Files
bigwo/parsers/doc_parser.py

218 lines
6.5 KiB
Python
Raw Permalink Normal View History

"""Word 文档解析器,使用 python-docx 提取文本并转换为 Markdown 格式"""
import os
from typing import List, Optional
from docx import Document
from docx.table import Table as DocxTable
from docx.text.paragraph import Paragraph as DocxParagraph
from exceptions import ParseError
from parsers.base import BaseParser
# 字体大小 → 标题层级映射(需要 bold
# (min_pt, max_pt) → heading_level
_FONT_SIZE_HEADING_MAP = [
(36, 100, 1),
(26, 36, 2),
(24, 26, 3),
(22, 24, 4),
(18, 22, 5),
(16, 18, 6),
]
class DocParser(BaseParser):
"""Word 文档解析器,遍历文档 body 元素,段落按 style/字体大小判断标题层级,表格转 Markdown"""
def supported_extensions(self) -> List[str]:
return [".docx"]
def parse(self, file_path: str) -> str:
"""
解析 Word 文档提取文本并转换为 Markdown 格式
参考 MaxKB DocSplitHandle 核心逻辑
遍历 doc.element.body 段落按 style name 或字体大小判断标题层级
表格转 Markdown 表格 拼接为 Markdown
Args:
file_path: 文件路径
Returns:
Markdown 格式的文本内容
Raises:
ParseError: 文件无法读取或解析失败时抛出
"""
file_name = os.path.basename(file_path)
try:
doc = Document(file_path)
except Exception as e:
raise ParseError(file_name, f"Word 文档打开失败: {e}")
try:
return self._to_md(doc)
except ParseError:
raise
except Exception as e:
raise ParseError(file_name, f"Word 文档解析失败: {e}")
@staticmethod
def _get_title_level(paragraph: DocxParagraph) -> Optional[int]:
"""
判断段落的标题层级
优先检查 style nameHeading X / TOC 标题 / 标题
回退到字体大小 + bold 判断
Args:
paragraph: python-docx 段落对象
Returns:
标题层级 (1-6)非标题返回 None
"""
# 1. 检查 style name
style_name = paragraph.style.name if paragraph.style else ""
if style_name:
for prefix in ("Heading", "TOC 标题", "标题"):
if style_name.startswith(prefix):
# 提取层级数字
suffix = style_name[len(prefix):].strip()
if suffix.isdigit():
level = int(suffix)
if 1 <= level <= 6:
return level
# 如果没有数字后缀但匹配了前缀,默认为 1
if not suffix:
return 1
# 2. 回退到字体大小 + bold 判断
if not paragraph.runs:
return None
first_run = paragraph.runs[0]
if not first_run.bold:
return None
font_size = first_run.font.size
if font_size is None:
return None
pt = font_size.pt
for min_pt, max_pt, level in _FONT_SIZE_HEADING_MAP:
if min_pt <= pt < max_pt:
return level
return None
@staticmethod
def _paragraph_to_md(paragraph: DocxParagraph, level: Optional[int]) -> str:
"""
将段落转换为 Markdown 文本
Args:
paragraph: python-docx 段落对象
level: 标题层级None 表示普通段落
Returns:
Markdown 格式的文本
"""
text = paragraph.text.strip()
if not text:
return ""
if level is not None:
return "#" * level + " " + text
return text
@staticmethod
def _table_to_md(table: DocxTable) -> str:
"""
将表格转换为 Markdown 表格格式
第一行作为表头第二行为分隔行其余为数据行
Cell 文本中的 | 转义为 &#124;,换行转为 <br>。
Args:
table: python-docx 表格对象
Returns:
Markdown 表格文本
"""
rows = table.rows
if not rows:
return ""
def cell_text(cell) -> str:
"""提取单元格文本,处理多段落和特殊字符"""
text = "<br>".join(p.text for p in cell.paragraphs)
text = text.replace("|", "&#124;")
text = text.replace("\n", "<br>")
return text
lines = []
# 表头行
header_cells = [cell_text(cell) for cell in rows[0].cells]
lines.append("| " + " | ".join(header_cells) + " |")
# 分隔行
lines.append("| " + " | ".join("---" for _ in header_cells) + " |")
# 数据行
for row in rows[1:]:
data_cells = [cell_text(cell) for cell in row.cells]
lines.append("| " + " | ".join(data_cells) + " |")
return "\n".join(lines)
def _to_md(self, doc: Document) -> str:
"""
将整个文档转换为 Markdown
遍历 doc.element.body 的子元素根据 tag 判断是段落还是表格
分别转换后拼接
Args:
doc: python-docx Document 对象
Returns:
Markdown 格式的完整文本
"""
parts = []
# 建立 element → 对象的映射,使用 doc.paragraphs/doc.tables 获取
# 正确构造的对象(带完整 parent chain可访问 style/part
para_elements = {}
for paragraph in doc.paragraphs:
para_elements[paragraph._element] = paragraph
table_elements = {}
for table in doc.tables:
table_elements[table._element] = table
for element in doc.element.body:
tag = element.tag
if tag.endswith("}tbl") or tag.endswith("tbl"):
# 表格元素
table = table_elements.get(element)
if table is not None:
md = self._table_to_md(table)
if md:
parts.append(md)
elif tag.endswith("}p") or tag.endswith("p"):
# 段落元素
paragraph = para_elements.get(element)
if paragraph is None:
continue
level = self._get_title_level(paragraph)
md = self._paragraph_to_md(paragraph, level)
if md:
parts.append(md)
return "\n".join(parts)