218 lines
6.5 KiB
Python
218 lines
6.5 KiB
Python
"""Word 文档解析器,使用 python-docx 提取文本并转换为 Markdown 格式"""
|
||
|
||
import os
|
||
from typing import List, Optional
|
||
|
||
from docx import Document
|
||
from docx.table import Table as DocxTable
|
||
from docx.text.paragraph import Paragraph as DocxParagraph
|
||
|
||
from exceptions import ParseError
|
||
from parsers.base import BaseParser
|
||
|
||
# 字体大小 → 标题层级映射(需要 bold)
|
||
# (min_pt, max_pt) → heading_level
|
||
_FONT_SIZE_HEADING_MAP = [
|
||
(36, 100, 1),
|
||
(26, 36, 2),
|
||
(24, 26, 3),
|
||
(22, 24, 4),
|
||
(18, 22, 5),
|
||
(16, 18, 6),
|
||
]
|
||
|
||
|
||
class DocParser(BaseParser):
|
||
"""Word 文档解析器,遍历文档 body 元素,段落按 style/字体大小判断标题层级,表格转 Markdown"""
|
||
|
||
def supported_extensions(self) -> List[str]:
|
||
return [".docx"]
|
||
|
||
def parse(self, file_path: str) -> str:
|
||
"""
|
||
解析 Word 文档,提取文本并转换为 Markdown 格式。
|
||
|
||
参考 MaxKB DocSplitHandle 核心逻辑:
|
||
遍历 doc.element.body → 段落按 style name 或字体大小判断标题层级
|
||
→ 表格转 Markdown 表格 → 拼接为 Markdown
|
||
|
||
Args:
|
||
file_path: 文件路径
|
||
|
||
Returns:
|
||
Markdown 格式的文本内容
|
||
|
||
Raises:
|
||
ParseError: 文件无法读取或解析失败时抛出
|
||
"""
|
||
file_name = os.path.basename(file_path)
|
||
|
||
try:
|
||
doc = Document(file_path)
|
||
except Exception as e:
|
||
raise ParseError(file_name, f"Word 文档打开失败: {e}")
|
||
|
||
try:
|
||
return self._to_md(doc)
|
||
except ParseError:
|
||
raise
|
||
except Exception as e:
|
||
raise ParseError(file_name, f"Word 文档解析失败: {e}")
|
||
|
||
@staticmethod
|
||
def _get_title_level(paragraph: DocxParagraph) -> Optional[int]:
|
||
"""
|
||
判断段落的标题层级。
|
||
|
||
优先检查 style name(Heading X / TOC 标题 / 标题),
|
||
回退到字体大小 + bold 判断。
|
||
|
||
Args:
|
||
paragraph: python-docx 段落对象
|
||
|
||
Returns:
|
||
标题层级 (1-6),非标题返回 None
|
||
"""
|
||
# 1. 检查 style name
|
||
style_name = paragraph.style.name if paragraph.style else ""
|
||
if style_name:
|
||
for prefix in ("Heading", "TOC 标题", "标题"):
|
||
if style_name.startswith(prefix):
|
||
# 提取层级数字
|
||
suffix = style_name[len(prefix):].strip()
|
||
if suffix.isdigit():
|
||
level = int(suffix)
|
||
if 1 <= level <= 6:
|
||
return level
|
||
# 如果没有数字后缀但匹配了前缀,默认为 1
|
||
if not suffix:
|
||
return 1
|
||
|
||
# 2. 回退到字体大小 + bold 判断
|
||
if not paragraph.runs:
|
||
return None
|
||
|
||
first_run = paragraph.runs[0]
|
||
if not first_run.bold:
|
||
return None
|
||
|
||
font_size = first_run.font.size
|
||
if font_size is None:
|
||
return None
|
||
|
||
pt = font_size.pt
|
||
for min_pt, max_pt, level in _FONT_SIZE_HEADING_MAP:
|
||
if min_pt <= pt < max_pt:
|
||
return level
|
||
|
||
return None
|
||
|
||
@staticmethod
|
||
def _paragraph_to_md(paragraph: DocxParagraph, level: Optional[int]) -> str:
|
||
"""
|
||
将段落转换为 Markdown 文本。
|
||
|
||
Args:
|
||
paragraph: python-docx 段落对象
|
||
level: 标题层级,None 表示普通段落
|
||
|
||
Returns:
|
||
Markdown 格式的文本
|
||
"""
|
||
text = paragraph.text.strip()
|
||
if not text:
|
||
return ""
|
||
|
||
if level is not None:
|
||
return "#" * level + " " + text
|
||
return text
|
||
|
||
@staticmethod
|
||
def _table_to_md(table: DocxTable) -> str:
|
||
"""
|
||
将表格转换为 Markdown 表格格式。
|
||
|
||
第一行作为表头,第二行为分隔行,其余为数据行。
|
||
Cell 文本中的 | 转义为 |,换行转为 <br>。
|
||
|
||
Args:
|
||
table: python-docx 表格对象
|
||
|
||
Returns:
|
||
Markdown 表格文本
|
||
"""
|
||
rows = table.rows
|
||
if not rows:
|
||
return ""
|
||
|
||
def cell_text(cell) -> str:
|
||
"""提取单元格文本,处理多段落和特殊字符"""
|
||
text = "<br>".join(p.text for p in cell.paragraphs)
|
||
text = text.replace("|", "|")
|
||
text = text.replace("\n", "<br>")
|
||
return text
|
||
|
||
lines = []
|
||
|
||
# 表头行
|
||
header_cells = [cell_text(cell) for cell in rows[0].cells]
|
||
lines.append("| " + " | ".join(header_cells) + " |")
|
||
|
||
# 分隔行
|
||
lines.append("| " + " | ".join("---" for _ in header_cells) + " |")
|
||
|
||
# 数据行
|
||
for row in rows[1:]:
|
||
data_cells = [cell_text(cell) for cell in row.cells]
|
||
lines.append("| " + " | ".join(data_cells) + " |")
|
||
|
||
return "\n".join(lines)
|
||
|
||
def _to_md(self, doc: Document) -> str:
|
||
"""
|
||
将整个文档转换为 Markdown。
|
||
|
||
遍历 doc.element.body 的子元素,根据 tag 判断是段落还是表格,
|
||
分别转换后拼接。
|
||
|
||
Args:
|
||
doc: python-docx Document 对象
|
||
|
||
Returns:
|
||
Markdown 格式的完整文本
|
||
"""
|
||
parts = []
|
||
|
||
# 建立 element → 对象的映射,使用 doc.paragraphs/doc.tables 获取
|
||
# 正确构造的对象(带完整 parent chain,可访问 style/part)
|
||
para_elements = {}
|
||
for paragraph in doc.paragraphs:
|
||
para_elements[paragraph._element] = paragraph
|
||
|
||
table_elements = {}
|
||
for table in doc.tables:
|
||
table_elements[table._element] = table
|
||
|
||
for element in doc.element.body:
|
||
tag = element.tag
|
||
|
||
if tag.endswith("}tbl") or tag.endswith("tbl"):
|
||
# 表格元素
|
||
table = table_elements.get(element)
|
||
if table is not None:
|
||
md = self._table_to_md(table)
|
||
if md:
|
||
parts.append(md)
|
||
|
||
elif tag.endswith("}p") or tag.endswith("p"):
|
||
# 段落元素
|
||
paragraph = para_elements.get(element)
|
||
if paragraph is None:
|
||
continue
|
||
level = self._get_title_level(paragraph)
|
||
md = self._paragraph_to_md(paragraph, level)
|
||
if md:
|
||
parts.append(md)
|
||
|
||
return "\n".join(parts)
|