Initial commit: AI 知识库文档智能分块工具
This commit is contained in:
217
parsers/doc_parser.py
Normal file
217
parsers/doc_parser.py
Normal file
@@ -0,0 +1,217 @@
|
||||
"""Word 文档解析器,使用 python-docx 提取文本并转换为 Markdown 格式"""
|
||||
|
||||
import os
|
||||
from typing import List, Optional
|
||||
|
||||
from docx import Document
|
||||
from docx.table import Table as DocxTable
|
||||
from docx.text.paragraph import Paragraph as DocxParagraph
|
||||
|
||||
from exceptions import ParseError
|
||||
from parsers.base import BaseParser
|
||||
|
||||
# 字体大小 → 标题层级映射(需要 bold)
|
||||
# (min_pt, max_pt) → heading_level
|
||||
_FONT_SIZE_HEADING_MAP = [
|
||||
(36, 100, 1),
|
||||
(26, 36, 2),
|
||||
(24, 26, 3),
|
||||
(22, 24, 4),
|
||||
(18, 22, 5),
|
||||
(16, 18, 6),
|
||||
]
|
||||
|
||||
|
||||
class DocParser(BaseParser):
|
||||
"""Word 文档解析器,遍历文档 body 元素,段落按 style/字体大小判断标题层级,表格转 Markdown"""
|
||||
|
||||
def supported_extensions(self) -> List[str]:
|
||||
return [".docx"]
|
||||
|
||||
def parse(self, file_path: str) -> str:
|
||||
"""
|
||||
解析 Word 文档,提取文本并转换为 Markdown 格式。
|
||||
|
||||
参考 MaxKB DocSplitHandle 核心逻辑:
|
||||
遍历 doc.element.body → 段落按 style name 或字体大小判断标题层级
|
||||
→ 表格转 Markdown 表格 → 拼接为 Markdown
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
|
||||
Returns:
|
||||
Markdown 格式的文本内容
|
||||
|
||||
Raises:
|
||||
ParseError: 文件无法读取或解析失败时抛出
|
||||
"""
|
||||
file_name = os.path.basename(file_path)
|
||||
|
||||
try:
|
||||
doc = Document(file_path)
|
||||
except Exception as e:
|
||||
raise ParseError(file_name, f"Word 文档打开失败: {e}")
|
||||
|
||||
try:
|
||||
return self._to_md(doc)
|
||||
except ParseError:
|
||||
raise
|
||||
except Exception as e:
|
||||
raise ParseError(file_name, f"Word 文档解析失败: {e}")
|
||||
|
||||
@staticmethod
|
||||
def _get_title_level(paragraph: DocxParagraph) -> Optional[int]:
|
||||
"""
|
||||
判断段落的标题层级。
|
||||
|
||||
优先检查 style name(Heading X / TOC 标题 / 标题),
|
||||
回退到字体大小 + bold 判断。
|
||||
|
||||
Args:
|
||||
paragraph: python-docx 段落对象
|
||||
|
||||
Returns:
|
||||
标题层级 (1-6),非标题返回 None
|
||||
"""
|
||||
# 1. 检查 style name
|
||||
style_name = paragraph.style.name if paragraph.style else ""
|
||||
if style_name:
|
||||
for prefix in ("Heading", "TOC 标题", "标题"):
|
||||
if style_name.startswith(prefix):
|
||||
# 提取层级数字
|
||||
suffix = style_name[len(prefix):].strip()
|
||||
if suffix.isdigit():
|
||||
level = int(suffix)
|
||||
if 1 <= level <= 6:
|
||||
return level
|
||||
# 如果没有数字后缀但匹配了前缀,默认为 1
|
||||
if not suffix:
|
||||
return 1
|
||||
|
||||
# 2. 回退到字体大小 + bold 判断
|
||||
if not paragraph.runs:
|
||||
return None
|
||||
|
||||
first_run = paragraph.runs[0]
|
||||
if not first_run.bold:
|
||||
return None
|
||||
|
||||
font_size = first_run.font.size
|
||||
if font_size is None:
|
||||
return None
|
||||
|
||||
pt = font_size.pt
|
||||
for min_pt, max_pt, level in _FONT_SIZE_HEADING_MAP:
|
||||
if min_pt <= pt < max_pt:
|
||||
return level
|
||||
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _paragraph_to_md(paragraph: DocxParagraph, level: Optional[int]) -> str:
|
||||
"""
|
||||
将段落转换为 Markdown 文本。
|
||||
|
||||
Args:
|
||||
paragraph: python-docx 段落对象
|
||||
level: 标题层级,None 表示普通段落
|
||||
|
||||
Returns:
|
||||
Markdown 格式的文本
|
||||
"""
|
||||
text = paragraph.text.strip()
|
||||
if not text:
|
||||
return ""
|
||||
|
||||
if level is not None:
|
||||
return "#" * level + " " + text
|
||||
return text
|
||||
|
||||
@staticmethod
|
||||
def _table_to_md(table: DocxTable) -> str:
|
||||
"""
|
||||
将表格转换为 Markdown 表格格式。
|
||||
|
||||
第一行作为表头,第二行为分隔行,其余为数据行。
|
||||
Cell 文本中的 | 转义为 |,换行转为 <br>。
|
||||
|
||||
Args:
|
||||
table: python-docx 表格对象
|
||||
|
||||
Returns:
|
||||
Markdown 表格文本
|
||||
"""
|
||||
rows = table.rows
|
||||
if not rows:
|
||||
return ""
|
||||
|
||||
def cell_text(cell) -> str:
|
||||
"""提取单元格文本,处理多段落和特殊字符"""
|
||||
text = "<br>".join(p.text for p in cell.paragraphs)
|
||||
text = text.replace("|", "|")
|
||||
text = text.replace("\n", "<br>")
|
||||
return text
|
||||
|
||||
lines = []
|
||||
|
||||
# 表头行
|
||||
header_cells = [cell_text(cell) for cell in rows[0].cells]
|
||||
lines.append("| " + " | ".join(header_cells) + " |")
|
||||
|
||||
# 分隔行
|
||||
lines.append("| " + " | ".join("---" for _ in header_cells) + " |")
|
||||
|
||||
# 数据行
|
||||
for row in rows[1:]:
|
||||
data_cells = [cell_text(cell) for cell in row.cells]
|
||||
lines.append("| " + " | ".join(data_cells) + " |")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
def _to_md(self, doc: Document) -> str:
|
||||
"""
|
||||
将整个文档转换为 Markdown。
|
||||
|
||||
遍历 doc.element.body 的子元素,根据 tag 判断是段落还是表格,
|
||||
分别转换后拼接。
|
||||
|
||||
Args:
|
||||
doc: python-docx Document 对象
|
||||
|
||||
Returns:
|
||||
Markdown 格式的完整文本
|
||||
"""
|
||||
parts = []
|
||||
|
||||
# 建立 element → 对象的映射,使用 doc.paragraphs/doc.tables 获取
|
||||
# 正确构造的对象(带完整 parent chain,可访问 style/part)
|
||||
para_elements = {}
|
||||
for paragraph in doc.paragraphs:
|
||||
para_elements[paragraph._element] = paragraph
|
||||
|
||||
table_elements = {}
|
||||
for table in doc.tables:
|
||||
table_elements[table._element] = table
|
||||
|
||||
for element in doc.element.body:
|
||||
tag = element.tag
|
||||
|
||||
if tag.endswith("}tbl") or tag.endswith("tbl"):
|
||||
# 表格元素
|
||||
table = table_elements.get(element)
|
||||
if table is not None:
|
||||
md = self._table_to_md(table)
|
||||
if md:
|
||||
parts.append(md)
|
||||
|
||||
elif tag.endswith("}p") or tag.endswith("p"):
|
||||
# 段落元素
|
||||
paragraph = para_elements.get(element)
|
||||
if paragraph is None:
|
||||
continue
|
||||
level = self._get_title_level(paragraph)
|
||||
md = self._paragraph_to_md(paragraph, level)
|
||||
if md:
|
||||
parts.append(md)
|
||||
|
||||
return "\n".join(parts)
|
||||
Reference in New Issue
Block a user