Initial commit: AI 知识库文档智能分块工具

2026-03-02 17:38:28 +08:00
commit 92e7fc5bda
160 changed files with 9577 additions and 0 deletions
--- a/parsers/doc_parser.py
+++ b/parsers/doc_parser.py
@@ -0,0 +1,217 @@
+"""Word 文档解析器，使用 python-docx 提取文本并转换为 Markdown 格式"""
+
+import os
+from typing import List, Optional
+
+from docx import Document
+from docx.table import Table as DocxTable
+from docx.text.paragraph import Paragraph as DocxParagraph
+
+from exceptions import ParseError
+from parsers.base import BaseParser
+
+# 字体大小 → 标题层级映射（需要 bold）
+# (min_pt, max_pt) → heading_level
+_FONT_SIZE_HEADING_MAP = [
+    (36, 100, 1),
+    (26, 36, 2),
+    (24, 26, 3),
+    (22, 24, 4),
+    (18, 22, 5),
+    (16, 18, 6),
+]
+
+
+class DocParser(BaseParser):
+    """Word 文档解析器，遍历文档 body 元素，段落按 style/字体大小判断标题层级，表格转 Markdown"""
+
+    def supported_extensions(self) -> List[str]:
+        return [".docx"]
+
+    def parse(self, file_path: str) -> str:
+        """
+        解析 Word 文档，提取文本并转换为 Markdown 格式。
+
+        参考 MaxKB DocSplitHandle 核心逻辑：
+        遍历 doc.element.body → 段落按 style name 或字体大小判断标题层级
+        → 表格转 Markdown 表格 → 拼接为 Markdown
+
+        Args:
+            file_path: 文件路径
+
+        Returns:
+            Markdown 格式的文本内容
+
+        Raises:
+            ParseError: 文件无法读取或解析失败时抛出
+        """
+        file_name = os.path.basename(file_path)
+
+        try:
+            doc = Document(file_path)
+        except Exception as e:
+            raise ParseError(file_name, f"Word 文档打开失败: {e}")
+
+        try:
+            return self._to_md(doc)
+        except ParseError:
+            raise
+        except Exception as e:
+            raise ParseError(file_name, f"Word 文档解析失败: {e}")
+
+    @staticmethod
+    def _get_title_level(paragraph: DocxParagraph) -> Optional[int]:
+        """
+        判断段落的标题层级。
+
+        优先检查 style name（Heading X / TOC 标题 / 标题），
+        回退到字体大小 + bold 判断。
+
+        Args:
+            paragraph: python-docx 段落对象
+
+        Returns:
+            标题层级 (1-6)，非标题返回 None
+        """
+        # 1. 检查 style name
+        style_name = paragraph.style.name if paragraph.style else ""
+        if style_name:
+            for prefix in ("Heading", "TOC 标题", "标题"):
+                if style_name.startswith(prefix):
+                    # 提取层级数字
+                    suffix = style_name[len(prefix):].strip()
+                    if suffix.isdigit():
+                        level = int(suffix)
+                        if 1 <= level <= 6:
+                            return level
+                    # 如果没有数字后缀但匹配了前缀，默认为 1
+                    if not suffix:
+                        return 1
+
+        # 2. 回退到字体大小 + bold 判断
+        if not paragraph.runs:
+            return None
+
+        first_run = paragraph.runs[0]
+        if not first_run.bold:
+            return None
+
+        font_size = first_run.font.size
+        if font_size is None:
+            return None
+
+        pt = font_size.pt
+        for min_pt, max_pt, level in _FONT_SIZE_HEADING_MAP:
+            if min_pt <= pt < max_pt:
+                return level
+
+        return None
+
+    @staticmethod
+    def _paragraph_to_md(paragraph: DocxParagraph, level: Optional[int]) -> str:
+        """
+        将段落转换为 Markdown 文本。
+
+        Args:
+            paragraph: python-docx 段落对象
+            level: 标题层级，None 表示普通段落
+
+        Returns:
+            Markdown 格式的文本
+        """
+        text = paragraph.text.strip()
+        if not text:
+            return ""
+
+        if level is not None:
+            return "#" * level + " " + text
+        return text
+
+    @staticmethod
+    def _table_to_md(table: DocxTable) -> str:
+        """
+        将表格转换为 Markdown 表格格式。
+
+        第一行作为表头，第二行为分隔行，其余为数据行。
+        Cell 文本中的 | 转义为 &#124;，换行转为 <br>。
+
+        Args:
+            table: python-docx 表格对象
+
+        Returns:
+            Markdown 表格文本
+        """
+        rows = table.rows
+        if not rows:
+            return ""
+
+        def cell_text(cell) -> str:
+            """提取单元格文本，处理多段落和特殊字符"""
+            text = "<br>".join(p.text for p in cell.paragraphs)
+            text = text.replace("|", "&#124;")
+            text = text.replace("\n", "<br>")
+            return text
+
+        lines = []
+
+        # 表头行
+        header_cells = [cell_text(cell) for cell in rows[0].cells]
+        lines.append("| " + " | ".join(header_cells) + " |")
+
+        # 分隔行
+        lines.append("| " + " | ".join("---" for _ in header_cells) + " |")
+
+        # 数据行
+        for row in rows[1:]:
+            data_cells = [cell_text(cell) for cell in row.cells]
+            lines.append("| " + " | ".join(data_cells) + " |")
+
+        return "\n".join(lines)
+
+    def _to_md(self, doc: Document) -> str:
+        """
+        将整个文档转换为 Markdown。
+
+        遍历 doc.element.body 的子元素，根据 tag 判断是段落还是表格，
+        分别转换后拼接。
+
+        Args:
+            doc: python-docx Document 对象
+
+        Returns:
+            Markdown 格式的完整文本
+        """
+        parts = []
+
+        # 建立 element → 对象的映射，使用 doc.paragraphs/doc.tables 获取
+        # 正确构造的对象（带完整 parent chain，可访问 style/part）
+        para_elements = {}
+        for paragraph in doc.paragraphs:
+            para_elements[paragraph._element] = paragraph
+
+        table_elements = {}
+        for table in doc.tables:
+            table_elements[table._element] = table
+
+        for element in doc.element.body:
+            tag = element.tag
+
+            if tag.endswith("}tbl") or tag.endswith("tbl"):
+                # 表格元素
+                table = table_elements.get(element)
+                if table is not None:
+                    md = self._table_to_md(table)
+                    if md:
+                        parts.append(md)
+
+            elif tag.endswith("}p") or tag.endswith("p"):
+                # 段落元素
+                paragraph = para_elements.get(element)
+                if paragraph is None:
+                    continue
+                level = self._get_title_level(paragraph)
+                md = self._paragraph_to_md(paragraph, level)
+                if md:
+                    parts.append(md)
+
+        return "\n".join(parts)