Initial commit: AI 知识库文档智能分块工具

2026-03-02 17:38:28 +08:00
commit 92e7fc5bda
160 changed files with 9577 additions and 0 deletions
--- a/tests/test_doc_parser.py
+++ b/tests/test_doc_parser.py
@@ -0,0 +1,260 @@
+"""DocParser 单元测试"""
+
+import pytest
+from docx import Document
+from docx.shared import Pt
+from docx.enum.text import WD_ALIGN_PARAGRAPH
+
+from exceptions import ParseError
+from parsers.doc_parser import DocParser
+
+
+@pytest.fixture
+def parser():
+    return DocParser()
+
+
+def _create_docx(path, paragraphs=None, tables=None):
+    """
+    创建测试用 Word 文档。
+
+    Args:
+        path: 输出文件路径
+        paragraphs: 列表，每个元素是 dict:
+            - text: 段落文本
+            - style: 可选，样式名（如 'Heading 1'）
+            - font_size: 可选，字体大小 (Pt)
+            - bold: 可选，是否加粗
+        tables: 列表，每个元素是二维列表（行×列的文本）
+    """
+    doc = Document()
+    # 清除默认的空段落
+    for p in doc.paragraphs:
+        p._element.getparent().remove(p._element)
+
+    if paragraphs:
+        for para_info in paragraphs:
+            if isinstance(para_info, str):
+                doc.add_paragraph(para_info)
+            else:
+                text = para_info.get("text", "")
+                style = para_info.get("style", None)
+                font_size = para_info.get("font_size", None)
+                bold = para_info.get("bold", None)
+
+                if style:
+                    p = doc.add_paragraph(text, style=style)
+                else:
+                    p = doc.add_paragraph(text)
+
+                if font_size is not None or bold is not None:
+                    # 需要通过 run 设置字体属性
+                    # 清除默认 run，重新添加
+                    for run in p.runs:
+                        if font_size is not None:
+                            run.font.size = Pt(font_size)
+                        if bold is not None:
+                            run.bold = bold
+
+    if tables:
+        for table_data in tables:
+            if not table_data:
+                continue
+            rows = len(table_data)
+            cols = len(table_data[0]) if table_data else 0
+            table = doc.add_table(rows=rows, cols=cols)
+            for i, row_data in enumerate(table_data):
+                for j, cell_text in enumerate(row_data):
+                    table.rows[i].cells[j].text = cell_text
+
+    doc.save(str(path))
+
+
+class TestSupportedExtensions:
+    def test_supports_docx(self, parser):
+        assert ".docx" in parser.supported_extensions()
+
+    def test_only_one_extension(self, parser):
+        assert len(parser.supported_extensions()) == 1
+
+
+class TestParse:
+    def test_parse_simple_text(self, parser, tmp_path):
+        docx_path = tmp_path / "simple.docx"
+        _create_docx(docx_path, paragraphs=["Hello, world!"])
+        result = parser.parse(str(docx_path))
+        assert "Hello, world!" in result
+
+    def test_parse_multiple_paragraphs(self, parser, tmp_path):
+        docx_path = tmp_path / "multi.docx"
+        _create_docx(docx_path, paragraphs=["First paragraph", "Second paragraph"])
+        result = parser.parse(str(docx_path))
+        assert "First paragraph" in result
+        assert "Second paragraph" in result
+
+    def test_heading_by_style_name(self, parser, tmp_path):
+        """Heading style should produce Markdown heading"""
+        docx_path = tmp_path / "heading.docx"
+        _create_docx(docx_path, paragraphs=[
+            {"text": "Main Title", "style": "Heading 1"},
+            {"text": "Body text"},
+        ])
+        result = parser.parse(str(docx_path))
+        assert "# Main Title" in result
+        # Should be exactly H1, not H2
+        assert "## Main Title" not in result
+
+    def test_heading2_by_style_name(self, parser, tmp_path):
+        docx_path = tmp_path / "h2.docx"
+        _create_docx(docx_path, paragraphs=[
+            {"text": "Section Title", "style": "Heading 2"},
+            {"text": "Some content"},
+        ])
+        result = parser.parse(str(docx_path))
+        assert "## Section Title" in result
+        assert "### Section Title" not in result
+
+    def test_heading3_by_style_name(self, parser, tmp_path):
+        docx_path = tmp_path / "h3.docx"
+        _create_docx(docx_path, paragraphs=[
+            {"text": "Subsection", "style": "Heading 3"},
+        ])
+        result = parser.parse(str(docx_path))
+        assert "### Subsection" in result
+
+    def test_heading_by_font_size_bold(self, parser, tmp_path):
+        """Bold text with large font size should be detected as heading"""
+        docx_path = tmp_path / "font_heading.docx"
+        _create_docx(docx_path, paragraphs=[
+            {"text": "Big Bold Title", "font_size": 36, "bold": True},
+            {"text": "Normal text"},
+        ])
+        result = parser.parse(str(docx_path))
+        assert "# Big Bold Title" in result
+
+    def test_heading_h2_by_font_size(self, parser, tmp_path):
+        docx_path = tmp_path / "font_h2.docx"
+        _create_docx(docx_path, paragraphs=[
+            {"text": "H2 Title", "font_size": 28, "bold": True},
+            {"text": "Normal text"},
+        ])
+        result = parser.parse(str(docx_path))
+        assert "## H2 Title" in result
+
+    def test_heading_h5_by_font_size(self, parser, tmp_path):
+        docx_path = tmp_path / "font_h5.docx"
+        _create_docx(docx_path, paragraphs=[
+            {"text": "H5 Title", "font_size": 20, "bold": True},
+            {"text": "Normal text"},
+        ])
+        result = parser.parse(str(docx_path))
+        assert "##### H5 Title" in result
+
+    def test_no_heading_without_bold(self, parser, tmp_path):
+        """Large font without bold should NOT be detected as heading via font size"""
+        docx_path = tmp_path / "no_bold.docx"
+        _create_docx(docx_path, paragraphs=[
+            {"text": "Large Not Bold", "font_size": 36, "bold": False},
+        ])
+        result = parser.parse(str(docx_path))
+        assert "# Large Not Bold" not in result
+        assert "Large Not Bold" in result
+
+    def test_simple_table(self, parser, tmp_path):
+        docx_path = tmp_path / "table.docx"
+        _create_docx(docx_path, tables=[
+            [["Name", "Age"], ["Alice", "30"], ["Bob", "25"]],
+        ])
+        result = parser.parse(str(docx_path))
+        assert "| Name | Age |" in result
+        assert "| --- | --- |" in result
+        assert "| Alice | 30 |" in result
+        assert "| Bob | 25 |" in result
+
+    def test_table_with_pipe_in_cell(self, parser, tmp_path):
+        """Pipe characters in cells should be escaped"""
+        docx_path = tmp_path / "pipe.docx"
+        _create_docx(docx_path, tables=[
+            [["Header"], ["value|with|pipes"]],
+        ])
+        result = parser.parse(str(docx_path))
+        assert "&#124;" in result
+        assert "value&#124;with&#124;pipes" in result
+
+    def test_mixed_paragraphs_and_tables(self, parser, tmp_path):
+        """Document with both paragraphs and tables"""
+        docx_path = tmp_path / "mixed.docx"
+        doc = Document()
+        # Clear default paragraph
+        for p in doc.paragraphs:
+            p._element.getparent().remove(p._element)
+
+        doc.add_paragraph("Introduction", style="Heading 1")
+        doc.add_paragraph("Some intro text.")
+        table = doc.add_table(rows=2, cols=2)
+        table.rows[0].cells[0].text = "Col1"
+        table.rows[0].cells[1].text = "Col2"
+        table.rows[1].cells[0].text = "A"
+        table.rows[1].cells[1].text = "B"
+        doc.add_paragraph("Conclusion")
+        doc.save(str(docx_path))
+
+        result = parser.parse(str(docx_path))
+        assert "# Introduction" in result
+        assert "Some intro text." in result
+        assert "| Col1 | Col2 |" in result
+        assert "| A | B |" in result
+        assert "Conclusion" in result
+
+    def test_empty_document(self, parser, tmp_path):
+        docx_path = tmp_path / "empty.docx"
+        doc = Document()
+        # Clear default paragraph
+        for p in doc.paragraphs:
+            p._element.getparent().remove(p._element)
+        doc.save(str(docx_path))
+        result = parser.parse(str(docx_path))
+        assert result.strip() == ""
+
+    def test_empty_paragraphs_skipped(self, parser, tmp_path):
+        docx_path = tmp_path / "empty_para.docx"
+        _create_docx(docx_path, paragraphs=["", "Actual content", ""])
+        result = parser.parse(str(docx_path))
+        assert "Actual content" in result
+        # Empty paragraphs should not produce extra lines
+        assert result.strip() == "Actual content"
+
+    def test_nonexistent_file_raises(self, parser):
+        with pytest.raises(ParseError) as exc_info:
+            parser.parse("/nonexistent/path/file.docx")
+        assert "file.docx" in exc_info.value.file_name
+        assert exc_info.value.reason != ""
+
+    def test_corrupted_file_raises(self, parser, tmp_path):
+        docx_path = tmp_path / "corrupted.docx"
+        docx_path.write_bytes(b"this is not a docx file at all")
+        with pytest.raises(ParseError) as exc_info:
+            parser.parse(str(docx_path))
+        assert "corrupted.docx" in exc_info.value.file_name
+
+    def test_parse_error_contains_filename(self, parser):
+        with pytest.raises(ParseError) as exc_info:
+            parser.parse("/no/such/report.docx")
+        assert exc_info.value.file_name == "report.docx"
+
+    def test_multiple_heading_levels(self, parser, tmp_path):
+        """Test document with multiple heading levels via styles"""
+        docx_path = tmp_path / "levels.docx"
+        _create_docx(docx_path, paragraphs=[
+            {"text": "Title", "style": "Heading 1"},
+            {"text": "Chapter", "style": "Heading 2"},
+            {"text": "Section", "style": "Heading 3"},
+            {"text": "Body text"},
+        ])
+        result = parser.parse(str(docx_path))
+        assert "# Title" in result
+        assert "## Chapter" in result
+        assert "### Section" in result
+        assert "Body text" in result
+        # Body text should not have heading prefix
+        assert "# Body text" not in result