Initial commit: AI 知识库文档智能分块工具

This commit is contained in:
AI Knowledge Splitter
2026-03-02 17:38:28 +08:00
commit 92e7fc5bda
160 changed files with 9577 additions and 0 deletions

260
tests/test_doc_parser.py Normal file
View File

@@ -0,0 +1,260 @@
"""DocParser 单元测试"""
import pytest
from docx import Document
from docx.shared import Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
from exceptions import ParseError
from parsers.doc_parser import DocParser
@pytest.fixture
def parser():
return DocParser()
def _create_docx(path, paragraphs=None, tables=None):
"""
创建测试用 Word 文档。
Args:
path: 输出文件路径
paragraphs: 列表,每个元素是 dict:
- text: 段落文本
- style: 可选,样式名(如 'Heading 1'
- font_size: 可选,字体大小 (Pt)
- bold: 可选,是否加粗
tables: 列表,每个元素是二维列表(行×列的文本)
"""
doc = Document()
# 清除默认的空段落
for p in doc.paragraphs:
p._element.getparent().remove(p._element)
if paragraphs:
for para_info in paragraphs:
if isinstance(para_info, str):
doc.add_paragraph(para_info)
else:
text = para_info.get("text", "")
style = para_info.get("style", None)
font_size = para_info.get("font_size", None)
bold = para_info.get("bold", None)
if style:
p = doc.add_paragraph(text, style=style)
else:
p = doc.add_paragraph(text)
if font_size is not None or bold is not None:
# 需要通过 run 设置字体属性
# 清除默认 run重新添加
for run in p.runs:
if font_size is not None:
run.font.size = Pt(font_size)
if bold is not None:
run.bold = bold
if tables:
for table_data in tables:
if not table_data:
continue
rows = len(table_data)
cols = len(table_data[0]) if table_data else 0
table = doc.add_table(rows=rows, cols=cols)
for i, row_data in enumerate(table_data):
for j, cell_text in enumerate(row_data):
table.rows[i].cells[j].text = cell_text
doc.save(str(path))
class TestSupportedExtensions:
def test_supports_docx(self, parser):
assert ".docx" in parser.supported_extensions()
def test_only_one_extension(self, parser):
assert len(parser.supported_extensions()) == 1
class TestParse:
def test_parse_simple_text(self, parser, tmp_path):
docx_path = tmp_path / "simple.docx"
_create_docx(docx_path, paragraphs=["Hello, world!"])
result = parser.parse(str(docx_path))
assert "Hello, world!" in result
def test_parse_multiple_paragraphs(self, parser, tmp_path):
docx_path = tmp_path / "multi.docx"
_create_docx(docx_path, paragraphs=["First paragraph", "Second paragraph"])
result = parser.parse(str(docx_path))
assert "First paragraph" in result
assert "Second paragraph" in result
def test_heading_by_style_name(self, parser, tmp_path):
"""Heading style should produce Markdown heading"""
docx_path = tmp_path / "heading.docx"
_create_docx(docx_path, paragraphs=[
{"text": "Main Title", "style": "Heading 1"},
{"text": "Body text"},
])
result = parser.parse(str(docx_path))
assert "# Main Title" in result
# Should be exactly H1, not H2
assert "## Main Title" not in result
def test_heading2_by_style_name(self, parser, tmp_path):
docx_path = tmp_path / "h2.docx"
_create_docx(docx_path, paragraphs=[
{"text": "Section Title", "style": "Heading 2"},
{"text": "Some content"},
])
result = parser.parse(str(docx_path))
assert "## Section Title" in result
assert "### Section Title" not in result
def test_heading3_by_style_name(self, parser, tmp_path):
docx_path = tmp_path / "h3.docx"
_create_docx(docx_path, paragraphs=[
{"text": "Subsection", "style": "Heading 3"},
])
result = parser.parse(str(docx_path))
assert "### Subsection" in result
def test_heading_by_font_size_bold(self, parser, tmp_path):
"""Bold text with large font size should be detected as heading"""
docx_path = tmp_path / "font_heading.docx"
_create_docx(docx_path, paragraphs=[
{"text": "Big Bold Title", "font_size": 36, "bold": True},
{"text": "Normal text"},
])
result = parser.parse(str(docx_path))
assert "# Big Bold Title" in result
def test_heading_h2_by_font_size(self, parser, tmp_path):
docx_path = tmp_path / "font_h2.docx"
_create_docx(docx_path, paragraphs=[
{"text": "H2 Title", "font_size": 28, "bold": True},
{"text": "Normal text"},
])
result = parser.parse(str(docx_path))
assert "## H2 Title" in result
def test_heading_h5_by_font_size(self, parser, tmp_path):
docx_path = tmp_path / "font_h5.docx"
_create_docx(docx_path, paragraphs=[
{"text": "H5 Title", "font_size": 20, "bold": True},
{"text": "Normal text"},
])
result = parser.parse(str(docx_path))
assert "##### H5 Title" in result
def test_no_heading_without_bold(self, parser, tmp_path):
"""Large font without bold should NOT be detected as heading via font size"""
docx_path = tmp_path / "no_bold.docx"
_create_docx(docx_path, paragraphs=[
{"text": "Large Not Bold", "font_size": 36, "bold": False},
])
result = parser.parse(str(docx_path))
assert "# Large Not Bold" not in result
assert "Large Not Bold" in result
def test_simple_table(self, parser, tmp_path):
docx_path = tmp_path / "table.docx"
_create_docx(docx_path, tables=[
[["Name", "Age"], ["Alice", "30"], ["Bob", "25"]],
])
result = parser.parse(str(docx_path))
assert "| Name | Age |" in result
assert "| --- | --- |" in result
assert "| Alice | 30 |" in result
assert "| Bob | 25 |" in result
def test_table_with_pipe_in_cell(self, parser, tmp_path):
"""Pipe characters in cells should be escaped"""
docx_path = tmp_path / "pipe.docx"
_create_docx(docx_path, tables=[
[["Header"], ["value|with|pipes"]],
])
result = parser.parse(str(docx_path))
assert "|" in result
assert "value|with|pipes" in result
def test_mixed_paragraphs_and_tables(self, parser, tmp_path):
"""Document with both paragraphs and tables"""
docx_path = tmp_path / "mixed.docx"
doc = Document()
# Clear default paragraph
for p in doc.paragraphs:
p._element.getparent().remove(p._element)
doc.add_paragraph("Introduction", style="Heading 1")
doc.add_paragraph("Some intro text.")
table = doc.add_table(rows=2, cols=2)
table.rows[0].cells[0].text = "Col1"
table.rows[0].cells[1].text = "Col2"
table.rows[1].cells[0].text = "A"
table.rows[1].cells[1].text = "B"
doc.add_paragraph("Conclusion")
doc.save(str(docx_path))
result = parser.parse(str(docx_path))
assert "# Introduction" in result
assert "Some intro text." in result
assert "| Col1 | Col2 |" in result
assert "| A | B |" in result
assert "Conclusion" in result
def test_empty_document(self, parser, tmp_path):
docx_path = tmp_path / "empty.docx"
doc = Document()
# Clear default paragraph
for p in doc.paragraphs:
p._element.getparent().remove(p._element)
doc.save(str(docx_path))
result = parser.parse(str(docx_path))
assert result.strip() == ""
def test_empty_paragraphs_skipped(self, parser, tmp_path):
docx_path = tmp_path / "empty_para.docx"
_create_docx(docx_path, paragraphs=["", "Actual content", ""])
result = parser.parse(str(docx_path))
assert "Actual content" in result
# Empty paragraphs should not produce extra lines
assert result.strip() == "Actual content"
def test_nonexistent_file_raises(self, parser):
with pytest.raises(ParseError) as exc_info:
parser.parse("/nonexistent/path/file.docx")
assert "file.docx" in exc_info.value.file_name
assert exc_info.value.reason != ""
def test_corrupted_file_raises(self, parser, tmp_path):
docx_path = tmp_path / "corrupted.docx"
docx_path.write_bytes(b"this is not a docx file at all")
with pytest.raises(ParseError) as exc_info:
parser.parse(str(docx_path))
assert "corrupted.docx" in exc_info.value.file_name
def test_parse_error_contains_filename(self, parser):
with pytest.raises(ParseError) as exc_info:
parser.parse("/no/such/report.docx")
assert exc_info.value.file_name == "report.docx"
def test_multiple_heading_levels(self, parser, tmp_path):
"""Test document with multiple heading levels via styles"""
docx_path = tmp_path / "levels.docx"
_create_docx(docx_path, paragraphs=[
{"text": "Title", "style": "Heading 1"},
{"text": "Chapter", "style": "Heading 2"},
{"text": "Section", "style": "Heading 3"},
{"text": "Body text"},
])
result = parser.parse(str(docx_path))
assert "# Title" in result
assert "## Chapter" in result
assert "### Section" in result
assert "Body text" in result
# Body text should not have heading prefix
assert "# Body text" not in result