261 lines
9.7 KiB
Python
261 lines
9.7 KiB
Python
|
|
"""DocParser 单元测试"""
|
|||
|
|
|
|||
|
|
import pytest
|
|||
|
|
from docx import Document
|
|||
|
|
from docx.shared import Pt
|
|||
|
|
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
|||
|
|
|
|||
|
|
from exceptions import ParseError
|
|||
|
|
from parsers.doc_parser import DocParser
|
|||
|
|
|
|||
|
|
|
|||
|
|
@pytest.fixture
|
|||
|
|
def parser():
|
|||
|
|
return DocParser()
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _create_docx(path, paragraphs=None, tables=None):
|
|||
|
|
"""
|
|||
|
|
创建测试用 Word 文档。
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
path: 输出文件路径
|
|||
|
|
paragraphs: 列表,每个元素是 dict:
|
|||
|
|
- text: 段落文本
|
|||
|
|
- style: 可选,样式名(如 'Heading 1')
|
|||
|
|
- font_size: 可选,字体大小 (Pt)
|
|||
|
|
- bold: 可选,是否加粗
|
|||
|
|
tables: 列表,每个元素是二维列表(行×列的文本)
|
|||
|
|
"""
|
|||
|
|
doc = Document()
|
|||
|
|
# 清除默认的空段落
|
|||
|
|
for p in doc.paragraphs:
|
|||
|
|
p._element.getparent().remove(p._element)
|
|||
|
|
|
|||
|
|
if paragraphs:
|
|||
|
|
for para_info in paragraphs:
|
|||
|
|
if isinstance(para_info, str):
|
|||
|
|
doc.add_paragraph(para_info)
|
|||
|
|
else:
|
|||
|
|
text = para_info.get("text", "")
|
|||
|
|
style = para_info.get("style", None)
|
|||
|
|
font_size = para_info.get("font_size", None)
|
|||
|
|
bold = para_info.get("bold", None)
|
|||
|
|
|
|||
|
|
if style:
|
|||
|
|
p = doc.add_paragraph(text, style=style)
|
|||
|
|
else:
|
|||
|
|
p = doc.add_paragraph(text)
|
|||
|
|
|
|||
|
|
if font_size is not None or bold is not None:
|
|||
|
|
# 需要通过 run 设置字体属性
|
|||
|
|
# 清除默认 run,重新添加
|
|||
|
|
for run in p.runs:
|
|||
|
|
if font_size is not None:
|
|||
|
|
run.font.size = Pt(font_size)
|
|||
|
|
if bold is not None:
|
|||
|
|
run.bold = bold
|
|||
|
|
|
|||
|
|
if tables:
|
|||
|
|
for table_data in tables:
|
|||
|
|
if not table_data:
|
|||
|
|
continue
|
|||
|
|
rows = len(table_data)
|
|||
|
|
cols = len(table_data[0]) if table_data else 0
|
|||
|
|
table = doc.add_table(rows=rows, cols=cols)
|
|||
|
|
for i, row_data in enumerate(table_data):
|
|||
|
|
for j, cell_text in enumerate(row_data):
|
|||
|
|
table.rows[i].cells[j].text = cell_text
|
|||
|
|
|
|||
|
|
doc.save(str(path))
|
|||
|
|
|
|||
|
|
|
|||
|
|
class TestSupportedExtensions:
|
|||
|
|
def test_supports_docx(self, parser):
|
|||
|
|
assert ".docx" in parser.supported_extensions()
|
|||
|
|
|
|||
|
|
def test_only_one_extension(self, parser):
|
|||
|
|
assert len(parser.supported_extensions()) == 1
|
|||
|
|
|
|||
|
|
|
|||
|
|
class TestParse:
|
|||
|
|
def test_parse_simple_text(self, parser, tmp_path):
|
|||
|
|
docx_path = tmp_path / "simple.docx"
|
|||
|
|
_create_docx(docx_path, paragraphs=["Hello, world!"])
|
|||
|
|
result = parser.parse(str(docx_path))
|
|||
|
|
assert "Hello, world!" in result
|
|||
|
|
|
|||
|
|
def test_parse_multiple_paragraphs(self, parser, tmp_path):
|
|||
|
|
docx_path = tmp_path / "multi.docx"
|
|||
|
|
_create_docx(docx_path, paragraphs=["First paragraph", "Second paragraph"])
|
|||
|
|
result = parser.parse(str(docx_path))
|
|||
|
|
assert "First paragraph" in result
|
|||
|
|
assert "Second paragraph" in result
|
|||
|
|
|
|||
|
|
def test_heading_by_style_name(self, parser, tmp_path):
|
|||
|
|
"""Heading style should produce Markdown heading"""
|
|||
|
|
docx_path = tmp_path / "heading.docx"
|
|||
|
|
_create_docx(docx_path, paragraphs=[
|
|||
|
|
{"text": "Main Title", "style": "Heading 1"},
|
|||
|
|
{"text": "Body text"},
|
|||
|
|
])
|
|||
|
|
result = parser.parse(str(docx_path))
|
|||
|
|
assert "# Main Title" in result
|
|||
|
|
# Should be exactly H1, not H2
|
|||
|
|
assert "## Main Title" not in result
|
|||
|
|
|
|||
|
|
def test_heading2_by_style_name(self, parser, tmp_path):
|
|||
|
|
docx_path = tmp_path / "h2.docx"
|
|||
|
|
_create_docx(docx_path, paragraphs=[
|
|||
|
|
{"text": "Section Title", "style": "Heading 2"},
|
|||
|
|
{"text": "Some content"},
|
|||
|
|
])
|
|||
|
|
result = parser.parse(str(docx_path))
|
|||
|
|
assert "## Section Title" in result
|
|||
|
|
assert "### Section Title" not in result
|
|||
|
|
|
|||
|
|
def test_heading3_by_style_name(self, parser, tmp_path):
|
|||
|
|
docx_path = tmp_path / "h3.docx"
|
|||
|
|
_create_docx(docx_path, paragraphs=[
|
|||
|
|
{"text": "Subsection", "style": "Heading 3"},
|
|||
|
|
])
|
|||
|
|
result = parser.parse(str(docx_path))
|
|||
|
|
assert "### Subsection" in result
|
|||
|
|
|
|||
|
|
def test_heading_by_font_size_bold(self, parser, tmp_path):
|
|||
|
|
"""Bold text with large font size should be detected as heading"""
|
|||
|
|
docx_path = tmp_path / "font_heading.docx"
|
|||
|
|
_create_docx(docx_path, paragraphs=[
|
|||
|
|
{"text": "Big Bold Title", "font_size": 36, "bold": True},
|
|||
|
|
{"text": "Normal text"},
|
|||
|
|
])
|
|||
|
|
result = parser.parse(str(docx_path))
|
|||
|
|
assert "# Big Bold Title" in result
|
|||
|
|
|
|||
|
|
def test_heading_h2_by_font_size(self, parser, tmp_path):
|
|||
|
|
docx_path = tmp_path / "font_h2.docx"
|
|||
|
|
_create_docx(docx_path, paragraphs=[
|
|||
|
|
{"text": "H2 Title", "font_size": 28, "bold": True},
|
|||
|
|
{"text": "Normal text"},
|
|||
|
|
])
|
|||
|
|
result = parser.parse(str(docx_path))
|
|||
|
|
assert "## H2 Title" in result
|
|||
|
|
|
|||
|
|
def test_heading_h5_by_font_size(self, parser, tmp_path):
|
|||
|
|
docx_path = tmp_path / "font_h5.docx"
|
|||
|
|
_create_docx(docx_path, paragraphs=[
|
|||
|
|
{"text": "H5 Title", "font_size": 20, "bold": True},
|
|||
|
|
{"text": "Normal text"},
|
|||
|
|
])
|
|||
|
|
result = parser.parse(str(docx_path))
|
|||
|
|
assert "##### H5 Title" in result
|
|||
|
|
|
|||
|
|
def test_no_heading_without_bold(self, parser, tmp_path):
|
|||
|
|
"""Large font without bold should NOT be detected as heading via font size"""
|
|||
|
|
docx_path = tmp_path / "no_bold.docx"
|
|||
|
|
_create_docx(docx_path, paragraphs=[
|
|||
|
|
{"text": "Large Not Bold", "font_size": 36, "bold": False},
|
|||
|
|
])
|
|||
|
|
result = parser.parse(str(docx_path))
|
|||
|
|
assert "# Large Not Bold" not in result
|
|||
|
|
assert "Large Not Bold" in result
|
|||
|
|
|
|||
|
|
def test_simple_table(self, parser, tmp_path):
|
|||
|
|
docx_path = tmp_path / "table.docx"
|
|||
|
|
_create_docx(docx_path, tables=[
|
|||
|
|
[["Name", "Age"], ["Alice", "30"], ["Bob", "25"]],
|
|||
|
|
])
|
|||
|
|
result = parser.parse(str(docx_path))
|
|||
|
|
assert "| Name | Age |" in result
|
|||
|
|
assert "| --- | --- |" in result
|
|||
|
|
assert "| Alice | 30 |" in result
|
|||
|
|
assert "| Bob | 25 |" in result
|
|||
|
|
|
|||
|
|
def test_table_with_pipe_in_cell(self, parser, tmp_path):
|
|||
|
|
"""Pipe characters in cells should be escaped"""
|
|||
|
|
docx_path = tmp_path / "pipe.docx"
|
|||
|
|
_create_docx(docx_path, tables=[
|
|||
|
|
[["Header"], ["value|with|pipes"]],
|
|||
|
|
])
|
|||
|
|
result = parser.parse(str(docx_path))
|
|||
|
|
assert "|" in result
|
|||
|
|
assert "value|with|pipes" in result
|
|||
|
|
|
|||
|
|
def test_mixed_paragraphs_and_tables(self, parser, tmp_path):
|
|||
|
|
"""Document with both paragraphs and tables"""
|
|||
|
|
docx_path = tmp_path / "mixed.docx"
|
|||
|
|
doc = Document()
|
|||
|
|
# Clear default paragraph
|
|||
|
|
for p in doc.paragraphs:
|
|||
|
|
p._element.getparent().remove(p._element)
|
|||
|
|
|
|||
|
|
doc.add_paragraph("Introduction", style="Heading 1")
|
|||
|
|
doc.add_paragraph("Some intro text.")
|
|||
|
|
table = doc.add_table(rows=2, cols=2)
|
|||
|
|
table.rows[0].cells[0].text = "Col1"
|
|||
|
|
table.rows[0].cells[1].text = "Col2"
|
|||
|
|
table.rows[1].cells[0].text = "A"
|
|||
|
|
table.rows[1].cells[1].text = "B"
|
|||
|
|
doc.add_paragraph("Conclusion")
|
|||
|
|
doc.save(str(docx_path))
|
|||
|
|
|
|||
|
|
result = parser.parse(str(docx_path))
|
|||
|
|
assert "# Introduction" in result
|
|||
|
|
assert "Some intro text." in result
|
|||
|
|
assert "| Col1 | Col2 |" in result
|
|||
|
|
assert "| A | B |" in result
|
|||
|
|
assert "Conclusion" in result
|
|||
|
|
|
|||
|
|
def test_empty_document(self, parser, tmp_path):
|
|||
|
|
docx_path = tmp_path / "empty.docx"
|
|||
|
|
doc = Document()
|
|||
|
|
# Clear default paragraph
|
|||
|
|
for p in doc.paragraphs:
|
|||
|
|
p._element.getparent().remove(p._element)
|
|||
|
|
doc.save(str(docx_path))
|
|||
|
|
result = parser.parse(str(docx_path))
|
|||
|
|
assert result.strip() == ""
|
|||
|
|
|
|||
|
|
def test_empty_paragraphs_skipped(self, parser, tmp_path):
|
|||
|
|
docx_path = tmp_path / "empty_para.docx"
|
|||
|
|
_create_docx(docx_path, paragraphs=["", "Actual content", ""])
|
|||
|
|
result = parser.parse(str(docx_path))
|
|||
|
|
assert "Actual content" in result
|
|||
|
|
# Empty paragraphs should not produce extra lines
|
|||
|
|
assert result.strip() == "Actual content"
|
|||
|
|
|
|||
|
|
def test_nonexistent_file_raises(self, parser):
|
|||
|
|
with pytest.raises(ParseError) as exc_info:
|
|||
|
|
parser.parse("/nonexistent/path/file.docx")
|
|||
|
|
assert "file.docx" in exc_info.value.file_name
|
|||
|
|
assert exc_info.value.reason != ""
|
|||
|
|
|
|||
|
|
def test_corrupted_file_raises(self, parser, tmp_path):
|
|||
|
|
docx_path = tmp_path / "corrupted.docx"
|
|||
|
|
docx_path.write_bytes(b"this is not a docx file at all")
|
|||
|
|
with pytest.raises(ParseError) as exc_info:
|
|||
|
|
parser.parse(str(docx_path))
|
|||
|
|
assert "corrupted.docx" in exc_info.value.file_name
|
|||
|
|
|
|||
|
|
def test_parse_error_contains_filename(self, parser):
|
|||
|
|
with pytest.raises(ParseError) as exc_info:
|
|||
|
|
parser.parse("/no/such/report.docx")
|
|||
|
|
assert exc_info.value.file_name == "report.docx"
|
|||
|
|
|
|||
|
|
def test_multiple_heading_levels(self, parser, tmp_path):
|
|||
|
|
"""Test document with multiple heading levels via styles"""
|
|||
|
|
docx_path = tmp_path / "levels.docx"
|
|||
|
|
_create_docx(docx_path, paragraphs=[
|
|||
|
|
{"text": "Title", "style": "Heading 1"},
|
|||
|
|
{"text": "Chapter", "style": "Heading 2"},
|
|||
|
|
{"text": "Section", "style": "Heading 3"},
|
|||
|
|
{"text": "Body text"},
|
|||
|
|
])
|
|||
|
|
result = parser.parse(str(docx_path))
|
|||
|
|
assert "# Title" in result
|
|||
|
|
assert "## Chapter" in result
|
|||
|
|
assert "### Section" in result
|
|||
|
|
assert "Body text" in result
|
|||
|
|
# Body text should not have heading prefix
|
|||
|
|
assert "# Body text" not in result
|