261 lines
9.7 KiB
Python
261 lines
9.7 KiB
Python
"""DocParser 单元测试"""
|
||
|
||
import pytest
|
||
from docx import Document
|
||
from docx.shared import Pt
|
||
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
||
|
||
from exceptions import ParseError
|
||
from parsers.doc_parser import DocParser
|
||
|
||
|
||
@pytest.fixture
|
||
def parser():
|
||
return DocParser()
|
||
|
||
|
||
def _create_docx(path, paragraphs=None, tables=None):
|
||
"""
|
||
创建测试用 Word 文档。
|
||
|
||
Args:
|
||
path: 输出文件路径
|
||
paragraphs: 列表,每个元素是 dict:
|
||
- text: 段落文本
|
||
- style: 可选,样式名(如 'Heading 1')
|
||
- font_size: 可选,字体大小 (Pt)
|
||
- bold: 可选,是否加粗
|
||
tables: 列表,每个元素是二维列表(行×列的文本)
|
||
"""
|
||
doc = Document()
|
||
# 清除默认的空段落
|
||
for p in doc.paragraphs:
|
||
p._element.getparent().remove(p._element)
|
||
|
||
if paragraphs:
|
||
for para_info in paragraphs:
|
||
if isinstance(para_info, str):
|
||
doc.add_paragraph(para_info)
|
||
else:
|
||
text = para_info.get("text", "")
|
||
style = para_info.get("style", None)
|
||
font_size = para_info.get("font_size", None)
|
||
bold = para_info.get("bold", None)
|
||
|
||
if style:
|
||
p = doc.add_paragraph(text, style=style)
|
||
else:
|
||
p = doc.add_paragraph(text)
|
||
|
||
if font_size is not None or bold is not None:
|
||
# 需要通过 run 设置字体属性
|
||
# 清除默认 run,重新添加
|
||
for run in p.runs:
|
||
if font_size is not None:
|
||
run.font.size = Pt(font_size)
|
||
if bold is not None:
|
||
run.bold = bold
|
||
|
||
if tables:
|
||
for table_data in tables:
|
||
if not table_data:
|
||
continue
|
||
rows = len(table_data)
|
||
cols = len(table_data[0]) if table_data else 0
|
||
table = doc.add_table(rows=rows, cols=cols)
|
||
for i, row_data in enumerate(table_data):
|
||
for j, cell_text in enumerate(row_data):
|
||
table.rows[i].cells[j].text = cell_text
|
||
|
||
doc.save(str(path))
|
||
|
||
|
||
class TestSupportedExtensions:
|
||
def test_supports_docx(self, parser):
|
||
assert ".docx" in parser.supported_extensions()
|
||
|
||
def test_only_one_extension(self, parser):
|
||
assert len(parser.supported_extensions()) == 1
|
||
|
||
|
||
class TestParse:
|
||
def test_parse_simple_text(self, parser, tmp_path):
|
||
docx_path = tmp_path / "simple.docx"
|
||
_create_docx(docx_path, paragraphs=["Hello, world!"])
|
||
result = parser.parse(str(docx_path))
|
||
assert "Hello, world!" in result
|
||
|
||
def test_parse_multiple_paragraphs(self, parser, tmp_path):
|
||
docx_path = tmp_path / "multi.docx"
|
||
_create_docx(docx_path, paragraphs=["First paragraph", "Second paragraph"])
|
||
result = parser.parse(str(docx_path))
|
||
assert "First paragraph" in result
|
||
assert "Second paragraph" in result
|
||
|
||
def test_heading_by_style_name(self, parser, tmp_path):
|
||
"""Heading style should produce Markdown heading"""
|
||
docx_path = tmp_path / "heading.docx"
|
||
_create_docx(docx_path, paragraphs=[
|
||
{"text": "Main Title", "style": "Heading 1"},
|
||
{"text": "Body text"},
|
||
])
|
||
result = parser.parse(str(docx_path))
|
||
assert "# Main Title" in result
|
||
# Should be exactly H1, not H2
|
||
assert "## Main Title" not in result
|
||
|
||
def test_heading2_by_style_name(self, parser, tmp_path):
|
||
docx_path = tmp_path / "h2.docx"
|
||
_create_docx(docx_path, paragraphs=[
|
||
{"text": "Section Title", "style": "Heading 2"},
|
||
{"text": "Some content"},
|
||
])
|
||
result = parser.parse(str(docx_path))
|
||
assert "## Section Title" in result
|
||
assert "### Section Title" not in result
|
||
|
||
def test_heading3_by_style_name(self, parser, tmp_path):
|
||
docx_path = tmp_path / "h3.docx"
|
||
_create_docx(docx_path, paragraphs=[
|
||
{"text": "Subsection", "style": "Heading 3"},
|
||
])
|
||
result = parser.parse(str(docx_path))
|
||
assert "### Subsection" in result
|
||
|
||
def test_heading_by_font_size_bold(self, parser, tmp_path):
|
||
"""Bold text with large font size should be detected as heading"""
|
||
docx_path = tmp_path / "font_heading.docx"
|
||
_create_docx(docx_path, paragraphs=[
|
||
{"text": "Big Bold Title", "font_size": 36, "bold": True},
|
||
{"text": "Normal text"},
|
||
])
|
||
result = parser.parse(str(docx_path))
|
||
assert "# Big Bold Title" in result
|
||
|
||
def test_heading_h2_by_font_size(self, parser, tmp_path):
|
||
docx_path = tmp_path / "font_h2.docx"
|
||
_create_docx(docx_path, paragraphs=[
|
||
{"text": "H2 Title", "font_size": 28, "bold": True},
|
||
{"text": "Normal text"},
|
||
])
|
||
result = parser.parse(str(docx_path))
|
||
assert "## H2 Title" in result
|
||
|
||
def test_heading_h5_by_font_size(self, parser, tmp_path):
|
||
docx_path = tmp_path / "font_h5.docx"
|
||
_create_docx(docx_path, paragraphs=[
|
||
{"text": "H5 Title", "font_size": 20, "bold": True},
|
||
{"text": "Normal text"},
|
||
])
|
||
result = parser.parse(str(docx_path))
|
||
assert "##### H5 Title" in result
|
||
|
||
def test_no_heading_without_bold(self, parser, tmp_path):
|
||
"""Large font without bold should NOT be detected as heading via font size"""
|
||
docx_path = tmp_path / "no_bold.docx"
|
||
_create_docx(docx_path, paragraphs=[
|
||
{"text": "Large Not Bold", "font_size": 36, "bold": False},
|
||
])
|
||
result = parser.parse(str(docx_path))
|
||
assert "# Large Not Bold" not in result
|
||
assert "Large Not Bold" in result
|
||
|
||
def test_simple_table(self, parser, tmp_path):
|
||
docx_path = tmp_path / "table.docx"
|
||
_create_docx(docx_path, tables=[
|
||
[["Name", "Age"], ["Alice", "30"], ["Bob", "25"]],
|
||
])
|
||
result = parser.parse(str(docx_path))
|
||
assert "| Name | Age |" in result
|
||
assert "| --- | --- |" in result
|
||
assert "| Alice | 30 |" in result
|
||
assert "| Bob | 25 |" in result
|
||
|
||
def test_table_with_pipe_in_cell(self, parser, tmp_path):
|
||
"""Pipe characters in cells should be escaped"""
|
||
docx_path = tmp_path / "pipe.docx"
|
||
_create_docx(docx_path, tables=[
|
||
[["Header"], ["value|with|pipes"]],
|
||
])
|
||
result = parser.parse(str(docx_path))
|
||
assert "|" in result
|
||
assert "value|with|pipes" in result
|
||
|
||
def test_mixed_paragraphs_and_tables(self, parser, tmp_path):
|
||
"""Document with both paragraphs and tables"""
|
||
docx_path = tmp_path / "mixed.docx"
|
||
doc = Document()
|
||
# Clear default paragraph
|
||
for p in doc.paragraphs:
|
||
p._element.getparent().remove(p._element)
|
||
|
||
doc.add_paragraph("Introduction", style="Heading 1")
|
||
doc.add_paragraph("Some intro text.")
|
||
table = doc.add_table(rows=2, cols=2)
|
||
table.rows[0].cells[0].text = "Col1"
|
||
table.rows[0].cells[1].text = "Col2"
|
||
table.rows[1].cells[0].text = "A"
|
||
table.rows[1].cells[1].text = "B"
|
||
doc.add_paragraph("Conclusion")
|
||
doc.save(str(docx_path))
|
||
|
||
result = parser.parse(str(docx_path))
|
||
assert "# Introduction" in result
|
||
assert "Some intro text." in result
|
||
assert "| Col1 | Col2 |" in result
|
||
assert "| A | B |" in result
|
||
assert "Conclusion" in result
|
||
|
||
def test_empty_document(self, parser, tmp_path):
|
||
docx_path = tmp_path / "empty.docx"
|
||
doc = Document()
|
||
# Clear default paragraph
|
||
for p in doc.paragraphs:
|
||
p._element.getparent().remove(p._element)
|
||
doc.save(str(docx_path))
|
||
result = parser.parse(str(docx_path))
|
||
assert result.strip() == ""
|
||
|
||
def test_empty_paragraphs_skipped(self, parser, tmp_path):
|
||
docx_path = tmp_path / "empty_para.docx"
|
||
_create_docx(docx_path, paragraphs=["", "Actual content", ""])
|
||
result = parser.parse(str(docx_path))
|
||
assert "Actual content" in result
|
||
# Empty paragraphs should not produce extra lines
|
||
assert result.strip() == "Actual content"
|
||
|
||
def test_nonexistent_file_raises(self, parser):
|
||
with pytest.raises(ParseError) as exc_info:
|
||
parser.parse("/nonexistent/path/file.docx")
|
||
assert "file.docx" in exc_info.value.file_name
|
||
assert exc_info.value.reason != ""
|
||
|
||
def test_corrupted_file_raises(self, parser, tmp_path):
|
||
docx_path = tmp_path / "corrupted.docx"
|
||
docx_path.write_bytes(b"this is not a docx file at all")
|
||
with pytest.raises(ParseError) as exc_info:
|
||
parser.parse(str(docx_path))
|
||
assert "corrupted.docx" in exc_info.value.file_name
|
||
|
||
def test_parse_error_contains_filename(self, parser):
|
||
with pytest.raises(ParseError) as exc_info:
|
||
parser.parse("/no/such/report.docx")
|
||
assert exc_info.value.file_name == "report.docx"
|
||
|
||
def test_multiple_heading_levels(self, parser, tmp_path):
|
||
"""Test document with multiple heading levels via styles"""
|
||
docx_path = tmp_path / "levels.docx"
|
||
_create_docx(docx_path, paragraphs=[
|
||
{"text": "Title", "style": "Heading 1"},
|
||
{"text": "Chapter", "style": "Heading 2"},
|
||
{"text": "Section", "style": "Heading 3"},
|
||
{"text": "Body text"},
|
||
])
|
||
result = parser.parse(str(docx_path))
|
||
assert "# Title" in result
|
||
assert "## Chapter" in result
|
||
assert "### Section" in result
|
||
assert "Body text" in result
|
||
# Body text should not have heading prefix
|
||
assert "# Body text" not in result
|