Files
bigwo/tests/test_doc_parser.py
2026-03-02 17:38:28 +08:00

261 lines
9.7 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""DocParser 单元测试"""
import pytest
from docx import Document
from docx.shared import Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
from exceptions import ParseError
from parsers.doc_parser import DocParser
@pytest.fixture
def parser():
return DocParser()
def _create_docx(path, paragraphs=None, tables=None):
"""
创建测试用 Word 文档。
Args:
path: 输出文件路径
paragraphs: 列表,每个元素是 dict:
- text: 段落文本
- style: 可选,样式名(如 'Heading 1'
- font_size: 可选,字体大小 (Pt)
- bold: 可选,是否加粗
tables: 列表,每个元素是二维列表(行×列的文本)
"""
doc = Document()
# 清除默认的空段落
for p in doc.paragraphs:
p._element.getparent().remove(p._element)
if paragraphs:
for para_info in paragraphs:
if isinstance(para_info, str):
doc.add_paragraph(para_info)
else:
text = para_info.get("text", "")
style = para_info.get("style", None)
font_size = para_info.get("font_size", None)
bold = para_info.get("bold", None)
if style:
p = doc.add_paragraph(text, style=style)
else:
p = doc.add_paragraph(text)
if font_size is not None or bold is not None:
# 需要通过 run 设置字体属性
# 清除默认 run重新添加
for run in p.runs:
if font_size is not None:
run.font.size = Pt(font_size)
if bold is not None:
run.bold = bold
if tables:
for table_data in tables:
if not table_data:
continue
rows = len(table_data)
cols = len(table_data[0]) if table_data else 0
table = doc.add_table(rows=rows, cols=cols)
for i, row_data in enumerate(table_data):
for j, cell_text in enumerate(row_data):
table.rows[i].cells[j].text = cell_text
doc.save(str(path))
class TestSupportedExtensions:
def test_supports_docx(self, parser):
assert ".docx" in parser.supported_extensions()
def test_only_one_extension(self, parser):
assert len(parser.supported_extensions()) == 1
class TestParse:
def test_parse_simple_text(self, parser, tmp_path):
docx_path = tmp_path / "simple.docx"
_create_docx(docx_path, paragraphs=["Hello, world!"])
result = parser.parse(str(docx_path))
assert "Hello, world!" in result
def test_parse_multiple_paragraphs(self, parser, tmp_path):
docx_path = tmp_path / "multi.docx"
_create_docx(docx_path, paragraphs=["First paragraph", "Second paragraph"])
result = parser.parse(str(docx_path))
assert "First paragraph" in result
assert "Second paragraph" in result
def test_heading_by_style_name(self, parser, tmp_path):
"""Heading style should produce Markdown heading"""
docx_path = tmp_path / "heading.docx"
_create_docx(docx_path, paragraphs=[
{"text": "Main Title", "style": "Heading 1"},
{"text": "Body text"},
])
result = parser.parse(str(docx_path))
assert "# Main Title" in result
# Should be exactly H1, not H2
assert "## Main Title" not in result
def test_heading2_by_style_name(self, parser, tmp_path):
docx_path = tmp_path / "h2.docx"
_create_docx(docx_path, paragraphs=[
{"text": "Section Title", "style": "Heading 2"},
{"text": "Some content"},
])
result = parser.parse(str(docx_path))
assert "## Section Title" in result
assert "### Section Title" not in result
def test_heading3_by_style_name(self, parser, tmp_path):
docx_path = tmp_path / "h3.docx"
_create_docx(docx_path, paragraphs=[
{"text": "Subsection", "style": "Heading 3"},
])
result = parser.parse(str(docx_path))
assert "### Subsection" in result
def test_heading_by_font_size_bold(self, parser, tmp_path):
"""Bold text with large font size should be detected as heading"""
docx_path = tmp_path / "font_heading.docx"
_create_docx(docx_path, paragraphs=[
{"text": "Big Bold Title", "font_size": 36, "bold": True},
{"text": "Normal text"},
])
result = parser.parse(str(docx_path))
assert "# Big Bold Title" in result
def test_heading_h2_by_font_size(self, parser, tmp_path):
docx_path = tmp_path / "font_h2.docx"
_create_docx(docx_path, paragraphs=[
{"text": "H2 Title", "font_size": 28, "bold": True},
{"text": "Normal text"},
])
result = parser.parse(str(docx_path))
assert "## H2 Title" in result
def test_heading_h5_by_font_size(self, parser, tmp_path):
docx_path = tmp_path / "font_h5.docx"
_create_docx(docx_path, paragraphs=[
{"text": "H5 Title", "font_size": 20, "bold": True},
{"text": "Normal text"},
])
result = parser.parse(str(docx_path))
assert "##### H5 Title" in result
def test_no_heading_without_bold(self, parser, tmp_path):
"""Large font without bold should NOT be detected as heading via font size"""
docx_path = tmp_path / "no_bold.docx"
_create_docx(docx_path, paragraphs=[
{"text": "Large Not Bold", "font_size": 36, "bold": False},
])
result = parser.parse(str(docx_path))
assert "# Large Not Bold" not in result
assert "Large Not Bold" in result
def test_simple_table(self, parser, tmp_path):
docx_path = tmp_path / "table.docx"
_create_docx(docx_path, tables=[
[["Name", "Age"], ["Alice", "30"], ["Bob", "25"]],
])
result = parser.parse(str(docx_path))
assert "| Name | Age |" in result
assert "| --- | --- |" in result
assert "| Alice | 30 |" in result
assert "| Bob | 25 |" in result
def test_table_with_pipe_in_cell(self, parser, tmp_path):
"""Pipe characters in cells should be escaped"""
docx_path = tmp_path / "pipe.docx"
_create_docx(docx_path, tables=[
[["Header"], ["value|with|pipes"]],
])
result = parser.parse(str(docx_path))
assert "|" in result
assert "value|with|pipes" in result
def test_mixed_paragraphs_and_tables(self, parser, tmp_path):
"""Document with both paragraphs and tables"""
docx_path = tmp_path / "mixed.docx"
doc = Document()
# Clear default paragraph
for p in doc.paragraphs:
p._element.getparent().remove(p._element)
doc.add_paragraph("Introduction", style="Heading 1")
doc.add_paragraph("Some intro text.")
table = doc.add_table(rows=2, cols=2)
table.rows[0].cells[0].text = "Col1"
table.rows[0].cells[1].text = "Col2"
table.rows[1].cells[0].text = "A"
table.rows[1].cells[1].text = "B"
doc.add_paragraph("Conclusion")
doc.save(str(docx_path))
result = parser.parse(str(docx_path))
assert "# Introduction" in result
assert "Some intro text." in result
assert "| Col1 | Col2 |" in result
assert "| A | B |" in result
assert "Conclusion" in result
def test_empty_document(self, parser, tmp_path):
docx_path = tmp_path / "empty.docx"
doc = Document()
# Clear default paragraph
for p in doc.paragraphs:
p._element.getparent().remove(p._element)
doc.save(str(docx_path))
result = parser.parse(str(docx_path))
assert result.strip() == ""
def test_empty_paragraphs_skipped(self, parser, tmp_path):
docx_path = tmp_path / "empty_para.docx"
_create_docx(docx_path, paragraphs=["", "Actual content", ""])
result = parser.parse(str(docx_path))
assert "Actual content" in result
# Empty paragraphs should not produce extra lines
assert result.strip() == "Actual content"
def test_nonexistent_file_raises(self, parser):
with pytest.raises(ParseError) as exc_info:
parser.parse("/nonexistent/path/file.docx")
assert "file.docx" in exc_info.value.file_name
assert exc_info.value.reason != ""
def test_corrupted_file_raises(self, parser, tmp_path):
docx_path = tmp_path / "corrupted.docx"
docx_path.write_bytes(b"this is not a docx file at all")
with pytest.raises(ParseError) as exc_info:
parser.parse(str(docx_path))
assert "corrupted.docx" in exc_info.value.file_name
def test_parse_error_contains_filename(self, parser):
with pytest.raises(ParseError) as exc_info:
parser.parse("/no/such/report.docx")
assert exc_info.value.file_name == "report.docx"
def test_multiple_heading_levels(self, parser, tmp_path):
"""Test document with multiple heading levels via styles"""
docx_path = tmp_path / "levels.docx"
_create_docx(docx_path, paragraphs=[
{"text": "Title", "style": "Heading 1"},
{"text": "Chapter", "style": "Heading 2"},
{"text": "Section", "style": "Heading 3"},
{"text": "Body text"},
])
result = parser.parse(str(docx_path))
assert "# Title" in result
assert "## Chapter" in result
assert "### Section" in result
assert "Body text" in result
# Body text should not have heading prefix
assert "# Body text" not in result