"""DocParser 单元测试""" import pytest from docx import Document from docx.shared import Pt from docx.enum.text import WD_ALIGN_PARAGRAPH from exceptions import ParseError from parsers.doc_parser import DocParser @pytest.fixture def parser(): return DocParser() def _create_docx(path, paragraphs=None, tables=None): """ 创建测试用 Word 文档。 Args: path: 输出文件路径 paragraphs: 列表,每个元素是 dict: - text: 段落文本 - style: 可选,样式名(如 'Heading 1') - font_size: 可选,字体大小 (Pt) - bold: 可选,是否加粗 tables: 列表,每个元素是二维列表(行×列的文本) """ doc = Document() # 清除默认的空段落 for p in doc.paragraphs: p._element.getparent().remove(p._element) if paragraphs: for para_info in paragraphs: if isinstance(para_info, str): doc.add_paragraph(para_info) else: text = para_info.get("text", "") style = para_info.get("style", None) font_size = para_info.get("font_size", None) bold = para_info.get("bold", None) if style: p = doc.add_paragraph(text, style=style) else: p = doc.add_paragraph(text) if font_size is not None or bold is not None: # 需要通过 run 设置字体属性 # 清除默认 run,重新添加 for run in p.runs: if font_size is not None: run.font.size = Pt(font_size) if bold is not None: run.bold = bold if tables: for table_data in tables: if not table_data: continue rows = len(table_data) cols = len(table_data[0]) if table_data else 0 table = doc.add_table(rows=rows, cols=cols) for i, row_data in enumerate(table_data): for j, cell_text in enumerate(row_data): table.rows[i].cells[j].text = cell_text doc.save(str(path)) class TestSupportedExtensions: def test_supports_docx(self, parser): assert ".docx" in parser.supported_extensions() def test_only_one_extension(self, parser): assert len(parser.supported_extensions()) == 1 class TestParse: def test_parse_simple_text(self, parser, tmp_path): docx_path = tmp_path / "simple.docx" _create_docx(docx_path, paragraphs=["Hello, world!"]) result = parser.parse(str(docx_path)) assert "Hello, world!" in result def test_parse_multiple_paragraphs(self, parser, tmp_path): docx_path = tmp_path / "multi.docx" _create_docx(docx_path, paragraphs=["First paragraph", "Second paragraph"]) result = parser.parse(str(docx_path)) assert "First paragraph" in result assert "Second paragraph" in result def test_heading_by_style_name(self, parser, tmp_path): """Heading style should produce Markdown heading""" docx_path = tmp_path / "heading.docx" _create_docx(docx_path, paragraphs=[ {"text": "Main Title", "style": "Heading 1"}, {"text": "Body text"}, ]) result = parser.parse(str(docx_path)) assert "# Main Title" in result # Should be exactly H1, not H2 assert "## Main Title" not in result def test_heading2_by_style_name(self, parser, tmp_path): docx_path = tmp_path / "h2.docx" _create_docx(docx_path, paragraphs=[ {"text": "Section Title", "style": "Heading 2"}, {"text": "Some content"}, ]) result = parser.parse(str(docx_path)) assert "## Section Title" in result assert "### Section Title" not in result def test_heading3_by_style_name(self, parser, tmp_path): docx_path = tmp_path / "h3.docx" _create_docx(docx_path, paragraphs=[ {"text": "Subsection", "style": "Heading 3"}, ]) result = parser.parse(str(docx_path)) assert "### Subsection" in result def test_heading_by_font_size_bold(self, parser, tmp_path): """Bold text with large font size should be detected as heading""" docx_path = tmp_path / "font_heading.docx" _create_docx(docx_path, paragraphs=[ {"text": "Big Bold Title", "font_size": 36, "bold": True}, {"text": "Normal text"}, ]) result = parser.parse(str(docx_path)) assert "# Big Bold Title" in result def test_heading_h2_by_font_size(self, parser, tmp_path): docx_path = tmp_path / "font_h2.docx" _create_docx(docx_path, paragraphs=[ {"text": "H2 Title", "font_size": 28, "bold": True}, {"text": "Normal text"}, ]) result = parser.parse(str(docx_path)) assert "## H2 Title" in result def test_heading_h5_by_font_size(self, parser, tmp_path): docx_path = tmp_path / "font_h5.docx" _create_docx(docx_path, paragraphs=[ {"text": "H5 Title", "font_size": 20, "bold": True}, {"text": "Normal text"}, ]) result = parser.parse(str(docx_path)) assert "##### H5 Title" in result def test_no_heading_without_bold(self, parser, tmp_path): """Large font without bold should NOT be detected as heading via font size""" docx_path = tmp_path / "no_bold.docx" _create_docx(docx_path, paragraphs=[ {"text": "Large Not Bold", "font_size": 36, "bold": False}, ]) result = parser.parse(str(docx_path)) assert "# Large Not Bold" not in result assert "Large Not Bold" in result def test_simple_table(self, parser, tmp_path): docx_path = tmp_path / "table.docx" _create_docx(docx_path, tables=[ [["Name", "Age"], ["Alice", "30"], ["Bob", "25"]], ]) result = parser.parse(str(docx_path)) assert "| Name | Age |" in result assert "| --- | --- |" in result assert "| Alice | 30 |" in result assert "| Bob | 25 |" in result def test_table_with_pipe_in_cell(self, parser, tmp_path): """Pipe characters in cells should be escaped""" docx_path = tmp_path / "pipe.docx" _create_docx(docx_path, tables=[ [["Header"], ["value|with|pipes"]], ]) result = parser.parse(str(docx_path)) assert "|" in result assert "value|with|pipes" in result def test_mixed_paragraphs_and_tables(self, parser, tmp_path): """Document with both paragraphs and tables""" docx_path = tmp_path / "mixed.docx" doc = Document() # Clear default paragraph for p in doc.paragraphs: p._element.getparent().remove(p._element) doc.add_paragraph("Introduction", style="Heading 1") doc.add_paragraph("Some intro text.") table = doc.add_table(rows=2, cols=2) table.rows[0].cells[0].text = "Col1" table.rows[0].cells[1].text = "Col2" table.rows[1].cells[0].text = "A" table.rows[1].cells[1].text = "B" doc.add_paragraph("Conclusion") doc.save(str(docx_path)) result = parser.parse(str(docx_path)) assert "# Introduction" in result assert "Some intro text." in result assert "| Col1 | Col2 |" in result assert "| A | B |" in result assert "Conclusion" in result def test_empty_document(self, parser, tmp_path): docx_path = tmp_path / "empty.docx" doc = Document() # Clear default paragraph for p in doc.paragraphs: p._element.getparent().remove(p._element) doc.save(str(docx_path)) result = parser.parse(str(docx_path)) assert result.strip() == "" def test_empty_paragraphs_skipped(self, parser, tmp_path): docx_path = tmp_path / "empty_para.docx" _create_docx(docx_path, paragraphs=["", "Actual content", ""]) result = parser.parse(str(docx_path)) assert "Actual content" in result # Empty paragraphs should not produce extra lines assert result.strip() == "Actual content" def test_nonexistent_file_raises(self, parser): with pytest.raises(ParseError) as exc_info: parser.parse("/nonexistent/path/file.docx") assert "file.docx" in exc_info.value.file_name assert exc_info.value.reason != "" def test_corrupted_file_raises(self, parser, tmp_path): docx_path = tmp_path / "corrupted.docx" docx_path.write_bytes(b"this is not a docx file at all") with pytest.raises(ParseError) as exc_info: parser.parse(str(docx_path)) assert "corrupted.docx" in exc_info.value.file_name def test_parse_error_contains_filename(self, parser): with pytest.raises(ParseError) as exc_info: parser.parse("/no/such/report.docx") assert exc_info.value.file_name == "report.docx" def test_multiple_heading_levels(self, parser, tmp_path): """Test document with multiple heading levels via styles""" docx_path = tmp_path / "levels.docx" _create_docx(docx_path, paragraphs=[ {"text": "Title", "style": "Heading 1"}, {"text": "Chapter", "style": "Heading 2"}, {"text": "Section", "style": "Heading 3"}, {"text": "Body text"}, ]) result = parser.parse(str(docx_path)) assert "# Title" in result assert "## Chapter" in result assert "### Section" in result assert "Body text" in result # Body text should not have heading prefix assert "# Body text" not in result