"""PdfParser 单元测试""" import pytest import fitz from exceptions import ParseError from parsers.pdf_parser import PdfParser @pytest.fixture def parser(): return PdfParser() def _create_pdf(path, pages): """ 创建测试用 PDF 文件。 Args: path: 输出文件路径 pages: 列表,每个元素是 (text, fontsize) 元组的列表,代表一页中的文本行 """ doc = fitz.open() for page_items in pages: page = doc.new_page() y = 72 for text, fontsize in page_items: page.insert_text((72, y), text, fontsize=fontsize) y += fontsize + 10 doc.save(str(path)) doc.close() class TestSupportedExtensions: def test_supports_pdf(self, parser): assert ".pdf" in parser.supported_extensions() def test_only_one_extension(self, parser): assert len(parser.supported_extensions()) == 1 class TestParse: def test_parse_simple_text(self, parser, tmp_path): pdf_path = tmp_path / "simple.pdf" _create_pdf(pdf_path, [ [("Hello, world!", 12)], ]) result = parser.parse(str(pdf_path)) assert "Hello, world!" in result def test_parse_multiline_text(self, parser, tmp_path): pdf_path = tmp_path / "multi.pdf" _create_pdf(pdf_path, [ [("Line one", 12), ("Line two", 12)], ]) result = parser.parse(str(pdf_path)) assert "Line one" in result assert "Line two" in result def test_parse_multiple_pages(self, parser, tmp_path): pdf_path = tmp_path / "pages.pdf" _create_pdf(pdf_path, [ [("Page one content", 12)], [("Page two content", 12)], ]) result = parser.parse(str(pdf_path)) assert "Page one content" in result assert "Page two content" in result def test_heading_level2_detection(self, parser, tmp_path): """Font size > body_mode + 2 should produce ## heading""" pdf_path = tmp_path / "h2.pdf" # Body text at size 12 (will be the mode), heading at size 18 (diff=6 > 2) _create_pdf(pdf_path, [ [ ("Body text line one", 12), ("Body text line two", 12), ("Body text line three", 12), ("Big Heading", 18), ], ]) result = parser.parse(str(pdf_path)) assert "## Big Heading" in result def test_heading_level3_detection(self, parser, tmp_path): """Font size > body_mode + 0.5 but <= body_mode + 2 should produce ### heading""" pdf_path = tmp_path / "h3.pdf" # Body text at size 12 (mode), heading at size 13.5 (diff=1.5, >0.5 and <=2) _create_pdf(pdf_path, [ [ ("Body text one", 12), ("Body text two", 12), ("Body text three", 12), ("Sub Heading", 13.5), ], ]) result = parser.parse(str(pdf_path)) assert "### Sub Heading" in result def test_body_text_no_heading_prefix(self, parser, tmp_path): """Text at body font size should not have heading prefix""" pdf_path = tmp_path / "body.pdf" _create_pdf(pdf_path, [ [("Normal text", 12), ("More normal text", 12)], ]) result = parser.parse(str(pdf_path)) assert "## Normal text" not in result assert "### Normal text" not in result assert "Normal text" in result def test_empty_pdf(self, parser, tmp_path): """Empty PDF (no text) should return empty string""" pdf_path = tmp_path / "empty.pdf" doc = fitz.open() doc.new_page() doc.save(str(pdf_path)) doc.close() result = parser.parse(str(pdf_path)) assert result.strip() == "" def test_nonexistent_file_raises(self, parser): with pytest.raises(ParseError) as exc_info: parser.parse("/nonexistent/path/file.pdf") assert "file.pdf" in exc_info.value.file_name assert exc_info.value.reason != "" def test_corrupted_file_raises(self, parser, tmp_path): pdf_path = tmp_path / "corrupted.pdf" pdf_path.write_bytes(b"this is not a pdf file at all") with pytest.raises(ParseError) as exc_info: parser.parse(str(pdf_path)) assert "corrupted.pdf" in exc_info.value.file_name def test_parse_error_contains_filename(self, parser): with pytest.raises(ParseError) as exc_info: parser.parse("/no/such/report.pdf") assert exc_info.value.file_name == "report.pdf" def test_mixed_headings_and_body(self, parser, tmp_path): """Test a document with mixed heading levels and body text""" pdf_path = tmp_path / "mixed.pdf" _create_pdf(pdf_path, [ [ ("Body one", 12), ("Body two", 12), ("Body three", 12), ("Body four", 12), ("Body five", 12), ("Main Title", 20), ("Section Title", 14), ("Paragraph text", 12), ], ]) result = parser.parse(str(pdf_path)) assert "## Main Title" in result assert "### Section Title" in result # Body text should not have heading markers assert "## Body one" not in result assert "## Paragraph text" not in result