Files
bigwo/tests/test_pdf_parser.py

160 lines
5.3 KiB
Python
Raw Permalink Normal View History

"""PdfParser 单元测试"""
import pytest
import fitz
from exceptions import ParseError
from parsers.pdf_parser import PdfParser
@pytest.fixture
def parser():
return PdfParser()
def _create_pdf(path, pages):
"""
创建测试用 PDF 文件
Args:
path: 输出文件路径
pages: 列表每个元素是 (text, fontsize) 元组的列表代表一页中的文本行
"""
doc = fitz.open()
for page_items in pages:
page = doc.new_page()
y = 72
for text, fontsize in page_items:
page.insert_text((72, y), text, fontsize=fontsize)
y += fontsize + 10
doc.save(str(path))
doc.close()
class TestSupportedExtensions:
def test_supports_pdf(self, parser):
assert ".pdf" in parser.supported_extensions()
def test_only_one_extension(self, parser):
assert len(parser.supported_extensions()) == 1
class TestParse:
def test_parse_simple_text(self, parser, tmp_path):
pdf_path = tmp_path / "simple.pdf"
_create_pdf(pdf_path, [
[("Hello, world!", 12)],
])
result = parser.parse(str(pdf_path))
assert "Hello, world!" in result
def test_parse_multiline_text(self, parser, tmp_path):
pdf_path = tmp_path / "multi.pdf"
_create_pdf(pdf_path, [
[("Line one", 12), ("Line two", 12)],
])
result = parser.parse(str(pdf_path))
assert "Line one" in result
assert "Line two" in result
def test_parse_multiple_pages(self, parser, tmp_path):
pdf_path = tmp_path / "pages.pdf"
_create_pdf(pdf_path, [
[("Page one content", 12)],
[("Page two content", 12)],
])
result = parser.parse(str(pdf_path))
assert "Page one content" in result
assert "Page two content" in result
def test_heading_level2_detection(self, parser, tmp_path):
"""Font size > body_mode + 2 should produce ## heading"""
pdf_path = tmp_path / "h2.pdf"
# Body text at size 12 (will be the mode), heading at size 18 (diff=6 > 2)
_create_pdf(pdf_path, [
[
("Body text line one", 12),
("Body text line two", 12),
("Body text line three", 12),
("Big Heading", 18),
],
])
result = parser.parse(str(pdf_path))
assert "## Big Heading" in result
def test_heading_level3_detection(self, parser, tmp_path):
"""Font size > body_mode + 0.5 but <= body_mode + 2 should produce ### heading"""
pdf_path = tmp_path / "h3.pdf"
# Body text at size 12 (mode), heading at size 13.5 (diff=1.5, >0.5 and <=2)
_create_pdf(pdf_path, [
[
("Body text one", 12),
("Body text two", 12),
("Body text three", 12),
("Sub Heading", 13.5),
],
])
result = parser.parse(str(pdf_path))
assert "### Sub Heading" in result
def test_body_text_no_heading_prefix(self, parser, tmp_path):
"""Text at body font size should not have heading prefix"""
pdf_path = tmp_path / "body.pdf"
_create_pdf(pdf_path, [
[("Normal text", 12), ("More normal text", 12)],
])
result = parser.parse(str(pdf_path))
assert "## Normal text" not in result
assert "### Normal text" not in result
assert "Normal text" in result
def test_empty_pdf(self, parser, tmp_path):
"""Empty PDF (no text) should return empty string"""
pdf_path = tmp_path / "empty.pdf"
doc = fitz.open()
doc.new_page()
doc.save(str(pdf_path))
doc.close()
result = parser.parse(str(pdf_path))
assert result.strip() == ""
def test_nonexistent_file_raises(self, parser):
with pytest.raises(ParseError) as exc_info:
parser.parse("/nonexistent/path/file.pdf")
assert "file.pdf" in exc_info.value.file_name
assert exc_info.value.reason != ""
def test_corrupted_file_raises(self, parser, tmp_path):
pdf_path = tmp_path / "corrupted.pdf"
pdf_path.write_bytes(b"this is not a pdf file at all")
with pytest.raises(ParseError) as exc_info:
parser.parse(str(pdf_path))
assert "corrupted.pdf" in exc_info.value.file_name
def test_parse_error_contains_filename(self, parser):
with pytest.raises(ParseError) as exc_info:
parser.parse("/no/such/report.pdf")
assert exc_info.value.file_name == "report.pdf"
def test_mixed_headings_and_body(self, parser, tmp_path):
"""Test a document with mixed heading levels and body text"""
pdf_path = tmp_path / "mixed.pdf"
_create_pdf(pdf_path, [
[
("Body one", 12),
("Body two", 12),
("Body three", 12),
("Body four", 12),
("Body five", 12),
("Main Title", 20),
("Section Title", 14),
("Paragraph text", 12),
],
])
result = parser.parse(str(pdf_path))
assert "## Main Title" in result
assert "### Section Title" in result
# Body text should not have heading markers
assert "## Body one" not in result
assert "## Paragraph text" not in result