160 lines
5.3 KiB
Python
160 lines
5.3 KiB
Python
"""PdfParser 单元测试"""
|
|
|
|
import pytest
|
|
import fitz
|
|
|
|
from exceptions import ParseError
|
|
from parsers.pdf_parser import PdfParser
|
|
|
|
|
|
@pytest.fixture
|
|
def parser():
|
|
return PdfParser()
|
|
|
|
|
|
def _create_pdf(path, pages):
|
|
"""
|
|
创建测试用 PDF 文件。
|
|
|
|
Args:
|
|
path: 输出文件路径
|
|
pages: 列表,每个元素是 (text, fontsize) 元组的列表,代表一页中的文本行
|
|
"""
|
|
doc = fitz.open()
|
|
for page_items in pages:
|
|
page = doc.new_page()
|
|
y = 72
|
|
for text, fontsize in page_items:
|
|
page.insert_text((72, y), text, fontsize=fontsize)
|
|
y += fontsize + 10
|
|
doc.save(str(path))
|
|
doc.close()
|
|
|
|
|
|
class TestSupportedExtensions:
|
|
def test_supports_pdf(self, parser):
|
|
assert ".pdf" in parser.supported_extensions()
|
|
|
|
def test_only_one_extension(self, parser):
|
|
assert len(parser.supported_extensions()) == 1
|
|
|
|
|
|
class TestParse:
|
|
def test_parse_simple_text(self, parser, tmp_path):
|
|
pdf_path = tmp_path / "simple.pdf"
|
|
_create_pdf(pdf_path, [
|
|
[("Hello, world!", 12)],
|
|
])
|
|
result = parser.parse(str(pdf_path))
|
|
assert "Hello, world!" in result
|
|
|
|
def test_parse_multiline_text(self, parser, tmp_path):
|
|
pdf_path = tmp_path / "multi.pdf"
|
|
_create_pdf(pdf_path, [
|
|
[("Line one", 12), ("Line two", 12)],
|
|
])
|
|
result = parser.parse(str(pdf_path))
|
|
assert "Line one" in result
|
|
assert "Line two" in result
|
|
|
|
def test_parse_multiple_pages(self, parser, tmp_path):
|
|
pdf_path = tmp_path / "pages.pdf"
|
|
_create_pdf(pdf_path, [
|
|
[("Page one content", 12)],
|
|
[("Page two content", 12)],
|
|
])
|
|
result = parser.parse(str(pdf_path))
|
|
assert "Page one content" in result
|
|
assert "Page two content" in result
|
|
|
|
def test_heading_level2_detection(self, parser, tmp_path):
|
|
"""Font size > body_mode + 2 should produce ## heading"""
|
|
pdf_path = tmp_path / "h2.pdf"
|
|
# Body text at size 12 (will be the mode), heading at size 18 (diff=6 > 2)
|
|
_create_pdf(pdf_path, [
|
|
[
|
|
("Body text line one", 12),
|
|
("Body text line two", 12),
|
|
("Body text line three", 12),
|
|
("Big Heading", 18),
|
|
],
|
|
])
|
|
result = parser.parse(str(pdf_path))
|
|
assert "## Big Heading" in result
|
|
|
|
def test_heading_level3_detection(self, parser, tmp_path):
|
|
"""Font size > body_mode + 0.5 but <= body_mode + 2 should produce ### heading"""
|
|
pdf_path = tmp_path / "h3.pdf"
|
|
# Body text at size 12 (mode), heading at size 13.5 (diff=1.5, >0.5 and <=2)
|
|
_create_pdf(pdf_path, [
|
|
[
|
|
("Body text one", 12),
|
|
("Body text two", 12),
|
|
("Body text three", 12),
|
|
("Sub Heading", 13.5),
|
|
],
|
|
])
|
|
result = parser.parse(str(pdf_path))
|
|
assert "### Sub Heading" in result
|
|
|
|
def test_body_text_no_heading_prefix(self, parser, tmp_path):
|
|
"""Text at body font size should not have heading prefix"""
|
|
pdf_path = tmp_path / "body.pdf"
|
|
_create_pdf(pdf_path, [
|
|
[("Normal text", 12), ("More normal text", 12)],
|
|
])
|
|
result = parser.parse(str(pdf_path))
|
|
assert "## Normal text" not in result
|
|
assert "### Normal text" not in result
|
|
assert "Normal text" in result
|
|
|
|
def test_empty_pdf(self, parser, tmp_path):
|
|
"""Empty PDF (no text) should return empty string"""
|
|
pdf_path = tmp_path / "empty.pdf"
|
|
doc = fitz.open()
|
|
doc.new_page()
|
|
doc.save(str(pdf_path))
|
|
doc.close()
|
|
result = parser.parse(str(pdf_path))
|
|
assert result.strip() == ""
|
|
|
|
def test_nonexistent_file_raises(self, parser):
|
|
with pytest.raises(ParseError) as exc_info:
|
|
parser.parse("/nonexistent/path/file.pdf")
|
|
assert "file.pdf" in exc_info.value.file_name
|
|
assert exc_info.value.reason != ""
|
|
|
|
def test_corrupted_file_raises(self, parser, tmp_path):
|
|
pdf_path = tmp_path / "corrupted.pdf"
|
|
pdf_path.write_bytes(b"this is not a pdf file at all")
|
|
with pytest.raises(ParseError) as exc_info:
|
|
parser.parse(str(pdf_path))
|
|
assert "corrupted.pdf" in exc_info.value.file_name
|
|
|
|
def test_parse_error_contains_filename(self, parser):
|
|
with pytest.raises(ParseError) as exc_info:
|
|
parser.parse("/no/such/report.pdf")
|
|
assert exc_info.value.file_name == "report.pdf"
|
|
|
|
def test_mixed_headings_and_body(self, parser, tmp_path):
|
|
"""Test a document with mixed heading levels and body text"""
|
|
pdf_path = tmp_path / "mixed.pdf"
|
|
_create_pdf(pdf_path, [
|
|
[
|
|
("Body one", 12),
|
|
("Body two", 12),
|
|
("Body three", 12),
|
|
("Body four", 12),
|
|
("Body five", 12),
|
|
("Main Title", 20),
|
|
("Section Title", 14),
|
|
("Paragraph text", 12),
|
|
],
|
|
])
|
|
result = parser.parse(str(pdf_path))
|
|
assert "## Main Title" in result
|
|
assert "### Section Title" in result
|
|
# Body text should not have heading markers
|
|
assert "## Body one" not in result
|
|
assert "## Paragraph text" not in result
|