Files
bigwo/tests/test_writer.py

226 lines
7.6 KiB
Python
Raw Normal View History

"""MarkdownWriter 单元测试"""
import pytest
from models import Chunk
from writer import MarkdownWriter
@pytest.fixture
def writer():
return MarkdownWriter()
@pytest.fixture
def tmp_output(tmp_path):
return str(tmp_path / "output.md")
class TestSingleChunk:
"""单个 Chunk 输出测试"""
def test_single_chunk_no_delimiter(self, writer, tmp_output):
chunks = [Chunk(title="摘要标题", content="这是内容")]
writer.write(chunks, tmp_output, "test.pdf")
content = open(tmp_output, encoding="utf-8").read()
assert "---" not in content.split("-->", 1)[1]
def test_single_chunk_has_title(self, writer, tmp_output):
chunks = [Chunk(title="摘要标题", content="这是内容")]
writer.write(chunks, tmp_output, "test.pdf")
content = open(tmp_output, encoding="utf-8").read()
assert "## 摘要标题" in content
def test_single_chunk_has_content(self, writer, tmp_output):
chunks = [Chunk(title="摘要标题", content="这是内容")]
writer.write(chunks, tmp_output, "test.pdf")
content = open(tmp_output, encoding="utf-8").read()
assert "这是内容" in content
class TestMultipleChunks:
"""多个 Chunk 输出测试"""
def test_delimiter_between_chunks(self, writer, tmp_output):
chunks = [
Chunk(title="标题1", content="内容1"),
Chunk(title="标题2", content="内容2"),
Chunk(title="标题3", content="内容3"),
]
writer.write(chunks, tmp_output, "test.pdf")
content = open(tmp_output, encoding="utf-8").read()
after_meta = content.split("-->", 1)[1]
assert after_meta.count("\n---\n") == 2
def test_all_titles_present(self, writer, tmp_output):
chunks = [
Chunk(title="标题A", content="内容A"),
Chunk(title="标题B", content="内容B"),
]
writer.write(chunks, tmp_output, "test.pdf")
content = open(tmp_output, encoding="utf-8").read()
assert "## 标题A" in content
assert "## 标题B" in content
def test_all_contents_present(self, writer, tmp_output):
chunks = [
Chunk(title="标题A", content="内容A"),
Chunk(title="标题B", content="内容B"),
]
writer.write(chunks, tmp_output, "test.pdf")
content = open(tmp_output, encoding="utf-8").read()
assert "内容A" in content
assert "内容B" in content
def test_no_trailing_delimiter(self, writer, tmp_output):
chunks = [
Chunk(title="标题1", content="内容1"),
Chunk(title="标题2", content="内容2"),
]
writer.write(chunks, tmp_output, "test.pdf")
content = open(tmp_output, encoding="utf-8").read()
after_meta = content.split("-->", 1)[1]
# The last chunk content should appear after the last delimiter
# and there should be no delimiter after the last content
last_delimiter_pos = after_meta.rfind("\n---\n")
last_content_pos = after_meta.rfind("内容2")
assert last_content_pos > last_delimiter_pos
class TestMetaInfo:
"""元信息注释测试"""
def test_contains_source_file(self, writer, tmp_output):
chunks = [Chunk(title="标题", content="内容")]
writer.write(chunks, tmp_output, "example.pdf")
content = open(tmp_output, encoding="utf-8").read()
assert "源文件: example.pdf" in content
def test_contains_process_time(self, writer, tmp_output):
chunks = [Chunk(title="标题", content="内容")]
writer.write(chunks, tmp_output, "test.pdf")
content = open(tmp_output, encoding="utf-8").read()
assert "处理时间:" in content
def test_contains_chunk_count(self, writer, tmp_output):
chunks = [
Chunk(title="标题1", content="内容1"),
Chunk(title="标题2", content="内容2"),
Chunk(title="标题3", content="内容3"),
]
writer.write(chunks, tmp_output, "test.pdf")
content = open(tmp_output, encoding="utf-8").read()
assert "分块总数: 3" in content
def test_meta_is_html_comment(self, writer, tmp_output):
chunks = [Chunk(title="标题", content="内容")]
writer.write(chunks, tmp_output, "test.pdf")
content = open(tmp_output, encoding="utf-8").read()
assert content.startswith("<!-- ")
assert "-->" in content
def test_meta_at_file_start(self, writer, tmp_output):
chunks = [Chunk(title="标题", content="内容")]
writer.write(chunks, tmp_output, "test.pdf")
content = open(tmp_output, encoding="utf-8").read()
comment_end = content.index("-->")
title_pos = content.index("## 标题")
assert comment_end < title_pos
class TestFileOverwrite:
"""文件覆盖测试"""
def test_overwrites_existing_file(self, writer, tmp_output):
with open(tmp_output, "w", encoding="utf-8") as f:
f.write("旧内容")
chunks = [Chunk(title="新标题", content="新内容")]
writer.write(chunks, tmp_output, "test.pdf")
content = open(tmp_output, encoding="utf-8").read()
assert "旧内容" not in content
assert "新内容" in content
def test_prints_warning_on_overwrite(self, writer, tmp_output, capsys):
with open(tmp_output, "w", encoding="utf-8") as f:
f.write("旧内容")
chunks = [Chunk(title="标题", content="内容")]
writer.write(chunks, tmp_output, "test.pdf")
captured = capsys.readouterr()
assert "警告" in captured.out
assert tmp_output in captured.out
def test_no_warning_for_new_file(self, writer, tmp_output, capsys):
chunks = [Chunk(title="标题", content="内容")]
writer.write(chunks, tmp_output, "test.pdf")
captured = capsys.readouterr()
assert "警告" not in captured.out
class TestCustomDelimiter:
"""自定义分隔符测试"""
def test_custom_delimiter(self, writer, tmp_output):
chunks = [
Chunk(title="标题1", content="内容1"),
Chunk(title="标题2", content="内容2"),
]
writer.write(chunks, tmp_output, "test.pdf", delimiter="===")
content = open(tmp_output, encoding="utf-8").read()
after_meta = content.split("-->", 1)[1]
assert "\n===\n" in after_meta
assert "\n---\n" not in after_meta
class TestEmptyContent:
"""空内容 Chunk 测试"""
def test_empty_content_chunk(self, writer, tmp_output):
chunks = [Chunk(title="空内容标题", content="")]
writer.write(chunks, tmp_output, "test.pdf")
content = open(tmp_output, encoding="utf-8").read()
assert "## 空内容标题" in content
def test_empty_content_with_multiple_chunks(self, writer, tmp_output):
chunks = [
Chunk(title="标题1", content=""),
Chunk(title="标题2", content="有内容"),
]
writer.write(chunks, tmp_output, "test.pdf")
content = open(tmp_output, encoding="utf-8").read()
assert "## 标题1" in content
assert "## 标题2" in content
assert "有内容" in content
class TestUTF8Encoding:
"""UTF-8 编码测试"""
def test_utf8_encoding(self, writer, tmp_output):
chunks = [Chunk(title="中文标题", content="中文内容,包含特殊字符:①②③")]
writer.write(chunks, tmp_output, "测试文件.pdf")
content = open(tmp_output, encoding="utf-8").read()
assert "中文标题" in content
assert "①②③" in content
assert "测试文件.pdf" in content