226 lines
7.6 KiB
Python
226 lines
7.6 KiB
Python
"""MarkdownWriter 单元测试"""
|
|
|
|
import pytest
|
|
|
|
from models import Chunk
|
|
from writer import MarkdownWriter
|
|
|
|
|
|
@pytest.fixture
|
|
def writer():
|
|
return MarkdownWriter()
|
|
|
|
|
|
@pytest.fixture
|
|
def tmp_output(tmp_path):
|
|
return str(tmp_path / "output.md")
|
|
|
|
|
|
class TestSingleChunk:
|
|
"""单个 Chunk 输出测试"""
|
|
|
|
def test_single_chunk_no_delimiter(self, writer, tmp_output):
|
|
chunks = [Chunk(title="摘要标题", content="这是内容")]
|
|
writer.write(chunks, tmp_output, "test.pdf")
|
|
|
|
content = open(tmp_output, encoding="utf-8").read()
|
|
assert "---" not in content.split("-->", 1)[1]
|
|
|
|
def test_single_chunk_has_title(self, writer, tmp_output):
|
|
chunks = [Chunk(title="摘要标题", content="这是内容")]
|
|
writer.write(chunks, tmp_output, "test.pdf")
|
|
|
|
content = open(tmp_output, encoding="utf-8").read()
|
|
assert "## 摘要标题" in content
|
|
|
|
def test_single_chunk_has_content(self, writer, tmp_output):
|
|
chunks = [Chunk(title="摘要标题", content="这是内容")]
|
|
writer.write(chunks, tmp_output, "test.pdf")
|
|
|
|
content = open(tmp_output, encoding="utf-8").read()
|
|
assert "这是内容" in content
|
|
|
|
|
|
class TestMultipleChunks:
|
|
"""多个 Chunk 输出测试"""
|
|
|
|
def test_delimiter_between_chunks(self, writer, tmp_output):
|
|
chunks = [
|
|
Chunk(title="标题1", content="内容1"),
|
|
Chunk(title="标题2", content="内容2"),
|
|
Chunk(title="标题3", content="内容3"),
|
|
]
|
|
writer.write(chunks, tmp_output, "test.pdf")
|
|
|
|
content = open(tmp_output, encoding="utf-8").read()
|
|
after_meta = content.split("-->", 1)[1]
|
|
assert after_meta.count("\n---\n") == 2
|
|
|
|
def test_all_titles_present(self, writer, tmp_output):
|
|
chunks = [
|
|
Chunk(title="标题A", content="内容A"),
|
|
Chunk(title="标题B", content="内容B"),
|
|
]
|
|
writer.write(chunks, tmp_output, "test.pdf")
|
|
|
|
content = open(tmp_output, encoding="utf-8").read()
|
|
assert "## 标题A" in content
|
|
assert "## 标题B" in content
|
|
|
|
def test_all_contents_present(self, writer, tmp_output):
|
|
chunks = [
|
|
Chunk(title="标题A", content="内容A"),
|
|
Chunk(title="标题B", content="内容B"),
|
|
]
|
|
writer.write(chunks, tmp_output, "test.pdf")
|
|
|
|
content = open(tmp_output, encoding="utf-8").read()
|
|
assert "内容A" in content
|
|
assert "内容B" in content
|
|
|
|
def test_no_trailing_delimiter(self, writer, tmp_output):
|
|
chunks = [
|
|
Chunk(title="标题1", content="内容1"),
|
|
Chunk(title="标题2", content="内容2"),
|
|
]
|
|
writer.write(chunks, tmp_output, "test.pdf")
|
|
|
|
content = open(tmp_output, encoding="utf-8").read()
|
|
after_meta = content.split("-->", 1)[1]
|
|
# The last chunk content should appear after the last delimiter
|
|
# and there should be no delimiter after the last content
|
|
last_delimiter_pos = after_meta.rfind("\n---\n")
|
|
last_content_pos = after_meta.rfind("内容2")
|
|
assert last_content_pos > last_delimiter_pos
|
|
|
|
|
|
class TestMetaInfo:
|
|
"""元信息注释测试"""
|
|
|
|
def test_contains_source_file(self, writer, tmp_output):
|
|
chunks = [Chunk(title="标题", content="内容")]
|
|
writer.write(chunks, tmp_output, "example.pdf")
|
|
|
|
content = open(tmp_output, encoding="utf-8").read()
|
|
assert "源文件: example.pdf" in content
|
|
|
|
def test_contains_process_time(self, writer, tmp_output):
|
|
chunks = [Chunk(title="标题", content="内容")]
|
|
writer.write(chunks, tmp_output, "test.pdf")
|
|
|
|
content = open(tmp_output, encoding="utf-8").read()
|
|
assert "处理时间:" in content
|
|
|
|
def test_contains_chunk_count(self, writer, tmp_output):
|
|
chunks = [
|
|
Chunk(title="标题1", content="内容1"),
|
|
Chunk(title="标题2", content="内容2"),
|
|
Chunk(title="标题3", content="内容3"),
|
|
]
|
|
writer.write(chunks, tmp_output, "test.pdf")
|
|
|
|
content = open(tmp_output, encoding="utf-8").read()
|
|
assert "分块总数: 3" in content
|
|
|
|
def test_meta_is_html_comment(self, writer, tmp_output):
|
|
chunks = [Chunk(title="标题", content="内容")]
|
|
writer.write(chunks, tmp_output, "test.pdf")
|
|
|
|
content = open(tmp_output, encoding="utf-8").read()
|
|
assert content.startswith("<!-- ")
|
|
assert "-->" in content
|
|
|
|
def test_meta_at_file_start(self, writer, tmp_output):
|
|
chunks = [Chunk(title="标题", content="内容")]
|
|
writer.write(chunks, tmp_output, "test.pdf")
|
|
|
|
content = open(tmp_output, encoding="utf-8").read()
|
|
comment_end = content.index("-->")
|
|
title_pos = content.index("## 标题")
|
|
assert comment_end < title_pos
|
|
|
|
|
|
class TestFileOverwrite:
|
|
"""文件覆盖测试"""
|
|
|
|
def test_overwrites_existing_file(self, writer, tmp_output):
|
|
with open(tmp_output, "w", encoding="utf-8") as f:
|
|
f.write("旧内容")
|
|
|
|
chunks = [Chunk(title="新标题", content="新内容")]
|
|
writer.write(chunks, tmp_output, "test.pdf")
|
|
|
|
content = open(tmp_output, encoding="utf-8").read()
|
|
assert "旧内容" not in content
|
|
assert "新内容" in content
|
|
|
|
def test_prints_warning_on_overwrite(self, writer, tmp_output, capsys):
|
|
with open(tmp_output, "w", encoding="utf-8") as f:
|
|
f.write("旧内容")
|
|
|
|
chunks = [Chunk(title="标题", content="内容")]
|
|
writer.write(chunks, tmp_output, "test.pdf")
|
|
|
|
captured = capsys.readouterr()
|
|
assert "警告" in captured.out
|
|
assert tmp_output in captured.out
|
|
|
|
def test_no_warning_for_new_file(self, writer, tmp_output, capsys):
|
|
chunks = [Chunk(title="标题", content="内容")]
|
|
writer.write(chunks, tmp_output, "test.pdf")
|
|
|
|
captured = capsys.readouterr()
|
|
assert "警告" not in captured.out
|
|
|
|
|
|
class TestCustomDelimiter:
|
|
"""自定义分隔符测试"""
|
|
|
|
def test_custom_delimiter(self, writer, tmp_output):
|
|
chunks = [
|
|
Chunk(title="标题1", content="内容1"),
|
|
Chunk(title="标题2", content="内容2"),
|
|
]
|
|
writer.write(chunks, tmp_output, "test.pdf", delimiter="===")
|
|
|
|
content = open(tmp_output, encoding="utf-8").read()
|
|
after_meta = content.split("-->", 1)[1]
|
|
assert "\n===\n" in after_meta
|
|
assert "\n---\n" not in after_meta
|
|
|
|
|
|
class TestEmptyContent:
|
|
"""空内容 Chunk 测试"""
|
|
|
|
def test_empty_content_chunk(self, writer, tmp_output):
|
|
chunks = [Chunk(title="空内容标题", content="")]
|
|
writer.write(chunks, tmp_output, "test.pdf")
|
|
|
|
content = open(tmp_output, encoding="utf-8").read()
|
|
assert "## 空内容标题" in content
|
|
|
|
def test_empty_content_with_multiple_chunks(self, writer, tmp_output):
|
|
chunks = [
|
|
Chunk(title="标题1", content=""),
|
|
Chunk(title="标题2", content="有内容"),
|
|
]
|
|
writer.write(chunks, tmp_output, "test.pdf")
|
|
|
|
content = open(tmp_output, encoding="utf-8").read()
|
|
assert "## 标题1" in content
|
|
assert "## 标题2" in content
|
|
assert "有内容" in content
|
|
|
|
|
|
class TestUTF8Encoding:
|
|
"""UTF-8 编码测试"""
|
|
|
|
def test_utf8_encoding(self, writer, tmp_output):
|
|
chunks = [Chunk(title="中文标题", content="中文内容,包含特殊字符:①②③")]
|
|
writer.write(chunks, tmp_output, "测试文件.pdf")
|
|
|
|
content = open(tmp_output, encoding="utf-8").read()
|
|
assert "中文标题" in content
|
|
assert "①②③" in content
|
|
assert "测试文件.pdf" in content
|