"""MarkdownWriter 单元测试""" import pytest from models import Chunk from writer import MarkdownWriter @pytest.fixture def writer(): return MarkdownWriter() @pytest.fixture def tmp_output(tmp_path): return str(tmp_path / "output.md") class TestSingleChunk: """单个 Chunk 输出测试""" def test_single_chunk_no_delimiter(self, writer, tmp_output): chunks = [Chunk(title="摘要标题", content="这是内容")] writer.write(chunks, tmp_output, "test.pdf") content = open(tmp_output, encoding="utf-8").read() assert "---" not in content.split("-->", 1)[1] def test_single_chunk_has_title(self, writer, tmp_output): chunks = [Chunk(title="摘要标题", content="这是内容")] writer.write(chunks, tmp_output, "test.pdf") content = open(tmp_output, encoding="utf-8").read() assert "## 摘要标题" in content def test_single_chunk_has_content(self, writer, tmp_output): chunks = [Chunk(title="摘要标题", content="这是内容")] writer.write(chunks, tmp_output, "test.pdf") content = open(tmp_output, encoding="utf-8").read() assert "这是内容" in content class TestMultipleChunks: """多个 Chunk 输出测试""" def test_delimiter_between_chunks(self, writer, tmp_output): chunks = [ Chunk(title="标题1", content="内容1"), Chunk(title="标题2", content="内容2"), Chunk(title="标题3", content="内容3"), ] writer.write(chunks, tmp_output, "test.pdf") content = open(tmp_output, encoding="utf-8").read() after_meta = content.split("-->", 1)[1] assert after_meta.count("\n---\n") == 2 def test_all_titles_present(self, writer, tmp_output): chunks = [ Chunk(title="标题A", content="内容A"), Chunk(title="标题B", content="内容B"), ] writer.write(chunks, tmp_output, "test.pdf") content = open(tmp_output, encoding="utf-8").read() assert "## 标题A" in content assert "## 标题B" in content def test_all_contents_present(self, writer, tmp_output): chunks = [ Chunk(title="标题A", content="内容A"), Chunk(title="标题B", content="内容B"), ] writer.write(chunks, tmp_output, "test.pdf") content = open(tmp_output, encoding="utf-8").read() assert "内容A" in content assert "内容B" in content def test_no_trailing_delimiter(self, writer, tmp_output): chunks = [ Chunk(title="标题1", content="内容1"), Chunk(title="标题2", content="内容2"), ] writer.write(chunks, tmp_output, "test.pdf") content = open(tmp_output, encoding="utf-8").read() after_meta = content.split("-->", 1)[1] # The last chunk content should appear after the last delimiter # and there should be no delimiter after the last content last_delimiter_pos = after_meta.rfind("\n---\n") last_content_pos = after_meta.rfind("内容2") assert last_content_pos > last_delimiter_pos class TestMetaInfo: """元信息注释测试""" def test_contains_source_file(self, writer, tmp_output): chunks = [Chunk(title="标题", content="内容")] writer.write(chunks, tmp_output, "example.pdf") content = open(tmp_output, encoding="utf-8").read() assert "源文件: example.pdf" in content def test_contains_process_time(self, writer, tmp_output): chunks = [Chunk(title="标题", content="内容")] writer.write(chunks, tmp_output, "test.pdf") content = open(tmp_output, encoding="utf-8").read() assert "处理时间:" in content def test_contains_chunk_count(self, writer, tmp_output): chunks = [ Chunk(title="标题1", content="内容1"), Chunk(title="标题2", content="内容2"), Chunk(title="标题3", content="内容3"), ] writer.write(chunks, tmp_output, "test.pdf") content = open(tmp_output, encoding="utf-8").read() assert "分块总数: 3" in content def test_meta_is_html_comment(self, writer, tmp_output): chunks = [Chunk(title="标题", content="内容")] writer.write(chunks, tmp_output, "test.pdf") content = open(tmp_output, encoding="utf-8").read() assert content.startswith("" in content def test_meta_at_file_start(self, writer, tmp_output): chunks = [Chunk(title="标题", content="内容")] writer.write(chunks, tmp_output, "test.pdf") content = open(tmp_output, encoding="utf-8").read() comment_end = content.index("-->") title_pos = content.index("## 标题") assert comment_end < title_pos class TestFileOverwrite: """文件覆盖测试""" def test_overwrites_existing_file(self, writer, tmp_output): with open(tmp_output, "w", encoding="utf-8") as f: f.write("旧内容") chunks = [Chunk(title="新标题", content="新内容")] writer.write(chunks, tmp_output, "test.pdf") content = open(tmp_output, encoding="utf-8").read() assert "旧内容" not in content assert "新内容" in content def test_prints_warning_on_overwrite(self, writer, tmp_output, capsys): with open(tmp_output, "w", encoding="utf-8") as f: f.write("旧内容") chunks = [Chunk(title="标题", content="内容")] writer.write(chunks, tmp_output, "test.pdf") captured = capsys.readouterr() assert "警告" in captured.out assert tmp_output in captured.out def test_no_warning_for_new_file(self, writer, tmp_output, capsys): chunks = [Chunk(title="标题", content="内容")] writer.write(chunks, tmp_output, "test.pdf") captured = capsys.readouterr() assert "警告" not in captured.out class TestCustomDelimiter: """自定义分隔符测试""" def test_custom_delimiter(self, writer, tmp_output): chunks = [ Chunk(title="标题1", content="内容1"), Chunk(title="标题2", content="内容2"), ] writer.write(chunks, tmp_output, "test.pdf", delimiter="===") content = open(tmp_output, encoding="utf-8").read() after_meta = content.split("-->", 1)[1] assert "\n===\n" in after_meta assert "\n---\n" not in after_meta class TestEmptyContent: """空内容 Chunk 测试""" def test_empty_content_chunk(self, writer, tmp_output): chunks = [Chunk(title="空内容标题", content="")] writer.write(chunks, tmp_output, "test.pdf") content = open(tmp_output, encoding="utf-8").read() assert "## 空内容标题" in content def test_empty_content_with_multiple_chunks(self, writer, tmp_output): chunks = [ Chunk(title="标题1", content=""), Chunk(title="标题2", content="有内容"), ] writer.write(chunks, tmp_output, "test.pdf") content = open(tmp_output, encoding="utf-8").read() assert "## 标题1" in content assert "## 标题2" in content assert "有内容" in content class TestUTF8Encoding: """UTF-8 编码测试""" def test_utf8_encoding(self, writer, tmp_output): chunks = [Chunk(title="中文标题", content="中文内容,包含特殊字符:①②③")] writer.write(chunks, tmp_output, "测试文件.pdf") content = open(tmp_output, encoding="utf-8").read() assert "中文标题" in content assert "①②③" in content assert "测试文件.pdf" in content