Initial commit: AI 知识库文档智能分块工具

2026-03-02 17:38:28 +08:00
commit 92e7fc5bda
160 changed files with 9577 additions and 0 deletions
--- a/writer.py
+++ b/writer.py
@@ -0,0 +1,118 @@
+"""Markdown / JSON 输出写入器"""
+
+import json
+import os
+from datetime import datetime
+from typing import List
+
+from models import Chunk
+
+
+class MarkdownWriter:
+    """将 Chunk 列表写入 Markdown 文件"""
+
+    def write(
+        self,
+        chunks: List[Chunk],
+        output_path: str,
+        source_file: str,
+        delimiter: str = "---",
+    ) -> None:
+        """
+        写入 Markdown 文件。
+        - 自动创建输出目录
+        - 文件开头添加元信息注释（源文件名、处理时间、Chunk 总数）
+        - 每个 Chunk 前添加摘要标题（## 级别）
+        - Chunk 之间用 delimiter 分隔
+        - 若输出文件已存在，覆盖并在控制台提示
+        """
+        # 自动创建输出目录
+        output_dir = os.path.dirname(output_path)
+        if output_dir:
+            os.makedirs(output_dir, exist_ok=True)
+
+        if os.path.exists(output_path):
+            print(f"警告: 输出文件已存在，将覆盖: {output_path}")
+
+        process_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+
+        lines = []
+
+        # 元信息 HTML 注释
+        chunk_sizes = [len(c.content) for c in chunks]
+        avg_size = sum(chunk_sizes) // max(len(chunk_sizes), 1)
+        max_size = max(chunk_sizes) if chunk_sizes else 0
+
+        lines.append("<!-- ")
+        lines.append(f"  源文件: {source_file}")
+        lines.append(f"  处理时间: {process_time}")
+        lines.append(f"  分块总数: {len(chunks)}")
+        lines.append(f"  分块大小: 平均 {avg_size} 字, 最大 {max_size} 字")
+        lines.append(f"  上传提示: 使用「自定义分段」，分隔符设为 {delimiter}")
+        lines.append("-->")
+
+        for i, chunk in enumerate(chunks):
+            lines.append("")
+            # 标签 + 标题
+            if chunk.tag:
+                lines.append(f"## `{chunk.tag}` {chunk.title}")
+            else:
+                lines.append(f"## {chunk.title}")
+            lines.append("")
+            lines.append(chunk.content)
+
+            if i < len(chunks) - 1:
+                lines.append("")
+                lines.append(delimiter)
+
+        lines.append("")
+
+        with open(output_path, "w", encoding="utf-8") as f:
+            f.write("\n".join(lines))
+
+
+class JsonWriter:
+    """将 Chunk 列表写入 JSON 文件，方便导入向量数据库"""
+
+    def write(
+        self,
+        chunks: List[Chunk],
+        output_path: str,
+        source_file: str,
+        delimiter: str = "---",
+    ) -> None:
+        """
+        写入 JSON 文件。
+        - 自动创建输出目录
+        - 输出结构化的 JSON，包含元信息和分块列表
+        - 若输出文件已存在，覆盖并在控制台提示
+        """
+        # 自动创建输出目录
+        output_dir = os.path.dirname(output_path)
+        if output_dir:
+            os.makedirs(output_dir, exist_ok=True)
+
+        # 将扩展名改为 .json
+        json_path = os.path.splitext(output_path)[0] + ".json"
+
+        if os.path.exists(json_path):
+            print(f"警告: 输出文件已存在，将覆盖: {json_path}")
+
+        data = {
+            "source_file": source_file,
+            "process_time": datetime.now().isoformat(),
+            "total_chunks": len(chunks),
+            "chunks": [
+                {
+                    "index": i,
+                    "tag": chunk.tag,
+                    "title": chunk.title,
+                    "content": chunk.content,
+                    "char_count": len(chunk.content),
+                }
+                for i, chunk in enumerate(chunks)
+            ],
+        }
+
+        with open(json_path, "w", encoding="utf-8") as f:
+            json.dump(data, f, ensure_ascii=False, indent=2)