Initial commit: AI 知识库文档智能分块工具
This commit is contained in:
118
writer.py
Normal file
118
writer.py
Normal file
@@ -0,0 +1,118 @@
|
||||
"""Markdown / JSON 输出写入器"""
|
||||
|
||||
import json
|
||||
import os
|
||||
from datetime import datetime
|
||||
from typing import List
|
||||
|
||||
from models import Chunk
|
||||
|
||||
|
||||
class MarkdownWriter:
|
||||
"""将 Chunk 列表写入 Markdown 文件"""
|
||||
|
||||
def write(
|
||||
self,
|
||||
chunks: List[Chunk],
|
||||
output_path: str,
|
||||
source_file: str,
|
||||
delimiter: str = "---",
|
||||
) -> None:
|
||||
"""
|
||||
写入 Markdown 文件。
|
||||
- 自动创建输出目录
|
||||
- 文件开头添加元信息注释(源文件名、处理时间、Chunk 总数)
|
||||
- 每个 Chunk 前添加摘要标题(## 级别)
|
||||
- Chunk 之间用 delimiter 分隔
|
||||
- 若输出文件已存在,覆盖并在控制台提示
|
||||
"""
|
||||
# 自动创建输出目录
|
||||
output_dir = os.path.dirname(output_path)
|
||||
if output_dir:
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
if os.path.exists(output_path):
|
||||
print(f"警告: 输出文件已存在,将覆盖: {output_path}")
|
||||
|
||||
process_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
lines = []
|
||||
|
||||
# 元信息 HTML 注释
|
||||
chunk_sizes = [len(c.content) for c in chunks]
|
||||
avg_size = sum(chunk_sizes) // max(len(chunk_sizes), 1)
|
||||
max_size = max(chunk_sizes) if chunk_sizes else 0
|
||||
|
||||
lines.append("<!-- ")
|
||||
lines.append(f" 源文件: {source_file}")
|
||||
lines.append(f" 处理时间: {process_time}")
|
||||
lines.append(f" 分块总数: {len(chunks)}")
|
||||
lines.append(f" 分块大小: 平均 {avg_size} 字, 最大 {max_size} 字")
|
||||
lines.append(f" 上传提示: 使用「自定义分段」,分隔符设为 {delimiter}")
|
||||
lines.append("-->")
|
||||
|
||||
for i, chunk in enumerate(chunks):
|
||||
lines.append("")
|
||||
# 标签 + 标题
|
||||
if chunk.tag:
|
||||
lines.append(f"## `{chunk.tag}` {chunk.title}")
|
||||
else:
|
||||
lines.append(f"## {chunk.title}")
|
||||
lines.append("")
|
||||
lines.append(chunk.content)
|
||||
|
||||
if i < len(chunks) - 1:
|
||||
lines.append("")
|
||||
lines.append(delimiter)
|
||||
|
||||
lines.append("")
|
||||
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
f.write("\n".join(lines))
|
||||
|
||||
|
||||
class JsonWriter:
|
||||
"""将 Chunk 列表写入 JSON 文件,方便导入向量数据库"""
|
||||
|
||||
def write(
|
||||
self,
|
||||
chunks: List[Chunk],
|
||||
output_path: str,
|
||||
source_file: str,
|
||||
delimiter: str = "---",
|
||||
) -> None:
|
||||
"""
|
||||
写入 JSON 文件。
|
||||
- 自动创建输出目录
|
||||
- 输出结构化的 JSON,包含元信息和分块列表
|
||||
- 若输出文件已存在,覆盖并在控制台提示
|
||||
"""
|
||||
# 自动创建输出目录
|
||||
output_dir = os.path.dirname(output_path)
|
||||
if output_dir:
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
# 将扩展名改为 .json
|
||||
json_path = os.path.splitext(output_path)[0] + ".json"
|
||||
|
||||
if os.path.exists(json_path):
|
||||
print(f"警告: 输出文件已存在,将覆盖: {json_path}")
|
||||
|
||||
data = {
|
||||
"source_file": source_file,
|
||||
"process_time": datetime.now().isoformat(),
|
||||
"total_chunks": len(chunks),
|
||||
"chunks": [
|
||||
{
|
||||
"index": i,
|
||||
"tag": chunk.tag,
|
||||
"title": chunk.title,
|
||||
"content": chunk.content,
|
||||
"char_count": len(chunk.content),
|
||||
}
|
||||
for i, chunk in enumerate(chunks)
|
||||
],
|
||||
}
|
||||
|
||||
with open(json_path, "w", encoding="utf-8") as f:
|
||||
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||
Reference in New Issue
Block a user