119 lines
3.7 KiB
Python
119 lines
3.7 KiB
Python
|
|
"""Markdown / JSON 输出写入器"""
|
|||
|
|
|
|||
|
|
import json
|
|||
|
|
import os
|
|||
|
|
from datetime import datetime
|
|||
|
|
from typing import List
|
|||
|
|
|
|||
|
|
from models import Chunk
|
|||
|
|
|
|||
|
|
|
|||
|
|
class MarkdownWriter:
|
|||
|
|
"""将 Chunk 列表写入 Markdown 文件"""
|
|||
|
|
|
|||
|
|
def write(
|
|||
|
|
self,
|
|||
|
|
chunks: List[Chunk],
|
|||
|
|
output_path: str,
|
|||
|
|
source_file: str,
|
|||
|
|
delimiter: str = "---",
|
|||
|
|
) -> None:
|
|||
|
|
"""
|
|||
|
|
写入 Markdown 文件。
|
|||
|
|
- 自动创建输出目录
|
|||
|
|
- 文件开头添加元信息注释(源文件名、处理时间、Chunk 总数)
|
|||
|
|
- 每个 Chunk 前添加摘要标题(## 级别)
|
|||
|
|
- Chunk 之间用 delimiter 分隔
|
|||
|
|
- 若输出文件已存在,覆盖并在控制台提示
|
|||
|
|
"""
|
|||
|
|
# 自动创建输出目录
|
|||
|
|
output_dir = os.path.dirname(output_path)
|
|||
|
|
if output_dir:
|
|||
|
|
os.makedirs(output_dir, exist_ok=True)
|
|||
|
|
|
|||
|
|
if os.path.exists(output_path):
|
|||
|
|
print(f"警告: 输出文件已存在,将覆盖: {output_path}")
|
|||
|
|
|
|||
|
|
process_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|||
|
|
|
|||
|
|
lines = []
|
|||
|
|
|
|||
|
|
# 元信息 HTML 注释
|
|||
|
|
chunk_sizes = [len(c.content) for c in chunks]
|
|||
|
|
avg_size = sum(chunk_sizes) // max(len(chunk_sizes), 1)
|
|||
|
|
max_size = max(chunk_sizes) if chunk_sizes else 0
|
|||
|
|
|
|||
|
|
lines.append("<!-- ")
|
|||
|
|
lines.append(f" 源文件: {source_file}")
|
|||
|
|
lines.append(f" 处理时间: {process_time}")
|
|||
|
|
lines.append(f" 分块总数: {len(chunks)}")
|
|||
|
|
lines.append(f" 分块大小: 平均 {avg_size} 字, 最大 {max_size} 字")
|
|||
|
|
lines.append(f" 上传提示: 使用「自定义分段」,分隔符设为 {delimiter}")
|
|||
|
|
lines.append("-->")
|
|||
|
|
|
|||
|
|
for i, chunk in enumerate(chunks):
|
|||
|
|
lines.append("")
|
|||
|
|
# 标签 + 标题
|
|||
|
|
if chunk.tag:
|
|||
|
|
lines.append(f"## `{chunk.tag}` {chunk.title}")
|
|||
|
|
else:
|
|||
|
|
lines.append(f"## {chunk.title}")
|
|||
|
|
lines.append("")
|
|||
|
|
lines.append(chunk.content)
|
|||
|
|
|
|||
|
|
if i < len(chunks) - 1:
|
|||
|
|
lines.append("")
|
|||
|
|
lines.append(delimiter)
|
|||
|
|
|
|||
|
|
lines.append("")
|
|||
|
|
|
|||
|
|
with open(output_path, "w", encoding="utf-8") as f:
|
|||
|
|
f.write("\n".join(lines))
|
|||
|
|
|
|||
|
|
|
|||
|
|
class JsonWriter:
|
|||
|
|
"""将 Chunk 列表写入 JSON 文件,方便导入向量数据库"""
|
|||
|
|
|
|||
|
|
def write(
|
|||
|
|
self,
|
|||
|
|
chunks: List[Chunk],
|
|||
|
|
output_path: str,
|
|||
|
|
source_file: str,
|
|||
|
|
delimiter: str = "---",
|
|||
|
|
) -> None:
|
|||
|
|
"""
|
|||
|
|
写入 JSON 文件。
|
|||
|
|
- 自动创建输出目录
|
|||
|
|
- 输出结构化的 JSON,包含元信息和分块列表
|
|||
|
|
- 若输出文件已存在,覆盖并在控制台提示
|
|||
|
|
"""
|
|||
|
|
# 自动创建输出目录
|
|||
|
|
output_dir = os.path.dirname(output_path)
|
|||
|
|
if output_dir:
|
|||
|
|
os.makedirs(output_dir, exist_ok=True)
|
|||
|
|
|
|||
|
|
# 将扩展名改为 .json
|
|||
|
|
json_path = os.path.splitext(output_path)[0] + ".json"
|
|||
|
|
|
|||
|
|
if os.path.exists(json_path):
|
|||
|
|
print(f"警告: 输出文件已存在,将覆盖: {json_path}")
|
|||
|
|
|
|||
|
|
data = {
|
|||
|
|
"source_file": source_file,
|
|||
|
|
"process_time": datetime.now().isoformat(),
|
|||
|
|
"total_chunks": len(chunks),
|
|||
|
|
"chunks": [
|
|||
|
|
{
|
|||
|
|
"index": i,
|
|||
|
|
"tag": chunk.tag,
|
|||
|
|
"title": chunk.title,
|
|||
|
|
"content": chunk.content,
|
|||
|
|
"char_count": len(chunk.content),
|
|||
|
|
}
|
|||
|
|
for i, chunk in enumerate(chunks)
|
|||
|
|
],
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
with open(json_path, "w", encoding="utf-8") as f:
|
|||
|
|
json.dump(data, f, ensure_ascii=False, indent=2)
|