Files
ztb/test_random_extract.py
2026-02-13 18:15:20 +08:00

139 lines
4.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
"""
随机选择几条原始数据执行提取流程,并生成分析报告
"""
import logging
import random
from processors.content_fetcher import ContentFetcher
from processors.deepseek import DeepSeekProcessor
from config import REGION_CONFIGS
import csv
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# 原始数据文件路径
CSV_FILE = "data/浙江省公共资源交易中心_20260213_142920.csv"
# 结果输出文件
OUTPUT_MD = "随机提取分析报告.md"
def read_csv_data(file_path):
"""读取CSV文件数据"""
data = []
with open(file_path, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
data.append(row)
return data
def extract_data(url, title):
"""执行数据提取"""
logger.info(f"\n开始提取: {title}")
logger.info(f"URL: {url}")
# 1. 获取内容
fetcher = ContentFetcher(temp_dir="temp_files")
content = fetcher.get_full_content(url)
if not content:
logger.error("无法获取网页内容")
return None
logger.info(f"获取到内容长度: {len(content)} 字符")
# 2. 提取字段
processor = DeepSeekProcessor()
# 获取浙江招标文件公示的配置
config_key = "zhejiang:招标文件公示"
if config_key not in REGION_CONFIGS:
logger.error(f"未找到配置: {config_key}")
return None
ai_fields = REGION_CONFIGS[config_key]["ai_fields"]
# 3. 执行提取
extracted = processor.extract_fields(content, ai_fields, "浙江")
return extracted
def generate_md_report(data_list, extracted_results):
"""生成MD格式报告"""
md_content = "# 随机提取分析报告\n\n"
md_content += f"生成时间: {logging.Formatter('%(asctime)s').formatTime(logging.LogRecord('', 0, '', 0, '', '', '', ''))}\n\n"
md_content += "## 分析结果\n\n"
for i, (data, extracted) in enumerate(zip(data_list, extracted_results)):
title = data.get("标题", "未知")
url = data.get("链接", "")
publish_date = data.get("发布日期", "")
region = data.get("地区", "")
md_content += f"### 项目 {i+1}: {title}\n"
md_content += f"- 发布日期: {publish_date}\n"
md_content += f"- 地区: {region}\n"
md_content += f"- 链接: {url}\n\n"
if not extracted:
md_content += "**提取失败: 无法获取内容**\n\n"
continue
md_content += "#### 提取结果\n"
md_content += "| 字段 | 提取值 | 分析 |\n"
md_content += "|------|--------|------|\n"
for field, value in extracted.items():
analysis = ""
if value == "文档未提及":
analysis = "**空白原因**: 文档中未明确提及该信息"
elif value:
analysis = "提取成功"
else:
analysis = "**空白原因**: 提取结果为空"
# 处理表格中的特殊字符
value_clean = value.replace("|", "").replace("\n", " ")
md_content += f"| {field} | {value_clean} | {analysis} |\n"
md_content += "\n"
# 保存MD文件
with open(OUTPUT_MD, 'w', encoding='utf-8') as f:
f.write(md_content)
logger.info(f"报告已生成: {OUTPUT_MD}")
def main():
"""主函数"""
# 读取原始数据
data = read_csv_data(CSV_FILE)
logger.info(f"原始数据条数: {len(data)}")
# 随机选择5条数据
random.seed(42) # 设置随机种子以保证可重复性
selected_data = random.sample(data, 5)
logger.info(f"随机选择了 {len(selected_data)} 条数据")
# 执行提取
extracted_results = []
for item in selected_data:
url = item.get("链接", "")
title = item.get("标题", "")
if url:
result = extract_data(url, title)
extracted_results.append(result)
else:
extracted_results.append(None)
# 生成报告
generate_md_report(selected_data, extracted_results)
if __name__ == "__main__":
main()