# -*- coding: utf-8 -*- """ 随机选择几条原始数据执行提取流程,并生成分析报告 """ import logging import random from processors.content_fetcher import ContentFetcher from processors.deepseek import DeepSeekProcessor from config import REGION_CONFIGS import csv # 配置日志 logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # 原始数据文件路径 CSV_FILE = "data/浙江省公共资源交易中心_20260213_142920.csv" # 结果输出文件 OUTPUT_MD = "随机提取分析报告.md" def read_csv_data(file_path): """读取CSV文件数据""" data = [] with open(file_path, 'r', encoding='utf-8') as f: reader = csv.DictReader(f) for row in reader: data.append(row) return data def extract_data(url, title): """执行数据提取""" logger.info(f"\n开始提取: {title}") logger.info(f"URL: {url}") # 1. 获取内容 fetcher = ContentFetcher(temp_dir="temp_files") content = fetcher.get_full_content(url) if not content: logger.error("无法获取网页内容") return None logger.info(f"获取到内容长度: {len(content)} 字符") # 2. 提取字段 processor = DeepSeekProcessor() # 获取浙江招标文件公示的配置 config_key = "zhejiang:招标文件公示" if config_key not in REGION_CONFIGS: logger.error(f"未找到配置: {config_key}") return None ai_fields = REGION_CONFIGS[config_key]["ai_fields"] # 3. 执行提取 extracted = processor.extract_fields(content, ai_fields, "浙江") return extracted def generate_md_report(data_list, extracted_results): """生成MD格式报告""" md_content = "# 随机提取分析报告\n\n" md_content += f"生成时间: {logging.Formatter('%(asctime)s').formatTime(logging.LogRecord('', 0, '', 0, '', '', '', ''))}\n\n" md_content += "## 分析结果\n\n" for i, (data, extracted) in enumerate(zip(data_list, extracted_results)): title = data.get("标题", "未知") url = data.get("链接", "") publish_date = data.get("发布日期", "") region = data.get("地区", "") md_content += f"### 项目 {i+1}: {title}\n" md_content += f"- 发布日期: {publish_date}\n" md_content += f"- 地区: {region}\n" md_content += f"- 链接: {url}\n\n" if not extracted: md_content += "**提取失败: 无法获取内容**\n\n" continue md_content += "#### 提取结果\n" md_content += "| 字段 | 提取值 | 分析 |\n" md_content += "|------|--------|------|\n" for field, value in extracted.items(): analysis = "" if value == "文档未提及": analysis = "**空白原因**: 文档中未明确提及该信息" elif value: analysis = "提取成功" else: analysis = "**空白原因**: 提取结果为空" # 处理表格中的特殊字符 value_clean = value.replace("|", "|").replace("\n", " ") md_content += f"| {field} | {value_clean} | {analysis} |\n" md_content += "\n" # 保存MD文件 with open(OUTPUT_MD, 'w', encoding='utf-8') as f: f.write(md_content) logger.info(f"报告已生成: {OUTPUT_MD}") def main(): """主函数""" # 读取原始数据 data = read_csv_data(CSV_FILE) logger.info(f"原始数据条数: {len(data)}") # 随机选择5条数据 random.seed(42) # 设置随机种子以保证可重复性 selected_data = random.sample(data, 5) logger.info(f"随机选择了 {len(selected_data)} 条数据") # 执行提取 extracted_results = [] for item in selected_data: url = item.get("链接", "") title = item.get("标题", "") if url: result = extract_data(url, title) extracted_results.append(result) else: extracted_results.append(None) # 生成报告 generate_md_report(selected_data, extracted_results) if __name__ == "__main__": main()