139 lines
4.2 KiB
Python
139 lines
4.2 KiB
Python
|
|
# -*- coding: utf-8 -*-
|
|||
|
|
"""
|
|||
|
|
随机选择几条原始数据执行提取流程,并生成分析报告
|
|||
|
|
"""
|
|||
|
|
import logging
|
|||
|
|
import random
|
|||
|
|
from processors.content_fetcher import ContentFetcher
|
|||
|
|
from processors.deepseek import DeepSeekProcessor
|
|||
|
|
from config import REGION_CONFIGS
|
|||
|
|
import csv
|
|||
|
|
|
|||
|
|
# 配置日志
|
|||
|
|
logging.basicConfig(
|
|||
|
|
level=logging.INFO,
|
|||
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|||
|
|
)
|
|||
|
|
logger = logging.getLogger(__name__)
|
|||
|
|
|
|||
|
|
# 原始数据文件路径
|
|||
|
|
CSV_FILE = "data/浙江省公共资源交易中心_20260213_142920.csv"
|
|||
|
|
|
|||
|
|
# 结果输出文件
|
|||
|
|
OUTPUT_MD = "随机提取分析报告.md"
|
|||
|
|
|
|||
|
|
def read_csv_data(file_path):
|
|||
|
|
"""读取CSV文件数据"""
|
|||
|
|
data = []
|
|||
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|||
|
|
reader = csv.DictReader(f)
|
|||
|
|
for row in reader:
|
|||
|
|
data.append(row)
|
|||
|
|
return data
|
|||
|
|
|
|||
|
|
def extract_data(url, title):
|
|||
|
|
"""执行数据提取"""
|
|||
|
|
logger.info(f"\n开始提取: {title}")
|
|||
|
|
logger.info(f"URL: {url}")
|
|||
|
|
|
|||
|
|
# 1. 获取内容
|
|||
|
|
fetcher = ContentFetcher(temp_dir="temp_files")
|
|||
|
|
content = fetcher.get_full_content(url)
|
|||
|
|
|
|||
|
|
if not content:
|
|||
|
|
logger.error("无法获取网页内容")
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
logger.info(f"获取到内容长度: {len(content)} 字符")
|
|||
|
|
|
|||
|
|
# 2. 提取字段
|
|||
|
|
processor = DeepSeekProcessor()
|
|||
|
|
|
|||
|
|
# 获取浙江招标文件公示的配置
|
|||
|
|
config_key = "zhejiang:招标文件公示"
|
|||
|
|
if config_key not in REGION_CONFIGS:
|
|||
|
|
logger.error(f"未找到配置: {config_key}")
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
ai_fields = REGION_CONFIGS[config_key]["ai_fields"]
|
|||
|
|
|
|||
|
|
# 3. 执行提取
|
|||
|
|
extracted = processor.extract_fields(content, ai_fields, "浙江")
|
|||
|
|
|
|||
|
|
return extracted
|
|||
|
|
|
|||
|
|
def generate_md_report(data_list, extracted_results):
|
|||
|
|
"""生成MD格式报告"""
|
|||
|
|
md_content = "# 随机提取分析报告\n\n"
|
|||
|
|
md_content += f"生成时间: {logging.Formatter('%(asctime)s').formatTime(logging.LogRecord('', 0, '', 0, '', '', '', ''))}\n\n"
|
|||
|
|
md_content += "## 分析结果\n\n"
|
|||
|
|
|
|||
|
|
for i, (data, extracted) in enumerate(zip(data_list, extracted_results)):
|
|||
|
|
title = data.get("标题", "未知")
|
|||
|
|
url = data.get("链接", "")
|
|||
|
|
publish_date = data.get("发布日期", "")
|
|||
|
|
region = data.get("地区", "")
|
|||
|
|
|
|||
|
|
md_content += f"### 项目 {i+1}: {title}\n"
|
|||
|
|
md_content += f"- 发布日期: {publish_date}\n"
|
|||
|
|
md_content += f"- 地区: {region}\n"
|
|||
|
|
md_content += f"- 链接: {url}\n\n"
|
|||
|
|
|
|||
|
|
if not extracted:
|
|||
|
|
md_content += "**提取失败: 无法获取内容**\n\n"
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
md_content += "#### 提取结果\n"
|
|||
|
|
md_content += "| 字段 | 提取值 | 分析 |\n"
|
|||
|
|
md_content += "|------|--------|------|\n"
|
|||
|
|
|
|||
|
|
for field, value in extracted.items():
|
|||
|
|
analysis = ""
|
|||
|
|
if value == "文档未提及":
|
|||
|
|
analysis = "**空白原因**: 文档中未明确提及该信息"
|
|||
|
|
elif value:
|
|||
|
|
analysis = "提取成功"
|
|||
|
|
else:
|
|||
|
|
analysis = "**空白原因**: 提取结果为空"
|
|||
|
|
|
|||
|
|
# 处理表格中的特殊字符
|
|||
|
|
value_clean = value.replace("|", "|").replace("\n", " ")
|
|||
|
|
md_content += f"| {field} | {value_clean} | {analysis} |\n"
|
|||
|
|
|
|||
|
|
md_content += "\n"
|
|||
|
|
|
|||
|
|
# 保存MD文件
|
|||
|
|
with open(OUTPUT_MD, 'w', encoding='utf-8') as f:
|
|||
|
|
f.write(md_content)
|
|||
|
|
|
|||
|
|
logger.info(f"报告已生成: {OUTPUT_MD}")
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
"""主函数"""
|
|||
|
|
# 读取原始数据
|
|||
|
|
data = read_csv_data(CSV_FILE)
|
|||
|
|
logger.info(f"原始数据条数: {len(data)}")
|
|||
|
|
|
|||
|
|
# 随机选择5条数据
|
|||
|
|
random.seed(42) # 设置随机种子以保证可重复性
|
|||
|
|
selected_data = random.sample(data, 5)
|
|||
|
|
|
|||
|
|
logger.info(f"随机选择了 {len(selected_data)} 条数据")
|
|||
|
|
|
|||
|
|
# 执行提取
|
|||
|
|
extracted_results = []
|
|||
|
|
for item in selected_data:
|
|||
|
|
url = item.get("链接", "")
|
|||
|
|
title = item.get("标题", "")
|
|||
|
|
if url:
|
|||
|
|
result = extract_data(url, title)
|
|||
|
|
extracted_results.append(result)
|
|||
|
|
else:
|
|||
|
|
extracted_results.append(None)
|
|||
|
|
|
|||
|
|
# 生成报告
|
|||
|
|
generate_md_report(selected_data, extracted_results)
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|