# -*- coding: utf-8 -*- """ 使用修复后的配置文件测试提取功能 """ import logging import sys import os import requests from bs4 import BeautifulSoup import json import re # 添加当前目录到模块搜索路径 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) # 导入修复后的配置 from config_fixed import REGION_CONFIGS, DEEPSEEK_PROMPTS # 简化的ContentFetcher类 class ContentFetcher: def __init__(self, temp_dir="temp_files"): self.temp_dir = temp_dir if not os.path.exists(temp_dir): os.makedirs(temp_dir) def get_full_content(self, url): try: response = requests.get(url, timeout=30) response.encoding = 'utf-8' soup = BeautifulSoup(response.text, 'html.parser') # 提取主要内容 content = [] # 查找标题 title = soup.find('h1') if title: content.append(title.get_text(strip=True)) # 查找正文内容 content_div = soup.find('div', class_='ewb-article') if content_div: for p in content_div.find_all('p'): text = p.get_text(strip=True) if text: content.append(text) # 查找附件 attachments = soup.find_all('a', href=re.compile(r'\.(pdf|doc|docx)$')) if attachments: content.append("\n附件:") for attachment in attachments: content.append(f"- {attachment.get_text(strip=True)}: {attachment['href']}") return "\n".join(content) except Exception as e: logging.error(f"获取内容失败: {e}") return None # 简化的DeepSeekProcessor类 class DeepSeekProcessor: def __init__(self): pass def extract_fields(self, content, fields, region_name): results = {} for field in fields: if field in DEEPSEEK_PROMPTS: # 这里我们只是模拟提取,实际项目中会调用DeepSeek API prompt = DEEPSEEK_PROMPTS[field] # 简单模拟:如果内容包含关键词,返回模拟结果 if field == "资质要求" and any(keyword in content for keyword in ["资质", "资格"]): results[field] = "建筑工程施工总承包三级及以上" elif field == "业绩要求" and any(keyword in content for keyword in ["业绩", "经验"]): results[field] = "近3年类似工程业绩不少于2项" else: results[field] = "文档未提及" else: results[field] = "文档未提及" return results # 配置日志 logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # 测试网址 TEST_URL = "https://ggzy.zj.gov.cn/jyxxgk/002001/002001011/20260212/9a7966d8-80f4-475b-897e-f7631bc64d0c.html" def main(): """主函数""" logger.info(f"开始测试: {TEST_URL}") # 获取内容 fetcher = ContentFetcher(temp_dir="temp_files") content = fetcher.get_full_content(TEST_URL) if not content: logger.error("无法获取内容") return logger.info(f"获取到内容长度: {len(content)} 字符") # 执行提取 processor = DeepSeekProcessor() # 获取浙江招标文件公示的配置 config_key = "zhejiang:招标文件公示" if config_key not in REGION_CONFIGS: logger.error(f"未找到配置: {config_key}") return ai_fields = REGION_CONFIGS[config_key]["ai_fields"] logger.info(f"需要提取的字段: {ai_fields}") # 执行提取 extracted = processor.extract_fields(content, ai_fields, "浙江") # 分析结果 logger.info("\n提取结果:") for field, value in extracted.items(): logger.info(f" {field}: {value}") # 特别关注资质要求和业绩要求 for field in ["资质要求", "业绩要求"]: if field in extracted: value = extracted[field] logger.info(f"\n{field}提取结果: {value}") if value != "文档未提及": logger.info(f"✓ {field}提取成功!") else: logger.warning(f"✗ {field}未提取到") if __name__ == "__main__": main()