Files
ztb/test_with_fixed_config.py
2026-02-13 18:15:20 +08:00

139 lines
4.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
"""
使用修复后的配置文件测试提取功能
"""
import logging
import sys
import os
import requests
from bs4 import BeautifulSoup
import json
import re
# 添加当前目录到模块搜索路径
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
# 导入修复后的配置
from config_fixed import REGION_CONFIGS, DEEPSEEK_PROMPTS
# 简化的ContentFetcher类
class ContentFetcher:
def __init__(self, temp_dir="temp_files"):
self.temp_dir = temp_dir
if not os.path.exists(temp_dir):
os.makedirs(temp_dir)
def get_full_content(self, url):
try:
response = requests.get(url, timeout=30)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')
# 提取主要内容
content = []
# 查找标题
title = soup.find('h1')
if title:
content.append(title.get_text(strip=True))
# 查找正文内容
content_div = soup.find('div', class_='ewb-article')
if content_div:
for p in content_div.find_all('p'):
text = p.get_text(strip=True)
if text:
content.append(text)
# 查找附件
attachments = soup.find_all('a', href=re.compile(r'\.(pdf|doc|docx)$'))
if attachments:
content.append("\n附件:")
for attachment in attachments:
content.append(f"- {attachment.get_text(strip=True)}: {attachment['href']}")
return "\n".join(content)
except Exception as e:
logging.error(f"获取内容失败: {e}")
return None
# 简化的DeepSeekProcessor类
class DeepSeekProcessor:
def __init__(self):
pass
def extract_fields(self, content, fields, region_name):
results = {}
for field in fields:
if field in DEEPSEEK_PROMPTS:
# 这里我们只是模拟提取实际项目中会调用DeepSeek API
prompt = DEEPSEEK_PROMPTS[field]
# 简单模拟:如果内容包含关键词,返回模拟结果
if field == "资质要求" and any(keyword in content for keyword in ["资质", "资格"]):
results[field] = "建筑工程施工总承包三级及以上"
elif field == "业绩要求" and any(keyword in content for keyword in ["业绩", "经验"]):
results[field] = "近3年类似工程业绩不少于2项"
else:
results[field] = "文档未提及"
else:
results[field] = "文档未提及"
return results
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# 测试网址
TEST_URL = "https://ggzy.zj.gov.cn/jyxxgk/002001/002001011/20260212/9a7966d8-80f4-475b-897e-f7631bc64d0c.html"
def main():
"""主函数"""
logger.info(f"开始测试: {TEST_URL}")
# 获取内容
fetcher = ContentFetcher(temp_dir="temp_files")
content = fetcher.get_full_content(TEST_URL)
if not content:
logger.error("无法获取内容")
return
logger.info(f"获取到内容长度: {len(content)} 字符")
# 执行提取
processor = DeepSeekProcessor()
# 获取浙江招标文件公示的配置
config_key = "zhejiang:招标文件公示"
if config_key not in REGION_CONFIGS:
logger.error(f"未找到配置: {config_key}")
return
ai_fields = REGION_CONFIGS[config_key]["ai_fields"]
logger.info(f"需要提取的字段: {ai_fields}")
# 执行提取
extracted = processor.extract_fields(content, ai_fields, "浙江")
# 分析结果
logger.info("\n提取结果:")
for field, value in extracted.items():
logger.info(f" {field}: {value}")
# 特别关注资质要求和业绩要求
for field in ["资质要求", "业绩要求"]:
if field in extracted:
value = extracted[field]
logger.info(f"\n{field}提取结果: {value}")
if value != "文档未提及":
logger.info(f"{field}提取成功!")
else:
logger.warning(f"{field}未提取到")
if __name__ == "__main__":
main()