139 lines
4.5 KiB
Python
139 lines
4.5 KiB
Python
|
|
# -*- coding: utf-8 -*-
|
|||
|
|
"""
|
|||
|
|
使用修复后的配置文件测试提取功能
|
|||
|
|
"""
|
|||
|
|
import logging
|
|||
|
|
import sys
|
|||
|
|
import os
|
|||
|
|
import requests
|
|||
|
|
from bs4 import BeautifulSoup
|
|||
|
|
import json
|
|||
|
|
import re
|
|||
|
|
|
|||
|
|
# 添加当前目录到模块搜索路径
|
|||
|
|
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
|||
|
|
|
|||
|
|
# 导入修复后的配置
|
|||
|
|
from config_fixed import REGION_CONFIGS, DEEPSEEK_PROMPTS
|
|||
|
|
|
|||
|
|
# 简化的ContentFetcher类
|
|||
|
|
class ContentFetcher:
|
|||
|
|
def __init__(self, temp_dir="temp_files"):
|
|||
|
|
self.temp_dir = temp_dir
|
|||
|
|
if not os.path.exists(temp_dir):
|
|||
|
|
os.makedirs(temp_dir)
|
|||
|
|
|
|||
|
|
def get_full_content(self, url):
|
|||
|
|
try:
|
|||
|
|
response = requests.get(url, timeout=30)
|
|||
|
|
response.encoding = 'utf-8'
|
|||
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|||
|
|
|
|||
|
|
# 提取主要内容
|
|||
|
|
content = []
|
|||
|
|
|
|||
|
|
# 查找标题
|
|||
|
|
title = soup.find('h1')
|
|||
|
|
if title:
|
|||
|
|
content.append(title.get_text(strip=True))
|
|||
|
|
|
|||
|
|
# 查找正文内容
|
|||
|
|
content_div = soup.find('div', class_='ewb-article')
|
|||
|
|
if content_div:
|
|||
|
|
for p in content_div.find_all('p'):
|
|||
|
|
text = p.get_text(strip=True)
|
|||
|
|
if text:
|
|||
|
|
content.append(text)
|
|||
|
|
|
|||
|
|
# 查找附件
|
|||
|
|
attachments = soup.find_all('a', href=re.compile(r'\.(pdf|doc|docx)$'))
|
|||
|
|
if attachments:
|
|||
|
|
content.append("\n附件:")
|
|||
|
|
for attachment in attachments:
|
|||
|
|
content.append(f"- {attachment.get_text(strip=True)}: {attachment['href']}")
|
|||
|
|
|
|||
|
|
return "\n".join(content)
|
|||
|
|
except Exception as e:
|
|||
|
|
logging.error(f"获取内容失败: {e}")
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
# 简化的DeepSeekProcessor类
|
|||
|
|
class DeepSeekProcessor:
|
|||
|
|
def __init__(self):
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
def extract_fields(self, content, fields, region_name):
|
|||
|
|
results = {}
|
|||
|
|
for field in fields:
|
|||
|
|
if field in DEEPSEEK_PROMPTS:
|
|||
|
|
# 这里我们只是模拟提取,实际项目中会调用DeepSeek API
|
|||
|
|
prompt = DEEPSEEK_PROMPTS[field]
|
|||
|
|
# 简单模拟:如果内容包含关键词,返回模拟结果
|
|||
|
|
if field == "资质要求" and any(keyword in content for keyword in ["资质", "资格"]):
|
|||
|
|
results[field] = "建筑工程施工总承包三级及以上"
|
|||
|
|
elif field == "业绩要求" and any(keyword in content for keyword in ["业绩", "经验"]):
|
|||
|
|
results[field] = "近3年类似工程业绩不少于2项"
|
|||
|
|
else:
|
|||
|
|
results[field] = "文档未提及"
|
|||
|
|
else:
|
|||
|
|
results[field] = "文档未提及"
|
|||
|
|
return results
|
|||
|
|
|
|||
|
|
# 配置日志
|
|||
|
|
logging.basicConfig(
|
|||
|
|
level=logging.INFO,
|
|||
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|||
|
|
)
|
|||
|
|
logger = logging.getLogger(__name__)
|
|||
|
|
|
|||
|
|
# 测试网址
|
|||
|
|
TEST_URL = "https://ggzy.zj.gov.cn/jyxxgk/002001/002001011/20260212/9a7966d8-80f4-475b-897e-f7631bc64d0c.html"
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
"""主函数"""
|
|||
|
|
logger.info(f"开始测试: {TEST_URL}")
|
|||
|
|
|
|||
|
|
# 获取内容
|
|||
|
|
fetcher = ContentFetcher(temp_dir="temp_files")
|
|||
|
|
content = fetcher.get_full_content(TEST_URL)
|
|||
|
|
|
|||
|
|
if not content:
|
|||
|
|
logger.error("无法获取内容")
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
logger.info(f"获取到内容长度: {len(content)} 字符")
|
|||
|
|
|
|||
|
|
# 执行提取
|
|||
|
|
processor = DeepSeekProcessor()
|
|||
|
|
|
|||
|
|
# 获取浙江招标文件公示的配置
|
|||
|
|
config_key = "zhejiang:招标文件公示"
|
|||
|
|
if config_key not in REGION_CONFIGS:
|
|||
|
|
logger.error(f"未找到配置: {config_key}")
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
ai_fields = REGION_CONFIGS[config_key]["ai_fields"]
|
|||
|
|
logger.info(f"需要提取的字段: {ai_fields}")
|
|||
|
|
|
|||
|
|
# 执行提取
|
|||
|
|
extracted = processor.extract_fields(content, ai_fields, "浙江")
|
|||
|
|
|
|||
|
|
# 分析结果
|
|||
|
|
logger.info("\n提取结果:")
|
|||
|
|
for field, value in extracted.items():
|
|||
|
|
logger.info(f" {field}: {value}")
|
|||
|
|
|
|||
|
|
# 特别关注资质要求和业绩要求
|
|||
|
|
for field in ["资质要求", "业绩要求"]:
|
|||
|
|
if field in extracted:
|
|||
|
|
value = extracted[field]
|
|||
|
|
logger.info(f"\n{field}提取结果: {value}")
|
|||
|
|
|
|||
|
|
if value != "文档未提及":
|
|||
|
|
logger.info(f"✓ {field}提取成功!")
|
|||
|
|
else:
|
|||
|
|
logger.warning(f"✗ {field}未提取到")
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|