109 lines
4.0 KiB
Python
109 lines
4.0 KiB
Python
# -*- coding: utf-8 -*-
|
|
"""
|
|
针对指定网址的重新提取测试
|
|
分析业绩要求未提取到的原因
|
|
"""
|
|
import logging
|
|
from processors.content_fetcher import ContentFetcher
|
|
from processors.deepseek import DeepSeekProcessor
|
|
from config import REGION_CONFIGS
|
|
|
|
# 配置日志
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# 测试网址
|
|
TARGET_URL = "https://ggzy.zj.gov.cn/jyxxgk/002001/002001011/20260212/d2f95295-6cb0-40c9-8023-cdbbf7e660ae.html"
|
|
|
|
def test_reextract():
|
|
"""重新提取指定网址的信息"""
|
|
logger.info(f"开始测试重新提取: {TARGET_URL}")
|
|
|
|
# 1. 获取内容
|
|
fetcher = ContentFetcher(temp_dir="temp_files")
|
|
content = fetcher.get_full_content(TARGET_URL)
|
|
|
|
if not content:
|
|
logger.error("无法获取网页内容")
|
|
return
|
|
|
|
logger.info(f"获取到内容长度: {len(content)} 字符")
|
|
|
|
# 2. 提取字段
|
|
processor = DeepSeekProcessor()
|
|
|
|
# 获取浙江招标文件公示的配置
|
|
config_key = "zhejiang:招标文件公示"
|
|
if config_key not in REGION_CONFIGS:
|
|
logger.error(f"未找到配置: {config_key}")
|
|
return
|
|
|
|
ai_fields = REGION_CONFIGS[config_key]["ai_fields"]
|
|
logger.info(f"需要提取的字段: {ai_fields}")
|
|
|
|
# 3. 执行提取
|
|
extracted = processor.extract_fields(content, ai_fields, "浙江")
|
|
|
|
# 4. 分析结果
|
|
logger.info("\n提取结果:")
|
|
for field, value in extracted.items():
|
|
logger.info(f" {field}: {value}")
|
|
|
|
# 特别关注业绩要求
|
|
if "业绩要求" in extracted:
|
|
performance_req = extracted["业绩要求"]
|
|
logger.info(f"\n业绩要求提取结果: {performance_req}")
|
|
|
|
if performance_req == "文档未提及":
|
|
logger.warning("业绩要求未提取到,开始分析原因...")
|
|
|
|
# 分析内容中是否包含业绩相关关键词
|
|
performance_keywords = ["业绩要求", "业绩条件", "投标人业绩", "类似项目", "工程经验"]
|
|
found_keywords = []
|
|
|
|
for keyword in performance_keywords:
|
|
if keyword in content:
|
|
found_keywords.append(keyword)
|
|
# 找到关键词上下文
|
|
start_idx = max(0, content.find(keyword) - 200)
|
|
end_idx = min(len(content), content.find(keyword) + 800)
|
|
context = content[start_idx:end_idx]
|
|
logger.info(f"\n找到关键词 '{keyword}' 的上下文:")
|
|
logger.info(f"{context[:500]}...")
|
|
|
|
if found_keywords:
|
|
logger.info(f"\n在内容中找到相关关键词: {found_keywords}")
|
|
logger.info("可能的问题: 关键词存在但提取逻辑未正确识别")
|
|
else:
|
|
logger.info("\n在内容中未找到业绩相关关键词")
|
|
logger.info("可能的问题: 内容中确实没有业绩要求信息")
|
|
else:
|
|
logger.info("业绩要求提取成功")
|
|
else:
|
|
logger.error("提取结果中没有业绩要求字段")
|
|
|
|
# 5. 分析内容预处理
|
|
logger.info("\n分析内容预处理...")
|
|
prepared_content = processor._prepare_content(content, ai_fields)
|
|
logger.info(f"预处理后内容长度: {len(prepared_content)} 字符")
|
|
|
|
# 检查预处理后是否包含业绩相关内容
|
|
performance_keywords = ["业绩要求", "业绩条件", "投标人业绩"]
|
|
found_in_prepared = []
|
|
|
|
for keyword in performance_keywords:
|
|
if keyword in prepared_content:
|
|
found_in_prepared.append(keyword)
|
|
|
|
if found_in_prepared:
|
|
logger.info(f"预处理后内容中包含业绩相关关键词: {found_in_prepared}")
|
|
else:
|
|
logger.warning("预处理后内容中不包含业绩相关关键词")
|
|
logger.warning("可能的问题: 内容预处理时未包含业绩相关部分")
|
|
|
|
if __name__ == "__main__":
|
|
test_reextract()
|