107 lines
4.0 KiB
Python
107 lines
4.0 KiB
Python
|
|
# -*- coding: utf-8 -*-
|
||
|
|
"""
|
||
|
|
详细分析指定网址的提取问题
|
||
|
|
"""
|
||
|
|
import logging
|
||
|
|
from processors.content_fetcher import ContentFetcher
|
||
|
|
from processors.deepseek import DeepSeekProcessor
|
||
|
|
from config import REGION_CONFIGS
|
||
|
|
|
||
|
|
# 配置日志
|
||
|
|
logging.basicConfig(
|
||
|
|
level=logging.INFO,
|
||
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
||
|
|
)
|
||
|
|
logger = logging.getLogger(__name__)
|
||
|
|
|
||
|
|
# 测试网址
|
||
|
|
TEST_URL = "https://ggzy.zj.gov.cn/jyxxgk/002001/002001011/20260212/9a7966d8-80f4-475b-897e-f7631bc64d0c.html"
|
||
|
|
|
||
|
|
def main():
|
||
|
|
"""主函数"""
|
||
|
|
logger.info(f"开始分析: {TEST_URL}")
|
||
|
|
|
||
|
|
# 1. 获取内容
|
||
|
|
fetcher = ContentFetcher(temp_dir="temp_files")
|
||
|
|
content = fetcher.get_full_content(TEST_URL)
|
||
|
|
|
||
|
|
if not content:
|
||
|
|
logger.error("无法获取网页内容")
|
||
|
|
return
|
||
|
|
|
||
|
|
logger.info(f"获取到内容长度: {len(content)} 字符")
|
||
|
|
|
||
|
|
# 2. 检查关键信息是否存在
|
||
|
|
keywords = ["资质要求", "业绩要求", "资格要求", "类似工程业绩"]
|
||
|
|
for keyword in keywords:
|
||
|
|
if keyword in content:
|
||
|
|
logger.info(f"✓ 包含关键词: {keyword}")
|
||
|
|
# 查找关键词上下文
|
||
|
|
start_idx = max(0, content.find(keyword) - 300)
|
||
|
|
end_idx = min(len(content), content.find(keyword) + 500)
|
||
|
|
context = content[start_idx:end_idx]
|
||
|
|
logger.info(f" 上下文: {context[:300]}...")
|
||
|
|
else:
|
||
|
|
logger.warning(f"✗ 不包含关键词: {keyword}")
|
||
|
|
|
||
|
|
# 3. 执行提取
|
||
|
|
processor = DeepSeekProcessor()
|
||
|
|
|
||
|
|
# 获取浙江招标文件公示的配置
|
||
|
|
config_key = "zhejiang:招标文件公示"
|
||
|
|
if config_key not in REGION_CONFIGS:
|
||
|
|
logger.error(f"未找到配置: {config_key}")
|
||
|
|
return
|
||
|
|
|
||
|
|
ai_fields = REGION_CONFIGS[config_key]["ai_fields"]
|
||
|
|
logger.info(f"需要提取的字段: {ai_fields}")
|
||
|
|
|
||
|
|
# 4. 执行提取
|
||
|
|
extracted = processor.extract_fields(content, ai_fields, "浙江")
|
||
|
|
|
||
|
|
# 5. 分析结果
|
||
|
|
logger.info("\n提取结果:")
|
||
|
|
for field, value in extracted.items():
|
||
|
|
logger.info(f" {field}: {value}")
|
||
|
|
|
||
|
|
# 特别关注资质要求和业绩要求
|
||
|
|
for field in ["资质要求", "业绩要求"]:
|
||
|
|
if field in extracted:
|
||
|
|
value = extracted[field]
|
||
|
|
logger.info(f"\n{field}提取结果: {value}")
|
||
|
|
|
||
|
|
if value == "文档未提及":
|
||
|
|
logger.warning(f"{field}未提取到,但内容中确实存在相关信息")
|
||
|
|
|
||
|
|
# 分析预处理内容
|
||
|
|
prepared_content = processor._prepare_content(content, ai_fields)
|
||
|
|
logger.info(f"预处理后内容长度: {len(prepared_content)} 字符")
|
||
|
|
|
||
|
|
if field in prepared_content:
|
||
|
|
logger.info(f"✓ 预处理后内容包含 {field}")
|
||
|
|
else:
|
||
|
|
logger.warning(f"✗ 预处理后内容不包含 {field}")
|
||
|
|
|
||
|
|
# 分析提示词
|
||
|
|
from config import DEEPSEEK_PROMPTS
|
||
|
|
if field in DEEPSEEK_PROMPTS:
|
||
|
|
logger.info(f"提示词: {DEEPSEEK_PROMPTS[field][:100]}...")
|
||
|
|
|
||
|
|
# 检查投标人须知前附表内容
|
||
|
|
if "投标人须知前附表" in prepared_content:
|
||
|
|
logger.info("✓ 预处理后内容包含 投标人须知前附表")
|
||
|
|
# 提取前附表内容
|
||
|
|
start_idx = prepared_content.find("投标人须知前附表")
|
||
|
|
end_idx = min(len(prepared_content), start_idx + 5000)
|
||
|
|
preamble_content = prepared_content[start_idx:end_idx]
|
||
|
|
logger.info(f"前附表内容片段: {preamble_content[:300]}...")
|
||
|
|
|
||
|
|
# 6. 尝试直接使用本地提取
|
||
|
|
logger.info("\n尝试本地提取:")
|
||
|
|
local_extracted = processor._local_extract(content, ai_fields)
|
||
|
|
for field, value in local_extracted.items():
|
||
|
|
logger.info(f" {field}: {value}")
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|