# -*- coding: utf-8 -*- """ 详细分析指定网址的提取问题 """ import logging from processors.content_fetcher import ContentFetcher from processors.deepseek import DeepSeekProcessor from config import REGION_CONFIGS # 配置日志 logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # 测试网址 TEST_URL = "https://ggzy.zj.gov.cn/jyxxgk/002001/002001011/20260212/9a7966d8-80f4-475b-897e-f7631bc64d0c.html" def main(): """主函数""" logger.info(f"开始分析: {TEST_URL}") # 1. 获取内容 fetcher = ContentFetcher(temp_dir="temp_files") content = fetcher.get_full_content(TEST_URL) if not content: logger.error("无法获取网页内容") return logger.info(f"获取到内容长度: {len(content)} 字符") # 2. 检查关键信息是否存在 keywords = ["资质要求", "业绩要求", "资格要求", "类似工程业绩"] for keyword in keywords: if keyword in content: logger.info(f"✓ 包含关键词: {keyword}") # 查找关键词上下文 start_idx = max(0, content.find(keyword) - 300) end_idx = min(len(content), content.find(keyword) + 500) context = content[start_idx:end_idx] logger.info(f" 上下文: {context[:300]}...") else: logger.warning(f"✗ 不包含关键词: {keyword}") # 3. 执行提取 processor = DeepSeekProcessor() # 获取浙江招标文件公示的配置 config_key = "zhejiang:招标文件公示" if config_key not in REGION_CONFIGS: logger.error(f"未找到配置: {config_key}") return ai_fields = REGION_CONFIGS[config_key]["ai_fields"] logger.info(f"需要提取的字段: {ai_fields}") # 4. 执行提取 extracted = processor.extract_fields(content, ai_fields, "浙江") # 5. 分析结果 logger.info("\n提取结果:") for field, value in extracted.items(): logger.info(f" {field}: {value}") # 特别关注资质要求和业绩要求 for field in ["资质要求", "业绩要求"]: if field in extracted: value = extracted[field] logger.info(f"\n{field}提取结果: {value}") if value == "文档未提及": logger.warning(f"{field}未提取到,但内容中确实存在相关信息") # 分析预处理内容 prepared_content = processor._prepare_content(content, ai_fields) logger.info(f"预处理后内容长度: {len(prepared_content)} 字符") if field in prepared_content: logger.info(f"✓ 预处理后内容包含 {field}") else: logger.warning(f"✗ 预处理后内容不包含 {field}") # 分析提示词 from config import DEEPSEEK_PROMPTS if field in DEEPSEEK_PROMPTS: logger.info(f"提示词: {DEEPSEEK_PROMPTS[field][:100]}...") # 检查投标人须知前附表内容 if "投标人须知前附表" in prepared_content: logger.info("✓ 预处理后内容包含 投标人须知前附表") # 提取前附表内容 start_idx = prepared_content.find("投标人须知前附表") end_idx = min(len(prepared_content), start_idx + 5000) preamble_content = prepared_content[start_idx:end_idx] logger.info(f"前附表内容片段: {preamble_content[:300]}...") # 6. 尝试直接使用本地提取 logger.info("\n尝试本地提取:") local_extracted = processor._local_extract(content, ai_fields) for field, value in local_extracted.items(): logger.info(f" {field}: {value}") if __name__ == "__main__": main()