# -*- coding: utf-8 -*- """ DeepSeek AI 处理器 - 从招标文件内容中提取结构化字段 """ import json import logging import re import time import urllib3 import requests from config import DEEPSEEK_API_KEY, DEEPSEEK_PROMPTS, PROCESSING_CONFIG urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) logger = logging.getLogger("ztb") class DeepSeekProcessor: """DeepSeek AI 字段提取器""" def __init__(self, api_key: str = None): self.api_key = api_key or DEEPSEEK_API_KEY self.api_url = "https://api.deepseek.com/chat/completions" self.model = "deepseek-chat" self.timeout = PROCESSING_CONFIG.get("request_timeout", 90) self.max_content = PROCESSING_CONFIG.get("max_content_length", 120000) def extract_fields(self, content: str, fields: list, region_name: str = "") -> dict: """ 使用 DeepSeek 提取指定字段 Args: content: 页面+附件合并后的文本 fields: 需要提取的字段列表 region_name: 区域名称(用于日志) Returns: {字段名: 提取值} 字典 """ if not content or not fields: return {} # 构建字段提示词 field_prompts = [] for field in fields: if field in DEEPSEEK_PROMPTS: field_prompts.append(f"【{field}】\n{DEEPSEEK_PROMPTS[field]}") else: field_prompts.append( f'【{field}】请从文档中提取{field}信息。如果未找到,返回"文档未提及"。') # 内容截取 selected_content = self._prepare_content(content, fields) # 构建消息 system_prompt = ( "你是一个专业的招标文件分析助手,擅长从招标文件中准确提取关键信息。" "请特别注意:1) 仔细检查PDF附件内容 2) 识别不同表述的同一概念 " "3) 提取详细完整的信息 4) 严格按照JSON格式返回结果。" ) prompt = f"""请从以下招标文件内容中提取指定字段信息。 提取规则: 1. 只提取文档中明确存在的信息,严禁推测或编造 2. 如果某字段在文档中未提及,必须返回"文档未提及" 3. 对于价格信息,确保提取完整的价格数值和单位 4. 评标办法和评分说明必须来自文档正文而非目录页 需要提取的字段: {chr(10).join(field_prompts)} 请以JSON格式返回结果: {{ "字段名1": "提取的内容", "字段名2": "提取的内容" }} 招标文件内容: {selected_content} """ try: response = requests.post( self.api_url, headers={ "Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json", }, json={ "model": self.model, "messages": [ {"role": "system", "content": system_prompt}, {"role": "user", "content": prompt}, ], "temperature": 0.1, "max_tokens": 3000, "top_p": 0.95, }, timeout=self.timeout, verify=False, ) response.raise_for_status() result = response.json() # 解析返回 JSON content_text = result["choices"][0]["message"]["content"] extracted = self._parse_json_response(content_text) # 后处理:价格同步、格式清理 extracted = self._post_process(extracted, fields, content) return extracted except json.JSONDecodeError as e: logger.warning(f"DeepSeek 返回 JSON 解析失败: {e}") return self._local_extract(content, fields) except requests.RequestException as e: logger.warning(f"DeepSeek API 请求失败: {e}") return self._local_extract(content, fields) except Exception as e: logger.warning(f"DeepSeek 处理异常: {e}") return self._local_extract(content, fields) # ---------- 内容预处理 ---------- def _prepare_content(self, content: str, fields: list) -> str: """根据字段类型智能截取内容""" if len(content) <= self.max_content: return content logger.debug(f"内容过长({len(content)}字符),使用预筛选") # 提取文档头部 header = content[:10000] contexts = [] # 按字段类型定义搜索关键词 keyword_map = { "价格": (["最高限价", "最高投标限价", "预估金额", "预估合同金额"], ["最高投标限价", "招标控制价", "最高限价", "控制价", "限价", "投标须知", "万元"]), "评标": (["评标办法", "评分说明与资信评分标准"], ["评标办法", "评分标准", "资信标", "技术标", "商务标", "综合评估法", "评定分离"]), "资质": (["资质要求", "业绩要求"], ["资质要求", "资格要求", "施工总承包", "资质等级", "业绩要求", "业绩条件"]), "日期": (["投标截止日"], ["投标截止", "截止时间", "开标时间", "递交截止"]), "付款": (["造价付款方式"], ["付款方式", "工程款支付", "预付款", "进度款", "结算款", "质保金", "合同条款"]), } for group, (target_fields, keywords) in keyword_map.items(): if any(f in fields for f in target_fields): window = 800 if group in ("评标", "付款") else 500 for kw in keywords: for m in re.finditer( r'.{0,' + str(window) + '}' + re.escape(kw) + r'.{0,' + str(window) + '}', content, re.DOTALL ): contexts.append(m.group(0)) # 特别提取投标人须知前附表 if "业绩要求" in fields or "资质要求" in fields: if "投标人须知前附表" in content: start_idx = content.find("投标人须知前附表") end_idx = min(len(content), start_idx + 10000) # 提取前附表的较大部分 contexts.append("=== 投标人须知前附表 ===\n" + content[start_idx:end_idx]) unique = list(set(contexts)) combined = "=== 文档头部信息 ===\n" + header + "\n\n" + "\n\n".join(unique) return combined[:self.max_content] # ---------- 响应解析 ---------- @staticmethod def _parse_json_response(text: str) -> dict: """从 DeepSeek 返回文本中提取 JSON""" if "```json" in text: text = text.split("```json")[1].split("```")[0] elif "```" in text: text = text.split("```")[1].split("```")[0] elif "{" in text: start = text.find("{") end = text.rfind("}") + 1 if start != -1 and end > 0: text = text[start:end] return json.loads(text.strip()) # ---------- 后处理 ---------- def _post_process(self, extracted: dict, fields: list, content: str) -> dict: """对提取结果进行格式校验和后处理""" # 投标截止日格式化 if "投标截止日" in extracted: val = extracted["投标截止日"] if val and val != "文档未提及": m = re.search(r'(\d{4})[年/-](\d{1,2})[月/-](\d{1,2})', val) if m: extracted["投标截止日"] = ( f"{m.group(1)}-{m.group(2).zfill(2)}-" f"{m.group(3).zfill(2)}") # 价格字段清理 + 同步 for pf in ("最高限价", "最高投标限价"): if pf in extracted and extracted[pf] != "文档未提及": pm = re.search(r'([\d,]+\.?\d*)\s*(万元|元)', extracted[pf]) if pm: extracted[pf] = pm.group(1).replace(",", "") + pm.group(2) # 最高限价 ↔ 最高投标限价 同步 h1 = extracted.get("最高限价", "文档未提及") h2 = extracted.get("最高投标限价", "文档未提及") if h1 != "文档未提及" and h2 == "文档未提及" and "最高投标限价" in fields: extracted["最高投标限价"] = h1 elif h2 != "文档未提及" and h1 == "文档未提及" and "最高限价" in fields: extracted["最高限价"] = h2 # 文本字段最短长度校验 for tf in ("资质要求", "业绩要求", "项目概况", "造价付款方式"): if tf in extracted and extracted[tf] not in ("文档未提及", ""): if len(extracted[tf]) < 3: extracted[tf] = "文档未提及" # 跨字段关联:当业绩要求未提取到时,尝试从评分说明中提取 if "业绩要求" in extracted and extracted["业绩要求"] == "文档未提及": if "评分说明与资信评分标准" in extracted: score_info = extracted["评分说明与资信评分标准"] # 从评分说明中提取业绩相关信息 if "类似工程业绩" in score_info: # 提取业绩信息 # 匹配业绩要求的正则表达式 performance_pattern = r'类似工程业绩[::]\s*(.*?)(?:;|。|$)' matches = re.findall(performance_pattern, score_info, re.DOTALL) if matches: performance_info = " ".join(matches) # 清理和格式化 performance_info = performance_info.strip() if performance_info: extracted["业绩要求"] = performance_info return extracted # ---------- 本地回退提取 ---------- @staticmethod def _local_extract(content: str, fields: list) -> dict: """API 失败时的本地正则回退提取""" result = {} field_patterns = { "类型": None, # 特殊处理 "投标截止日": [ r'投标截止时间[::]\s*(\d{4}年\d{1,2}月\d{1,2}日)', r'投标截止[::]\s*(\d{4}-\d{1,2}-\d{1,2})', r'开标时间[::]\s*(\d{4}年\d{1,2}月\d{1,2}日)', ], "招标人": [ r'招标人[::]\s*([^\n]+)', r'招标单位[::]\s*([^\n]+)', r'建设单位[::]\s*([^\n]+)', ], "有无答辩": None, # 特殊处理 "业绩要求": [ r'业绩要求[::]\s*([^\n]+)', r'类似工程业绩[::]\s*([^\n]+)', r'投标人业绩[::]\s*([^\n]+)', ], } for field in fields: if field == "类型": type_kw = { "勘察": ["勘察", "地质", "岩土", "测量"], "设计": ["设计", "规划", "施工图"], "监理": ["监理", "监督"], "EPC": ["EPC"], "采购": ["采购", "设备"], "咨询": ["咨询", "造价", "招标代理"], } matched = "其他" for tname, kws in type_kw.items(): if any(k in content[:5000] for k in kws): matched = tname break if matched == "其他" and any( k in content[:5000] for k in ["施工", "建筑", "安装", "市政"] ): matched = "施工" result["类型"] = matched elif field == "有无答辩": result["有无答辩"] = ( "有" if any(k in content for k in ["答辩", "面试", "现场汇报"]) else "无" ) elif field in field_patterns and field_patterns[field]: for pat in field_patterns[field]: m = re.search(pat, content) if m: val = m.group(1).strip() # 日期格式化 if field == "投标截止日" and "年" in val: dm = re.search( r'(\d{4})年(\d{1,2})月(\d{1,2})日', val) if dm: val = (f"{dm.group(1)}-" f"{dm.group(2).zfill(2)}-" f"{dm.group(3).zfill(2)}") result[field] = val break elif field in ("最高限价", "最高投标限价"): patterns = [ r'最高投标限价.*?(\d+(?:\.\d+)?)\s*(万元|元)', r'招标控制价.*?(\d+(?:\.\d+)?)\s*(万元|元)', r'最高限价.*?(\d+(?:\.\d+)?)\s*(万元|元)', r'控制价.*?(\d+(?:\.\d+)?)\s*(万元|元)', ] for pat in patterns: m = re.search(pat, content, re.DOTALL) if m: price = m.group(1).replace(",", "") + m.group(2) result["最高限价"] = price result["最高投标限价"] = price break # 特别处理业绩要求:从评分标准中提取 if "业绩要求" in fields and "业绩要求" not in result: # 搜索评分标准中的业绩要求 score_pattern = r'类似工程业绩[::]\s*(.*?)(?:;|。|$)' m = re.search(score_pattern, content, re.DOTALL) if m: result["业绩要求"] = m.group(1).strip() return {k: v for k, v in result.items() if v}