344 lines
14 KiB
Python
344 lines
14 KiB
Python
# -*- coding: utf-8 -*-
|
||
"""
|
||
DeepSeek AI 处理器 - 从招标文件内容中提取结构化字段
|
||
"""
|
||
import json
|
||
import logging
|
||
import re
|
||
import time
|
||
import urllib3
|
||
|
||
import requests
|
||
|
||
from config import DEEPSEEK_API_KEY, DEEPSEEK_PROMPTS, PROCESSING_CONFIG
|
||
|
||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||
logger = logging.getLogger("ztb")
|
||
|
||
|
||
class DeepSeekProcessor:
|
||
"""DeepSeek AI 字段提取器"""
|
||
|
||
def __init__(self, api_key: str = None):
|
||
self.api_key = api_key or DEEPSEEK_API_KEY
|
||
self.api_url = "https://api.deepseek.com/chat/completions"
|
||
self.model = "deepseek-chat"
|
||
self.timeout = PROCESSING_CONFIG.get("request_timeout", 90)
|
||
self.max_content = PROCESSING_CONFIG.get("max_content_length", 120000)
|
||
|
||
def extract_fields(self, content: str, fields: list,
|
||
region_name: str = "") -> dict:
|
||
"""
|
||
使用 DeepSeek 提取指定字段
|
||
|
||
Args:
|
||
content: 页面+附件合并后的文本
|
||
fields: 需要提取的字段列表
|
||
region_name: 区域名称(用于日志)
|
||
|
||
Returns:
|
||
{字段名: 提取值} 字典
|
||
"""
|
||
if not content or not fields:
|
||
return {}
|
||
|
||
# 构建字段提示词
|
||
field_prompts = []
|
||
for field in fields:
|
||
if field in DEEPSEEK_PROMPTS:
|
||
field_prompts.append(f"【{field}】\n{DEEPSEEK_PROMPTS[field]}")
|
||
else:
|
||
field_prompts.append(
|
||
f'【{field}】请从文档中提取{field}信息。如果未找到,返回"文档未提及"。')
|
||
|
||
# 内容截取
|
||
selected_content = self._prepare_content(content, fields)
|
||
|
||
# 构建消息
|
||
system_prompt = (
|
||
"你是一个专业的招标文件分析助手,擅长从招标文件中准确提取关键信息。"
|
||
"请特别注意:1) 仔细检查PDF附件内容 2) 识别不同表述的同一概念 "
|
||
"3) 提取详细完整的信息 4) 严格按照JSON格式返回结果。"
|
||
)
|
||
|
||
prompt = f"""请从以下招标文件内容中提取指定字段信息。
|
||
|
||
提取规则:
|
||
1. 只提取文档中明确存在的信息,严禁推测或编造
|
||
2. 如果某字段在文档中未提及,必须返回"文档未提及"
|
||
3. 对于价格信息,确保提取完整的价格数值和单位
|
||
4. 评标办法和评分说明必须来自文档正文而非目录页
|
||
|
||
需要提取的字段:
|
||
{chr(10).join(field_prompts)}
|
||
|
||
请以JSON格式返回结果:
|
||
{{
|
||
"字段名1": "提取的内容",
|
||
"字段名2": "提取的内容"
|
||
}}
|
||
|
||
招标文件内容:
|
||
{selected_content}
|
||
"""
|
||
|
||
try:
|
||
response = requests.post(
|
||
self.api_url,
|
||
headers={
|
||
"Authorization": f"Bearer {self.api_key}",
|
||
"Content-Type": "application/json",
|
||
},
|
||
json={
|
||
"model": self.model,
|
||
"messages": [
|
||
{"role": "system", "content": system_prompt},
|
||
{"role": "user", "content": prompt},
|
||
],
|
||
"temperature": 0.1,
|
||
"max_tokens": 3000,
|
||
"top_p": 0.95,
|
||
},
|
||
timeout=self.timeout,
|
||
verify=False,
|
||
)
|
||
response.raise_for_status()
|
||
result = response.json()
|
||
|
||
# 解析返回 JSON
|
||
content_text = result["choices"][0]["message"]["content"]
|
||
extracted = self._parse_json_response(content_text)
|
||
|
||
# 后处理:价格同步、格式清理
|
||
extracted = self._post_process(extracted, fields, content)
|
||
|
||
return extracted
|
||
|
||
except json.JSONDecodeError as e:
|
||
logger.warning(f"DeepSeek 返回 JSON 解析失败: {e}")
|
||
return self._local_extract(content, fields)
|
||
except requests.RequestException as e:
|
||
logger.warning(f"DeepSeek API 请求失败: {e}")
|
||
return self._local_extract(content, fields)
|
||
except Exception as e:
|
||
logger.warning(f"DeepSeek 处理异常: {e}")
|
||
return self._local_extract(content, fields)
|
||
|
||
# ---------- 内容预处理 ----------
|
||
|
||
def _prepare_content(self, content: str, fields: list) -> str:
|
||
"""根据字段类型智能截取内容"""
|
||
if len(content) <= self.max_content:
|
||
return content
|
||
|
||
logger.debug(f"内容过长({len(content)}字符),使用预筛选")
|
||
# 提取文档头部
|
||
header = content[:10000]
|
||
contexts = []
|
||
|
||
# 按字段类型定义搜索关键词
|
||
keyword_map = {
|
||
"价格": (["最高限价", "最高投标限价", "预估金额", "预估合同金额"],
|
||
["最高投标限价", "招标控制价", "最高限价", "控制价", "限价",
|
||
"投标须知", "万元"]),
|
||
"评标": (["评标办法", "评分说明与资信评分标准"],
|
||
["评标办法", "评分标准", "资信标", "技术标", "商务标",
|
||
"综合评估法", "评定分离"]),
|
||
"资质": (["资质要求", "业绩要求"],
|
||
["资质要求", "资格要求", "施工总承包", "资质等级",
|
||
"业绩要求", "业绩条件"]),
|
||
"日期": (["投标截止日"],
|
||
["投标截止", "截止时间", "开标时间", "递交截止"]),
|
||
"付款": (["造价付款方式"],
|
||
["付款方式", "工程款支付", "预付款", "进度款",
|
||
"结算款", "质保金", "合同条款"]),
|
||
}
|
||
|
||
for group, (target_fields, keywords) in keyword_map.items():
|
||
if any(f in fields for f in target_fields):
|
||
window = 800 if group in ("评标", "付款") else 500
|
||
for kw in keywords:
|
||
for m in re.finditer(
|
||
r'.{0,' + str(window) + '}' + re.escape(kw) +
|
||
r'.{0,' + str(window) + '}', content, re.DOTALL
|
||
):
|
||
contexts.append(m.group(0))
|
||
|
||
# 特别提取投标人须知前附表
|
||
if "业绩要求" in fields or "资质要求" in fields:
|
||
if "投标人须知前附表" in content:
|
||
start_idx = content.find("投标人须知前附表")
|
||
end_idx = min(len(content), start_idx + 10000) # 提取前附表的较大部分
|
||
contexts.append("=== 投标人须知前附表 ===\n" + content[start_idx:end_idx])
|
||
|
||
unique = list(set(contexts))
|
||
combined = "=== 文档头部信息 ===\n" + header + "\n\n" + "\n\n".join(unique)
|
||
return combined[:self.max_content]
|
||
|
||
# ---------- 响应解析 ----------
|
||
|
||
@staticmethod
|
||
def _parse_json_response(text: str) -> dict:
|
||
"""从 DeepSeek 返回文本中提取 JSON"""
|
||
if "```json" in text:
|
||
text = text.split("```json")[1].split("```")[0]
|
||
elif "```" in text:
|
||
text = text.split("```")[1].split("```")[0]
|
||
elif "{" in text:
|
||
start = text.find("{")
|
||
end = text.rfind("}") + 1
|
||
if start != -1 and end > 0:
|
||
text = text[start:end]
|
||
return json.loads(text.strip())
|
||
|
||
# ---------- 后处理 ----------
|
||
|
||
def _post_process(self, extracted: dict, fields: list,
|
||
content: str) -> dict:
|
||
"""对提取结果进行格式校验和后处理"""
|
||
# 投标截止日格式化
|
||
if "投标截止日" in extracted:
|
||
val = extracted["投标截止日"]
|
||
if val and val != "文档未提及":
|
||
m = re.search(r'(\d{4})[年/-](\d{1,2})[月/-](\d{1,2})', val)
|
||
if m:
|
||
extracted["投标截止日"] = (
|
||
f"{m.group(1)}-{m.group(2).zfill(2)}-"
|
||
f"{m.group(3).zfill(2)}")
|
||
|
||
# 价格字段清理 + 同步
|
||
for pf in ("最高限价", "最高投标限价"):
|
||
if pf in extracted and extracted[pf] != "文档未提及":
|
||
pm = re.search(r'([\d,]+\.?\d*)\s*(万元|元)', extracted[pf])
|
||
if pm:
|
||
extracted[pf] = pm.group(1).replace(",", "") + pm.group(2)
|
||
|
||
# 最高限价 ↔ 最高投标限价 同步
|
||
h1 = extracted.get("最高限价", "文档未提及")
|
||
h2 = extracted.get("最高投标限价", "文档未提及")
|
||
if h1 != "文档未提及" and h2 == "文档未提及" and "最高投标限价" in fields:
|
||
extracted["最高投标限价"] = h1
|
||
elif h2 != "文档未提及" and h1 == "文档未提及" and "最高限价" in fields:
|
||
extracted["最高限价"] = h2
|
||
|
||
# 文本字段最短长度校验
|
||
for tf in ("资质要求", "业绩要求", "项目概况", "造价付款方式"):
|
||
if tf in extracted and extracted[tf] not in ("文档未提及", ""):
|
||
if len(extracted[tf]) < 3:
|
||
extracted[tf] = "文档未提及"
|
||
|
||
# 跨字段关联:当业绩要求未提取到时,尝试从评分说明中提取
|
||
if "业绩要求" in extracted and extracted["业绩要求"] == "文档未提及":
|
||
if "评分说明与资信评分标准" in extracted:
|
||
score_info = extracted["评分说明与资信评分标准"]
|
||
# 从评分说明中提取业绩相关信息
|
||
if "类似工程业绩" in score_info:
|
||
# 提取业绩信息
|
||
# 匹配业绩要求的正则表达式
|
||
performance_pattern = r'类似工程业绩[::]\s*(.*?)(?:;|。|$)'
|
||
matches = re.findall(performance_pattern, score_info, re.DOTALL)
|
||
if matches:
|
||
performance_info = " ".join(matches)
|
||
# 清理和格式化
|
||
performance_info = performance_info.strip()
|
||
if performance_info:
|
||
extracted["业绩要求"] = performance_info
|
||
|
||
return extracted
|
||
|
||
# ---------- 本地回退提取 ----------
|
||
|
||
@staticmethod
|
||
def _local_extract(content: str, fields: list) -> dict:
|
||
"""API 失败时的本地正则回退提取"""
|
||
result = {}
|
||
|
||
field_patterns = {
|
||
"类型": None, # 特殊处理
|
||
"投标截止日": [
|
||
r'投标截止时间[::]\s*(\d{4}年\d{1,2}月\d{1,2}日)',
|
||
r'投标截止[::]\s*(\d{4}-\d{1,2}-\d{1,2})',
|
||
r'开标时间[::]\s*(\d{4}年\d{1,2}月\d{1,2}日)',
|
||
],
|
||
"招标人": [
|
||
r'招标人[::]\s*([^\n]+)',
|
||
r'招标单位[::]\s*([^\n]+)',
|
||
r'建设单位[::]\s*([^\n]+)',
|
||
],
|
||
"有无答辩": None, # 特殊处理
|
||
"业绩要求": [
|
||
r'业绩要求[::]\s*([^\n]+)',
|
||
r'类似工程业绩[::]\s*([^\n]+)',
|
||
r'投标人业绩[::]\s*([^\n]+)',
|
||
],
|
||
}
|
||
|
||
for field in fields:
|
||
if field == "类型":
|
||
type_kw = {
|
||
"勘察": ["勘察", "地质", "岩土", "测量"],
|
||
"设计": ["设计", "规划", "施工图"],
|
||
"监理": ["监理", "监督"],
|
||
"EPC": ["EPC"],
|
||
"采购": ["采购", "设备"],
|
||
"咨询": ["咨询", "造价", "招标代理"],
|
||
}
|
||
matched = "其他"
|
||
for tname, kws in type_kw.items():
|
||
if any(k in content[:5000] for k in kws):
|
||
matched = tname
|
||
break
|
||
if matched == "其他" and any(
|
||
k in content[:5000]
|
||
for k in ["施工", "建筑", "安装", "市政"]
|
||
):
|
||
matched = "施工"
|
||
result["类型"] = matched
|
||
|
||
elif field == "有无答辩":
|
||
result["有无答辩"] = (
|
||
"有" if any(k in content for k in ["答辩", "面试", "现场汇报"])
|
||
else "无"
|
||
)
|
||
|
||
elif field in field_patterns and field_patterns[field]:
|
||
for pat in field_patterns[field]:
|
||
m = re.search(pat, content)
|
||
if m:
|
||
val = m.group(1).strip()
|
||
# 日期格式化
|
||
if field == "投标截止日" and "年" in val:
|
||
dm = re.search(
|
||
r'(\d{4})年(\d{1,2})月(\d{1,2})日', val)
|
||
if dm:
|
||
val = (f"{dm.group(1)}-"
|
||
f"{dm.group(2).zfill(2)}-"
|
||
f"{dm.group(3).zfill(2)}")
|
||
result[field] = val
|
||
break
|
||
|
||
elif field in ("最高限价", "最高投标限价"):
|
||
patterns = [
|
||
r'最高投标限价.*?(\d+(?:\.\d+)?)\s*(万元|元)',
|
||
r'招标控制价.*?(\d+(?:\.\d+)?)\s*(万元|元)',
|
||
r'最高限价.*?(\d+(?:\.\d+)?)\s*(万元|元)',
|
||
r'控制价.*?(\d+(?:\.\d+)?)\s*(万元|元)',
|
||
]
|
||
for pat in patterns:
|
||
m = re.search(pat, content, re.DOTALL)
|
||
if m:
|
||
price = m.group(1).replace(",", "") + m.group(2)
|
||
result["最高限价"] = price
|
||
result["最高投标限价"] = price
|
||
break
|
||
|
||
# 特别处理业绩要求:从评分标准中提取
|
||
if "业绩要求" in fields and "业绩要求" not in result:
|
||
# 搜索评分标准中的业绩要求
|
||
score_pattern = r'类似工程业绩[::]\s*(.*?)(?:;|。|$)'
|
||
m = re.search(score_pattern, content, re.DOTALL)
|
||
if m:
|
||
result["业绩要求"] = m.group(1).strip()
|
||
|
||
return {k: v for k, v in result.items() if v}
|