Files
ztb/processors/deepseek.py
2026-02-13 18:15:20 +08:00

344 lines
14 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
"""
DeepSeek AI 处理器 - 从招标文件内容中提取结构化字段
"""
import json
import logging
import re
import time
import urllib3
import requests
from config import DEEPSEEK_API_KEY, DEEPSEEK_PROMPTS, PROCESSING_CONFIG
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
logger = logging.getLogger("ztb")
class DeepSeekProcessor:
"""DeepSeek AI 字段提取器"""
def __init__(self, api_key: str = None):
self.api_key = api_key or DEEPSEEK_API_KEY
self.api_url = "https://api.deepseek.com/chat/completions"
self.model = "deepseek-chat"
self.timeout = PROCESSING_CONFIG.get("request_timeout", 90)
self.max_content = PROCESSING_CONFIG.get("max_content_length", 120000)
def extract_fields(self, content: str, fields: list,
region_name: str = "") -> dict:
"""
使用 DeepSeek 提取指定字段
Args:
content: 页面+附件合并后的文本
fields: 需要提取的字段列表
region_name: 区域名称(用于日志)
Returns:
{字段名: 提取值} 字典
"""
if not content or not fields:
return {}
# 构建字段提示词
field_prompts = []
for field in fields:
if field in DEEPSEEK_PROMPTS:
field_prompts.append(f"{field}\n{DEEPSEEK_PROMPTS[field]}")
else:
field_prompts.append(
f'{field}】请从文档中提取{field}信息。如果未找到,返回"文档未提及"')
# 内容截取
selected_content = self._prepare_content(content, fields)
# 构建消息
system_prompt = (
"你是一个专业的招标文件分析助手,擅长从招标文件中准确提取关键信息。"
"请特别注意1) 仔细检查PDF附件内容 2) 识别不同表述的同一概念 "
"3) 提取详细完整的信息 4) 严格按照JSON格式返回结果。"
)
prompt = f"""请从以下招标文件内容中提取指定字段信息。
提取规则:
1. 只提取文档中明确存在的信息,严禁推测或编造
2. 如果某字段在文档中未提及,必须返回"文档未提及"
3. 对于价格信息,确保提取完整的价格数值和单位
4. 评标办法和评分说明必须来自文档正文而非目录页
需要提取的字段:
{chr(10).join(field_prompts)}
请以JSON格式返回结果
{{
"字段名1": "提取的内容",
"字段名2": "提取的内容"
}}
招标文件内容:
{selected_content}
"""
try:
response = requests.post(
self.api_url,
headers={
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
},
json={
"model": self.model,
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": prompt},
],
"temperature": 0.1,
"max_tokens": 3000,
"top_p": 0.95,
},
timeout=self.timeout,
verify=False,
)
response.raise_for_status()
result = response.json()
# 解析返回 JSON
content_text = result["choices"][0]["message"]["content"]
extracted = self._parse_json_response(content_text)
# 后处理:价格同步、格式清理
extracted = self._post_process(extracted, fields, content)
return extracted
except json.JSONDecodeError as e:
logger.warning(f"DeepSeek 返回 JSON 解析失败: {e}")
return self._local_extract(content, fields)
except requests.RequestException as e:
logger.warning(f"DeepSeek API 请求失败: {e}")
return self._local_extract(content, fields)
except Exception as e:
logger.warning(f"DeepSeek 处理异常: {e}")
return self._local_extract(content, fields)
# ---------- 内容预处理 ----------
def _prepare_content(self, content: str, fields: list) -> str:
"""根据字段类型智能截取内容"""
if len(content) <= self.max_content:
return content
logger.debug(f"内容过长({len(content)}字符),使用预筛选")
# 提取文档头部
header = content[:10000]
contexts = []
# 按字段类型定义搜索关键词
keyword_map = {
"价格": (["最高限价", "最高投标限价", "预估金额", "预估合同金额"],
["最高投标限价", "招标控制价", "最高限价", "控制价", "限价",
"投标须知", "万元"]),
"评标": (["评标办法", "评分说明与资信评分标准"],
["评标办法", "评分标准", "资信标", "技术标", "商务标",
"综合评估法", "评定分离"]),
"资质": (["资质要求", "业绩要求"],
["资质要求", "资格要求", "施工总承包", "资质等级",
"业绩要求", "业绩条件"]),
"日期": (["投标截止日"],
["投标截止", "截止时间", "开标时间", "递交截止"]),
"付款": (["造价付款方式"],
["付款方式", "工程款支付", "预付款", "进度款",
"结算款", "质保金", "合同条款"]),
}
for group, (target_fields, keywords) in keyword_map.items():
if any(f in fields for f in target_fields):
window = 800 if group in ("评标", "付款") else 500
for kw in keywords:
for m in re.finditer(
r'.{0,' + str(window) + '}' + re.escape(kw) +
r'.{0,' + str(window) + '}', content, re.DOTALL
):
contexts.append(m.group(0))
# 特别提取投标人须知前附表
if "业绩要求" in fields or "资质要求" in fields:
if "投标人须知前附表" in content:
start_idx = content.find("投标人须知前附表")
end_idx = min(len(content), start_idx + 10000) # 提取前附表的较大部分
contexts.append("=== 投标人须知前附表 ===\n" + content[start_idx:end_idx])
unique = list(set(contexts))
combined = "=== 文档头部信息 ===\n" + header + "\n\n" + "\n\n".join(unique)
return combined[:self.max_content]
# ---------- 响应解析 ----------
@staticmethod
def _parse_json_response(text: str) -> dict:
"""从 DeepSeek 返回文本中提取 JSON"""
if "```json" in text:
text = text.split("```json")[1].split("```")[0]
elif "```" in text:
text = text.split("```")[1].split("```")[0]
elif "{" in text:
start = text.find("{")
end = text.rfind("}") + 1
if start != -1 and end > 0:
text = text[start:end]
return json.loads(text.strip())
# ---------- 后处理 ----------
def _post_process(self, extracted: dict, fields: list,
content: str) -> dict:
"""对提取结果进行格式校验和后处理"""
# 投标截止日格式化
if "投标截止日" in extracted:
val = extracted["投标截止日"]
if val and val != "文档未提及":
m = re.search(r'(\d{4})[年/-](\d{1,2})[月/-](\d{1,2})', val)
if m:
extracted["投标截止日"] = (
f"{m.group(1)}-{m.group(2).zfill(2)}-"
f"{m.group(3).zfill(2)}")
# 价格字段清理 + 同步
for pf in ("最高限价", "最高投标限价"):
if pf in extracted and extracted[pf] != "文档未提及":
pm = re.search(r'([\d,]+\.?\d*)\s*(万元|元)', extracted[pf])
if pm:
extracted[pf] = pm.group(1).replace(",", "") + pm.group(2)
# 最高限价 ↔ 最高投标限价 同步
h1 = extracted.get("最高限价", "文档未提及")
h2 = extracted.get("最高投标限价", "文档未提及")
if h1 != "文档未提及" and h2 == "文档未提及" and "最高投标限价" in fields:
extracted["最高投标限价"] = h1
elif h2 != "文档未提及" and h1 == "文档未提及" and "最高限价" in fields:
extracted["最高限价"] = h2
# 文本字段最短长度校验
for tf in ("资质要求", "业绩要求", "项目概况", "造价付款方式"):
if tf in extracted and extracted[tf] not in ("文档未提及", ""):
if len(extracted[tf]) < 3:
extracted[tf] = "文档未提及"
# 跨字段关联:当业绩要求未提取到时,尝试从评分说明中提取
if "业绩要求" in extracted and extracted["业绩要求"] == "文档未提及":
if "评分说明与资信评分标准" in extracted:
score_info = extracted["评分说明与资信评分标准"]
# 从评分说明中提取业绩相关信息
if "类似工程业绩" in score_info:
# 提取业绩信息
# 匹配业绩要求的正则表达式
performance_pattern = r'类似工程业绩[:]\s*(.*?)(?:|。|$)'
matches = re.findall(performance_pattern, score_info, re.DOTALL)
if matches:
performance_info = " ".join(matches)
# 清理和格式化
performance_info = performance_info.strip()
if performance_info:
extracted["业绩要求"] = performance_info
return extracted
# ---------- 本地回退提取 ----------
@staticmethod
def _local_extract(content: str, fields: list) -> dict:
"""API 失败时的本地正则回退提取"""
result = {}
field_patterns = {
"类型": None, # 特殊处理
"投标截止日": [
r'投标截止时间[:]\s*(\d{4}\d{1,2}月\d{1,2}日)',
r'投标截止[:]\s*(\d{4}-\d{1,2}-\d{1,2})',
r'开标时间[:]\s*(\d{4}\d{1,2}月\d{1,2}日)',
],
"招标人": [
r'招标人[:]\s*([^\n]+)',
r'招标单位[:]\s*([^\n]+)',
r'建设单位[:]\s*([^\n]+)',
],
"有无答辩": None, # 特殊处理
"业绩要求": [
r'业绩要求[:]\s*([^\n]+)',
r'类似工程业绩[:]\s*([^\n]+)',
r'投标人业绩[:]\s*([^\n]+)',
],
}
for field in fields:
if field == "类型":
type_kw = {
"勘察": ["勘察", "地质", "岩土", "测量"],
"设计": ["设计", "规划", "施工图"],
"监理": ["监理", "监督"],
"EPC": ["EPC"],
"采购": ["采购", "设备"],
"咨询": ["咨询", "造价", "招标代理"],
}
matched = "其他"
for tname, kws in type_kw.items():
if any(k in content[:5000] for k in kws):
matched = tname
break
if matched == "其他" and any(
k in content[:5000]
for k in ["施工", "建筑", "安装", "市政"]
):
matched = "施工"
result["类型"] = matched
elif field == "有无答辩":
result["有无答辩"] = (
"" if any(k in content for k in ["答辩", "面试", "现场汇报"])
else ""
)
elif field in field_patterns and field_patterns[field]:
for pat in field_patterns[field]:
m = re.search(pat, content)
if m:
val = m.group(1).strip()
# 日期格式化
if field == "投标截止日" and "" in val:
dm = re.search(
r'(\d{4})年(\d{1,2})月(\d{1,2})日', val)
if dm:
val = (f"{dm.group(1)}-"
f"{dm.group(2).zfill(2)}-"
f"{dm.group(3).zfill(2)}")
result[field] = val
break
elif field in ("最高限价", "最高投标限价"):
patterns = [
r'最高投标限价.*?(\d+(?:\.\d+)?)\s*(万元|元)',
r'招标控制价.*?(\d+(?:\.\d+)?)\s*(万元|元)',
r'最高限价.*?(\d+(?:\.\d+)?)\s*(万元|元)',
r'控制价.*?(\d+(?:\.\d+)?)\s*(万元|元)',
]
for pat in patterns:
m = re.search(pat, content, re.DOTALL)
if m:
price = m.group(1).replace(",", "") + m.group(2)
result["最高限价"] = price
result["最高投标限价"] = price
break
# 特别处理业绩要求:从评分标准中提取
if "业绩要求" in fields and "业绩要求" not in result:
# 搜索评分标准中的业绩要求
score_pattern = r'类似工程业绩[:]\s*(.*?)(?:|。|$)'
m = re.search(score_pattern, content, re.DOTALL)
if m:
result["业绩要求"] = m.group(1).strip()
return {k: v for k, v in result.items() if v}