Initial commit: 招标信息爬虫与分析系统

This commit is contained in:
ztb-system
2026-02-13 18:15:20 +08:00
commit d2fa06801f
38 changed files with 5415 additions and 0 deletions

343
processors/deepseek.py Normal file
View File

@@ -0,0 +1,343 @@
# -*- coding: utf-8 -*-
"""
DeepSeek AI 处理器 - 从招标文件内容中提取结构化字段
"""
import json
import logging
import re
import time
import urllib3
import requests
from config import DEEPSEEK_API_KEY, DEEPSEEK_PROMPTS, PROCESSING_CONFIG
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
logger = logging.getLogger("ztb")
class DeepSeekProcessor:
"""DeepSeek AI 字段提取器"""
def __init__(self, api_key: str = None):
self.api_key = api_key or DEEPSEEK_API_KEY
self.api_url = "https://api.deepseek.com/chat/completions"
self.model = "deepseek-chat"
self.timeout = PROCESSING_CONFIG.get("request_timeout", 90)
self.max_content = PROCESSING_CONFIG.get("max_content_length", 120000)
def extract_fields(self, content: str, fields: list,
region_name: str = "") -> dict:
"""
使用 DeepSeek 提取指定字段
Args:
content: 页面+附件合并后的文本
fields: 需要提取的字段列表
region_name: 区域名称(用于日志)
Returns:
{字段名: 提取值} 字典
"""
if not content or not fields:
return {}
# 构建字段提示词
field_prompts = []
for field in fields:
if field in DEEPSEEK_PROMPTS:
field_prompts.append(f"{field}\n{DEEPSEEK_PROMPTS[field]}")
else:
field_prompts.append(
f'{field}】请从文档中提取{field}信息。如果未找到,返回"文档未提及"')
# 内容截取
selected_content = self._prepare_content(content, fields)
# 构建消息
system_prompt = (
"你是一个专业的招标文件分析助手,擅长从招标文件中准确提取关键信息。"
"请特别注意1) 仔细检查PDF附件内容 2) 识别不同表述的同一概念 "
"3) 提取详细完整的信息 4) 严格按照JSON格式返回结果。"
)
prompt = f"""请从以下招标文件内容中提取指定字段信息。
提取规则:
1. 只提取文档中明确存在的信息,严禁推测或编造
2. 如果某字段在文档中未提及,必须返回"文档未提及"
3. 对于价格信息,确保提取完整的价格数值和单位
4. 评标办法和评分说明必须来自文档正文而非目录页
需要提取的字段:
{chr(10).join(field_prompts)}
请以JSON格式返回结果
{{
"字段名1": "提取的内容",
"字段名2": "提取的内容"
}}
招标文件内容:
{selected_content}
"""
try:
response = requests.post(
self.api_url,
headers={
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
},
json={
"model": self.model,
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": prompt},
],
"temperature": 0.1,
"max_tokens": 3000,
"top_p": 0.95,
},
timeout=self.timeout,
verify=False,
)
response.raise_for_status()
result = response.json()
# 解析返回 JSON
content_text = result["choices"][0]["message"]["content"]
extracted = self._parse_json_response(content_text)
# 后处理:价格同步、格式清理
extracted = self._post_process(extracted, fields, content)
return extracted
except json.JSONDecodeError as e:
logger.warning(f"DeepSeek 返回 JSON 解析失败: {e}")
return self._local_extract(content, fields)
except requests.RequestException as e:
logger.warning(f"DeepSeek API 请求失败: {e}")
return self._local_extract(content, fields)
except Exception as e:
logger.warning(f"DeepSeek 处理异常: {e}")
return self._local_extract(content, fields)
# ---------- 内容预处理 ----------
def _prepare_content(self, content: str, fields: list) -> str:
"""根据字段类型智能截取内容"""
if len(content) <= self.max_content:
return content
logger.debug(f"内容过长({len(content)}字符),使用预筛选")
# 提取文档头部
header = content[:10000]
contexts = []
# 按字段类型定义搜索关键词
keyword_map = {
"价格": (["最高限价", "最高投标限价", "预估金额", "预估合同金额"],
["最高投标限价", "招标控制价", "最高限价", "控制价", "限价",
"投标须知", "万元"]),
"评标": (["评标办法", "评分说明与资信评分标准"],
["评标办法", "评分标准", "资信标", "技术标", "商务标",
"综合评估法", "评定分离"]),
"资质": (["资质要求", "业绩要求"],
["资质要求", "资格要求", "施工总承包", "资质等级",
"业绩要求", "业绩条件"]),
"日期": (["投标截止日"],
["投标截止", "截止时间", "开标时间", "递交截止"]),
"付款": (["造价付款方式"],
["付款方式", "工程款支付", "预付款", "进度款",
"结算款", "质保金", "合同条款"]),
}
for group, (target_fields, keywords) in keyword_map.items():
if any(f in fields for f in target_fields):
window = 800 if group in ("评标", "付款") else 500
for kw in keywords:
for m in re.finditer(
r'.{0,' + str(window) + '}' + re.escape(kw) +
r'.{0,' + str(window) + '}', content, re.DOTALL
):
contexts.append(m.group(0))
# 特别提取投标人须知前附表
if "业绩要求" in fields or "资质要求" in fields:
if "投标人须知前附表" in content:
start_idx = content.find("投标人须知前附表")
end_idx = min(len(content), start_idx + 10000) # 提取前附表的较大部分
contexts.append("=== 投标人须知前附表 ===\n" + content[start_idx:end_idx])
unique = list(set(contexts))
combined = "=== 文档头部信息 ===\n" + header + "\n\n" + "\n\n".join(unique)
return combined[:self.max_content]
# ---------- 响应解析 ----------
@staticmethod
def _parse_json_response(text: str) -> dict:
"""从 DeepSeek 返回文本中提取 JSON"""
if "```json" in text:
text = text.split("```json")[1].split("```")[0]
elif "```" in text:
text = text.split("```")[1].split("```")[0]
elif "{" in text:
start = text.find("{")
end = text.rfind("}") + 1
if start != -1 and end > 0:
text = text[start:end]
return json.loads(text.strip())
# ---------- 后处理 ----------
def _post_process(self, extracted: dict, fields: list,
content: str) -> dict:
"""对提取结果进行格式校验和后处理"""
# 投标截止日格式化
if "投标截止日" in extracted:
val = extracted["投标截止日"]
if val and val != "文档未提及":
m = re.search(r'(\d{4})[年/-](\d{1,2})[月/-](\d{1,2})', val)
if m:
extracted["投标截止日"] = (
f"{m.group(1)}-{m.group(2).zfill(2)}-"
f"{m.group(3).zfill(2)}")
# 价格字段清理 + 同步
for pf in ("最高限价", "最高投标限价"):
if pf in extracted and extracted[pf] != "文档未提及":
pm = re.search(r'([\d,]+\.?\d*)\s*(万元|元)', extracted[pf])
if pm:
extracted[pf] = pm.group(1).replace(",", "") + pm.group(2)
# 最高限价 ↔ 最高投标限价 同步
h1 = extracted.get("最高限价", "文档未提及")
h2 = extracted.get("最高投标限价", "文档未提及")
if h1 != "文档未提及" and h2 == "文档未提及" and "最高投标限价" in fields:
extracted["最高投标限价"] = h1
elif h2 != "文档未提及" and h1 == "文档未提及" and "最高限价" in fields:
extracted["最高限价"] = h2
# 文本字段最短长度校验
for tf in ("资质要求", "业绩要求", "项目概况", "造价付款方式"):
if tf in extracted and extracted[tf] not in ("文档未提及", ""):
if len(extracted[tf]) < 3:
extracted[tf] = "文档未提及"
# 跨字段关联:当业绩要求未提取到时,尝试从评分说明中提取
if "业绩要求" in extracted and extracted["业绩要求"] == "文档未提及":
if "评分说明与资信评分标准" in extracted:
score_info = extracted["评分说明与资信评分标准"]
# 从评分说明中提取业绩相关信息
if "类似工程业绩" in score_info:
# 提取业绩信息
# 匹配业绩要求的正则表达式
performance_pattern = r'类似工程业绩[:]\s*(.*?)(?:|。|$)'
matches = re.findall(performance_pattern, score_info, re.DOTALL)
if matches:
performance_info = " ".join(matches)
# 清理和格式化
performance_info = performance_info.strip()
if performance_info:
extracted["业绩要求"] = performance_info
return extracted
# ---------- 本地回退提取 ----------
@staticmethod
def _local_extract(content: str, fields: list) -> dict:
"""API 失败时的本地正则回退提取"""
result = {}
field_patterns = {
"类型": None, # 特殊处理
"投标截止日": [
r'投标截止时间[:]\s*(\d{4}\d{1,2}月\d{1,2}日)',
r'投标截止[:]\s*(\d{4}-\d{1,2}-\d{1,2})',
r'开标时间[:]\s*(\d{4}\d{1,2}月\d{1,2}日)',
],
"招标人": [
r'招标人[:]\s*([^\n]+)',
r'招标单位[:]\s*([^\n]+)',
r'建设单位[:]\s*([^\n]+)',
],
"有无答辩": None, # 特殊处理
"业绩要求": [
r'业绩要求[:]\s*([^\n]+)',
r'类似工程业绩[:]\s*([^\n]+)',
r'投标人业绩[:]\s*([^\n]+)',
],
}
for field in fields:
if field == "类型":
type_kw = {
"勘察": ["勘察", "地质", "岩土", "测量"],
"设计": ["设计", "规划", "施工图"],
"监理": ["监理", "监督"],
"EPC": ["EPC"],
"采购": ["采购", "设备"],
"咨询": ["咨询", "造价", "招标代理"],
}
matched = "其他"
for tname, kws in type_kw.items():
if any(k in content[:5000] for k in kws):
matched = tname
break
if matched == "其他" and any(
k in content[:5000]
for k in ["施工", "建筑", "安装", "市政"]
):
matched = "施工"
result["类型"] = matched
elif field == "有无答辩":
result["有无答辩"] = (
"" if any(k in content for k in ["答辩", "面试", "现场汇报"])
else ""
)
elif field in field_patterns and field_patterns[field]:
for pat in field_patterns[field]:
m = re.search(pat, content)
if m:
val = m.group(1).strip()
# 日期格式化
if field == "投标截止日" and "" in val:
dm = re.search(
r'(\d{4})年(\d{1,2})月(\d{1,2})日', val)
if dm:
val = (f"{dm.group(1)}-"
f"{dm.group(2).zfill(2)}-"
f"{dm.group(3).zfill(2)}")
result[field] = val
break
elif field in ("最高限价", "最高投标限价"):
patterns = [
r'最高投标限价.*?(\d+(?:\.\d+)?)\s*(万元|元)',
r'招标控制价.*?(\d+(?:\.\d+)?)\s*(万元|元)',
r'最高限价.*?(\d+(?:\.\d+)?)\s*(万元|元)',
r'控制价.*?(\d+(?:\.\d+)?)\s*(万元|元)',
]
for pat in patterns:
m = re.search(pat, content, re.DOTALL)
if m:
price = m.group(1).replace(",", "") + m.group(2)
result["最高限价"] = price
result["最高投标限价"] = price
break
# 特别处理业绩要求:从评分标准中提取
if "业绩要求" in fields and "业绩要求" not in result:
# 搜索评分标准中的业绩要求
score_pattern = r'类似工程业绩[:]\s*(.*?)(?:|。|$)'
m = re.search(score_pattern, content, re.DOTALL)
if m:
result["业绩要求"] = m.group(1).strip()
return {k: v for k, v in result.items() if v}