Initial commit: 招标信息爬虫与分析系统
This commit is contained in:
4
processors/__init__.py
Normal file
4
processors/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from .pipeline import ProcessingPipeline
|
||||
|
||||
__all__ = ['ProcessingPipeline']
|
||||
293
processors/content_fetcher.py
Normal file
293
processors/content_fetcher.py
Normal file
@@ -0,0 +1,293 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
内容获取器 - 获取详情页文本 + 附件内容
|
||||
"""
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import time
|
||||
|
||||
import requests
|
||||
import urllib3
|
||||
import pdfplumber
|
||||
from bs4 import BeautifulSoup
|
||||
from docx import Document
|
||||
|
||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||
logger = logging.getLogger("ztb")
|
||||
|
||||
|
||||
class ContentFetcher:
|
||||
"""页面内容 + 附件获取器"""
|
||||
|
||||
# 速率控制参数
|
||||
RPM_LIMIT = 12 # 每分钟最大请求数
|
||||
DELAY_MIN = 1.5 # 请求间最小延迟(秒)
|
||||
DELAY_MAX = 3.0 # 请求间最大延迟(秒)
|
||||
MAX_DOWNLOAD_MB = 50 # 单个附件最大体积(MB)
|
||||
|
||||
def __init__(self, temp_dir: str = "temp_files"):
|
||||
self.temp_dir = temp_dir
|
||||
os.makedirs(temp_dir, exist_ok=True)
|
||||
self.headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/120.0.0.0 Safari/537.36",
|
||||
}
|
||||
self._req_timestamps = [] # 用于 RPM 限速
|
||||
|
||||
# ---------- 公开方法 ----------
|
||||
|
||||
def get_full_content(self, url: str, max_attachments: int = 2) -> str:
|
||||
"""
|
||||
获取页面文本 + 附件解析文本,合并返回(单次请求)
|
||||
|
||||
Args:
|
||||
url: 详情页 URL
|
||||
max_attachments: 最多处理的附件数
|
||||
|
||||
Returns:
|
||||
合并后的全文文本
|
||||
"""
|
||||
# 1. 获取页面 HTML(单次请求)
|
||||
html = self._fetch_html(url)
|
||||
if not html:
|
||||
return ""
|
||||
|
||||
# 2. 提取页面纯文本
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
page_content = soup.get_text(separator="\n", strip=True)
|
||||
|
||||
# 3. 提取发布时间
|
||||
publish_time = self._extract_publish_time(soup, page_content)
|
||||
if publish_time:
|
||||
page_content = f"发布时间: {publish_time}\n\n" + page_content
|
||||
|
||||
# 4. 从同一 HTML 查找并解析附件
|
||||
attachments = self._find_attachments(soup, url)
|
||||
attachment_content = ""
|
||||
for att in attachments[:max_attachments]:
|
||||
att_text = self._download_and_parse(att["url"], att["name"])
|
||||
if att_text:
|
||||
attachment_content += f"\n\n=== 附件: {att['name']} ===\n{att_text}"
|
||||
|
||||
full_content = page_content
|
||||
if attachment_content:
|
||||
full_content += attachment_content
|
||||
|
||||
return full_content
|
||||
|
||||
@staticmethod
|
||||
def _extract_publish_time(soup: BeautifulSoup, page_content: str) -> str:
|
||||
"""
|
||||
从页面中提取发布时间
|
||||
|
||||
Args:
|
||||
soup: BeautifulSoup 对象
|
||||
page_content: 页面纯文本
|
||||
|
||||
Returns:
|
||||
发布时间字符串,如 "2026-02-13 16:12:28"
|
||||
"""
|
||||
# 1. 尝试从页面文本中提取
|
||||
patterns = [
|
||||
r'信息发布时间[::]\s*([\d-]+\s[\d:]+)',
|
||||
r'发布时间[::]\s*([\d-]+\s[\d:]+)',
|
||||
r'发布日期[::]\s*([\d-]+\s[\d:]+)',
|
||||
r'发布时间[::]\s*([\d-]+)',
|
||||
r'发布日期[::]\s*([\d-]+)',
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, page_content)
|
||||
if match:
|
||||
return match.group(1).strip()
|
||||
|
||||
# 2. 尝试从HTML标签中提取
|
||||
time_tags = soup.find_all(['time', 'span', 'div'], class_=re.compile(r'time|date|publish', re.I))
|
||||
for tag in time_tags:
|
||||
text = tag.get_text(strip=True)
|
||||
match = re.search(r'([\d-]+\s[\d:]+)', text)
|
||||
if match:
|
||||
return match.group(1).strip()
|
||||
|
||||
return ""
|
||||
|
||||
# ---------- 速率控制 ----------
|
||||
|
||||
def _throttle(self):
|
||||
"""请求前限速:RPM 上限 + 随机延迟"""
|
||||
now = time.time()
|
||||
self._req_timestamps = [
|
||||
t for t in self._req_timestamps if now - t < 60]
|
||||
if len(self._req_timestamps) >= self.RPM_LIMIT:
|
||||
wait = 60 - (now - self._req_timestamps[0]) + random.uniform(1, 3)
|
||||
if wait > 0:
|
||||
logger.debug(f"ContentFetcher 限速等待 {wait:.0f}s")
|
||||
time.sleep(wait)
|
||||
self._req_timestamps.append(time.time())
|
||||
time.sleep(random.uniform(self.DELAY_MIN, self.DELAY_MAX))
|
||||
|
||||
# ---------- 页面获取 ----------
|
||||
|
||||
def _fetch_html(self, url: str, max_retries: int = 3) -> str:
|
||||
"""获取页面 HTML 原文"""
|
||||
self._throttle()
|
||||
for retry in range(max_retries):
|
||||
try:
|
||||
resp = requests.get(url, headers=self.headers,
|
||||
timeout=45, verify=False)
|
||||
resp.encoding = "utf-8"
|
||||
if resp.status_code != 200:
|
||||
logger.warning(f"页面返回 {resp.status_code}: {url[:60]}")
|
||||
if retry < max_retries - 1:
|
||||
time.sleep(3)
|
||||
continue
|
||||
return ""
|
||||
|
||||
logger.debug(f"页面获取成功 {len(resp.text)} 字符: {url[:60]}")
|
||||
return resp.text
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"获取页面失败 ({retry+1}/{max_retries}): {e}")
|
||||
if retry < max_retries - 1:
|
||||
time.sleep(3)
|
||||
return ""
|
||||
|
||||
# ---------- 附件发现 ----------
|
||||
|
||||
@staticmethod
|
||||
def _find_attachments(soup: BeautifulSoup, base_url: str) -> list:
|
||||
"""从已解析的 HTML 中查找附件链接"""
|
||||
attachments = []
|
||||
for link in soup.find_all("a"):
|
||||
href = link.get("href", "")
|
||||
text = link.get_text(strip=True)
|
||||
if any(ext in href.lower() for ext in [".pdf", ".doc", ".docx"]):
|
||||
if not href.startswith("http"):
|
||||
if href.startswith("/"):
|
||||
base = "/".join(base_url.split("/")[:3])
|
||||
href = base + href
|
||||
else:
|
||||
href = base_url.rsplit("/", 1)[0] + "/" + href
|
||||
attachments.append({
|
||||
"name": text or href.split("/")[-1],
|
||||
"url": href,
|
||||
})
|
||||
return attachments
|
||||
|
||||
# ---------- 附件下载与解析 ----------
|
||||
|
||||
def _download_and_parse(self, url: str, filename: str,
|
||||
max_retries: int = 3) -> str:
|
||||
"""下载附件并解析为文本"""
|
||||
self._throttle()
|
||||
file_type = self._get_file_type(url)
|
||||
max_bytes = self.MAX_DOWNLOAD_MB * 1024 * 1024
|
||||
for retry in range(max_retries):
|
||||
try:
|
||||
logger.debug(f"下载附件: {filename}")
|
||||
resp = requests.get(url, headers=self.headers,
|
||||
timeout=90, verify=False, stream=True)
|
||||
resp.raise_for_status()
|
||||
|
||||
temp_path = os.path.join(
|
||||
self.temp_dir, f"temp_{hash(url)}.{file_type}")
|
||||
total = 0
|
||||
with open(temp_path, "wb") as f:
|
||||
for chunk in resp.iter_content(chunk_size=8192):
|
||||
if chunk:
|
||||
f.write(chunk)
|
||||
total += len(chunk)
|
||||
if total > max_bytes:
|
||||
logger.warning(
|
||||
f"附件超过 {self.MAX_DOWNLOAD_MB}MB 限制,跳过: {filename}")
|
||||
break
|
||||
|
||||
if total > max_bytes:
|
||||
try:
|
||||
os.remove(temp_path)
|
||||
except OSError:
|
||||
pass
|
||||
return ""
|
||||
|
||||
logger.debug(f"附件已下载 {total/1024:.1f}KB: {filename}")
|
||||
|
||||
try:
|
||||
if file_type == "pdf":
|
||||
return self._parse_pdf(temp_path)
|
||||
elif file_type in ("doc", "docx"):
|
||||
return self._parse_word(temp_path)
|
||||
return ""
|
||||
finally:
|
||||
try:
|
||||
os.remove(temp_path)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"附件处理失败 ({retry+1}/{max_retries}): {e}")
|
||||
if retry < max_retries - 1:
|
||||
time.sleep(4)
|
||||
return ""
|
||||
|
||||
# ---------- 文件解析 ----------
|
||||
|
||||
@staticmethod
|
||||
def _parse_pdf(file_path: str) -> str:
|
||||
"""解析 PDF 文件"""
|
||||
try:
|
||||
text = ""
|
||||
with pdfplumber.open(file_path) as pdf:
|
||||
for page in pdf.pages:
|
||||
page_text = page.extract_text()
|
||||
if page_text:
|
||||
text += page_text + "\n"
|
||||
return text
|
||||
except Exception as e:
|
||||
logger.warning(f"PDF解析失败: {e}")
|
||||
return ""
|
||||
|
||||
@staticmethod
|
||||
def _parse_word(file_path: str) -> str:
|
||||
"""解析 Word 文件(支持 .doc 和 .docx)"""
|
||||
# 尝试 python-docx (适用于 .docx)
|
||||
try:
|
||||
doc = Document(file_path)
|
||||
text = "\n".join(p.text for p in doc.paragraphs)
|
||||
if len(text) > 500:
|
||||
return text
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 回退: UTF-16LE 解码 (适用于 .doc)
|
||||
try:
|
||||
with open(file_path, "rb") as f:
|
||||
content = f.read()
|
||||
raw = content.decode("utf-16le", errors="ignore")
|
||||
readable = []
|
||||
for c in raw:
|
||||
if "\u4e00" <= c <= "\u9fff" or c in ",。;:""''()《》【】、0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz%.=×+- \n□☑":
|
||||
readable.append(c)
|
||||
elif readable and readable[-1] != " ":
|
||||
readable.append(" ")
|
||||
text = re.sub(r" +", " ", "".join(readable))
|
||||
if len(text) > 500:
|
||||
return text
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return ""
|
||||
|
||||
@staticmethod
|
||||
def _get_file_type(filename: str) -> str:
|
||||
"""根据文件名/URL 判断文件类型"""
|
||||
low = filename.lower()
|
||||
if ".pdf" in low:
|
||||
return "pdf"
|
||||
if ".docx" in low:
|
||||
return "docx"
|
||||
if ".doc" in low:
|
||||
return "doc"
|
||||
return "unknown"
|
||||
343
processors/deepseek.py
Normal file
343
processors/deepseek.py
Normal file
@@ -0,0 +1,343 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
DeepSeek AI 处理器 - 从招标文件内容中提取结构化字段
|
||||
"""
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
import urllib3
|
||||
|
||||
import requests
|
||||
|
||||
from config import DEEPSEEK_API_KEY, DEEPSEEK_PROMPTS, PROCESSING_CONFIG
|
||||
|
||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||
logger = logging.getLogger("ztb")
|
||||
|
||||
|
||||
class DeepSeekProcessor:
|
||||
"""DeepSeek AI 字段提取器"""
|
||||
|
||||
def __init__(self, api_key: str = None):
|
||||
self.api_key = api_key or DEEPSEEK_API_KEY
|
||||
self.api_url = "https://api.deepseek.com/chat/completions"
|
||||
self.model = "deepseek-chat"
|
||||
self.timeout = PROCESSING_CONFIG.get("request_timeout", 90)
|
||||
self.max_content = PROCESSING_CONFIG.get("max_content_length", 120000)
|
||||
|
||||
def extract_fields(self, content: str, fields: list,
|
||||
region_name: str = "") -> dict:
|
||||
"""
|
||||
使用 DeepSeek 提取指定字段
|
||||
|
||||
Args:
|
||||
content: 页面+附件合并后的文本
|
||||
fields: 需要提取的字段列表
|
||||
region_name: 区域名称(用于日志)
|
||||
|
||||
Returns:
|
||||
{字段名: 提取值} 字典
|
||||
"""
|
||||
if not content or not fields:
|
||||
return {}
|
||||
|
||||
# 构建字段提示词
|
||||
field_prompts = []
|
||||
for field in fields:
|
||||
if field in DEEPSEEK_PROMPTS:
|
||||
field_prompts.append(f"【{field}】\n{DEEPSEEK_PROMPTS[field]}")
|
||||
else:
|
||||
field_prompts.append(
|
||||
f'【{field}】请从文档中提取{field}信息。如果未找到,返回"文档未提及"。')
|
||||
|
||||
# 内容截取
|
||||
selected_content = self._prepare_content(content, fields)
|
||||
|
||||
# 构建消息
|
||||
system_prompt = (
|
||||
"你是一个专业的招标文件分析助手,擅长从招标文件中准确提取关键信息。"
|
||||
"请特别注意:1) 仔细检查PDF附件内容 2) 识别不同表述的同一概念 "
|
||||
"3) 提取详细完整的信息 4) 严格按照JSON格式返回结果。"
|
||||
)
|
||||
|
||||
prompt = f"""请从以下招标文件内容中提取指定字段信息。
|
||||
|
||||
提取规则:
|
||||
1. 只提取文档中明确存在的信息,严禁推测或编造
|
||||
2. 如果某字段在文档中未提及,必须返回"文档未提及"
|
||||
3. 对于价格信息,确保提取完整的价格数值和单位
|
||||
4. 评标办法和评分说明必须来自文档正文而非目录页
|
||||
|
||||
需要提取的字段:
|
||||
{chr(10).join(field_prompts)}
|
||||
|
||||
请以JSON格式返回结果:
|
||||
{{
|
||||
"字段名1": "提取的内容",
|
||||
"字段名2": "提取的内容"
|
||||
}}
|
||||
|
||||
招标文件内容:
|
||||
{selected_content}
|
||||
"""
|
||||
|
||||
try:
|
||||
response = requests.post(
|
||||
self.api_url,
|
||||
headers={
|
||||
"Authorization": f"Bearer {self.api_key}",
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
json={
|
||||
"model": self.model,
|
||||
"messages": [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
"temperature": 0.1,
|
||||
"max_tokens": 3000,
|
||||
"top_p": 0.95,
|
||||
},
|
||||
timeout=self.timeout,
|
||||
verify=False,
|
||||
)
|
||||
response.raise_for_status()
|
||||
result = response.json()
|
||||
|
||||
# 解析返回 JSON
|
||||
content_text = result["choices"][0]["message"]["content"]
|
||||
extracted = self._parse_json_response(content_text)
|
||||
|
||||
# 后处理:价格同步、格式清理
|
||||
extracted = self._post_process(extracted, fields, content)
|
||||
|
||||
return extracted
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
logger.warning(f"DeepSeek 返回 JSON 解析失败: {e}")
|
||||
return self._local_extract(content, fields)
|
||||
except requests.RequestException as e:
|
||||
logger.warning(f"DeepSeek API 请求失败: {e}")
|
||||
return self._local_extract(content, fields)
|
||||
except Exception as e:
|
||||
logger.warning(f"DeepSeek 处理异常: {e}")
|
||||
return self._local_extract(content, fields)
|
||||
|
||||
# ---------- 内容预处理 ----------
|
||||
|
||||
def _prepare_content(self, content: str, fields: list) -> str:
|
||||
"""根据字段类型智能截取内容"""
|
||||
if len(content) <= self.max_content:
|
||||
return content
|
||||
|
||||
logger.debug(f"内容过长({len(content)}字符),使用预筛选")
|
||||
# 提取文档头部
|
||||
header = content[:10000]
|
||||
contexts = []
|
||||
|
||||
# 按字段类型定义搜索关键词
|
||||
keyword_map = {
|
||||
"价格": (["最高限价", "最高投标限价", "预估金额", "预估合同金额"],
|
||||
["最高投标限价", "招标控制价", "最高限价", "控制价", "限价",
|
||||
"投标须知", "万元"]),
|
||||
"评标": (["评标办法", "评分说明与资信评分标准"],
|
||||
["评标办法", "评分标准", "资信标", "技术标", "商务标",
|
||||
"综合评估法", "评定分离"]),
|
||||
"资质": (["资质要求", "业绩要求"],
|
||||
["资质要求", "资格要求", "施工总承包", "资质等级",
|
||||
"业绩要求", "业绩条件"]),
|
||||
"日期": (["投标截止日"],
|
||||
["投标截止", "截止时间", "开标时间", "递交截止"]),
|
||||
"付款": (["造价付款方式"],
|
||||
["付款方式", "工程款支付", "预付款", "进度款",
|
||||
"结算款", "质保金", "合同条款"]),
|
||||
}
|
||||
|
||||
for group, (target_fields, keywords) in keyword_map.items():
|
||||
if any(f in fields for f in target_fields):
|
||||
window = 800 if group in ("评标", "付款") else 500
|
||||
for kw in keywords:
|
||||
for m in re.finditer(
|
||||
r'.{0,' + str(window) + '}' + re.escape(kw) +
|
||||
r'.{0,' + str(window) + '}', content, re.DOTALL
|
||||
):
|
||||
contexts.append(m.group(0))
|
||||
|
||||
# 特别提取投标人须知前附表
|
||||
if "业绩要求" in fields or "资质要求" in fields:
|
||||
if "投标人须知前附表" in content:
|
||||
start_idx = content.find("投标人须知前附表")
|
||||
end_idx = min(len(content), start_idx + 10000) # 提取前附表的较大部分
|
||||
contexts.append("=== 投标人须知前附表 ===\n" + content[start_idx:end_idx])
|
||||
|
||||
unique = list(set(contexts))
|
||||
combined = "=== 文档头部信息 ===\n" + header + "\n\n" + "\n\n".join(unique)
|
||||
return combined[:self.max_content]
|
||||
|
||||
# ---------- 响应解析 ----------
|
||||
|
||||
@staticmethod
|
||||
def _parse_json_response(text: str) -> dict:
|
||||
"""从 DeepSeek 返回文本中提取 JSON"""
|
||||
if "```json" in text:
|
||||
text = text.split("```json")[1].split("```")[0]
|
||||
elif "```" in text:
|
||||
text = text.split("```")[1].split("```")[0]
|
||||
elif "{" in text:
|
||||
start = text.find("{")
|
||||
end = text.rfind("}") + 1
|
||||
if start != -1 and end > 0:
|
||||
text = text[start:end]
|
||||
return json.loads(text.strip())
|
||||
|
||||
# ---------- 后处理 ----------
|
||||
|
||||
def _post_process(self, extracted: dict, fields: list,
|
||||
content: str) -> dict:
|
||||
"""对提取结果进行格式校验和后处理"""
|
||||
# 投标截止日格式化
|
||||
if "投标截止日" in extracted:
|
||||
val = extracted["投标截止日"]
|
||||
if val and val != "文档未提及":
|
||||
m = re.search(r'(\d{4})[年/-](\d{1,2})[月/-](\d{1,2})', val)
|
||||
if m:
|
||||
extracted["投标截止日"] = (
|
||||
f"{m.group(1)}-{m.group(2).zfill(2)}-"
|
||||
f"{m.group(3).zfill(2)}")
|
||||
|
||||
# 价格字段清理 + 同步
|
||||
for pf in ("最高限价", "最高投标限价"):
|
||||
if pf in extracted and extracted[pf] != "文档未提及":
|
||||
pm = re.search(r'([\d,]+\.?\d*)\s*(万元|元)', extracted[pf])
|
||||
if pm:
|
||||
extracted[pf] = pm.group(1).replace(",", "") + pm.group(2)
|
||||
|
||||
# 最高限价 ↔ 最高投标限价 同步
|
||||
h1 = extracted.get("最高限价", "文档未提及")
|
||||
h2 = extracted.get("最高投标限价", "文档未提及")
|
||||
if h1 != "文档未提及" and h2 == "文档未提及" and "最高投标限价" in fields:
|
||||
extracted["最高投标限价"] = h1
|
||||
elif h2 != "文档未提及" and h1 == "文档未提及" and "最高限价" in fields:
|
||||
extracted["最高限价"] = h2
|
||||
|
||||
# 文本字段最短长度校验
|
||||
for tf in ("资质要求", "业绩要求", "项目概况", "造价付款方式"):
|
||||
if tf in extracted and extracted[tf] not in ("文档未提及", ""):
|
||||
if len(extracted[tf]) < 3:
|
||||
extracted[tf] = "文档未提及"
|
||||
|
||||
# 跨字段关联:当业绩要求未提取到时,尝试从评分说明中提取
|
||||
if "业绩要求" in extracted and extracted["业绩要求"] == "文档未提及":
|
||||
if "评分说明与资信评分标准" in extracted:
|
||||
score_info = extracted["评分说明与资信评分标准"]
|
||||
# 从评分说明中提取业绩相关信息
|
||||
if "类似工程业绩" in score_info:
|
||||
# 提取业绩信息
|
||||
# 匹配业绩要求的正则表达式
|
||||
performance_pattern = r'类似工程业绩[::]\s*(.*?)(?:;|。|$)'
|
||||
matches = re.findall(performance_pattern, score_info, re.DOTALL)
|
||||
if matches:
|
||||
performance_info = " ".join(matches)
|
||||
# 清理和格式化
|
||||
performance_info = performance_info.strip()
|
||||
if performance_info:
|
||||
extracted["业绩要求"] = performance_info
|
||||
|
||||
return extracted
|
||||
|
||||
# ---------- 本地回退提取 ----------
|
||||
|
||||
@staticmethod
|
||||
def _local_extract(content: str, fields: list) -> dict:
|
||||
"""API 失败时的本地正则回退提取"""
|
||||
result = {}
|
||||
|
||||
field_patterns = {
|
||||
"类型": None, # 特殊处理
|
||||
"投标截止日": [
|
||||
r'投标截止时间[::]\s*(\d{4}年\d{1,2}月\d{1,2}日)',
|
||||
r'投标截止[::]\s*(\d{4}-\d{1,2}-\d{1,2})',
|
||||
r'开标时间[::]\s*(\d{4}年\d{1,2}月\d{1,2}日)',
|
||||
],
|
||||
"招标人": [
|
||||
r'招标人[::]\s*([^\n]+)',
|
||||
r'招标单位[::]\s*([^\n]+)',
|
||||
r'建设单位[::]\s*([^\n]+)',
|
||||
],
|
||||
"有无答辩": None, # 特殊处理
|
||||
"业绩要求": [
|
||||
r'业绩要求[::]\s*([^\n]+)',
|
||||
r'类似工程业绩[::]\s*([^\n]+)',
|
||||
r'投标人业绩[::]\s*([^\n]+)',
|
||||
],
|
||||
}
|
||||
|
||||
for field in fields:
|
||||
if field == "类型":
|
||||
type_kw = {
|
||||
"勘察": ["勘察", "地质", "岩土", "测量"],
|
||||
"设计": ["设计", "规划", "施工图"],
|
||||
"监理": ["监理", "监督"],
|
||||
"EPC": ["EPC"],
|
||||
"采购": ["采购", "设备"],
|
||||
"咨询": ["咨询", "造价", "招标代理"],
|
||||
}
|
||||
matched = "其他"
|
||||
for tname, kws in type_kw.items():
|
||||
if any(k in content[:5000] for k in kws):
|
||||
matched = tname
|
||||
break
|
||||
if matched == "其他" and any(
|
||||
k in content[:5000]
|
||||
for k in ["施工", "建筑", "安装", "市政"]
|
||||
):
|
||||
matched = "施工"
|
||||
result["类型"] = matched
|
||||
|
||||
elif field == "有无答辩":
|
||||
result["有无答辩"] = (
|
||||
"有" if any(k in content for k in ["答辩", "面试", "现场汇报"])
|
||||
else "无"
|
||||
)
|
||||
|
||||
elif field in field_patterns and field_patterns[field]:
|
||||
for pat in field_patterns[field]:
|
||||
m = re.search(pat, content)
|
||||
if m:
|
||||
val = m.group(1).strip()
|
||||
# 日期格式化
|
||||
if field == "投标截止日" and "年" in val:
|
||||
dm = re.search(
|
||||
r'(\d{4})年(\d{1,2})月(\d{1,2})日', val)
|
||||
if dm:
|
||||
val = (f"{dm.group(1)}-"
|
||||
f"{dm.group(2).zfill(2)}-"
|
||||
f"{dm.group(3).zfill(2)}")
|
||||
result[field] = val
|
||||
break
|
||||
|
||||
elif field in ("最高限价", "最高投标限价"):
|
||||
patterns = [
|
||||
r'最高投标限价.*?(\d+(?:\.\d+)?)\s*(万元|元)',
|
||||
r'招标控制价.*?(\d+(?:\.\d+)?)\s*(万元|元)',
|
||||
r'最高限价.*?(\d+(?:\.\d+)?)\s*(万元|元)',
|
||||
r'控制价.*?(\d+(?:\.\d+)?)\s*(万元|元)',
|
||||
]
|
||||
for pat in patterns:
|
||||
m = re.search(pat, content, re.DOTALL)
|
||||
if m:
|
||||
price = m.group(1).replace(",", "") + m.group(2)
|
||||
result["最高限价"] = price
|
||||
result["最高投标限价"] = price
|
||||
break
|
||||
|
||||
# 特别处理业绩要求:从评分标准中提取
|
||||
if "业绩要求" in fields and "业绩要求" not in result:
|
||||
# 搜索评分标准中的业绩要求
|
||||
score_pattern = r'类似工程业绩[::]\s*(.*?)(?:;|。|$)'
|
||||
m = re.search(score_pattern, content, re.DOTALL)
|
||||
if m:
|
||||
result["业绩要求"] = m.group(1).strip()
|
||||
|
||||
return {k: v for k, v in result.items() if v}
|
||||
143
processors/jiandaoyun.py
Normal file
143
processors/jiandaoyun.py
Normal file
@@ -0,0 +1,143 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
简道云数据上传模块
|
||||
"""
|
||||
import logging
|
||||
import re
|
||||
|
||||
import requests
|
||||
import urllib3
|
||||
|
||||
from config import JDY_CONFIG
|
||||
|
||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||
logger = logging.getLogger("ztb")
|
||||
|
||||
|
||||
class JiandaoyunUploader:
|
||||
"""简道云 API 上传器"""
|
||||
|
||||
BASE_URL = "https://api.jiandaoyun.com/api/v5"
|
||||
|
||||
# 需要转换为数字的字段
|
||||
NUMERIC_FIELDS = {"最高限价", "最高投标限价", "预估金额"}
|
||||
|
||||
def __init__(self, api_key: str = None):
|
||||
self.api_key = api_key or JDY_CONFIG["api_key"]
|
||||
self.headers = {
|
||||
"Authorization": f"Bearer {self.api_key}",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
|
||||
def upload_records(self, region_name: str, records: list) -> dict:
|
||||
"""
|
||||
上传记录到对应的简道云表单
|
||||
|
||||
Args:
|
||||
region_name: 区域名称(如 "浙江招标文件公示")
|
||||
records: 数据记录列表
|
||||
|
||||
Returns:
|
||||
{"total": N, "success": N, "failed": N}
|
||||
"""
|
||||
form_config = JDY_CONFIG["forms"].get(region_name)
|
||||
if not form_config:
|
||||
logger.warning(f"{region_name}: 未找到简道云表单配置,跳过上传")
|
||||
return {"total": len(records), "success": 0, "failed": len(records)}
|
||||
|
||||
app_id = form_config["app_id"]
|
||||
entry_id = form_config["entry_id"]
|
||||
field_mapping = form_config.get("field_mapping", {})
|
||||
|
||||
success = 0
|
||||
failed = 0
|
||||
|
||||
for i, record in enumerate(records):
|
||||
name = record.get("名称", f"记录{i+1}")
|
||||
try:
|
||||
jdy_data = self._convert(record, field_mapping)
|
||||
if not jdy_data:
|
||||
logger.debug(f"[{i+1}/{len(records)}] {name}: 无有效数据")
|
||||
failed += 1
|
||||
continue
|
||||
|
||||
result = self._create_record(app_id, entry_id, jdy_data)
|
||||
if result and result.get("success"):
|
||||
success += 1
|
||||
if (i + 1) % 10 == 0 or (i + 1) == len(records):
|
||||
logger.info(
|
||||
f" [{i+1}/{len(records)}] 上传进度: "
|
||||
f"成功{success} 失败{failed}")
|
||||
else:
|
||||
failed += 1
|
||||
err = result.get("error", "未知") if result else "无返回"
|
||||
logger.warning(
|
||||
f" [{i+1}/{len(records)}] {name[:25]}: "
|
||||
f"上传失败 - {err}")
|
||||
except Exception as e:
|
||||
failed += 1
|
||||
logger.error(f" [{i+1}/{len(records)}] {name[:25]}: 异常 - {e}")
|
||||
|
||||
logger.info(f" {region_name} 上传完成: 成功 {success}, 失败 {failed}")
|
||||
return {"total": len(records), "success": success, "failed": failed}
|
||||
|
||||
# ---------- 内部方法 ----------
|
||||
|
||||
def _create_record(self, app_id: str, entry_id: str, data: dict) -> dict:
|
||||
"""调用简道云 API 创建单条记录"""
|
||||
url = f"{self.BASE_URL}/app/entry/data/create"
|
||||
payload = {"app_id": app_id, "entry_id": entry_id, "data": data}
|
||||
|
||||
try:
|
||||
resp = requests.post(url, headers=self.headers,
|
||||
json=payload, timeout=30, verify=False)
|
||||
if not resp.text:
|
||||
return {"success": False,
|
||||
"error": f"空响应, status={resp.status_code}"}
|
||||
|
||||
result = resp.json()
|
||||
if resp.status_code == 200 and result.get("data", {}).get("_id"):
|
||||
return {"success": True,
|
||||
"data_id": result["data"]["_id"]}
|
||||
return {"success": False,
|
||||
"error": result.get("msg", str(result))}
|
||||
except Exception as e:
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
def _convert(self, record: dict, field_mapping: dict) -> dict:
|
||||
"""将记录转换为简道云 API 格式"""
|
||||
jdy_data = {}
|
||||
for local_field, jdy_field in field_mapping.items():
|
||||
value = record.get(local_field)
|
||||
if not value or value in ("文档未提及", "详见公告"):
|
||||
continue
|
||||
|
||||
if local_field in self.NUMERIC_FIELDS:
|
||||
num = self._parse_price(value)
|
||||
if num is not None:
|
||||
jdy_data[jdy_field] = {"value": num}
|
||||
else:
|
||||
jdy_data[jdy_field] = {"value": value}
|
||||
return jdy_data
|
||||
|
||||
@staticmethod
|
||||
def _parse_price(price_str) -> int | None:
|
||||
"""将价格字符串转为纯数字(元)"""
|
||||
if not price_str or price_str in ("文档未提及", "详见公告"):
|
||||
return None
|
||||
|
||||
s = str(price_str).strip()
|
||||
s = re.sub(r'^[约≈大概]*', '', s)
|
||||
s = re.sub(r'[((].*?[))]', '', s)
|
||||
s = re.sub(r'[元人民币¥¥\s]', '', s)
|
||||
|
||||
try:
|
||||
if "亿" in s:
|
||||
return int(float(s.replace("亿", "")) * 100_000_000)
|
||||
elif "万" in s:
|
||||
return int(float(s.replace("万", "")) * 10_000)
|
||||
else:
|
||||
s = s.replace(",", "").replace(",", "")
|
||||
return int(float(s))
|
||||
except (ValueError, TypeError):
|
||||
return None
|
||||
202
processors/pipeline.py
Normal file
202
processors/pipeline.py
Normal file
@@ -0,0 +1,202 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
处理管道 - 将爬虫结果经 DeepSeek AI 处理后上传简道云
|
||||
"""
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from datetime import datetime
|
||||
|
||||
from config import REGION_CONFIGS, PROCESSING_CONFIG
|
||||
from .content_fetcher import ContentFetcher
|
||||
from .deepseek import DeepSeekProcessor
|
||||
from .jiandaoyun import JiandaoyunUploader
|
||||
|
||||
logger = logging.getLogger("ztb")
|
||||
|
||||
|
||||
class ProcessingPipeline:
|
||||
"""处理管道:爬虫结果 → 内容获取 → AI提取 → 上传简道云"""
|
||||
|
||||
def __init__(self):
|
||||
temp_dir = PROCESSING_CONFIG.get("temp_dir", "temp_files")
|
||||
self.fetcher = ContentFetcher(temp_dir=temp_dir)
|
||||
self.deepseek = DeepSeekProcessor()
|
||||
self.uploader = JiandaoyunUploader()
|
||||
self.api_delay = PROCESSING_CONFIG.get("api_delay", 1)
|
||||
self.output_dir = PROCESSING_CONFIG.get("output_dir", "data")
|
||||
|
||||
def process_results(self, results: list, site: str,
|
||||
notice_type: str, upload: bool = False) -> list:
|
||||
"""
|
||||
处理爬虫结果:字段映射 → 内容获取 → AI提取 → 可选上传
|
||||
|
||||
Args:
|
||||
results: 爬虫返回的结果列表
|
||||
site: 站点标识 ("zhejiang" / "taizhou")
|
||||
notice_type: 公告类型 (如 "招标文件公示")
|
||||
upload: 是否上传简道云
|
||||
|
||||
Returns:
|
||||
处理后的记录列表
|
||||
"""
|
||||
# 查找区域配置
|
||||
region_key = f"{site}:{notice_type}"
|
||||
region_cfg = REGION_CONFIGS.get(region_key)
|
||||
if not region_cfg:
|
||||
logger.warning(f"未找到区域配置: {region_key},跳过AI处理")
|
||||
return results
|
||||
|
||||
region_name = region_cfg["region_name"]
|
||||
link_field = region_cfg["link_field"]
|
||||
ai_fields = region_cfg["ai_fields"]
|
||||
|
||||
logger.info(f"开始AI处理: {region_name}, {len(results)} 条记录")
|
||||
logger.info(f" 需要提取的字段: {ai_fields}")
|
||||
|
||||
processed = []
|
||||
success_count = 0
|
||||
fail_count = 0
|
||||
|
||||
for i, item in enumerate(results):
|
||||
# 1. 字段映射:爬虫字段 → 处理字段
|
||||
record = self._map_fields(item, link_field, notice_type)
|
||||
name = record.get("名称", "未知")[:35]
|
||||
logger.info(f" [{i+1}/{len(results)}] {name}")
|
||||
|
||||
# 2. 获取全文内容
|
||||
url = record.get(link_field, "")
|
||||
if not url:
|
||||
logger.warning(f" 无详情链接,跳过")
|
||||
processed.append(record)
|
||||
fail_count += 1
|
||||
continue
|
||||
|
||||
content = self.fetcher.get_full_content(url)
|
||||
if not content or len(content) < 200:
|
||||
logger.warning(
|
||||
f" 内容过少({len(content) if content else 0}字符),跳过")
|
||||
processed.append(record)
|
||||
fail_count += 1
|
||||
continue
|
||||
|
||||
logger.info(f" 获取到 {len(content)} 字符内容")
|
||||
|
||||
# 3. DeepSeek 提取
|
||||
extracted = self.deepseek.extract_fields(
|
||||
content, ai_fields, region_name)
|
||||
|
||||
# 4. 提取发布时间(从content中)
|
||||
import re
|
||||
publish_time_match = re.search(r'发布时间:\s*(.*?)\n', content)
|
||||
if publish_time_match:
|
||||
extracted_publish_time = publish_time_match.group(1).strip()
|
||||
# 如果提取到了更详细的发布时间(包含时分秒),更新记录
|
||||
if extracted_publish_time:
|
||||
record["发布时间"] = extracted_publish_time
|
||||
record["项目发布时间"] = extracted_publish_time # 同时更新项目发布时间,确保一致性
|
||||
logger.info(f" ✓ 发布时间: {extracted_publish_time}")
|
||||
|
||||
# 5. 合并结果(AI 优先,原有值保底)
|
||||
for field in ai_fields:
|
||||
# 保留原始的项目名称、项目批准文号和批准文号,不被AI覆盖
|
||||
if field in ["项目名称", "项目批准文号", "批准文号"] and record.get(field):
|
||||
logger.debug(f" 保留原始 {field}: {record[field][:50]}")
|
||||
continue
|
||||
|
||||
ai_val = extracted.get(field, "")
|
||||
if ai_val and ai_val != "文档未提及":
|
||||
record[field] = ai_val
|
||||
logger.info(f" ✓ {field}: {ai_val[:50]}")
|
||||
elif not record.get(field):
|
||||
record[field] = ai_val or "文档未提及"
|
||||
logger.debug(f" ○ {field}: {record[field]}")
|
||||
|
||||
# 处理最高限价字段:优先使用最高投标限价,为空时使用最高限价
|
||||
max_price = record.get("最高投标限价", "")
|
||||
if not max_price:
|
||||
max_price = record.get("最高限价", "")
|
||||
if max_price:
|
||||
record["最高投标限价"] = max_price
|
||||
record["最高限价"] = max_price
|
||||
|
||||
processed.append(record)
|
||||
success_count += 1
|
||||
|
||||
# API 限流
|
||||
time.sleep(self.api_delay)
|
||||
|
||||
logger.info(f"AI处理完成: 成功 {success_count}, 失败 {fail_count}")
|
||||
|
||||
# 保存 AI 处理结果
|
||||
self._save_results(processed, region_name)
|
||||
|
||||
# 上传简道云
|
||||
if upload:
|
||||
logger.info(f"开始上传 {region_name} 到简道云...")
|
||||
self.uploader.upload_records(region_name, processed)
|
||||
|
||||
return processed
|
||||
|
||||
# ---------- 字段映射 ----------
|
||||
|
||||
@staticmethod
|
||||
def _map_fields(item: dict, link_field: str,
|
||||
notice_type: str) -> dict:
|
||||
"""将爬虫输出字段映射为处理所需字段"""
|
||||
record = {}
|
||||
|
||||
# 基础字段映射
|
||||
record["名称"] = item.get("标题", item.get("项目名称", ""))
|
||||
pub_date = item.get("发布日期", item.get("项目发布时间", ""))
|
||||
record["发布时间"] = pub_date
|
||||
# 项目发布时间修复:使用与发布时间相同的值,确保格式一致
|
||||
record["项目发布时间"] = pub_date # 台州招标计划 JDY 使用此字段名
|
||||
record["地区"] = item.get("地区", "")
|
||||
record["招标阶段"] = item.get("公告类型", notice_type)
|
||||
record["来源"] = item.get("来源", "")
|
||||
|
||||
# 链接字段:根据公告类型映射
|
||||
record[link_field] = item.get("链接", "")
|
||||
|
||||
# 保留爬虫已提取的额外字段
|
||||
extra_fields = [
|
||||
"项目名称", "项目代码", "招标人", "招标代理",
|
||||
"项目批准文号", "项目类型", "预估合同金额(万元)",
|
||||
"计划招标时间", "联系电话", "招标估算金额",
|
||||
]
|
||||
for f in extra_fields:
|
||||
if f in item and item[f]:
|
||||
record[f] = item[f]
|
||||
|
||||
# 别名映射
|
||||
if "项目批准文号" in record:
|
||||
record.setdefault("批准文号", record["项目批准文号"])
|
||||
if "项目类型" in record:
|
||||
record.setdefault("类型", record["项目类型"])
|
||||
if "预估合同金额(万元)" in record:
|
||||
val = record["预估合同金额(万元)"]
|
||||
record.setdefault("预估金额", f"{val}万元" if val else "")
|
||||
if "计划招标时间" in record:
|
||||
record.setdefault("招标时间", record["计划招标时间"])
|
||||
|
||||
return record
|
||||
|
||||
# ---------- 结果保存 ----------
|
||||
|
||||
def _save_results(self, records: list, region_name: str):
|
||||
"""保存 AI 处理结果为 JSON"""
|
||||
os.makedirs(self.output_dir, exist_ok=True)
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
filepath = os.path.join(
|
||||
self.output_dir, f"{region_name}_AI处理_{timestamp}.json")
|
||||
|
||||
with open(filepath, "w", encoding="utf-8") as f:
|
||||
json.dump({
|
||||
"处理时间": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"总记录数": len(records),
|
||||
"data": records,
|
||||
}, f, ensure_ascii=False, indent=2)
|
||||
|
||||
logger.info(f"AI处理结果已保存: {filepath}")
|
||||
Reference in New Issue
Block a user