清理已删除的测试文件,准备云端部署

This commit is contained in:
ztb-system
2026-02-25 18:17:00 +08:00
parent 5f93dbe5e4
commit 305f6b342c
29 changed files with 143 additions and 2536 deletions

View File

@@ -7,6 +7,7 @@ import logging
import os
import re
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
from .base import BaseSpider
from utils.attachment import AttachmentHandler
@@ -95,8 +96,7 @@ class ZhejiangSpider(BaseSpider):
# ---------- 解析记录 ----------
@staticmethod
def _parse_record(record: dict, source: str) -> dict:
def _parse_record(self, record: dict, source: str) -> dict:
"""将 API 原始记录转换为结果字典"""
title = record.get("title", "").strip()
link = record.get("linkurl", "")
@@ -115,36 +115,8 @@ class ZhejiangSpider(BaseSpider):
"来源": source,
}
# 解析特定格式的标题:[招标文件]项目名称[批准文号]
import re
# 改进的正则表达式,确保正确匹配标题格式
title_pattern = r"\[(?:招标文件|招标公告)\]\s*(.*?)\s*\[([A-Z0-9]+)\]\s*$"
match = re.search(title_pattern, title)
if match:
project_name = match.group(1).strip()
# 删除结尾的"招标文件公示"、"招标文件预公示"等后缀
suffixes = ["招标文件公示", "招标文件预公示", "招标公告", "招标预公告"]
for suffix in suffixes:
if project_name.endswith(suffix):
project_name = project_name[:-len(suffix)].strip()
item["项目名称"] = project_name
item["项目批准文号"] = match.group(2).strip()
else:
# 如果正则匹配失败,直接使用标题作为项目名称
project_name = title
# 删除结尾的"招标文件公示"、"招标文件预公示"等后缀
suffixes = ["招标文件公示", "招标文件预公示", "招标公告", "招标预公告"]
for suffix in suffixes:
if project_name.endswith(suffix):
project_name = project_name[:-len(suffix)].strip()
# 尝试从标题中提取批准文号
number_pattern = r"\[([A-Z0-9]+)\]\s*$"
match = re.search(number_pattern, project_name)
if match:
item["项目批准文号"] = match.group(1).strip()
# 从项目名称中删除批准文号部分
project_name = project_name[:match.start()].strip()
item["项目名称"] = project_name
# 解析标题:提取项目名称批准文号(统一规则)
item.update(self._parse_title(title))
return item
@@ -176,6 +148,65 @@ class ZhejiangSpider(BaseSpider):
return fields
# ---------- 详情页补充 ----------
def parse_detail(self, url: str) -> dict:
"""访问详情页,提取项目名称和批准文号等结构化字段"""
resp = self.fetch(url)
if resp is None:
return {}
detail = {}
soup = BeautifulSoup(resp.text, "html.parser")
# 解析表格字段
field_map = {
"项目名称": "项目名称",
"项目批准文件及文号": "项目批准文号",
"项目批准文号": "项目批准文号",
"批准文号": "项目批准文号",
"建设单位(招标人)": "招标人",
"建设单位(招标人)": "招标人",
"招标人": "招标人",
"项目类型": "项目类型",
"招标方式": "招标方式",
"联系人": "联系人",
"联系方式": "联系方式",
}
for row in soup.select("table tr"):
cells = row.select("td")
if len(cells) >= 2:
key = cells[0].get_text(strip=True)
value = cells[1].get_text(strip=True)
if key in field_map and value:
detail[field_map[key]] = value
if len(cells) >= 4:
key2 = cells[2].get_text(strip=True)
value2 = cells[3].get_text(strip=True)
if key2 in field_map and value2:
detail[field_map[key2]] = value2
# 招标项目表(计划招标时间 / 预估合同金额)
for table in soup.select("table"):
headers = [th.get_text(strip=True) for th in table.select("th")]
if "计划招标时间" in headers:
data_rows = table.select("tbody tr") or [
r for r in table.select("tr") if r.select("td")
]
if data_rows:
cells = data_rows[0].select("td")
for i, h in enumerate(headers):
if i < len(cells):
val = cells[i].get_text(strip=True)
if h == "计划招标时间" and val:
detail["计划招标时间"] = val
elif "预估合同金额" in h and val:
detail["预估合同金额(万元)"] = val
break
return detail
# ---------- 附件 ----------
def _extract_attachments_from_detail(self, url: str) -> list:
@@ -275,9 +306,16 @@ class ZhejiangSpider(BaseSpider):
detail = self._parse_content_fields(rec.get("content", ""))
item.update(detail)
# 详情页补充:提取项目名称和批准文号等
self.detail_delay()
page_detail = self.parse_detail(link)
# 详情页字段仅补充,不覆盖已有值
for k, v in page_detail.items():
if not item.get(k):
item[k] = v
# 附件
if download_attachment and attachment_handler:
self.detail_delay()
atts = self._extract_attachments_from_detail(link)
if atts:
item["附件数量"] = len(atts)