清理已删除的测试文件,准备云端部署
This commit is contained in:
@@ -7,6 +7,7 @@ import logging
|
||||
import os
|
||||
import re
|
||||
from datetime import datetime, timedelta
|
||||
from bs4 import BeautifulSoup
|
||||
from .base import BaseSpider
|
||||
from utils.attachment import AttachmentHandler
|
||||
|
||||
@@ -95,8 +96,7 @@ class ZhejiangSpider(BaseSpider):
|
||||
|
||||
# ---------- 解析记录 ----------
|
||||
|
||||
@staticmethod
|
||||
def _parse_record(record: dict, source: str) -> dict:
|
||||
def _parse_record(self, record: dict, source: str) -> dict:
|
||||
"""将 API 原始记录转换为结果字典"""
|
||||
title = record.get("title", "").strip()
|
||||
link = record.get("linkurl", "")
|
||||
@@ -115,36 +115,8 @@ class ZhejiangSpider(BaseSpider):
|
||||
"来源": source,
|
||||
}
|
||||
|
||||
# 解析特定格式的标题:[招标文件]项目名称[批准文号]
|
||||
import re
|
||||
# 改进的正则表达式,确保正确匹配标题格式
|
||||
title_pattern = r"\[(?:招标文件|招标公告)\]\s*(.*?)\s*\[([A-Z0-9]+)\]\s*$"
|
||||
match = re.search(title_pattern, title)
|
||||
if match:
|
||||
project_name = match.group(1).strip()
|
||||
# 删除结尾的"招标文件公示"、"招标文件预公示"等后缀
|
||||
suffixes = ["招标文件公示", "招标文件预公示", "招标公告", "招标预公告"]
|
||||
for suffix in suffixes:
|
||||
if project_name.endswith(suffix):
|
||||
project_name = project_name[:-len(suffix)].strip()
|
||||
item["项目名称"] = project_name
|
||||
item["项目批准文号"] = match.group(2).strip()
|
||||
else:
|
||||
# 如果正则匹配失败,直接使用标题作为项目名称
|
||||
project_name = title
|
||||
# 删除结尾的"招标文件公示"、"招标文件预公示"等后缀
|
||||
suffixes = ["招标文件公示", "招标文件预公示", "招标公告", "招标预公告"]
|
||||
for suffix in suffixes:
|
||||
if project_name.endswith(suffix):
|
||||
project_name = project_name[:-len(suffix)].strip()
|
||||
# 尝试从标题中提取批准文号
|
||||
number_pattern = r"\[([A-Z0-9]+)\]\s*$"
|
||||
match = re.search(number_pattern, project_name)
|
||||
if match:
|
||||
item["项目批准文号"] = match.group(1).strip()
|
||||
# 从项目名称中删除批准文号部分
|
||||
project_name = project_name[:match.start()].strip()
|
||||
item["项目名称"] = project_name
|
||||
# 解析标题:提取项目名称和批准文号(统一规则)
|
||||
item.update(self._parse_title(title))
|
||||
|
||||
return item
|
||||
|
||||
@@ -176,6 +148,65 @@ class ZhejiangSpider(BaseSpider):
|
||||
|
||||
return fields
|
||||
|
||||
# ---------- 详情页补充 ----------
|
||||
|
||||
def parse_detail(self, url: str) -> dict:
|
||||
"""访问详情页,提取项目名称和批准文号等结构化字段"""
|
||||
resp = self.fetch(url)
|
||||
if resp is None:
|
||||
return {}
|
||||
|
||||
detail = {}
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
|
||||
# 解析表格字段
|
||||
field_map = {
|
||||
"项目名称": "项目名称",
|
||||
"项目批准文件及文号": "项目批准文号",
|
||||
"项目批准文号": "项目批准文号",
|
||||
"批准文号": "项目批准文号",
|
||||
"建设单位(招标人)": "招标人",
|
||||
"建设单位(招标人)": "招标人",
|
||||
"招标人": "招标人",
|
||||
"项目类型": "项目类型",
|
||||
"招标方式": "招标方式",
|
||||
"联系人": "联系人",
|
||||
"联系方式": "联系方式",
|
||||
}
|
||||
|
||||
for row in soup.select("table tr"):
|
||||
cells = row.select("td")
|
||||
if len(cells) >= 2:
|
||||
key = cells[0].get_text(strip=True)
|
||||
value = cells[1].get_text(strip=True)
|
||||
if key in field_map and value:
|
||||
detail[field_map[key]] = value
|
||||
if len(cells) >= 4:
|
||||
key2 = cells[2].get_text(strip=True)
|
||||
value2 = cells[3].get_text(strip=True)
|
||||
if key2 in field_map and value2:
|
||||
detail[field_map[key2]] = value2
|
||||
|
||||
# 招标项目表(计划招标时间 / 预估合同金额)
|
||||
for table in soup.select("table"):
|
||||
headers = [th.get_text(strip=True) for th in table.select("th")]
|
||||
if "计划招标时间" in headers:
|
||||
data_rows = table.select("tbody tr") or [
|
||||
r for r in table.select("tr") if r.select("td")
|
||||
]
|
||||
if data_rows:
|
||||
cells = data_rows[0].select("td")
|
||||
for i, h in enumerate(headers):
|
||||
if i < len(cells):
|
||||
val = cells[i].get_text(strip=True)
|
||||
if h == "计划招标时间" and val:
|
||||
detail["计划招标时间"] = val
|
||||
elif "预估合同金额" in h and val:
|
||||
detail["预估合同金额(万元)"] = val
|
||||
break
|
||||
|
||||
return detail
|
||||
|
||||
# ---------- 附件 ----------
|
||||
|
||||
def _extract_attachments_from_detail(self, url: str) -> list:
|
||||
@@ -275,9 +306,16 @@ class ZhejiangSpider(BaseSpider):
|
||||
detail = self._parse_content_fields(rec.get("content", ""))
|
||||
item.update(detail)
|
||||
|
||||
# 详情页补充:提取项目名称和批准文号等
|
||||
self.detail_delay()
|
||||
page_detail = self.parse_detail(link)
|
||||
# 详情页字段仅补充,不覆盖已有值
|
||||
for k, v in page_detail.items():
|
||||
if not item.get(k):
|
||||
item[k] = v
|
||||
|
||||
# 附件
|
||||
if download_attachment and attachment_handler:
|
||||
self.detail_delay()
|
||||
atts = self._extract_attachments_from_detail(link)
|
||||
if atts:
|
||||
item["附件数量"] = len(atts)
|
||||
|
||||
Reference in New Issue
Block a user