Initial commit: 招标信息爬虫与分析系统

This commit is contained in:
ztb-system
2026-02-13 18:15:20 +08:00
commit d2fa06801f
38 changed files with 5415 additions and 0 deletions

5
spiders/__init__.py Normal file
View File

@@ -0,0 +1,5 @@
# -*- coding: utf-8 -*-
from .zhejiang import ZhejiangSpider
from .taizhou import TaizhouSpider
__all__ = ['ZhejiangSpider', 'TaizhouSpider']

229
spiders/base.py Normal file
View File

@@ -0,0 +1,229 @@
# -*- coding: utf-8 -*-
"""
爬虫基类 - 基于 requests
"""
import csv
import logging
import os
import random
import signal
import sys
import time
from datetime import datetime
from abc import ABC, abstractmethod
from logging.handlers import RotatingFileHandler
import requests
logger = logging.getLogger("ztb")
def setup_logging(log_dir: str = "logs", level: int = logging.INFO):
"""配置日志系统:文件 + 控制台"""
os.makedirs(log_dir, exist_ok=True)
root = logging.getLogger("ztb")
if root.handlers: # 避免重复初始化
return root
root.setLevel(level)
fmt = logging.Formatter(
"%(asctime)s [%(levelname)s] %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
# 文件日志:自动轮转,单文件 5MB保留 5 个
fh = RotatingFileHandler(
os.path.join(log_dir, "spider.log"),
maxBytes=5 * 1024 * 1024,
backupCount=5,
encoding="utf-8",
)
fh.setLevel(logging.DEBUG)
fh.setFormatter(fmt)
root.addHandler(fh)
# 控制台:只输出 INFO 以上
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
ch.setFormatter(fmt)
root.addHandler(ch)
return root
class BaseSpider(ABC):
"""爬虫基类"""
def __init__(self, config: dict, spider_config: dict, data_dir: str):
self.config = config
self.spider_config = spider_config
self.data_dir = data_dir
self.results = []
self._seen_urls = set() # 去重
# 安全计数器
self._total_requests = 0
self._consecutive_errors = 0
self._stopped = False
self._start_time = time.time()
self._minute_requests = [] # 每分钟请求时间戳
# HTTP 会话
self.session = requests.Session()
self.session.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;"
"q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
})
# 注册优雅退出
signal.signal(signal.SIGINT, self._handle_stop)
signal.signal(signal.SIGTERM, self._handle_stop)
# ---------- 安全机制 ----------
def _handle_stop(self, signum, frame):
"""捕获中断信号,保存已采集数据后退出"""
logger.warning("收到中断信号,正在保存已采集数据...")
self._stopped = True
self.save_to_csv()
sys.exit(0)
def _check_limits(self) -> bool:
"""检查是否超出安全阈值,返回 True 表示应停止"""
max_req = self.spider_config.get("max_total_requests", 300)
if self._total_requests >= max_req:
logger.warning(f"达到最大请求数 ({max_req}),停止爬取")
return True
max_err = self.spider_config.get("max_consecutive_errors", 5)
if self._consecutive_errors >= max_err:
logger.error(f"连续失败 {max_err} 次,触发熔断")
return True
return self._stopped
# ---------- 网络请求 ----------
def _throttle(self):
"""每分钟请求数限制,超出则等待"""
rpm_limit = self.spider_config.get("requests_per_minute", 10)
now = time.time()
# 清理 60s 以前的时间戳
self._minute_requests = [t for t in self._minute_requests if now - t < 60]
if len(self._minute_requests) >= rpm_limit:
wait = 60 - (now - self._minute_requests[0]) + random.uniform(1, 3)
if wait > 0:
logger.info(f"达到速率限制 ({rpm_limit}次/分钟),等待 {wait:.0f}s...")
time.sleep(wait)
self._minute_requests.append(time.time())
def fetch(self, url: str, method: str = "GET", **kwargs) -> requests.Response | None:
"""
带重试、限速和安全检查的 HTTP 请求
"""
if self._check_limits():
return None
self._throttle()
timeout = kwargs.pop("timeout", self.spider_config.get("timeout", 30))
max_retries = self.spider_config.get("max_retries", 3)
for attempt in range(1, max_retries + 1):
try:
self._total_requests += 1
resp = self.session.request(method, url, timeout=timeout, **kwargs)
resp.raise_for_status()
# 检测被拦截的空响应(反爬虒返回 200 但 body 为空)
if len(resp.content) <= 10 and "json" not in resp.headers.get("Content-Type", ""):
self._consecutive_errors += 1
logger.warning(f"检测到空响应 ({len(resp.content)} bytes),可能被反爬")
if attempt < max_retries:
wait = 10 * attempt + random.uniform(5, 10)
logger.info(f"疑似被反爬拦截,等待 {wait:.0f}s 后重试...")
time.sleep(wait)
continue
return None
self._consecutive_errors = 0
return resp
except requests.RequestException as e:
self._consecutive_errors += 1
wait = 2 ** attempt + random.random()
logger.warning(f"请求失败 ({attempt}/{max_retries}): {e}{wait:.1f}s 后重试")
if attempt < max_retries:
time.sleep(wait)
logger.error(f"请求失败,已达最大重试次数: {url[:80]}")
return None
def delay(self):
"""列表页之间的随机延迟"""
lo = self.spider_config.get("delay_min", 3)
hi = self.spider_config.get("delay_max", 6)
time.sleep(random.uniform(lo, hi))
def detail_delay(self):
"""详情页请求前的随机延迟"""
lo = self.spider_config.get("detail_delay_min", 2)
hi = self.spider_config.get("detail_delay_max", 5)
time.sleep(random.uniform(lo, hi))
def print_stats(self):
"""输出爬取统计"""
elapsed = time.time() - self._start_time
rpm = self._total_requests / max(elapsed / 60, 0.1)
logger.info(f"[统计] 总请求: {self._total_requests}, "
f"耗时: {elapsed:.0f}s, 速率: {rpm:.1f}次/分钟")
# ---------- 去重 ----------
def is_duplicate(self, url: str) -> bool:
"""基于 URL 去重"""
if url in self._seen_urls:
return True
self._seen_urls.add(url)
return False
# ---------- 数据存储 ----------
def save_to_csv(self, filename: str = None):
"""保存数据到 CSV"""
if not self.results:
logger.info("没有数据可保存")
return
if not filename:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"{self.config['name']}_{timestamp}.csv"
filepath = os.path.join(self.data_dir, filename)
os.makedirs(self.data_dir, exist_ok=True)
# 汇总所有字段
all_keys = []
seen = set()
for row in self.results:
for k in row:
if k not in seen:
all_keys.append(k)
seen.add(k)
with open(filepath, "w", newline="", encoding="utf-8-sig") as f:
writer = csv.DictWriter(f, fieldnames=all_keys, extrasaction="ignore")
writer.writeheader()
writer.writerows(self.results)
logger.info(f"数据已保存到: {filepath} (共 {len(self.results)} 条记录)")
# ---------- 抽象方法 ----------
@abstractmethod
def crawl(self, max_pages: int = None, **kwargs):
"""执行爬取,子类实现"""
pass

360
spiders/taizhou.py Normal file
View File

@@ -0,0 +1,360 @@
# -*- coding: utf-8 -*-
"""
台州公共资源交易中心爬虫 —— 基于 API + requests
"""
import logging
import os
import re
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
from .base import BaseSpider
from utils.attachment import AttachmentHandler
logger = logging.getLogger("ztb")
class TaizhouSpider(BaseSpider):
"""台州公共资源交易中心爬虫"""
# ---------- 列表数据 ----------
def _build_list_url(self, category_code: str, notice_code: str, page_num: int) -> str:
"""构建列表页 URLSSR 页面,页 1-6"""
base = self.config["base_url"]
if notice_code:
if category_code:
path = f"/jyxx/{category_code}/{notice_code}"
else:
# 当只有notice_code时直接使用/jyxx/{notice_code}
path = f"/jyxx/{notice_code}"
elif category_code:
path = f"/jyxx/{category_code}"
else:
path = "/jyxx"
if page_num <= 1:
return f"{base}{path}/trade_infor.html"
else:
return f"{base}{path}/{page_num}.html"
def fetch_list_via_api(self, page_index: int, page_size: int,
category_num: str, start_date: str = "",
end_date: str = "") -> list:
"""通过 API 获取列表(第 7 页以后)"""
resp = self.fetch(
self.config["api_url"],
method="POST",
data={
"siteGuid": self.config["site_guid"],
"categoryNum": category_num,
"content": "",
"pageIndex": page_index,
"pageSize": page_size,
"YZM": "",
"ImgGuid": "",
"startdate": start_date,
"enddate": end_date,
"xiaqucode": "",
"projectjiaoyitype": "",
"jytype": "",
"zhuanzai": "",
},
)
if resp is None:
return []
try:
data = resp.json()
return data.get("custom", {}).get("infodata", [])
except Exception as e:
logger.error(f"解析 API 响应失败: {e}")
return []
def parse_html_list(self, html: str) -> list:
"""解析 SSR 列表页 HTML"""
soup = BeautifulSoup(html, "html.parser")
items = []
for a in soup.select("a.public-list-item"):
title = a.get("title", "").strip()
href = a.get("href", "")
if href and not href.startswith("http"):
href = self.config["base_url"] + href
date_el = a.select_one("span.date")
date = date_el.text.strip() if date_el else ""
region_el = a.select_one("span.xiaquclass")
region = region_el.text.strip().strip("【】") if region_el else ""
item = {
"标题": title,
"发布日期": date,
"地区": region,
"链接": href,
"来源": self.config["name"],
}
# 解析特定格式的标题:[招标文件]项目名称[批准文号]
title_pattern = r"(?:\[招标文件\])?\s*(.*)\s*\[([A-Z0-9]+)\]\s*$"
match = re.search(title_pattern, title)
if match:
item["项目名称"] = match.group(1).strip()
item["项目批准文号"] = match.group(2).strip()
else:
# 如果正则匹配失败,直接使用标题作为项目名称
project_name = title
# 尝试从标题中提取批准文号
number_pattern = r"\[([A-Z0-9]+)\]\s*$"
match = re.search(number_pattern, project_name)
if match:
item["项目批准文号"] = match.group(1).strip()
# 从项目名称中删除批准文号部分
project_name = project_name[:match.start()].strip()
item["项目名称"] = project_name
if title and href:
items.append(item)
return items
def parse_api_list(self, records: list) -> list:
"""解析 API 返回的列表数据"""
items = []
for rec in records:
title = rec.get("title2") or rec.get("title", "")
href = rec.get("infourl", "")
if href and not href.startswith("http"):
href = self.config["base_url"] + href
item = {
"标题": title.strip(),
"发布日期": rec.get("infodate", ""),
"地区": rec.get("xiaquname", "").strip("【】"),
"链接": href,
"来源": self.config["name"],
}
# 解析特定格式的标题:[招标文件]项目名称[批准文号]
title_pattern = r"(?:\[招标文件\])?\s*(.*)\s*\[([A-Z0-9]+)\]\s*$"
match = re.search(title_pattern, title)
if match:
item["项目名称"] = match.group(1).strip()
item["项目批准文号"] = match.group(2).strip()
else:
# 如果正则匹配失败,直接使用标题作为项目名称
project_name = title
# 尝试从标题中提取批准文号
number_pattern = r"\[([A-Z0-9]+)\]\s*$"
match = re.search(number_pattern, project_name)
if match:
item["项目批准文号"] = match.group(1).strip()
# 从项目名称中删除批准文号部分
project_name = project_name[:match.start()].strip()
item["项目名称"] = project_name
items.append(item)
return items
# ---------- 详情页 ----------
def parse_detail(self, url: str) -> dict:
"""解析详情页"""
resp = self.fetch(url)
if resp is None:
return {}
detail = {}
soup = BeautifulSoup(resp.text, "html.parser")
# 解析表格字段
field_map = {
"项目名称": "项目名称",
"联系人": "联系人",
"联系方式": "联系方式",
"建设单位(招标人)": "招标人",
"建设单位(招标人)": "招标人",
"项目批准文件及文号": "项目批准文号",
"项目类型": "项目类型",
"招标方式": "招标方式",
"主要建设内容": "主要建设内容",
}
for row in soup.select("table tr"):
cells = row.select("td")
if len(cells) >= 2:
key = cells[0].get_text(strip=True)
value = cells[1].get_text(strip=True)
if key in field_map and value:
detail[field_map[key]] = value
if len(cells) >= 4:
key2 = cells[2].get_text(strip=True)
value2 = cells[3].get_text(strip=True)
if key2 == "联系方式" and value2:
detail["联系方式"] = value2
# 招标项目表(计划招标时间 / 预估合同金额)
for table in soup.select("table"):
headers = [th.get_text(strip=True) for th in table.select("th")]
if "计划招标时间" in headers:
data_rows = table.select("tbody tr") or [
r for r in table.select("tr") if r.select("td")
]
if data_rows:
cells = data_rows[0].select("td")
for i, h in enumerate(headers):
if i < len(cells):
val = cells[i].get_text(strip=True)
if h == "计划招标时间" and val:
detail["计划招标时间"] = val
elif "预估合同金额" in h and val:
detail["预估合同金额(万元)"] = val
break
return detail
# ---------- 附件 ----------
def _extract_attachments(self, url: str) -> list:
"""从详情页提取附件链接"""
resp = self.fetch(url)
if resp is None:
return []
attachments = []
for href in re.findall(r'href=["\']([^"\']*\.pdf[^"\']*)', resp.text):
if not href.startswith("http"):
href = self.config["base_url"] + href
attachments.append({"name": href.split("/")[-1], "url": href})
for href in re.findall(r'href=["\']([^"\']*\.docx?[^"\']*)', resp.text):
if not href.startswith("http"):
href = self.config["base_url"] + href
attachments.append({"name": href.split("/")[-1], "url": href})
return attachments
# ---------- 主流程 ----------
def crawl(self, max_pages: int = None, category: str = None,
notice_type: str = None, date_filter: str = None,
download_attachment: bool = False, **kwargs):
"""
执行爬取
Args:
max_pages: 最大爬取页数
category: 交易领域
notice_type: 公告类型
date_filter: 日期过滤
download_attachment: 是否下载附件
"""
if max_pages is None:
max_pages = self.spider_config.get("max_pages", 10)
page_size = 10 # 台州站固定每页 10 条
# 日期过滤
target_date = None
start_date = end_date = ""
if date_filter == "yesterday":
d = datetime.now() - timedelta(days=1)
target_date = d.strftime("%Y-%m-%d")
start_date = target_date + " 00:00:00"
end_date = target_date + " 23:59:59"
logger.info(f"过滤日期: {target_date}(昨天)")
elif date_filter:
target_date = date_filter
start_date = target_date + " 00:00:00"
end_date = target_date + " 23:59:59"
logger.info(f"过滤日期: {target_date}")
category_code = self.config.get("categories", {}).get(category, "")
notice_code = self.config.get("notice_types", {}).get(notice_type, "")
category_num = notice_code or category_code or "002"
# 附件
attachment_handler = None
if download_attachment:
attachment_dir = os.path.join(self.data_dir, "attachments")
attachment_handler = AttachmentHandler(attachment_dir)
logger.info(f"启用附件下载,保存到: {attachment_dir}")
logger.info(f"开始爬取: {self.config['name']}")
if category:
logger.info(f"交易领域: {category}")
if notice_type:
logger.info(f"公告类型: {notice_type}")
for page_num in range(1, max_pages + 1):
if self._check_limits():
break
logger.info(f"正在爬取第 {page_num} 页...")
# 页 1-7 用 SSR8+ 用 API
if page_num <= 7:
url = self._build_list_url(category_code, notice_code, page_num)
resp = self.fetch(url)
if resp is None:
break
page_items = self.parse_html_list(resp.text)
else:
records = self.fetch_list_via_api(
page_num - 1, page_size, category_num,
start_date, end_date,
)
if not records:
logger.info("没有更多数据")
break
page_items = self.parse_api_list(records)
if not page_items:
logger.info("没有更多数据")
break
# 日期过滤 + 去重
count = 0
has_older = False # 是否存在比目标日期更早的记录
for item in page_items:
if target_date and item["发布日期"] != target_date:
if target_date and item["发布日期"] < target_date:
has_older = True
continue
if self.is_duplicate(item["链接"]):
continue
# 详情页
self.detail_delay()
detail = self.parse_detail(item["链接"])
item.update(detail)
# 附件
if download_attachment and attachment_handler:
atts = self._extract_attachments(item["链接"])
if atts:
item["附件数量"] = len(atts)
att_names = []
for att in atts:
att_names.append(att["name"])
result = attachment_handler.download_and_extract(att["url"])
if result["success"] and result["text"]:
item["附件内容摘要"] = result["text"][:2000]
item["附件名称"] = " | ".join(att_names)
self.results.append(item)
count += 1
logger.info(f" 获取 {count} 条数据")
if count == 0:
if not target_date or has_older:
# 无日期过滤 / 已出现更早日期 → 停止
logger.info("当前页无新数据,停止翻页")
break
else:
# 页面全是比目标日期更新的数据,继续翻页
logger.info(" 当前页均为更新日期的数据,继续翻页")
self.delay()
continue
self.delay()
self.print_stats()
logger.info(f"爬取完成,共 {len(self.results)} 条数据")
return self.results

305
spiders/zhejiang.py Normal file
View File

@@ -0,0 +1,305 @@
# -*- coding: utf-8 -*-
"""
浙江省公共资源交易中心爬虫 —— 基于 API + requests
"""
import json
import logging
import os
import re
from datetime import datetime, timedelta
from .base import BaseSpider
from utils.attachment import AttachmentHandler
logger = logging.getLogger("ztb")
class ZhejiangSpider(BaseSpider):
"""浙江省公共资源交易中心爬虫"""
# ---------- API 列表 ----------
def _build_payload(self, page_index: int, page_size: int,
category_code: str, notice_code: str,
start_date: str, end_date: str) -> dict:
"""构建浙江省 API 请求体"""
condition = []
if notice_code:
condition.append({
"fieldName": "categorynum",
"isLike": True,
"likeType": 2,
"equal": notice_code,
})
elif category_code:
condition.append({
"fieldName": "categorynum",
"isLike": True,
"likeType": 2,
"equal": category_code,
})
time_cond = []
if start_date and end_date:
time_cond.append({
"fieldName": "webdate",
"startTime": f"{start_date} 00:00:00",
"endTime": f"{end_date} 23:59:59",
})
return {
"token": "",
"pn": page_index * page_size,
"rn": page_size,
"sdt": "", "edt": "",
"wd": "", "inc_wd": "", "exc_wd": "",
"fields": "title",
"cnum": "001",
"sort": '{"webdate":"0"}',
"ssort": "title",
"cl": 5000,
"terminal": "",
"condition": condition or None,
"time": time_cond or None,
"highlights": "",
"statistics": None,
"unionCondition": None,
"accuracy": "",
"noParticiple": "0",
"searchRange": None,
"isBusiness": "1",
}
def fetch_list_page(self, page_index: int, page_size: int,
category_code: str, notice_code: str,
start_date: str, end_date: str) -> list:
"""通过 API 获取一页列表数据"""
payload = self._build_payload(
page_index, page_size, category_code, notice_code,
start_date, end_date,
)
resp = self.fetch(
self.config["api_url"],
method="POST",
json=payload,
headers={"Referer": self.config["base_url"] + "/jyxxgk/list.html"},
)
if resp is None:
return []
try:
data = resp.json()
return data.get("result", {}).get("records", [])
except Exception as e:
logger.error(f"解析 API 响应失败: {e}")
return []
# ---------- 解析记录 ----------
@staticmethod
def _parse_record(record: dict, source: str) -> dict:
"""将 API 原始记录转换为结果字典"""
title = record.get("title", "").strip()
link = record.get("linkurl", "")
if link and not link.startswith("http"):
link = "https://ggzy.zj.gov.cn" + link
date_str = record.get("webdate", "")
date_short = date_str.split(" ")[0] if date_str else ""
item = {
"标题": title,
"发布日期": date_short,
"地区": record.get("infod", ""),
"公告类型": record.get("categoryname", ""),
"链接": link,
"来源": source,
}
# 解析特定格式的标题:[招标文件]项目名称[批准文号]
import re
# 改进的正则表达式,确保正确匹配标题格式
title_pattern = r"\[(?:招标文件|招标公告)\]\s*(.*?)\s*\[([A-Z0-9]+)\]\s*$"
match = re.search(title_pattern, title)
if match:
project_name = match.group(1).strip()
# 删除结尾的"招标文件公示"、"招标文件预公示"等后缀
suffixes = ["招标文件公示", "招标文件预公示", "招标公告", "招标预公告"]
for suffix in suffixes:
if project_name.endswith(suffix):
project_name = project_name[:-len(suffix)].strip()
item["项目名称"] = project_name
item["项目批准文号"] = match.group(2).strip()
else:
# 如果正则匹配失败,直接使用标题作为项目名称
project_name = title
# 删除结尾的"招标文件公示"、"招标文件预公示"等后缀
suffixes = ["招标文件公示", "招标文件预公示", "招标公告", "招标预公告"]
for suffix in suffixes:
if project_name.endswith(suffix):
project_name = project_name[:-len(suffix)].strip()
# 尝试从标题中提取批准文号
number_pattern = r"\[([A-Z0-9]+)\]\s*$"
match = re.search(number_pattern, project_name)
if match:
item["项目批准文号"] = match.group(1).strip()
# 从项目名称中删除批准文号部分
project_name = project_name[:match.start()].strip()
item["项目名称"] = project_name
return item
@staticmethod
def _parse_content_fields(content: str) -> dict:
"""从 API content 字段提取结构化信息"""
if not content:
return {}
# 清理 HTML 实体
import html as html_mod
text = html_mod.unescape(content)
text = re.sub(r"<[^>]+>", "", text) # 去 HTML 标签
text = re.sub(r"\s+", " ", text).strip()
fields = {}
patterns = {
"项目名称": r"项目名称[::]\s*(.+?)\s{2,}",
"项目代码": r"项目代码[::]\s*(.+?)\s{2,}",
"招标人": r"招标人[::].*?名称[::]\s*(.+?)\s{2,}",
"招标代理": r"代理机构[::].*?名称[::]\s*(.+?)\s{2,}",
"联系电话": r"\s*话[::]\s*([\d\-]+)",
"招标估算金额": r"招标估算金额[::]\s*([\d,\.]+\s*元)",
}
for key, pat in patterns.items():
m = re.search(pat, text)
if m:
fields[key] = m.group(1).strip()
return fields
# ---------- 附件 ----------
def _extract_attachments_from_detail(self, url: str) -> list:
"""访问详情页,提取附件链接"""
resp = self.fetch(url)
if resp is None:
return []
attachments = []
# PDF
for href in re.findall(r'href=["\']([^"\']*\.pdf[^"\']*)', resp.text):
if not href.startswith("http"):
href = self.config["base_url"] + href
name = href.split("/")[-1]
attachments.append({"name": name, "url": href})
# Word
for href in re.findall(r'href=["\']([^"\']*\.docx?[^"\']*)', resp.text):
if not href.startswith("http"):
href = self.config["base_url"] + href
name = href.split("/")[-1]
attachments.append({"name": name, "url": href})
return attachments
# ---------- 主流程 ----------
def crawl(self, max_pages: int = None, category: str = None,
notice_type: str = None, date_filter: str = None,
download_attachment: bool = False, **kwargs):
"""
执行爬取
Args:
max_pages: 最大爬取页数
category: 交易领域(如 "工程建设"
notice_type: 公告类型(如 "招标公告"
date_filter: 日期过滤("yesterday""2026-02-03"
download_attachment: 是否下载附件
"""
if max_pages is None:
max_pages = self.spider_config.get("max_pages", 10)
page_size = self.spider_config.get("page_size", 20)
# 日期范围
if date_filter == "yesterday":
d = datetime.now() - timedelta(days=1)
start_date = end_date = d.strftime("%Y-%m-%d")
logger.info(f"过滤日期: {start_date}(昨天)")
elif date_filter:
start_date = end_date = date_filter
logger.info(f"过滤日期: {start_date}")
else:
# 默认近一个月
end_date = datetime.now().strftime("%Y-%m-%d")
start_date = (datetime.now() - timedelta(days=30)).strftime("%Y-%m-%d")
category_code = self.config.get("categories", {}).get(category, "")
notice_code = self.config.get("notice_types", {}).get(notice_type, "")
# 附件处理器
attachment_handler = None
if download_attachment:
attachment_dir = os.path.join(self.data_dir, "attachments")
attachment_handler = AttachmentHandler(attachment_dir)
logger.info(f"启用附件下载,保存到: {attachment_dir}")
logger.info(f"开始爬取: {self.config['name']}")
if category:
logger.info(f"交易领域: {category}")
if notice_type:
logger.info(f"公告类型: {notice_type}")
for page_idx in range(max_pages):
if self._check_limits():
break
logger.info(f"正在爬取第 {page_idx + 1} 页...")
records = self.fetch_list_page(
page_idx, page_size, category_code, notice_code,
start_date, end_date,
)
if not records:
logger.info("没有更多数据")
break
count = 0
for rec in records:
link = rec.get("linkurl", "")
if link and not link.startswith("http"):
link = self.config["base_url"] + link
if self.is_duplicate(link):
continue
item = self._parse_record(rec, self.config["name"])
# 从 content 提取详情字段
detail = self._parse_content_fields(rec.get("content", ""))
item.update(detail)
# 附件
if download_attachment and attachment_handler:
self.detail_delay()
atts = self._extract_attachments_from_detail(link)
if atts:
item["附件数量"] = len(atts)
att_names = []
for att in atts:
att_names.append(att["name"])
result = attachment_handler.download_and_extract(att["url"])
if result["success"] and result["text"]:
item["附件内容摘要"] = result["text"][:2000]
item["附件名称"] = " | ".join(att_names)
self.results.append(item)
count += 1
logger.info(f" 获取 {count} 条数据")
if count == 0:
logger.info("当前页无新数据,停止翻页")
break
self.delay()
self.print_stats()
logger.info(f"爬取完成,共 {len(self.results)} 条数据")
return self.results