Initial commit: 招标信息爬虫与分析系统
This commit is contained in:
5
spiders/__init__.py
Normal file
5
spiders/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from .zhejiang import ZhejiangSpider
|
||||
from .taizhou import TaizhouSpider
|
||||
|
||||
__all__ = ['ZhejiangSpider', 'TaizhouSpider']
|
||||
229
spiders/base.py
Normal file
229
spiders/base.py
Normal file
@@ -0,0 +1,229 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
爬虫基类 - 基于 requests
|
||||
"""
|
||||
import csv
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import signal
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime
|
||||
from abc import ABC, abstractmethod
|
||||
from logging.handlers import RotatingFileHandler
|
||||
|
||||
import requests
|
||||
|
||||
logger = logging.getLogger("ztb")
|
||||
|
||||
|
||||
def setup_logging(log_dir: str = "logs", level: int = logging.INFO):
|
||||
"""配置日志系统:文件 + 控制台"""
|
||||
os.makedirs(log_dir, exist_ok=True)
|
||||
root = logging.getLogger("ztb")
|
||||
if root.handlers: # 避免重复初始化
|
||||
return root
|
||||
root.setLevel(level)
|
||||
|
||||
fmt = logging.Formatter(
|
||||
"%(asctime)s [%(levelname)s] %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
)
|
||||
|
||||
# 文件日志:自动轮转,单文件 5MB,保留 5 个
|
||||
fh = RotatingFileHandler(
|
||||
os.path.join(log_dir, "spider.log"),
|
||||
maxBytes=5 * 1024 * 1024,
|
||||
backupCount=5,
|
||||
encoding="utf-8",
|
||||
)
|
||||
fh.setLevel(logging.DEBUG)
|
||||
fh.setFormatter(fmt)
|
||||
root.addHandler(fh)
|
||||
|
||||
# 控制台:只输出 INFO 以上
|
||||
ch = logging.StreamHandler()
|
||||
ch.setLevel(logging.INFO)
|
||||
ch.setFormatter(fmt)
|
||||
root.addHandler(ch)
|
||||
|
||||
return root
|
||||
|
||||
|
||||
class BaseSpider(ABC):
|
||||
"""爬虫基类"""
|
||||
|
||||
def __init__(self, config: dict, spider_config: dict, data_dir: str):
|
||||
self.config = config
|
||||
self.spider_config = spider_config
|
||||
self.data_dir = data_dir
|
||||
self.results = []
|
||||
self._seen_urls = set() # 去重
|
||||
|
||||
# 安全计数器
|
||||
self._total_requests = 0
|
||||
self._consecutive_errors = 0
|
||||
self._stopped = False
|
||||
self._start_time = time.time()
|
||||
self._minute_requests = [] # 每分钟请求时间戳
|
||||
|
||||
# HTTP 会话
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update({
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/120.0.0.0 Safari/537.36",
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;"
|
||||
"q=0.9,image/webp,*/*;q=0.8",
|
||||
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
||||
})
|
||||
|
||||
# 注册优雅退出
|
||||
signal.signal(signal.SIGINT, self._handle_stop)
|
||||
signal.signal(signal.SIGTERM, self._handle_stop)
|
||||
|
||||
# ---------- 安全机制 ----------
|
||||
|
||||
def _handle_stop(self, signum, frame):
|
||||
"""捕获中断信号,保存已采集数据后退出"""
|
||||
logger.warning("收到中断信号,正在保存已采集数据...")
|
||||
self._stopped = True
|
||||
self.save_to_csv()
|
||||
sys.exit(0)
|
||||
|
||||
def _check_limits(self) -> bool:
|
||||
"""检查是否超出安全阈值,返回 True 表示应停止"""
|
||||
max_req = self.spider_config.get("max_total_requests", 300)
|
||||
if self._total_requests >= max_req:
|
||||
logger.warning(f"达到最大请求数 ({max_req}),停止爬取")
|
||||
return True
|
||||
|
||||
max_err = self.spider_config.get("max_consecutive_errors", 5)
|
||||
if self._consecutive_errors >= max_err:
|
||||
logger.error(f"连续失败 {max_err} 次,触发熔断")
|
||||
return True
|
||||
|
||||
return self._stopped
|
||||
|
||||
# ---------- 网络请求 ----------
|
||||
|
||||
def _throttle(self):
|
||||
"""每分钟请求数限制,超出则等待"""
|
||||
rpm_limit = self.spider_config.get("requests_per_minute", 10)
|
||||
now = time.time()
|
||||
# 清理 60s 以前的时间戳
|
||||
self._minute_requests = [t for t in self._minute_requests if now - t < 60]
|
||||
if len(self._minute_requests) >= rpm_limit:
|
||||
wait = 60 - (now - self._minute_requests[0]) + random.uniform(1, 3)
|
||||
if wait > 0:
|
||||
logger.info(f"达到速率限制 ({rpm_limit}次/分钟),等待 {wait:.0f}s...")
|
||||
time.sleep(wait)
|
||||
self._minute_requests.append(time.time())
|
||||
|
||||
def fetch(self, url: str, method: str = "GET", **kwargs) -> requests.Response | None:
|
||||
"""
|
||||
带重试、限速和安全检查的 HTTP 请求
|
||||
"""
|
||||
if self._check_limits():
|
||||
return None
|
||||
|
||||
self._throttle()
|
||||
|
||||
timeout = kwargs.pop("timeout", self.spider_config.get("timeout", 30))
|
||||
max_retries = self.spider_config.get("max_retries", 3)
|
||||
|
||||
for attempt in range(1, max_retries + 1):
|
||||
try:
|
||||
self._total_requests += 1
|
||||
resp = self.session.request(method, url, timeout=timeout, **kwargs)
|
||||
resp.raise_for_status()
|
||||
|
||||
# 检测被拦截的空响应(反爬虒返回 200 但 body 为空)
|
||||
if len(resp.content) <= 10 and "json" not in resp.headers.get("Content-Type", ""):
|
||||
self._consecutive_errors += 1
|
||||
logger.warning(f"检测到空响应 ({len(resp.content)} bytes),可能被反爬")
|
||||
if attempt < max_retries:
|
||||
wait = 10 * attempt + random.uniform(5, 10)
|
||||
logger.info(f"疑似被反爬拦截,等待 {wait:.0f}s 后重试...")
|
||||
time.sleep(wait)
|
||||
continue
|
||||
return None
|
||||
|
||||
self._consecutive_errors = 0
|
||||
return resp
|
||||
except requests.RequestException as e:
|
||||
self._consecutive_errors += 1
|
||||
wait = 2 ** attempt + random.random()
|
||||
logger.warning(f"请求失败 ({attempt}/{max_retries}): {e},{wait:.1f}s 后重试")
|
||||
if attempt < max_retries:
|
||||
time.sleep(wait)
|
||||
|
||||
logger.error(f"请求失败,已达最大重试次数: {url[:80]}")
|
||||
return None
|
||||
|
||||
def delay(self):
|
||||
"""列表页之间的随机延迟"""
|
||||
lo = self.spider_config.get("delay_min", 3)
|
||||
hi = self.spider_config.get("delay_max", 6)
|
||||
time.sleep(random.uniform(lo, hi))
|
||||
|
||||
def detail_delay(self):
|
||||
"""详情页请求前的随机延迟"""
|
||||
lo = self.spider_config.get("detail_delay_min", 2)
|
||||
hi = self.spider_config.get("detail_delay_max", 5)
|
||||
time.sleep(random.uniform(lo, hi))
|
||||
|
||||
def print_stats(self):
|
||||
"""输出爬取统计"""
|
||||
elapsed = time.time() - self._start_time
|
||||
rpm = self._total_requests / max(elapsed / 60, 0.1)
|
||||
logger.info(f"[统计] 总请求: {self._total_requests}, "
|
||||
f"耗时: {elapsed:.0f}s, 速率: {rpm:.1f}次/分钟")
|
||||
|
||||
# ---------- 去重 ----------
|
||||
|
||||
def is_duplicate(self, url: str) -> bool:
|
||||
"""基于 URL 去重"""
|
||||
if url in self._seen_urls:
|
||||
return True
|
||||
self._seen_urls.add(url)
|
||||
return False
|
||||
|
||||
# ---------- 数据存储 ----------
|
||||
|
||||
def save_to_csv(self, filename: str = None):
|
||||
"""保存数据到 CSV"""
|
||||
if not self.results:
|
||||
logger.info("没有数据可保存")
|
||||
return
|
||||
|
||||
if not filename:
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
filename = f"{self.config['name']}_{timestamp}.csv"
|
||||
|
||||
filepath = os.path.join(self.data_dir, filename)
|
||||
os.makedirs(self.data_dir, exist_ok=True)
|
||||
|
||||
# 汇总所有字段
|
||||
all_keys = []
|
||||
seen = set()
|
||||
for row in self.results:
|
||||
for k in row:
|
||||
if k not in seen:
|
||||
all_keys.append(k)
|
||||
seen.add(k)
|
||||
|
||||
with open(filepath, "w", newline="", encoding="utf-8-sig") as f:
|
||||
writer = csv.DictWriter(f, fieldnames=all_keys, extrasaction="ignore")
|
||||
writer.writeheader()
|
||||
writer.writerows(self.results)
|
||||
|
||||
logger.info(f"数据已保存到: {filepath} (共 {len(self.results)} 条记录)")
|
||||
|
||||
# ---------- 抽象方法 ----------
|
||||
|
||||
@abstractmethod
|
||||
def crawl(self, max_pages: int = None, **kwargs):
|
||||
"""执行爬取,子类实现"""
|
||||
pass
|
||||
360
spiders/taizhou.py
Normal file
360
spiders/taizhou.py
Normal file
@@ -0,0 +1,360 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
台州公共资源交易中心爬虫 —— 基于 API + requests
|
||||
"""
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from datetime import datetime, timedelta
|
||||
from bs4 import BeautifulSoup
|
||||
from .base import BaseSpider
|
||||
from utils.attachment import AttachmentHandler
|
||||
|
||||
logger = logging.getLogger("ztb")
|
||||
|
||||
|
||||
class TaizhouSpider(BaseSpider):
|
||||
"""台州公共资源交易中心爬虫"""
|
||||
|
||||
# ---------- 列表数据 ----------
|
||||
|
||||
def _build_list_url(self, category_code: str, notice_code: str, page_num: int) -> str:
|
||||
"""构建列表页 URL(SSR 页面,页 1-6)"""
|
||||
base = self.config["base_url"]
|
||||
if notice_code:
|
||||
if category_code:
|
||||
path = f"/jyxx/{category_code}/{notice_code}"
|
||||
else:
|
||||
# 当只有notice_code时,直接使用/jyxx/{notice_code}
|
||||
path = f"/jyxx/{notice_code}"
|
||||
elif category_code:
|
||||
path = f"/jyxx/{category_code}"
|
||||
else:
|
||||
path = "/jyxx"
|
||||
|
||||
if page_num <= 1:
|
||||
return f"{base}{path}/trade_infor.html"
|
||||
else:
|
||||
return f"{base}{path}/{page_num}.html"
|
||||
|
||||
def fetch_list_via_api(self, page_index: int, page_size: int,
|
||||
category_num: str, start_date: str = "",
|
||||
end_date: str = "") -> list:
|
||||
"""通过 API 获取列表(第 7 页以后)"""
|
||||
resp = self.fetch(
|
||||
self.config["api_url"],
|
||||
method="POST",
|
||||
data={
|
||||
"siteGuid": self.config["site_guid"],
|
||||
"categoryNum": category_num,
|
||||
"content": "",
|
||||
"pageIndex": page_index,
|
||||
"pageSize": page_size,
|
||||
"YZM": "",
|
||||
"ImgGuid": "",
|
||||
"startdate": start_date,
|
||||
"enddate": end_date,
|
||||
"xiaqucode": "",
|
||||
"projectjiaoyitype": "",
|
||||
"jytype": "",
|
||||
"zhuanzai": "",
|
||||
},
|
||||
)
|
||||
if resp is None:
|
||||
return []
|
||||
|
||||
try:
|
||||
data = resp.json()
|
||||
return data.get("custom", {}).get("infodata", [])
|
||||
except Exception as e:
|
||||
logger.error(f"解析 API 响应失败: {e}")
|
||||
return []
|
||||
|
||||
def parse_html_list(self, html: str) -> list:
|
||||
"""解析 SSR 列表页 HTML"""
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
items = []
|
||||
for a in soup.select("a.public-list-item"):
|
||||
title = a.get("title", "").strip()
|
||||
href = a.get("href", "")
|
||||
if href and not href.startswith("http"):
|
||||
href = self.config["base_url"] + href
|
||||
|
||||
date_el = a.select_one("span.date")
|
||||
date = date_el.text.strip() if date_el else ""
|
||||
|
||||
region_el = a.select_one("span.xiaquclass")
|
||||
region = region_el.text.strip().strip("【】") if region_el else ""
|
||||
|
||||
item = {
|
||||
"标题": title,
|
||||
"发布日期": date,
|
||||
"地区": region,
|
||||
"链接": href,
|
||||
"来源": self.config["name"],
|
||||
}
|
||||
|
||||
# 解析特定格式的标题:[招标文件]项目名称[批准文号]
|
||||
title_pattern = r"(?:\[招标文件\])?\s*(.*)\s*\[([A-Z0-9]+)\]\s*$"
|
||||
match = re.search(title_pattern, title)
|
||||
if match:
|
||||
item["项目名称"] = match.group(1).strip()
|
||||
item["项目批准文号"] = match.group(2).strip()
|
||||
else:
|
||||
# 如果正则匹配失败,直接使用标题作为项目名称
|
||||
project_name = title
|
||||
# 尝试从标题中提取批准文号
|
||||
number_pattern = r"\[([A-Z0-9]+)\]\s*$"
|
||||
match = re.search(number_pattern, project_name)
|
||||
if match:
|
||||
item["项目批准文号"] = match.group(1).strip()
|
||||
# 从项目名称中删除批准文号部分
|
||||
project_name = project_name[:match.start()].strip()
|
||||
item["项目名称"] = project_name
|
||||
|
||||
if title and href:
|
||||
items.append(item)
|
||||
return items
|
||||
|
||||
def parse_api_list(self, records: list) -> list:
|
||||
"""解析 API 返回的列表数据"""
|
||||
items = []
|
||||
for rec in records:
|
||||
title = rec.get("title2") or rec.get("title", "")
|
||||
href = rec.get("infourl", "")
|
||||
if href and not href.startswith("http"):
|
||||
href = self.config["base_url"] + href
|
||||
|
||||
item = {
|
||||
"标题": title.strip(),
|
||||
"发布日期": rec.get("infodate", ""),
|
||||
"地区": rec.get("xiaquname", "").strip("【】"),
|
||||
"链接": href,
|
||||
"来源": self.config["name"],
|
||||
}
|
||||
|
||||
# 解析特定格式的标题:[招标文件]项目名称[批准文号]
|
||||
title_pattern = r"(?:\[招标文件\])?\s*(.*)\s*\[([A-Z0-9]+)\]\s*$"
|
||||
match = re.search(title_pattern, title)
|
||||
if match:
|
||||
item["项目名称"] = match.group(1).strip()
|
||||
item["项目批准文号"] = match.group(2).strip()
|
||||
else:
|
||||
# 如果正则匹配失败,直接使用标题作为项目名称
|
||||
project_name = title
|
||||
# 尝试从标题中提取批准文号
|
||||
number_pattern = r"\[([A-Z0-9]+)\]\s*$"
|
||||
match = re.search(number_pattern, project_name)
|
||||
if match:
|
||||
item["项目批准文号"] = match.group(1).strip()
|
||||
# 从项目名称中删除批准文号部分
|
||||
project_name = project_name[:match.start()].strip()
|
||||
item["项目名称"] = project_name
|
||||
|
||||
items.append(item)
|
||||
return items
|
||||
|
||||
# ---------- 详情页 ----------
|
||||
|
||||
def parse_detail(self, url: str) -> dict:
|
||||
"""解析详情页"""
|
||||
resp = self.fetch(url)
|
||||
if resp is None:
|
||||
return {}
|
||||
|
||||
detail = {}
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
|
||||
# 解析表格字段
|
||||
field_map = {
|
||||
"项目名称": "项目名称",
|
||||
"联系人": "联系人",
|
||||
"联系方式": "联系方式",
|
||||
"建设单位(招标人)": "招标人",
|
||||
"建设单位(招标人)": "招标人",
|
||||
"项目批准文件及文号": "项目批准文号",
|
||||
"项目类型": "项目类型",
|
||||
"招标方式": "招标方式",
|
||||
"主要建设内容": "主要建设内容",
|
||||
}
|
||||
|
||||
for row in soup.select("table tr"):
|
||||
cells = row.select("td")
|
||||
if len(cells) >= 2:
|
||||
key = cells[0].get_text(strip=True)
|
||||
value = cells[1].get_text(strip=True)
|
||||
if key in field_map and value:
|
||||
detail[field_map[key]] = value
|
||||
if len(cells) >= 4:
|
||||
key2 = cells[2].get_text(strip=True)
|
||||
value2 = cells[3].get_text(strip=True)
|
||||
if key2 == "联系方式" and value2:
|
||||
detail["联系方式"] = value2
|
||||
|
||||
# 招标项目表(计划招标时间 / 预估合同金额)
|
||||
for table in soup.select("table"):
|
||||
headers = [th.get_text(strip=True) for th in table.select("th")]
|
||||
if "计划招标时间" in headers:
|
||||
data_rows = table.select("tbody tr") or [
|
||||
r for r in table.select("tr") if r.select("td")
|
||||
]
|
||||
if data_rows:
|
||||
cells = data_rows[0].select("td")
|
||||
for i, h in enumerate(headers):
|
||||
if i < len(cells):
|
||||
val = cells[i].get_text(strip=True)
|
||||
if h == "计划招标时间" and val:
|
||||
detail["计划招标时间"] = val
|
||||
elif "预估合同金额" in h and val:
|
||||
detail["预估合同金额(万元)"] = val
|
||||
break
|
||||
|
||||
return detail
|
||||
|
||||
# ---------- 附件 ----------
|
||||
|
||||
def _extract_attachments(self, url: str) -> list:
|
||||
"""从详情页提取附件链接"""
|
||||
resp = self.fetch(url)
|
||||
if resp is None:
|
||||
return []
|
||||
|
||||
attachments = []
|
||||
for href in re.findall(r'href=["\']([^"\']*\.pdf[^"\']*)', resp.text):
|
||||
if not href.startswith("http"):
|
||||
href = self.config["base_url"] + href
|
||||
attachments.append({"name": href.split("/")[-1], "url": href})
|
||||
for href in re.findall(r'href=["\']([^"\']*\.docx?[^"\']*)', resp.text):
|
||||
if not href.startswith("http"):
|
||||
href = self.config["base_url"] + href
|
||||
attachments.append({"name": href.split("/")[-1], "url": href})
|
||||
return attachments
|
||||
|
||||
# ---------- 主流程 ----------
|
||||
|
||||
def crawl(self, max_pages: int = None, category: str = None,
|
||||
notice_type: str = None, date_filter: str = None,
|
||||
download_attachment: bool = False, **kwargs):
|
||||
"""
|
||||
执行爬取
|
||||
|
||||
Args:
|
||||
max_pages: 最大爬取页数
|
||||
category: 交易领域
|
||||
notice_type: 公告类型
|
||||
date_filter: 日期过滤
|
||||
download_attachment: 是否下载附件
|
||||
"""
|
||||
if max_pages is None:
|
||||
max_pages = self.spider_config.get("max_pages", 10)
|
||||
page_size = 10 # 台州站固定每页 10 条
|
||||
|
||||
# 日期过滤
|
||||
target_date = None
|
||||
start_date = end_date = ""
|
||||
if date_filter == "yesterday":
|
||||
d = datetime.now() - timedelta(days=1)
|
||||
target_date = d.strftime("%Y-%m-%d")
|
||||
start_date = target_date + " 00:00:00"
|
||||
end_date = target_date + " 23:59:59"
|
||||
logger.info(f"过滤日期: {target_date}(昨天)")
|
||||
elif date_filter:
|
||||
target_date = date_filter
|
||||
start_date = target_date + " 00:00:00"
|
||||
end_date = target_date + " 23:59:59"
|
||||
logger.info(f"过滤日期: {target_date}")
|
||||
|
||||
category_code = self.config.get("categories", {}).get(category, "")
|
||||
notice_code = self.config.get("notice_types", {}).get(notice_type, "")
|
||||
category_num = notice_code or category_code or "002"
|
||||
|
||||
# 附件
|
||||
attachment_handler = None
|
||||
if download_attachment:
|
||||
attachment_dir = os.path.join(self.data_dir, "attachments")
|
||||
attachment_handler = AttachmentHandler(attachment_dir)
|
||||
logger.info(f"启用附件下载,保存到: {attachment_dir}")
|
||||
|
||||
logger.info(f"开始爬取: {self.config['name']}")
|
||||
if category:
|
||||
logger.info(f"交易领域: {category}")
|
||||
if notice_type:
|
||||
logger.info(f"公告类型: {notice_type}")
|
||||
|
||||
for page_num in range(1, max_pages + 1):
|
||||
if self._check_limits():
|
||||
break
|
||||
|
||||
logger.info(f"正在爬取第 {page_num} 页...")
|
||||
|
||||
# 页 1-7 用 SSR,8+ 用 API
|
||||
if page_num <= 7:
|
||||
url = self._build_list_url(category_code, notice_code, page_num)
|
||||
resp = self.fetch(url)
|
||||
if resp is None:
|
||||
break
|
||||
page_items = self.parse_html_list(resp.text)
|
||||
else:
|
||||
records = self.fetch_list_via_api(
|
||||
page_num - 1, page_size, category_num,
|
||||
start_date, end_date,
|
||||
)
|
||||
if not records:
|
||||
logger.info("没有更多数据")
|
||||
break
|
||||
page_items = self.parse_api_list(records)
|
||||
|
||||
if not page_items:
|
||||
logger.info("没有更多数据")
|
||||
break
|
||||
|
||||
# 日期过滤 + 去重
|
||||
count = 0
|
||||
has_older = False # 是否存在比目标日期更早的记录
|
||||
for item in page_items:
|
||||
if target_date and item["发布日期"] != target_date:
|
||||
if target_date and item["发布日期"] < target_date:
|
||||
has_older = True
|
||||
continue
|
||||
if self.is_duplicate(item["链接"]):
|
||||
continue
|
||||
|
||||
# 详情页
|
||||
self.detail_delay()
|
||||
detail = self.parse_detail(item["链接"])
|
||||
item.update(detail)
|
||||
|
||||
# 附件
|
||||
if download_attachment and attachment_handler:
|
||||
atts = self._extract_attachments(item["链接"])
|
||||
if atts:
|
||||
item["附件数量"] = len(atts)
|
||||
att_names = []
|
||||
for att in atts:
|
||||
att_names.append(att["name"])
|
||||
result = attachment_handler.download_and_extract(att["url"])
|
||||
if result["success"] and result["text"]:
|
||||
item["附件内容摘要"] = result["text"][:2000]
|
||||
item["附件名称"] = " | ".join(att_names)
|
||||
|
||||
self.results.append(item)
|
||||
count += 1
|
||||
|
||||
logger.info(f" 获取 {count} 条数据")
|
||||
|
||||
if count == 0:
|
||||
if not target_date or has_older:
|
||||
# 无日期过滤 / 已出现更早日期 → 停止
|
||||
logger.info("当前页无新数据,停止翻页")
|
||||
break
|
||||
else:
|
||||
# 页面全是比目标日期更新的数据,继续翻页
|
||||
logger.info(" 当前页均为更新日期的数据,继续翻页")
|
||||
self.delay()
|
||||
continue
|
||||
|
||||
self.delay()
|
||||
|
||||
self.print_stats()
|
||||
logger.info(f"爬取完成,共 {len(self.results)} 条数据")
|
||||
return self.results
|
||||
305
spiders/zhejiang.py
Normal file
305
spiders/zhejiang.py
Normal file
@@ -0,0 +1,305 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
浙江省公共资源交易中心爬虫 —— 基于 API + requests
|
||||
"""
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from datetime import datetime, timedelta
|
||||
from .base import BaseSpider
|
||||
from utils.attachment import AttachmentHandler
|
||||
|
||||
logger = logging.getLogger("ztb")
|
||||
|
||||
|
||||
class ZhejiangSpider(BaseSpider):
|
||||
"""浙江省公共资源交易中心爬虫"""
|
||||
|
||||
# ---------- API 列表 ----------
|
||||
|
||||
def _build_payload(self, page_index: int, page_size: int,
|
||||
category_code: str, notice_code: str,
|
||||
start_date: str, end_date: str) -> dict:
|
||||
"""构建浙江省 API 请求体"""
|
||||
condition = []
|
||||
if notice_code:
|
||||
condition.append({
|
||||
"fieldName": "categorynum",
|
||||
"isLike": True,
|
||||
"likeType": 2,
|
||||
"equal": notice_code,
|
||||
})
|
||||
elif category_code:
|
||||
condition.append({
|
||||
"fieldName": "categorynum",
|
||||
"isLike": True,
|
||||
"likeType": 2,
|
||||
"equal": category_code,
|
||||
})
|
||||
|
||||
time_cond = []
|
||||
if start_date and end_date:
|
||||
time_cond.append({
|
||||
"fieldName": "webdate",
|
||||
"startTime": f"{start_date} 00:00:00",
|
||||
"endTime": f"{end_date} 23:59:59",
|
||||
})
|
||||
|
||||
return {
|
||||
"token": "",
|
||||
"pn": page_index * page_size,
|
||||
"rn": page_size,
|
||||
"sdt": "", "edt": "",
|
||||
"wd": "", "inc_wd": "", "exc_wd": "",
|
||||
"fields": "title",
|
||||
"cnum": "001",
|
||||
"sort": '{"webdate":"0"}',
|
||||
"ssort": "title",
|
||||
"cl": 5000,
|
||||
"terminal": "",
|
||||
"condition": condition or None,
|
||||
"time": time_cond or None,
|
||||
"highlights": "",
|
||||
"statistics": None,
|
||||
"unionCondition": None,
|
||||
"accuracy": "",
|
||||
"noParticiple": "0",
|
||||
"searchRange": None,
|
||||
"isBusiness": "1",
|
||||
}
|
||||
|
||||
def fetch_list_page(self, page_index: int, page_size: int,
|
||||
category_code: str, notice_code: str,
|
||||
start_date: str, end_date: str) -> list:
|
||||
"""通过 API 获取一页列表数据"""
|
||||
payload = self._build_payload(
|
||||
page_index, page_size, category_code, notice_code,
|
||||
start_date, end_date,
|
||||
)
|
||||
resp = self.fetch(
|
||||
self.config["api_url"],
|
||||
method="POST",
|
||||
json=payload,
|
||||
headers={"Referer": self.config["base_url"] + "/jyxxgk/list.html"},
|
||||
)
|
||||
if resp is None:
|
||||
return []
|
||||
|
||||
try:
|
||||
data = resp.json()
|
||||
return data.get("result", {}).get("records", [])
|
||||
except Exception as e:
|
||||
logger.error(f"解析 API 响应失败: {e}")
|
||||
return []
|
||||
|
||||
# ---------- 解析记录 ----------
|
||||
|
||||
@staticmethod
|
||||
def _parse_record(record: dict, source: str) -> dict:
|
||||
"""将 API 原始记录转换为结果字典"""
|
||||
title = record.get("title", "").strip()
|
||||
link = record.get("linkurl", "")
|
||||
if link and not link.startswith("http"):
|
||||
link = "https://ggzy.zj.gov.cn" + link
|
||||
|
||||
date_str = record.get("webdate", "")
|
||||
date_short = date_str.split(" ")[0] if date_str else ""
|
||||
|
||||
item = {
|
||||
"标题": title,
|
||||
"发布日期": date_short,
|
||||
"地区": record.get("infod", ""),
|
||||
"公告类型": record.get("categoryname", ""),
|
||||
"链接": link,
|
||||
"来源": source,
|
||||
}
|
||||
|
||||
# 解析特定格式的标题:[招标文件]项目名称[批准文号]
|
||||
import re
|
||||
# 改进的正则表达式,确保正确匹配标题格式
|
||||
title_pattern = r"\[(?:招标文件|招标公告)\]\s*(.*?)\s*\[([A-Z0-9]+)\]\s*$"
|
||||
match = re.search(title_pattern, title)
|
||||
if match:
|
||||
project_name = match.group(1).strip()
|
||||
# 删除结尾的"招标文件公示"、"招标文件预公示"等后缀
|
||||
suffixes = ["招标文件公示", "招标文件预公示", "招标公告", "招标预公告"]
|
||||
for suffix in suffixes:
|
||||
if project_name.endswith(suffix):
|
||||
project_name = project_name[:-len(suffix)].strip()
|
||||
item["项目名称"] = project_name
|
||||
item["项目批准文号"] = match.group(2).strip()
|
||||
else:
|
||||
# 如果正则匹配失败,直接使用标题作为项目名称
|
||||
project_name = title
|
||||
# 删除结尾的"招标文件公示"、"招标文件预公示"等后缀
|
||||
suffixes = ["招标文件公示", "招标文件预公示", "招标公告", "招标预公告"]
|
||||
for suffix in suffixes:
|
||||
if project_name.endswith(suffix):
|
||||
project_name = project_name[:-len(suffix)].strip()
|
||||
# 尝试从标题中提取批准文号
|
||||
number_pattern = r"\[([A-Z0-9]+)\]\s*$"
|
||||
match = re.search(number_pattern, project_name)
|
||||
if match:
|
||||
item["项目批准文号"] = match.group(1).strip()
|
||||
# 从项目名称中删除批准文号部分
|
||||
project_name = project_name[:match.start()].strip()
|
||||
item["项目名称"] = project_name
|
||||
|
||||
return item
|
||||
|
||||
@staticmethod
|
||||
def _parse_content_fields(content: str) -> dict:
|
||||
"""从 API content 字段提取结构化信息"""
|
||||
if not content:
|
||||
return {}
|
||||
|
||||
# 清理 HTML 实体
|
||||
import html as html_mod
|
||||
text = html_mod.unescape(content)
|
||||
text = re.sub(r"<[^>]+>", "", text) # 去 HTML 标签
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
|
||||
fields = {}
|
||||
patterns = {
|
||||
"项目名称": r"项目名称[:::]\s*(.+?)\s{2,}",
|
||||
"项目代码": r"项目代码[:::]\s*(.+?)\s{2,}",
|
||||
"招标人": r"招标人[:::].*?名称[:::]\s*(.+?)\s{2,}",
|
||||
"招标代理": r"代理机构[:::].*?名称[:::]\s*(.+?)\s{2,}",
|
||||
"联系电话": r"电\s*话[:::]\s*([\d\-]+)",
|
||||
"招标估算金额": r"招标估算金额[:::]\s*([\d,\.]+\s*元)",
|
||||
}
|
||||
for key, pat in patterns.items():
|
||||
m = re.search(pat, text)
|
||||
if m:
|
||||
fields[key] = m.group(1).strip()
|
||||
|
||||
return fields
|
||||
|
||||
# ---------- 附件 ----------
|
||||
|
||||
def _extract_attachments_from_detail(self, url: str) -> list:
|
||||
"""访问详情页,提取附件链接"""
|
||||
resp = self.fetch(url)
|
||||
if resp is None:
|
||||
return []
|
||||
|
||||
attachments = []
|
||||
# PDF
|
||||
for href in re.findall(r'href=["\']([^"\']*\.pdf[^"\']*)', resp.text):
|
||||
if not href.startswith("http"):
|
||||
href = self.config["base_url"] + href
|
||||
name = href.split("/")[-1]
|
||||
attachments.append({"name": name, "url": href})
|
||||
# Word
|
||||
for href in re.findall(r'href=["\']([^"\']*\.docx?[^"\']*)', resp.text):
|
||||
if not href.startswith("http"):
|
||||
href = self.config["base_url"] + href
|
||||
name = href.split("/")[-1]
|
||||
attachments.append({"name": name, "url": href})
|
||||
|
||||
return attachments
|
||||
|
||||
# ---------- 主流程 ----------
|
||||
|
||||
def crawl(self, max_pages: int = None, category: str = None,
|
||||
notice_type: str = None, date_filter: str = None,
|
||||
download_attachment: bool = False, **kwargs):
|
||||
"""
|
||||
执行爬取
|
||||
|
||||
Args:
|
||||
max_pages: 最大爬取页数
|
||||
category: 交易领域(如 "工程建设")
|
||||
notice_type: 公告类型(如 "招标公告")
|
||||
date_filter: 日期过滤("yesterday" 或 "2026-02-03")
|
||||
download_attachment: 是否下载附件
|
||||
"""
|
||||
if max_pages is None:
|
||||
max_pages = self.spider_config.get("max_pages", 10)
|
||||
page_size = self.spider_config.get("page_size", 20)
|
||||
|
||||
# 日期范围
|
||||
if date_filter == "yesterday":
|
||||
d = datetime.now() - timedelta(days=1)
|
||||
start_date = end_date = d.strftime("%Y-%m-%d")
|
||||
logger.info(f"过滤日期: {start_date}(昨天)")
|
||||
elif date_filter:
|
||||
start_date = end_date = date_filter
|
||||
logger.info(f"过滤日期: {start_date}")
|
||||
else:
|
||||
# 默认近一个月
|
||||
end_date = datetime.now().strftime("%Y-%m-%d")
|
||||
start_date = (datetime.now() - timedelta(days=30)).strftime("%Y-%m-%d")
|
||||
|
||||
category_code = self.config.get("categories", {}).get(category, "")
|
||||
notice_code = self.config.get("notice_types", {}).get(notice_type, "")
|
||||
|
||||
# 附件处理器
|
||||
attachment_handler = None
|
||||
if download_attachment:
|
||||
attachment_dir = os.path.join(self.data_dir, "attachments")
|
||||
attachment_handler = AttachmentHandler(attachment_dir)
|
||||
logger.info(f"启用附件下载,保存到: {attachment_dir}")
|
||||
|
||||
logger.info(f"开始爬取: {self.config['name']}")
|
||||
if category:
|
||||
logger.info(f"交易领域: {category}")
|
||||
if notice_type:
|
||||
logger.info(f"公告类型: {notice_type}")
|
||||
|
||||
for page_idx in range(max_pages):
|
||||
if self._check_limits():
|
||||
break
|
||||
|
||||
logger.info(f"正在爬取第 {page_idx + 1} 页...")
|
||||
records = self.fetch_list_page(
|
||||
page_idx, page_size, category_code, notice_code,
|
||||
start_date, end_date,
|
||||
)
|
||||
|
||||
if not records:
|
||||
logger.info("没有更多数据")
|
||||
break
|
||||
|
||||
count = 0
|
||||
for rec in records:
|
||||
link = rec.get("linkurl", "")
|
||||
if link and not link.startswith("http"):
|
||||
link = self.config["base_url"] + link
|
||||
if self.is_duplicate(link):
|
||||
continue
|
||||
|
||||
item = self._parse_record(rec, self.config["name"])
|
||||
# 从 content 提取详情字段
|
||||
detail = self._parse_content_fields(rec.get("content", ""))
|
||||
item.update(detail)
|
||||
|
||||
# 附件
|
||||
if download_attachment and attachment_handler:
|
||||
self.detail_delay()
|
||||
atts = self._extract_attachments_from_detail(link)
|
||||
if atts:
|
||||
item["附件数量"] = len(atts)
|
||||
att_names = []
|
||||
for att in atts:
|
||||
att_names.append(att["name"])
|
||||
result = attachment_handler.download_and_extract(att["url"])
|
||||
if result["success"] and result["text"]:
|
||||
item["附件内容摘要"] = result["text"][:2000]
|
||||
item["附件名称"] = " | ".join(att_names)
|
||||
|
||||
self.results.append(item)
|
||||
count += 1
|
||||
|
||||
logger.info(f" 获取 {count} 条数据")
|
||||
|
||||
if count == 0:
|
||||
logger.info("当前页无新数据,停止翻页")
|
||||
break
|
||||
|
||||
self.delay()
|
||||
|
||||
self.print_stats()
|
||||
logger.info(f"爬取完成,共 {len(self.results)} 条数据")
|
||||
return self.results
|
||||
Reference in New Issue
Block a user