# -*- coding: utf-8 -*- """ 爬虫基类 - 基于 requests """ import csv import logging import os import random import re import signal import sys import time from datetime import datetime from abc import ABC, abstractmethod from logging.handlers import RotatingFileHandler import requests logger = logging.getLogger("ztb") def setup_logging(log_dir: str = "logs", level: int = logging.INFO): """配置日志系统:文件 + 控制台""" os.makedirs(log_dir, exist_ok=True) root = logging.getLogger("ztb") if root.handlers: # 避免重复初始化 return root root.setLevel(level) fmt = logging.Formatter( "%(asctime)s [%(levelname)s] %(message)s", datefmt="%Y-%m-%d %H:%M:%S", ) # 文件日志:自动轮转,单文件 5MB,保留 5 个 fh = RotatingFileHandler( os.path.join(log_dir, "spider.log"), maxBytes=5 * 1024 * 1024, backupCount=5, encoding="utf-8", ) fh.setLevel(logging.DEBUG) fh.setFormatter(fmt) root.addHandler(fh) # 控制台:只输出 INFO 以上 ch = logging.StreamHandler() ch.setLevel(logging.INFO) ch.setFormatter(fmt) root.addHandler(ch) return root class BaseSpider(ABC): """爬虫基类""" def __init__(self, config: dict, spider_config: dict, data_dir: str): self.config = config self.spider_config = spider_config self.data_dir = data_dir self.results = [] self._seen_urls = set() # 去重 # 安全计数器 self._total_requests = 0 self._consecutive_errors = 0 self._stopped = False self._start_time = time.time() self._minute_requests = [] # 每分钟请求时间戳 # HTTP 会话 self.session = requests.Session() self.session.headers.update({ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/120.0.0.0 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;" "q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", }) # 注册优雅退出 signal.signal(signal.SIGINT, self._handle_stop) signal.signal(signal.SIGTERM, self._handle_stop) # ---------- 安全机制 ---------- def _handle_stop(self, signum, frame): """捕获中断信号,保存已采集数据后退出""" logger.warning("收到中断信号,正在保存已采集数据...") self._stopped = True self.save_to_csv() sys.exit(0) def _check_limits(self) -> bool: """检查是否超出安全阈值,返回 True 表示应停止""" max_req = self.spider_config.get("max_total_requests", 300) if self._total_requests >= max_req: logger.warning(f"达到最大请求数 ({max_req}),停止爬取") return True max_err = self.spider_config.get("max_consecutive_errors", 5) if self._consecutive_errors >= max_err: logger.error(f"连续失败 {max_err} 次,触发熔断") return True return self._stopped # ---------- 网络请求 ---------- def _throttle(self): """每分钟请求数限制,超出则等待""" rpm_limit = self.spider_config.get("requests_per_minute", 10) now = time.time() # 清理 60s 以前的时间戳 self._minute_requests = [t for t in self._minute_requests if now - t < 60] if len(self._minute_requests) >= rpm_limit: wait = 60 - (now - self._minute_requests[0]) + random.uniform(1, 3) if wait > 0: logger.info(f"达到速率限制 ({rpm_limit}次/分钟),等待 {wait:.0f}s...") time.sleep(wait) self._minute_requests.append(time.time()) def fetch(self, url: str, method: str = "GET", **kwargs) -> requests.Response | None: """ 带重试、限速和安全检查的 HTTP 请求 """ if self._check_limits(): return None self._throttle() timeout = kwargs.pop("timeout", self.spider_config.get("timeout", 30)) max_retries = self.spider_config.get("max_retries", 3) for attempt in range(1, max_retries + 1): try: self._total_requests += 1 resp = self.session.request(method, url, timeout=timeout, **kwargs) resp.raise_for_status() # 检测被拦截的空响应(反爬虒返回 200 但 body 为空) if len(resp.content) <= 10 and "json" not in resp.headers.get("Content-Type", ""): self._consecutive_errors += 1 logger.warning(f"检测到空响应 ({len(resp.content)} bytes),可能被反爬") if attempt < max_retries: wait = 10 * attempt + random.uniform(5, 10) logger.info(f"疑似被反爬拦截,等待 {wait:.0f}s 后重试...") time.sleep(wait) continue return None self._consecutive_errors = 0 return resp except requests.RequestException as e: self._consecutive_errors += 1 wait = 2 ** attempt + random.random() logger.warning(f"请求失败 ({attempt}/{max_retries}): {e},{wait:.1f}s 后重试") if attempt < max_retries: time.sleep(wait) logger.error(f"请求失败,已达最大重试次数: {url[:80]}") return None def delay(self): """列表页之间的随机延迟""" lo = self.spider_config.get("delay_min", 3) hi = self.spider_config.get("delay_max", 6) time.sleep(random.uniform(lo, hi)) def detail_delay(self): """详情页请求前的随机延迟""" lo = self.spider_config.get("detail_delay_min", 2) hi = self.spider_config.get("detail_delay_max", 5) time.sleep(random.uniform(lo, hi)) def print_stats(self): """输出爬取统计""" elapsed = time.time() - self._start_time rpm = self._total_requests / max(elapsed / 60, 0.1) logger.info(f"[统计] 总请求: {self._total_requests}, " f"耗时: {elapsed:.0f}s, 速率: {rpm:.1f}次/分钟") # ---------- 标题解析(统一规则) ---------- @staticmethod def _parse_title(title: str) -> dict: """从标题中提取项目名称和批准文号(统一规则)""" result = {} # 统一正则:前缀可选,贪婪匹配项目名称,提取尾部批准文号 title_pattern = r"(?:\[(?:招标文件|招标公告)\])?\s*(.*)\s*\[([A-Z0-9]+)\]\s*$" match = re.search(title_pattern, title) if match: project_name = match.group(1).strip() result["项目批准文号"] = match.group(2).strip() else: project_name = title # 尝试从标题尾部提取批准文号 number_pattern = r"\[([A-Z0-9]+)\]\s*$" match = re.search(number_pattern, project_name) if match: result["项目批准文号"] = match.group(1).strip() project_name = project_name[:match.start()].strip() # 清理项目名称后缀 suffixes = ["招标文件公示", "招标文件预公示", "招标公告", "招标预公告"] for suffix in suffixes: if project_name.endswith(suffix): project_name = project_name[:-len(suffix)].strip() result["项目名称"] = project_name return result # ---------- 发布时间提取 ---------- @staticmethod def _extract_publish_time(soup, page_text: str) -> str: """从详情页中提取发布时间(含时分秒)""" patterns = [ r'信息发布时间[::]\s*([\d-]+\s[\d:]+)', r'发布时间[::]\s*([\d-]+\s[\d:]+)', r'发布日期[::]\s*([\d-]+\s[\d:]+)', r'发布时间[::]\s*([\d-]+)', r'发布日期[::]\s*([\d-]+)', ] for pattern in patterns: match = re.search(pattern, page_text) if match: return match.group(1).strip() time_tags = soup.find_all(['time', 'span', 'div'], class_=re.compile(r'time|date|publish', re.I)) for tag in time_tags: text = tag.get_text(strip=True) match = re.search(r'([\d-]+\s[\d:]+)', text) if match: return match.group(1).strip() return "" # ---------- 去重 ---------- def is_duplicate(self, url: str) -> bool: """基于 URL 去重""" if url in self._seen_urls: return True self._seen_urls.add(url) return False # ---------- 数据存储 ---------- def save_to_csv(self, filename: str = None): """保存数据到 CSV""" if not self.results: logger.info("没有数据可保存") return if not filename: timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") filename = f"{self.config['name']}_{timestamp}.csv" filepath = os.path.join(self.data_dir, filename) os.makedirs(self.data_dir, exist_ok=True) # 汇总所有字段 all_keys = [] seen = set() for row in self.results: for k in row: if k not in seen: all_keys.append(k) seen.add(k) with open(filepath, "w", newline="", encoding="utf-8-sig") as f: writer = csv.DictWriter(f, fieldnames=all_keys, extrasaction="ignore") writer.writeheader() writer.writerows(self.results) logger.info(f"数据已保存到: {filepath} (共 {len(self.results)} 条记录)") # ---------- 抽象方法 ---------- @abstractmethod def crawl(self, max_pages: int = None, **kwargs): """执行爬取,子类实现""" pass