Initial commit: 招标信息爬虫与分析系统
This commit is contained in:
229
spiders/base.py
Normal file
229
spiders/base.py
Normal file
@@ -0,0 +1,229 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
爬虫基类 - 基于 requests
|
||||
"""
|
||||
import csv
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import signal
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime
|
||||
from abc import ABC, abstractmethod
|
||||
from logging.handlers import RotatingFileHandler
|
||||
|
||||
import requests
|
||||
|
||||
logger = logging.getLogger("ztb")
|
||||
|
||||
|
||||
def setup_logging(log_dir: str = "logs", level: int = logging.INFO):
|
||||
"""配置日志系统:文件 + 控制台"""
|
||||
os.makedirs(log_dir, exist_ok=True)
|
||||
root = logging.getLogger("ztb")
|
||||
if root.handlers: # 避免重复初始化
|
||||
return root
|
||||
root.setLevel(level)
|
||||
|
||||
fmt = logging.Formatter(
|
||||
"%(asctime)s [%(levelname)s] %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
)
|
||||
|
||||
# 文件日志:自动轮转,单文件 5MB,保留 5 个
|
||||
fh = RotatingFileHandler(
|
||||
os.path.join(log_dir, "spider.log"),
|
||||
maxBytes=5 * 1024 * 1024,
|
||||
backupCount=5,
|
||||
encoding="utf-8",
|
||||
)
|
||||
fh.setLevel(logging.DEBUG)
|
||||
fh.setFormatter(fmt)
|
||||
root.addHandler(fh)
|
||||
|
||||
# 控制台:只输出 INFO 以上
|
||||
ch = logging.StreamHandler()
|
||||
ch.setLevel(logging.INFO)
|
||||
ch.setFormatter(fmt)
|
||||
root.addHandler(ch)
|
||||
|
||||
return root
|
||||
|
||||
|
||||
class BaseSpider(ABC):
|
||||
"""爬虫基类"""
|
||||
|
||||
def __init__(self, config: dict, spider_config: dict, data_dir: str):
|
||||
self.config = config
|
||||
self.spider_config = spider_config
|
||||
self.data_dir = data_dir
|
||||
self.results = []
|
||||
self._seen_urls = set() # 去重
|
||||
|
||||
# 安全计数器
|
||||
self._total_requests = 0
|
||||
self._consecutive_errors = 0
|
||||
self._stopped = False
|
||||
self._start_time = time.time()
|
||||
self._minute_requests = [] # 每分钟请求时间戳
|
||||
|
||||
# HTTP 会话
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update({
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/120.0.0.0 Safari/537.36",
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;"
|
||||
"q=0.9,image/webp,*/*;q=0.8",
|
||||
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
||||
})
|
||||
|
||||
# 注册优雅退出
|
||||
signal.signal(signal.SIGINT, self._handle_stop)
|
||||
signal.signal(signal.SIGTERM, self._handle_stop)
|
||||
|
||||
# ---------- 安全机制 ----------
|
||||
|
||||
def _handle_stop(self, signum, frame):
|
||||
"""捕获中断信号,保存已采集数据后退出"""
|
||||
logger.warning("收到中断信号,正在保存已采集数据...")
|
||||
self._stopped = True
|
||||
self.save_to_csv()
|
||||
sys.exit(0)
|
||||
|
||||
def _check_limits(self) -> bool:
|
||||
"""检查是否超出安全阈值,返回 True 表示应停止"""
|
||||
max_req = self.spider_config.get("max_total_requests", 300)
|
||||
if self._total_requests >= max_req:
|
||||
logger.warning(f"达到最大请求数 ({max_req}),停止爬取")
|
||||
return True
|
||||
|
||||
max_err = self.spider_config.get("max_consecutive_errors", 5)
|
||||
if self._consecutive_errors >= max_err:
|
||||
logger.error(f"连续失败 {max_err} 次,触发熔断")
|
||||
return True
|
||||
|
||||
return self._stopped
|
||||
|
||||
# ---------- 网络请求 ----------
|
||||
|
||||
def _throttle(self):
|
||||
"""每分钟请求数限制,超出则等待"""
|
||||
rpm_limit = self.spider_config.get("requests_per_minute", 10)
|
||||
now = time.time()
|
||||
# 清理 60s 以前的时间戳
|
||||
self._minute_requests = [t for t in self._minute_requests if now - t < 60]
|
||||
if len(self._minute_requests) >= rpm_limit:
|
||||
wait = 60 - (now - self._minute_requests[0]) + random.uniform(1, 3)
|
||||
if wait > 0:
|
||||
logger.info(f"达到速率限制 ({rpm_limit}次/分钟),等待 {wait:.0f}s...")
|
||||
time.sleep(wait)
|
||||
self._minute_requests.append(time.time())
|
||||
|
||||
def fetch(self, url: str, method: str = "GET", **kwargs) -> requests.Response | None:
|
||||
"""
|
||||
带重试、限速和安全检查的 HTTP 请求
|
||||
"""
|
||||
if self._check_limits():
|
||||
return None
|
||||
|
||||
self._throttle()
|
||||
|
||||
timeout = kwargs.pop("timeout", self.spider_config.get("timeout", 30))
|
||||
max_retries = self.spider_config.get("max_retries", 3)
|
||||
|
||||
for attempt in range(1, max_retries + 1):
|
||||
try:
|
||||
self._total_requests += 1
|
||||
resp = self.session.request(method, url, timeout=timeout, **kwargs)
|
||||
resp.raise_for_status()
|
||||
|
||||
# 检测被拦截的空响应(反爬虒返回 200 但 body 为空)
|
||||
if len(resp.content) <= 10 and "json" not in resp.headers.get("Content-Type", ""):
|
||||
self._consecutive_errors += 1
|
||||
logger.warning(f"检测到空响应 ({len(resp.content)} bytes),可能被反爬")
|
||||
if attempt < max_retries:
|
||||
wait = 10 * attempt + random.uniform(5, 10)
|
||||
logger.info(f"疑似被反爬拦截,等待 {wait:.0f}s 后重试...")
|
||||
time.sleep(wait)
|
||||
continue
|
||||
return None
|
||||
|
||||
self._consecutive_errors = 0
|
||||
return resp
|
||||
except requests.RequestException as e:
|
||||
self._consecutive_errors += 1
|
||||
wait = 2 ** attempt + random.random()
|
||||
logger.warning(f"请求失败 ({attempt}/{max_retries}): {e},{wait:.1f}s 后重试")
|
||||
if attempt < max_retries:
|
||||
time.sleep(wait)
|
||||
|
||||
logger.error(f"请求失败,已达最大重试次数: {url[:80]}")
|
||||
return None
|
||||
|
||||
def delay(self):
|
||||
"""列表页之间的随机延迟"""
|
||||
lo = self.spider_config.get("delay_min", 3)
|
||||
hi = self.spider_config.get("delay_max", 6)
|
||||
time.sleep(random.uniform(lo, hi))
|
||||
|
||||
def detail_delay(self):
|
||||
"""详情页请求前的随机延迟"""
|
||||
lo = self.spider_config.get("detail_delay_min", 2)
|
||||
hi = self.spider_config.get("detail_delay_max", 5)
|
||||
time.sleep(random.uniform(lo, hi))
|
||||
|
||||
def print_stats(self):
|
||||
"""输出爬取统计"""
|
||||
elapsed = time.time() - self._start_time
|
||||
rpm = self._total_requests / max(elapsed / 60, 0.1)
|
||||
logger.info(f"[统计] 总请求: {self._total_requests}, "
|
||||
f"耗时: {elapsed:.0f}s, 速率: {rpm:.1f}次/分钟")
|
||||
|
||||
# ---------- 去重 ----------
|
||||
|
||||
def is_duplicate(self, url: str) -> bool:
|
||||
"""基于 URL 去重"""
|
||||
if url in self._seen_urls:
|
||||
return True
|
||||
self._seen_urls.add(url)
|
||||
return False
|
||||
|
||||
# ---------- 数据存储 ----------
|
||||
|
||||
def save_to_csv(self, filename: str = None):
|
||||
"""保存数据到 CSV"""
|
||||
if not self.results:
|
||||
logger.info("没有数据可保存")
|
||||
return
|
||||
|
||||
if not filename:
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
filename = f"{self.config['name']}_{timestamp}.csv"
|
||||
|
||||
filepath = os.path.join(self.data_dir, filename)
|
||||
os.makedirs(self.data_dir, exist_ok=True)
|
||||
|
||||
# 汇总所有字段
|
||||
all_keys = []
|
||||
seen = set()
|
||||
for row in self.results:
|
||||
for k in row:
|
||||
if k not in seen:
|
||||
all_keys.append(k)
|
||||
seen.add(k)
|
||||
|
||||
with open(filepath, "w", newline="", encoding="utf-8-sig") as f:
|
||||
writer = csv.DictWriter(f, fieldnames=all_keys, extrasaction="ignore")
|
||||
writer.writeheader()
|
||||
writer.writerows(self.results)
|
||||
|
||||
logger.info(f"数据已保存到: {filepath} (共 {len(self.results)} 条记录)")
|
||||
|
||||
# ---------- 抽象方法 ----------
|
||||
|
||||
@abstractmethod
|
||||
def crawl(self, max_pages: int = None, **kwargs):
|
||||
"""执行爬取,子类实现"""
|
||||
pass
|
||||
Reference in New Issue
Block a user