# 定义基础爬虫类 from typing import Callable, Dict, Optional, List, Any, Union from abc import ABC, abstractmethod import requests from bs4 import BeautifulSoup, NavigableString from loguru import logger from pydantic import BaseModel, Field, HttpUrl import json class UrlConfig(BaseModel): """URL配置数据模型""" url: str = Field(..., description="请求URL") params: Optional[Dict[str, Any]] = Field(default=None, description="请求参数") method: str = Field(default="GET", description="请求方法") headers: Optional[Dict[str, str]] = Field(default=None, description="请求头") class Config: # 允许任意类型 arbitrary_types_allowed = True class CrawlerConfig(BaseModel): """爬虫配置数据模型""" base_url: str = Field(..., description="基础URL") urls: Dict[str, UrlConfig] = Field( default_factory=dict, description="URL配置映射,如 {'search': UrlConfig(...), 'list': UrlConfig(...)}" ) headers: Dict[str, str] = Field( default_factory=lambda: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' }, description="HTTP请求头" ) timeout: int = Field(default=30, description="请求超时时间(秒)") retry_times: int = Field(default=3, description="重试次数") proxy: Optional[str] = Field(default=None, description="代理地址") class Config: # 允许任意类型 arbitrary_types_allowed = True class NewsItem(BaseModel): """新闻数据模型""" title: str = Field(..., description="新闻标题") contentRows: List[Dict[str, Any]] = Field(..., description="新闻内容") url: str = Field(..., description="新闻链接") publishTime: Optional[str] = Field(default=None, description="发布时间") author: Optional[str] = Field(default=None, description="作者") source: Optional[str] = Field(default=None, description="来源") category: Optional[str] = Field(default=None, description="分类") executeStatus: Optional[int] = Field(default=0, description="执行状态") executeMessage: Optional[str] = Field(default=None, description="执行消息") class BaseCrawler(ABC): """基础爬虫类""" def __init__(self, config: CrawlerConfig): """ 初始化爬虫 Args: config: 爬虫配置对象 """ self.config = config self.session = requests.Session() self.session.headers.update(config.headers) logger.info(f"初始化爬虫: {self.__class__.__name__}") def fetch(self, url: str, method: str = "GET", data: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None, **kwargs) -> Optional[requests.Response]: """ 发送HTTP请求 Args: url: 请求URL method: 请求方法 data: 请求数据 headers: 额外的请求头,将与默认请求头合并(额外的优先) **kwargs: 其他请求参数 Returns: 响应对象,失败返回None """ for attempt in range(self.config.retry_times): try: logger.info(f"请求URL: {url} (尝试 {attempt + 1}/{self.config.retry_times})") # 合并默认headers与调用方headers(调用方覆盖默认) request_headers = dict(self.config.headers or {}) if headers: request_headers.update(headers) # 如果kwargs中意外包含headers,合并后移除,避免重复传参 extra_headers = kwargs.pop("headers", None) if extra_headers: request_headers.update(extra_headers) response = self.session.request( method=method, url=url, headers=request_headers, data=data, timeout=self.config.timeout, proxies={'http': self.config.proxy, 'https': self.config.proxy} if self.config.proxy else None, **kwargs ) response.raise_for_status() return response except requests.RequestException as e: logger.warning(f"请求失败 (尝试 {attempt + 1}/{self.config.retry_times}): {str(e)}") if attempt == self.config.retry_times - 1: logger.error(f"请求最终失败: {url}") return None return None def parse_html(self, html, parser: str = "lxml") -> Optional[BeautifulSoup]: """ 解析HTML Args: html: HTML字符串或字节数据(bytes),BeautifulSoup会自动检测编码 parser: 解析器类型 Returns: BeautifulSoup对象 """ try: # BeautifulSoup 可以自动处理编码: # - 如果传入 bytes,会从 HTML 的 标签或响应头自动检测编码 # - 如果传入 str,会直接使用 return BeautifulSoup(html, parser) except Exception as e: logger.error(f"HTML解析失败: {str(e)}") return None @abstractmethod def parse_news_detail(self, url: str) -> Optional[NewsItem]: """ 解析新闻详情(子类必须实现) Args: url: 新闻详情页URL Returns: 新闻对象 """ pass def close(self): """关闭会话""" self.session.close() logger.info(f"关闭爬虫: {self.__class__.__name__}")