# 定义基础爬虫类 from typing import Dict, Optional, List, Any from abc import ABC, abstractmethod import requests from bs4 import BeautifulSoup from loguru import logger from pydantic import BaseModel, Field, HttpUrl class UrlConfig(BaseModel): """URL配置数据模型""" url: str = Field(..., description="请求URL") params: Optional[Dict[str, Any]] = Field(default=None, description="请求参数") method: str = Field(default="GET", description="请求方法") class Config: # 允许任意类型 arbitrary_types_allowed = True class CrawlerConfig(BaseModel): """爬虫配置数据模型""" base_url: str = Field(..., description="基础URL") urls: Dict[str, UrlConfig] = Field( default_factory=dict, description="URL配置映射,如 {'search': UrlConfig(...), 'list': UrlConfig(...)}" ) headers: Dict[str, str] = Field( default_factory=lambda: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' }, description="HTTP请求头" ) timeout: int = Field(default=30, description="请求超时时间(秒)") retry_times: int = Field(default=3, description="重试次数") proxy: Optional[str] = Field(default=None, description="代理地址") class Config: # 允许任意类型 arbitrary_types_allowed = True class NewsItem(BaseModel): """新闻数据模型""" title: str = Field(..., description="新闻标题") contentRows: List[Dict[str, Any]] = Field(..., description="新闻内容") url: str = Field(..., description="新闻链接") publishTime: Optional[str] = Field(default=None, description="发布时间") author: Optional[str] = Field(default=None, description="作者") source: Optional[str] = Field(default=None, description="来源") category: Optional[str] = Field(default=None, description="分类") class BaseCrawler(ABC): """基础爬虫类""" def __init__(self, config: CrawlerConfig): """ 初始化爬虫 Args: config: 爬虫配置对象 """ self.config = config self.session = requests.Session() self.session.headers.update(config.headers) logger.info(f"初始化爬虫: {self.__class__.__name__}") def fetch(self, url: str, method: str = "GET", data: Optional[Dict[str, Any]] = None, **kwargs) -> Optional[requests.Response]: """ 发送HTTP请求 Args: url: 请求URL method: 请求方法 data: 请求数据 **kwargs: 其他请求参数 Returns: 响应对象,失败返回None """ for attempt in range(self.config.retry_times): try: logger.info(f"请求URL: {url} (尝试 {attempt + 1}/{self.config.retry_times})") response = self.session.request( method=method, url=url, headers=self.config.headers, data=data, timeout=self.config.timeout, proxies={'http': self.config.proxy, 'https': self.config.proxy} if self.config.proxy else None, **kwargs ) response.raise_for_status() return response except requests.RequestException as e: logger.warning(f"请求失败 (尝试 {attempt + 1}/{self.config.retry_times}): {str(e)}") if attempt == self.config.retry_times - 1: logger.error(f"请求最终失败: {url}") return None return None def parse_html(self, html, parser: str = "lxml") -> Optional[BeautifulSoup]: """ 解析HTML Args: html: HTML字符串或字节数据(bytes),BeautifulSoup会自动检测编码 parser: 解析器类型 Returns: BeautifulSoup对象 """ try: # BeautifulSoup 可以自动处理编码: # - 如果传入 bytes,会从 HTML 的 标签或响应头自动检测编码 # - 如果传入 str,会直接使用 return BeautifulSoup(html, parser) except Exception as e: logger.error(f"HTML解析失败: {str(e)}") return None @abstractmethod def crawl(self) -> List[NewsItem]: """ 爬取新闻(子类必须实现) Returns: 新闻列表 """ pass @abstractmethod def parse_news_detail(self, url: str) -> Optional[NewsItem]: """ 解析新闻详情(子类必须实现) Args: url: 新闻详情页URL Returns: 新闻对象 """ pass def close(self): """关闭会话""" self.session.close() logger.info(f"关闭爬虫: {self.__class__.__name__}")