152 lines
5.1 KiB
Python
152 lines
5.1 KiB
Python
|
|
# 定义基础爬虫类
|
|||
|
|
from typing import Dict, Optional, List, Any
|
|||
|
|
from abc import ABC, abstractmethod
|
|||
|
|
import requests
|
|||
|
|
from bs4 import BeautifulSoup
|
|||
|
|
from loguru import logger
|
|||
|
|
from pydantic import BaseModel, Field, HttpUrl
|
|||
|
|
|
|||
|
|
|
|||
|
|
class UrlConfig(BaseModel):
|
|||
|
|
"""URL配置数据模型"""
|
|||
|
|
url: str = Field(..., description="请求URL")
|
|||
|
|
params: Optional[Dict[str, Any]] = Field(default=None, description="请求参数")
|
|||
|
|
method: str = Field(default="GET", description="请求方法")
|
|||
|
|
|
|||
|
|
class Config:
|
|||
|
|
# 允许任意类型
|
|||
|
|
arbitrary_types_allowed = True
|
|||
|
|
|
|||
|
|
|
|||
|
|
class CrawlerConfig(BaseModel):
|
|||
|
|
"""爬虫配置数据模型"""
|
|||
|
|
base_url: str = Field(..., description="基础URL")
|
|||
|
|
urls: Dict[str, UrlConfig] = Field(
|
|||
|
|
default_factory=dict,
|
|||
|
|
description="URL配置映射,如 {'search': UrlConfig(...), 'list': UrlConfig(...)}"
|
|||
|
|
)
|
|||
|
|
headers: Dict[str, str] = Field(
|
|||
|
|
default_factory=lambda: {
|
|||
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
|||
|
|
},
|
|||
|
|
description="HTTP请求头"
|
|||
|
|
)
|
|||
|
|
timeout: int = Field(default=30, description="请求超时时间(秒)")
|
|||
|
|
retry_times: int = Field(default=3, description="重试次数")
|
|||
|
|
proxy: Optional[str] = Field(default=None, description="代理地址")
|
|||
|
|
|
|||
|
|
class Config:
|
|||
|
|
# 允许任意类型
|
|||
|
|
arbitrary_types_allowed = True
|
|||
|
|
|
|||
|
|
|
|||
|
|
class NewsItem(BaseModel):
|
|||
|
|
"""新闻数据模型"""
|
|||
|
|
title: str = Field(..., description="新闻标题")
|
|||
|
|
contentRows: List[Dict[str, Any]] = Field(..., description="新闻内容")
|
|||
|
|
url: str = Field(..., description="新闻链接")
|
|||
|
|
publishTime: Optional[str] = Field(default=None, description="发布时间")
|
|||
|
|
author: Optional[str] = Field(default=None, description="作者")
|
|||
|
|
source: Optional[str] = Field(default=None, description="来源")
|
|||
|
|
category: Optional[str] = Field(default=None, description="分类")
|
|||
|
|
|
|||
|
|
|
|||
|
|
class BaseCrawler(ABC):
|
|||
|
|
"""基础爬虫类"""
|
|||
|
|
|
|||
|
|
def __init__(self, config: CrawlerConfig):
|
|||
|
|
"""
|
|||
|
|
初始化爬虫
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
config: 爬虫配置对象
|
|||
|
|
"""
|
|||
|
|
self.config = config
|
|||
|
|
self.session = requests.Session()
|
|||
|
|
self.session.headers.update(config.headers)
|
|||
|
|
logger.info(f"初始化爬虫: {self.__class__.__name__}")
|
|||
|
|
|
|||
|
|
def fetch(self, url: str, method: str = "GET", data: Optional[Dict[str, Any]] = None, **kwargs) -> Optional[requests.Response]:
|
|||
|
|
"""
|
|||
|
|
发送HTTP请求
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
url: 请求URL
|
|||
|
|
method: 请求方法
|
|||
|
|
data: 请求数据
|
|||
|
|
**kwargs: 其他请求参数
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
响应对象,失败返回None
|
|||
|
|
"""
|
|||
|
|
for attempt in range(self.config.retry_times):
|
|||
|
|
try:
|
|||
|
|
logger.info(f"请求URL: {url} (尝试 {attempt + 1}/{self.config.retry_times})")
|
|||
|
|
|
|||
|
|
response = self.session.request(
|
|||
|
|
method=method,
|
|||
|
|
url=url,
|
|||
|
|
headers=self.config.headers,
|
|||
|
|
data=data,
|
|||
|
|
timeout=self.config.timeout,
|
|||
|
|
proxies={'http': self.config.proxy, 'https': self.config.proxy} if self.config.proxy else None,
|
|||
|
|
**kwargs
|
|||
|
|
)
|
|||
|
|
response.raise_for_status()
|
|||
|
|
return response
|
|||
|
|
|
|||
|
|
except requests.RequestException as e:
|
|||
|
|
logger.warning(f"请求失败 (尝试 {attempt + 1}/{self.config.retry_times}): {str(e)}")
|
|||
|
|
if attempt == self.config.retry_times - 1:
|
|||
|
|
logger.error(f"请求最终失败: {url}")
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
def parse_html(self, html, parser: str = "lxml") -> Optional[BeautifulSoup]:
|
|||
|
|
"""
|
|||
|
|
解析HTML
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
html: HTML字符串或字节数据(bytes),BeautifulSoup会自动检测编码
|
|||
|
|
parser: 解析器类型
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
BeautifulSoup对象
|
|||
|
|
"""
|
|||
|
|
try:
|
|||
|
|
# BeautifulSoup 可以自动处理编码:
|
|||
|
|
# - 如果传入 bytes,会从 HTML 的 <meta charset> 标签或响应头自动检测编码
|
|||
|
|
# - 如果传入 str,会直接使用
|
|||
|
|
return BeautifulSoup(html, parser)
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"HTML解析失败: {str(e)}")
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
@abstractmethod
|
|||
|
|
def crawl(self) -> List[NewsItem]:
|
|||
|
|
"""
|
|||
|
|
爬取新闻(子类必须实现)
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
新闻列表
|
|||
|
|
"""
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
@abstractmethod
|
|||
|
|
def parse_news_detail(self, url: str) -> Optional[NewsItem]:
|
|||
|
|
"""
|
|||
|
|
解析新闻详情(子类必须实现)
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
url: 新闻详情页URL
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
新闻对象
|
|||
|
|
"""
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
def close(self):
|
|||
|
|
"""关闭会话"""
|
|||
|
|
self.session.close()
|
|||
|
|
logger.info(f"关闭爬虫: {self.__class__.__name__}")
|