155 lines
5.8 KiB
Python
155 lines
5.8 KiB
Python
# 定义基础爬虫类
|
||
from typing import Callable, Dict, Optional, List, Any, Union
|
||
from abc import ABC, abstractmethod
|
||
import requests
|
||
from bs4 import BeautifulSoup, NavigableString
|
||
from loguru import logger
|
||
from pydantic import BaseModel, Field, HttpUrl
|
||
import json
|
||
|
||
class UrlConfig(BaseModel):
|
||
"""URL配置数据模型"""
|
||
url: str = Field(..., description="请求URL")
|
||
params: Optional[Dict[str, Any]] = Field(default=None, description="请求参数")
|
||
method: str = Field(default="GET", description="请求方法")
|
||
headers: Optional[Dict[str, str]] = Field(default=None, description="请求头")
|
||
class Config:
|
||
# 允许任意类型
|
||
arbitrary_types_allowed = True
|
||
|
||
|
||
class CrawlerConfig(BaseModel):
|
||
"""爬虫配置数据模型"""
|
||
base_url: str = Field(..., description="基础URL")
|
||
urls: Dict[str, UrlConfig] = Field(
|
||
default_factory=dict,
|
||
description="URL配置映射,如 {'search': UrlConfig(...), 'list': UrlConfig(...)}"
|
||
)
|
||
headers: Dict[str, str] = Field(
|
||
default_factory=lambda: {
|
||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||
},
|
||
description="HTTP请求头"
|
||
)
|
||
timeout: int = Field(default=30, description="请求超时时间(秒)")
|
||
retry_times: int = Field(default=3, description="重试次数")
|
||
proxy: Optional[str] = Field(default=None, description="代理地址")
|
||
|
||
class Config:
|
||
# 允许任意类型
|
||
arbitrary_types_allowed = True
|
||
|
||
|
||
class NewsItem(BaseModel):
|
||
"""新闻数据模型"""
|
||
title: str = Field(..., description="新闻标题")
|
||
contentRows: List[Dict[str, Any]] = Field(..., description="新闻内容")
|
||
url: str = Field(..., description="新闻链接")
|
||
publishTime: Optional[str] = Field(default=None, description="发布时间")
|
||
author: Optional[str] = Field(default=None, description="作者")
|
||
source: Optional[str] = Field(default=None, description="来源")
|
||
category: Optional[str] = Field(default=None, description="分类")
|
||
executeStatus: Optional[int] = Field(default=0, description="执行状态")
|
||
executeMessage: Optional[str] = Field(default=None, description="执行消息")
|
||
|
||
|
||
class BaseCrawler(ABC):
|
||
"""基础爬虫类"""
|
||
|
||
def __init__(self, config: CrawlerConfig):
|
||
"""
|
||
初始化爬虫
|
||
|
||
Args:
|
||
config: 爬虫配置对象
|
||
"""
|
||
self.config = config
|
||
self.session = requests.Session()
|
||
self.session.headers.update(config.headers)
|
||
logger.info(f"初始化爬虫: {self.__class__.__name__}")
|
||
|
||
def fetch(self, url: str, method: str = "GET", data: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None, **kwargs) -> Optional[requests.Response]:
|
||
"""
|
||
发送HTTP请求
|
||
|
||
Args:
|
||
url: 请求URL
|
||
method: 请求方法
|
||
data: 请求数据
|
||
headers: 额外的请求头,将与默认请求头合并(额外的优先)
|
||
**kwargs: 其他请求参数
|
||
|
||
Returns:
|
||
响应对象,失败返回None
|
||
"""
|
||
for attempt in range(self.config.retry_times):
|
||
try:
|
||
logger.info(f"请求URL: {url} (尝试 {attempt + 1}/{self.config.retry_times})")
|
||
|
||
# 合并默认headers与调用方headers(调用方覆盖默认)
|
||
request_headers = dict(self.config.headers or {})
|
||
if headers:
|
||
request_headers.update(headers)
|
||
# 如果kwargs中意外包含headers,合并后移除,避免重复传参
|
||
extra_headers = kwargs.pop("headers", None)
|
||
if extra_headers:
|
||
request_headers.update(extra_headers)
|
||
|
||
response = self.session.request(
|
||
method=method,
|
||
url=url,
|
||
headers=request_headers,
|
||
data=data,
|
||
timeout=self.config.timeout,
|
||
proxies={'http': self.config.proxy, 'https': self.config.proxy} if self.config.proxy else None,
|
||
**kwargs
|
||
)
|
||
response.raise_for_status()
|
||
return response
|
||
|
||
except requests.RequestException as e:
|
||
logger.warning(f"请求失败 (尝试 {attempt + 1}/{self.config.retry_times}): {str(e)}")
|
||
if attempt == self.config.retry_times - 1:
|
||
logger.error(f"请求最终失败: {url}")
|
||
return None
|
||
|
||
return None
|
||
|
||
def parse_html(self, html, parser: str = "lxml") -> Optional[BeautifulSoup]:
|
||
"""
|
||
解析HTML
|
||
|
||
Args:
|
||
html: HTML字符串或字节数据(bytes),BeautifulSoup会自动检测编码
|
||
parser: 解析器类型
|
||
|
||
Returns:
|
||
BeautifulSoup对象
|
||
"""
|
||
try:
|
||
# BeautifulSoup 可以自动处理编码:
|
||
# - 如果传入 bytes,会从 HTML 的 <meta charset> 标签或响应头自动检测编码
|
||
# - 如果传入 str,会直接使用
|
||
return BeautifulSoup(html, parser)
|
||
except Exception as e:
|
||
logger.error(f"HTML解析失败: {str(e)}")
|
||
return None
|
||
|
||
|
||
@abstractmethod
|
||
def parse_news_detail(self, url: str) -> Optional[NewsItem]:
|
||
"""
|
||
解析新闻详情(子类必须实现)
|
||
|
||
Args:
|
||
url: 新闻详情页URL
|
||
|
||
Returns:
|
||
新闻对象
|
||
"""
|
||
pass
|
||
|
||
def close(self):
|
||
"""关闭会话"""
|
||
self.session.close()
|
||
logger.info(f"关闭爬虫: {self.__class__.__name__}") |