Files
schoolNews/schoolNewsCrawler/crawler/BaseCrawler.py

143 lines
5.0 KiB
Python
Raw Normal View History

2025-11-10 15:22:44 +08:00
# 定义基础爬虫类
2025-11-10 19:13:54 +08:00
from typing import Callable, Dict, Optional, List, Any, Union
2025-11-10 15:22:44 +08:00
from abc import ABC, abstractmethod
import requests
from bs4 import BeautifulSoup
from loguru import logger
from pydantic import BaseModel, Field, HttpUrl
class UrlConfig(BaseModel):
"""URL配置数据模型"""
url: str = Field(..., description="请求URL")
params: Optional[Dict[str, Any]] = Field(default=None, description="请求参数")
method: str = Field(default="GET", description="请求方法")
2025-11-10 19:13:54 +08:00
headers: Optional[Dict[str, str]] = Field(default=None, description="请求头")
2025-11-10 15:22:44 +08:00
class Config:
# 允许任意类型
arbitrary_types_allowed = True
class CrawlerConfig(BaseModel):
"""爬虫配置数据模型"""
base_url: str = Field(..., description="基础URL")
urls: Dict[str, UrlConfig] = Field(
default_factory=dict,
description="URL配置映射{'search': UrlConfig(...), 'list': UrlConfig(...)}"
)
headers: Dict[str, str] = Field(
default_factory=lambda: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
},
description="HTTP请求头"
)
timeout: int = Field(default=30, description="请求超时时间(秒)")
retry_times: int = Field(default=3, description="重试次数")
proxy: Optional[str] = Field(default=None, description="代理地址")
class Config:
# 允许任意类型
arbitrary_types_allowed = True
class NewsItem(BaseModel):
"""新闻数据模型"""
title: str = Field(..., description="新闻标题")
contentRows: List[Dict[str, Any]] = Field(..., description="新闻内容")
url: str = Field(..., description="新闻链接")
publishTime: Optional[str] = Field(default=None, description="发布时间")
author: Optional[str] = Field(default=None, description="作者")
source: Optional[str] = Field(default=None, description="来源")
category: Optional[str] = Field(default=None, description="分类")
class BaseCrawler(ABC):
"""基础爬虫类"""
def __init__(self, config: CrawlerConfig):
"""
初始化爬虫
Args:
config: 爬虫配置对象
"""
self.config = config
self.session = requests.Session()
self.session.headers.update(config.headers)
logger.info(f"初始化爬虫: {self.__class__.__name__}")
def fetch(self, url: str, method: str = "GET", data: Optional[Dict[str, Any]] = None, **kwargs) -> Optional[requests.Response]:
"""
发送HTTP请求
Args:
url: 请求URL
method: 请求方法
data: 请求数据
**kwargs: 其他请求参数
Returns:
响应对象失败返回None
"""
for attempt in range(self.config.retry_times):
try:
logger.info(f"请求URL: {url} (尝试 {attempt + 1}/{self.config.retry_times})")
response = self.session.request(
method=method,
url=url,
headers=self.config.headers,
data=data,
timeout=self.config.timeout,
proxies={'http': self.config.proxy, 'https': self.config.proxy} if self.config.proxy else None,
**kwargs
)
response.raise_for_status()
return response
except requests.RequestException as e:
logger.warning(f"请求失败 (尝试 {attempt + 1}/{self.config.retry_times}): {str(e)}")
if attempt == self.config.retry_times - 1:
logger.error(f"请求最终失败: {url}")
return None
return None
def parse_html(self, html, parser: str = "lxml") -> Optional[BeautifulSoup]:
"""
解析HTML
Args:
html: HTML字符串或字节数据bytesBeautifulSoup会自动检测编码
parser: 解析器类型
Returns:
BeautifulSoup对象
"""
try:
# BeautifulSoup 可以自动处理编码:
# - 如果传入 bytes会从 HTML 的 <meta charset> 标签或响应头自动检测编码
# - 如果传入 str会直接使用
return BeautifulSoup(html, parser)
except Exception as e:
logger.error(f"HTML解析失败: {str(e)}")
return None
@abstractmethod
def parse_news_detail(self, url: str) -> Optional[NewsItem]:
"""
解析新闻详情子类必须实现
Args:
url: 新闻详情页URL
Returns:
新闻对象
"""
pass
def close(self):
"""关闭会话"""
self.session.close()
logger.info(f"关闭爬虫: {self.__class__.__name__}")