Files
schoolNews/schoolNewsCrawler/crawler/BaseCrawler.py
2025-11-10 15:22:44 +08:00

152 lines
5.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# 定义基础爬虫类
from typing import Dict, Optional, List, Any
from abc import ABC, abstractmethod
import requests
from bs4 import BeautifulSoup
from loguru import logger
from pydantic import BaseModel, Field, HttpUrl
class UrlConfig(BaseModel):
"""URL配置数据模型"""
url: str = Field(..., description="请求URL")
params: Optional[Dict[str, Any]] = Field(default=None, description="请求参数")
method: str = Field(default="GET", description="请求方法")
class Config:
# 允许任意类型
arbitrary_types_allowed = True
class CrawlerConfig(BaseModel):
"""爬虫配置数据模型"""
base_url: str = Field(..., description="基础URL")
urls: Dict[str, UrlConfig] = Field(
default_factory=dict,
description="URL配置映射{'search': UrlConfig(...), 'list': UrlConfig(...)}"
)
headers: Dict[str, str] = Field(
default_factory=lambda: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
},
description="HTTP请求头"
)
timeout: int = Field(default=30, description="请求超时时间(秒)")
retry_times: int = Field(default=3, description="重试次数")
proxy: Optional[str] = Field(default=None, description="代理地址")
class Config:
# 允许任意类型
arbitrary_types_allowed = True
class NewsItem(BaseModel):
"""新闻数据模型"""
title: str = Field(..., description="新闻标题")
contentRows: List[Dict[str, Any]] = Field(..., description="新闻内容")
url: str = Field(..., description="新闻链接")
publishTime: Optional[str] = Field(default=None, description="发布时间")
author: Optional[str] = Field(default=None, description="作者")
source: Optional[str] = Field(default=None, description="来源")
category: Optional[str] = Field(default=None, description="分类")
class BaseCrawler(ABC):
"""基础爬虫类"""
def __init__(self, config: CrawlerConfig):
"""
初始化爬虫
Args:
config: 爬虫配置对象
"""
self.config = config
self.session = requests.Session()
self.session.headers.update(config.headers)
logger.info(f"初始化爬虫: {self.__class__.__name__}")
def fetch(self, url: str, method: str = "GET", data: Optional[Dict[str, Any]] = None, **kwargs) -> Optional[requests.Response]:
"""
发送HTTP请求
Args:
url: 请求URL
method: 请求方法
data: 请求数据
**kwargs: 其他请求参数
Returns:
响应对象失败返回None
"""
for attempt in range(self.config.retry_times):
try:
logger.info(f"请求URL: {url} (尝试 {attempt + 1}/{self.config.retry_times})")
response = self.session.request(
method=method,
url=url,
headers=self.config.headers,
data=data,
timeout=self.config.timeout,
proxies={'http': self.config.proxy, 'https': self.config.proxy} if self.config.proxy else None,
**kwargs
)
response.raise_for_status()
return response
except requests.RequestException as e:
logger.warning(f"请求失败 (尝试 {attempt + 1}/{self.config.retry_times}): {str(e)}")
if attempt == self.config.retry_times - 1:
logger.error(f"请求最终失败: {url}")
return None
return None
def parse_html(self, html, parser: str = "lxml") -> Optional[BeautifulSoup]:
"""
解析HTML
Args:
html: HTML字符串或字节数据bytesBeautifulSoup会自动检测编码
parser: 解析器类型
Returns:
BeautifulSoup对象
"""
try:
# BeautifulSoup 可以自动处理编码:
# - 如果传入 bytes会从 HTML 的 <meta charset> 标签或响应头自动检测编码
# - 如果传入 str会直接使用
return BeautifulSoup(html, parser)
except Exception as e:
logger.error(f"HTML解析失败: {str(e)}")
return None
@abstractmethod
def crawl(self) -> List[NewsItem]:
"""
爬取新闻(子类必须实现)
Returns:
新闻列表
"""
pass
@abstractmethod
def parse_news_detail(self, url: str) -> Optional[NewsItem]:
"""
解析新闻详情(子类必须实现)
Args:
url: 新闻详情页URL
Returns:
新闻对象
"""
pass
def close(self):
"""关闭会话"""
self.session.close()
logger.info(f"关闭爬虫: {self.__class__.__name__}")