人民日报爬虫
This commit is contained in:
152
schoolNewsCrawler/crawler/BaseCrawler.py
Normal file
152
schoolNewsCrawler/crawler/BaseCrawler.py
Normal file
@@ -0,0 +1,152 @@
|
||||
# 定义基础爬虫类
|
||||
from typing import Dict, Optional, List, Any
|
||||
from abc import ABC, abstractmethod
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from loguru import logger
|
||||
from pydantic import BaseModel, Field, HttpUrl
|
||||
|
||||
|
||||
class UrlConfig(BaseModel):
|
||||
"""URL配置数据模型"""
|
||||
url: str = Field(..., description="请求URL")
|
||||
params: Optional[Dict[str, Any]] = Field(default=None, description="请求参数")
|
||||
method: str = Field(default="GET", description="请求方法")
|
||||
|
||||
class Config:
|
||||
# 允许任意类型
|
||||
arbitrary_types_allowed = True
|
||||
|
||||
|
||||
class CrawlerConfig(BaseModel):
|
||||
"""爬虫配置数据模型"""
|
||||
base_url: str = Field(..., description="基础URL")
|
||||
urls: Dict[str, UrlConfig] = Field(
|
||||
default_factory=dict,
|
||||
description="URL配置映射,如 {'search': UrlConfig(...), 'list': UrlConfig(...)}"
|
||||
)
|
||||
headers: Dict[str, str] = Field(
|
||||
default_factory=lambda: {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||||
},
|
||||
description="HTTP请求头"
|
||||
)
|
||||
timeout: int = Field(default=30, description="请求超时时间(秒)")
|
||||
retry_times: int = Field(default=3, description="重试次数")
|
||||
proxy: Optional[str] = Field(default=None, description="代理地址")
|
||||
|
||||
class Config:
|
||||
# 允许任意类型
|
||||
arbitrary_types_allowed = True
|
||||
|
||||
|
||||
class NewsItem(BaseModel):
|
||||
"""新闻数据模型"""
|
||||
title: str = Field(..., description="新闻标题")
|
||||
contentRows: List[Dict[str, Any]] = Field(..., description="新闻内容")
|
||||
url: str = Field(..., description="新闻链接")
|
||||
publishTime: Optional[str] = Field(default=None, description="发布时间")
|
||||
author: Optional[str] = Field(default=None, description="作者")
|
||||
source: Optional[str] = Field(default=None, description="来源")
|
||||
category: Optional[str] = Field(default=None, description="分类")
|
||||
|
||||
|
||||
class BaseCrawler(ABC):
|
||||
"""基础爬虫类"""
|
||||
|
||||
def __init__(self, config: CrawlerConfig):
|
||||
"""
|
||||
初始化爬虫
|
||||
|
||||
Args:
|
||||
config: 爬虫配置对象
|
||||
"""
|
||||
self.config = config
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update(config.headers)
|
||||
logger.info(f"初始化爬虫: {self.__class__.__name__}")
|
||||
|
||||
def fetch(self, url: str, method: str = "GET", data: Optional[Dict[str, Any]] = None, **kwargs) -> Optional[requests.Response]:
|
||||
"""
|
||||
发送HTTP请求
|
||||
|
||||
Args:
|
||||
url: 请求URL
|
||||
method: 请求方法
|
||||
data: 请求数据
|
||||
**kwargs: 其他请求参数
|
||||
|
||||
Returns:
|
||||
响应对象,失败返回None
|
||||
"""
|
||||
for attempt in range(self.config.retry_times):
|
||||
try:
|
||||
logger.info(f"请求URL: {url} (尝试 {attempt + 1}/{self.config.retry_times})")
|
||||
|
||||
response = self.session.request(
|
||||
method=method,
|
||||
url=url,
|
||||
headers=self.config.headers,
|
||||
data=data,
|
||||
timeout=self.config.timeout,
|
||||
proxies={'http': self.config.proxy, 'https': self.config.proxy} if self.config.proxy else None,
|
||||
**kwargs
|
||||
)
|
||||
response.raise_for_status()
|
||||
return response
|
||||
|
||||
except requests.RequestException as e:
|
||||
logger.warning(f"请求失败 (尝试 {attempt + 1}/{self.config.retry_times}): {str(e)}")
|
||||
if attempt == self.config.retry_times - 1:
|
||||
logger.error(f"请求最终失败: {url}")
|
||||
return None
|
||||
|
||||
return None
|
||||
|
||||
def parse_html(self, html, parser: str = "lxml") -> Optional[BeautifulSoup]:
|
||||
"""
|
||||
解析HTML
|
||||
|
||||
Args:
|
||||
html: HTML字符串或字节数据(bytes),BeautifulSoup会自动检测编码
|
||||
parser: 解析器类型
|
||||
|
||||
Returns:
|
||||
BeautifulSoup对象
|
||||
"""
|
||||
try:
|
||||
# BeautifulSoup 可以自动处理编码:
|
||||
# - 如果传入 bytes,会从 HTML 的 <meta charset> 标签或响应头自动检测编码
|
||||
# - 如果传入 str,会直接使用
|
||||
return BeautifulSoup(html, parser)
|
||||
except Exception as e:
|
||||
logger.error(f"HTML解析失败: {str(e)}")
|
||||
return None
|
||||
|
||||
@abstractmethod
|
||||
def crawl(self) -> List[NewsItem]:
|
||||
"""
|
||||
爬取新闻(子类必须实现)
|
||||
|
||||
Returns:
|
||||
新闻列表
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def parse_news_detail(self, url: str) -> Optional[NewsItem]:
|
||||
"""
|
||||
解析新闻详情(子类必须实现)
|
||||
|
||||
Args:
|
||||
url: 新闻详情页URL
|
||||
|
||||
Returns:
|
||||
新闻对象
|
||||
"""
|
||||
pass
|
||||
|
||||
def close(self):
|
||||
"""关闭会话"""
|
||||
self.session.close()
|
||||
logger.info(f"关闭爬虫: {self.__class__.__name__}")
|
||||
Reference in New Issue
Block a user