2025-11-10 15:22:44 +08:00
|
|
|
|
# 定义基础爬虫类
|
2025-11-10 19:13:54 +08:00
|
|
|
|
from typing import Callable, Dict, Optional, List, Any, Union
|
2025-11-10 15:22:44 +08:00
|
|
|
|
from abc import ABC, abstractmethod
|
|
|
|
|
|
import requests
|
2025-11-19 19:05:31 +08:00
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
from bs4.element import Tag, NavigableString
|
2025-11-10 15:22:44 +08:00
|
|
|
|
from loguru import logger
|
|
|
|
|
|
from pydantic import BaseModel, Field, HttpUrl
|
2025-11-12 19:16:50 +08:00
|
|
|
|
import json
|
2025-11-10 15:22:44 +08:00
|
|
|
|
|
|
|
|
|
|
class UrlConfig(BaseModel):
|
|
|
|
|
|
"""URL配置数据模型"""
|
|
|
|
|
|
url: str = Field(..., description="请求URL")
|
|
|
|
|
|
params: Optional[Dict[str, Any]] = Field(default=None, description="请求参数")
|
|
|
|
|
|
method: str = Field(default="GET", description="请求方法")
|
2025-11-10 19:13:54 +08:00
|
|
|
|
headers: Optional[Dict[str, str]] = Field(default=None, description="请求头")
|
2025-11-10 15:22:44 +08:00
|
|
|
|
class Config:
|
|
|
|
|
|
# 允许任意类型
|
|
|
|
|
|
arbitrary_types_allowed = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class CrawlerConfig(BaseModel):
|
|
|
|
|
|
"""爬虫配置数据模型"""
|
|
|
|
|
|
base_url: str = Field(..., description="基础URL")
|
|
|
|
|
|
urls: Dict[str, UrlConfig] = Field(
|
|
|
|
|
|
default_factory=dict,
|
|
|
|
|
|
description="URL配置映射,如 {'search': UrlConfig(...), 'list': UrlConfig(...)}"
|
|
|
|
|
|
)
|
|
|
|
|
|
headers: Dict[str, str] = Field(
|
|
|
|
|
|
default_factory=lambda: {
|
|
|
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
|
|
|
|
|
},
|
|
|
|
|
|
description="HTTP请求头"
|
|
|
|
|
|
)
|
|
|
|
|
|
timeout: int = Field(default=30, description="请求超时时间(秒)")
|
|
|
|
|
|
retry_times: int = Field(default=3, description="重试次数")
|
|
|
|
|
|
proxy: Optional[str] = Field(default=None, description="代理地址")
|
|
|
|
|
|
|
|
|
|
|
|
class Config:
|
|
|
|
|
|
# 允许任意类型
|
|
|
|
|
|
arbitrary_types_allowed = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class NewsItem(BaseModel):
|
|
|
|
|
|
"""新闻数据模型"""
|
|
|
|
|
|
title: str = Field(..., description="新闻标题")
|
|
|
|
|
|
contentRows: List[Dict[str, Any]] = Field(..., description="新闻内容")
|
|
|
|
|
|
url: str = Field(..., description="新闻链接")
|
2025-11-19 19:05:31 +08:00
|
|
|
|
viewCount: Optional[int] = Field(default=None, description="浏览数")
|
2025-11-10 15:22:44 +08:00
|
|
|
|
publishTime: Optional[str] = Field(default=None, description="发布时间")
|
|
|
|
|
|
author: Optional[str] = Field(default=None, description="作者")
|
|
|
|
|
|
source: Optional[str] = Field(default=None, description="来源")
|
|
|
|
|
|
category: Optional[str] = Field(default=None, description="分类")
|
2025-11-12 19:16:50 +08:00
|
|
|
|
executeStatus: Optional[int] = Field(default=0, description="执行状态")
|
|
|
|
|
|
executeMessage: Optional[str] = Field(default=None, description="执行消息")
|
2025-11-10 15:22:44 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class BaseCrawler(ABC):
|
|
|
|
|
|
"""基础爬虫类"""
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(self, config: CrawlerConfig):
|
|
|
|
|
|
"""
|
|
|
|
|
|
初始化爬虫
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
config: 爬虫配置对象
|
|
|
|
|
|
"""
|
|
|
|
|
|
self.config = config
|
|
|
|
|
|
self.session = requests.Session()
|
|
|
|
|
|
self.session.headers.update(config.headers)
|
|
|
|
|
|
logger.info(f"初始化爬虫: {self.__class__.__name__}")
|
|
|
|
|
|
|
2025-11-12 16:10:34 +08:00
|
|
|
|
def fetch(self, url: str, method: str = "GET", data: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None, **kwargs) -> Optional[requests.Response]:
|
2025-11-10 15:22:44 +08:00
|
|
|
|
"""
|
|
|
|
|
|
发送HTTP请求
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
url: 请求URL
|
|
|
|
|
|
method: 请求方法
|
|
|
|
|
|
data: 请求数据
|
2025-11-12 16:10:34 +08:00
|
|
|
|
headers: 额外的请求头,将与默认请求头合并(额外的优先)
|
2025-11-10 15:22:44 +08:00
|
|
|
|
**kwargs: 其他请求参数
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
响应对象,失败返回None
|
|
|
|
|
|
"""
|
|
|
|
|
|
for attempt in range(self.config.retry_times):
|
|
|
|
|
|
try:
|
|
|
|
|
|
logger.info(f"请求URL: {url} (尝试 {attempt + 1}/{self.config.retry_times})")
|
2025-11-12 16:10:34 +08:00
|
|
|
|
|
|
|
|
|
|
# 合并默认headers与调用方headers(调用方覆盖默认)
|
|
|
|
|
|
request_headers = dict(self.config.headers or {})
|
|
|
|
|
|
if headers:
|
|
|
|
|
|
request_headers.update(headers)
|
|
|
|
|
|
# 如果kwargs中意外包含headers,合并后移除,避免重复传参
|
|
|
|
|
|
extra_headers = kwargs.pop("headers", None)
|
|
|
|
|
|
if extra_headers:
|
|
|
|
|
|
request_headers.update(extra_headers)
|
2025-11-10 15:22:44 +08:00
|
|
|
|
|
|
|
|
|
|
response = self.session.request(
|
|
|
|
|
|
method=method,
|
|
|
|
|
|
url=url,
|
2025-11-12 16:10:34 +08:00
|
|
|
|
headers=request_headers,
|
2025-11-10 15:22:44 +08:00
|
|
|
|
data=data,
|
|
|
|
|
|
timeout=self.config.timeout,
|
|
|
|
|
|
proxies={'http': self.config.proxy, 'https': self.config.proxy} if self.config.proxy else None,
|
|
|
|
|
|
**kwargs
|
|
|
|
|
|
)
|
|
|
|
|
|
response.raise_for_status()
|
|
|
|
|
|
return response
|
|
|
|
|
|
|
|
|
|
|
|
except requests.RequestException as e:
|
|
|
|
|
|
logger.warning(f"请求失败 (尝试 {attempt + 1}/{self.config.retry_times}): {str(e)}")
|
|
|
|
|
|
if attempt == self.config.retry_times - 1:
|
|
|
|
|
|
logger.error(f"请求最终失败: {url}")
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
def parse_html(self, html, parser: str = "lxml") -> Optional[BeautifulSoup]:
|
|
|
|
|
|
"""
|
|
|
|
|
|
解析HTML
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
html: HTML字符串或字节数据(bytes),BeautifulSoup会自动检测编码
|
|
|
|
|
|
parser: 解析器类型
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
BeautifulSoup对象
|
|
|
|
|
|
"""
|
|
|
|
|
|
try:
|
|
|
|
|
|
# BeautifulSoup 可以自动处理编码:
|
|
|
|
|
|
# - 如果传入 bytes,会从 HTML 的 <meta charset> 标签或响应头自动检测编码
|
|
|
|
|
|
# - 如果传入 str,会直接使用
|
|
|
|
|
|
return BeautifulSoup(html, parser)
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
logger.error(f"HTML解析失败: {str(e)}")
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@abstractmethod
|
|
|
|
|
|
def parse_news_detail(self, url: str) -> Optional[NewsItem]:
|
|
|
|
|
|
"""
|
|
|
|
|
|
解析新闻详情(子类必须实现)
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
url: 新闻详情页URL
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
新闻对象
|
|
|
|
|
|
"""
|
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
def close(self):
|
|
|
|
|
|
"""关闭会话"""
|
|
|
|
|
|
self.session.close()
|
|
|
|
|
|
logger.info(f"关闭爬虫: {self.__class__.__name__}")
|