爬虫
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
# 定义基础爬虫类
|
||||
from typing import Dict, Optional, List, Any
|
||||
from typing import Callable, Dict, Optional, List, Any, Union
|
||||
from abc import ABC, abstractmethod
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
@@ -12,7 +12,7 @@ class UrlConfig(BaseModel):
|
||||
url: str = Field(..., description="请求URL")
|
||||
params: Optional[Dict[str, Any]] = Field(default=None, description="请求参数")
|
||||
method: str = Field(default="GET", description="请求方法")
|
||||
|
||||
headers: Optional[Dict[str, str]] = Field(default=None, description="请求头")
|
||||
class Config:
|
||||
# 允许任意类型
|
||||
arbitrary_types_allowed = True
|
||||
@@ -123,15 +123,6 @@ class BaseCrawler(ABC):
|
||||
logger.error(f"HTML解析失败: {str(e)}")
|
||||
return None
|
||||
|
||||
@abstractmethod
|
||||
def crawl(self) -> List[NewsItem]:
|
||||
"""
|
||||
爬取新闻(子类必须实现)
|
||||
|
||||
Returns:
|
||||
新闻列表
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def parse_news_detail(self, url: str) -> Optional[NewsItem]:
|
||||
|
||||
Reference in New Issue
Block a user