搜索关键字爬虫

This commit is contained in:
2025-11-12 16:10:34 +08:00
parent 7be02fe396
commit 675e6da7d7
37 changed files with 3382 additions and 572 deletions

View File

@@ -66,7 +66,7 @@ class BaseCrawler(ABC):
self.session.headers.update(config.headers)
logger.info(f"初始化爬虫: {self.__class__.__name__}")
def fetch(self, url: str, method: str = "GET", data: Optional[Dict[str, Any]] = None, **kwargs) -> Optional[requests.Response]:
def fetch(self, url: str, method: str = "GET", data: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None, **kwargs) -> Optional[requests.Response]:
"""
发送HTTP请求
@@ -74,6 +74,7 @@ class BaseCrawler(ABC):
url: 请求URL
method: 请求方法
data: 请求数据
headers: 额外的请求头,将与默认请求头合并(额外的优先)
**kwargs: 其他请求参数
Returns:
@@ -82,11 +83,20 @@ class BaseCrawler(ABC):
for attempt in range(self.config.retry_times):
try:
logger.info(f"请求URL: {url} (尝试 {attempt + 1}/{self.config.retry_times})")
# 合并默认headers与调用方headers调用方覆盖默认
request_headers = dict(self.config.headers or {})
if headers:
request_headers.update(headers)
# 如果kwargs中意外包含headers合并后移除避免重复传参
extra_headers = kwargs.pop("headers", None)
if extra_headers:
request_headers.update(extra_headers)
response = self.session.request(
method=method,
url=url,
headers=self.config.headers,
headers=request_headers,
data=data,
timeout=self.config.timeout,
proxies={'http': self.config.proxy, 'https': self.config.proxy} if self.config.proxy else None,