搜索关键字爬虫
This commit is contained in:
@@ -66,7 +66,7 @@ class BaseCrawler(ABC):
|
||||
self.session.headers.update(config.headers)
|
||||
logger.info(f"初始化爬虫: {self.__class__.__name__}")
|
||||
|
||||
def fetch(self, url: str, method: str = "GET", data: Optional[Dict[str, Any]] = None, **kwargs) -> Optional[requests.Response]:
|
||||
def fetch(self, url: str, method: str = "GET", data: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None, **kwargs) -> Optional[requests.Response]:
|
||||
"""
|
||||
发送HTTP请求
|
||||
|
||||
@@ -74,6 +74,7 @@ class BaseCrawler(ABC):
|
||||
url: 请求URL
|
||||
method: 请求方法
|
||||
data: 请求数据
|
||||
headers: 额外的请求头,将与默认请求头合并(额外的优先)
|
||||
**kwargs: 其他请求参数
|
||||
|
||||
Returns:
|
||||
@@ -82,11 +83,20 @@ class BaseCrawler(ABC):
|
||||
for attempt in range(self.config.retry_times):
|
||||
try:
|
||||
logger.info(f"请求URL: {url} (尝试 {attempt + 1}/{self.config.retry_times})")
|
||||
|
||||
# 合并默认headers与调用方headers(调用方覆盖默认)
|
||||
request_headers = dict(self.config.headers or {})
|
||||
if headers:
|
||||
request_headers.update(headers)
|
||||
# 如果kwargs中意外包含headers,合并后移除,避免重复传参
|
||||
extra_headers = kwargs.pop("headers", None)
|
||||
if extra_headers:
|
||||
request_headers.update(extra_headers)
|
||||
|
||||
response = self.session.request(
|
||||
method=method,
|
||||
url=url,
|
||||
headers=self.config.headers,
|
||||
headers=request_headers,
|
||||
data=data,
|
||||
timeout=self.config.timeout,
|
||||
proxies={'http': self.config.proxy, 'https': self.config.proxy} if self.config.proxy else None,
|
||||
|
||||
Reference in New Issue
Block a user