2025-11-19 16:04:50 +08:00
|
|
|
# 新华网爬虫
|
|
|
|
|
from typing import List, Optional
|
|
|
|
|
from core.ResultDomain import ResultDomain
|
|
|
|
|
from crawler.BaseCrawler import BaseCrawler, CrawlerConfig, NewsItem, UrlConfig
|
|
|
|
|
from loguru import logger
|
|
|
|
|
import re
|
|
|
|
|
import chardet
|
|
|
|
|
from datetime import datetime, timedelta
|
|
|
|
|
from bs4.element import NavigableString
|
2025-11-19 16:41:41 +08:00
|
|
|
from urllib.parse import urlparse, urlencode
|
2025-11-19 16:04:50 +08:00
|
|
|
import json
|
|
|
|
|
|
|
|
|
|
class XhwCrawler(BaseCrawler):
|
|
|
|
|
def __init__(self):
|
|
|
|
|
"""初始化人民日报爬虫"""
|
|
|
|
|
config = CrawlerConfig(
|
|
|
|
|
base_url="https://xhsz.news.cn/",
|
|
|
|
|
urls={
|
|
|
|
|
"search": UrlConfig(
|
|
|
|
|
url="https://xhsz.news.cn/s",
|
2025-11-19 16:41:41 +08:00
|
|
|
method="GET",
|
2025-11-19 16:04:50 +08:00
|
|
|
params={
|
|
|
|
|
"k": "",
|
2025-11-19 16:41:41 +08:00
|
|
|
"action": "",
|
|
|
|
|
"page": 1
|
2025-11-19 16:04:50 +08:00
|
|
|
},
|
|
|
|
|
headers={
|
|
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36',
|
|
|
|
|
'Accept': 'application/json, text/plain, */*',
|
|
|
|
|
'Accept-Language': 'zh-CN,zh;q=0.9',
|
|
|
|
|
'Content-Type': 'application/json;charset=UTF-8'
|
|
|
|
|
}
|
|
|
|
|
),
|
|
|
|
|
|
|
|
|
|
},
|
|
|
|
|
)
|
|
|
|
|
super().__init__(config)
|
2025-11-19 16:41:41 +08:00
|
|
|
self.search_action_map = {
|
|
|
|
|
"全部": "index",
|
|
|
|
|
"热点发布": "news"
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
def search(self, key:str, total=10, action="news") -> ResultDomain:
|
|
|
|
|
resultDomain = ResultDomain()
|
|
|
|
|
news_list = []
|
|
|
|
|
resultDomain.dataList = news_list
|
|
|
|
|
# 获取搜索配置
|
|
|
|
|
search_config = self.config.urls.get("search")
|
|
|
|
|
if not search_config:
|
|
|
|
|
logger.error("未找到搜索URL配置")
|
|
|
|
|
resultDomain.code = 0
|
|
|
|
|
resultDomain.message = "未找到搜索URL配置"
|
|
|
|
|
resultDomain.success = False
|
|
|
|
|
return resultDomain
|
|
|
|
|
pagesize = 10
|
|
|
|
|
# 准备搜索参数
|
|
|
|
|
search_data = search_config.params.copy()
|
|
|
|
|
search_data["k"] = key
|
|
|
|
|
search_data["action"] = action
|
2025-11-19 16:04:50 +08:00
|
|
|
|
2025-11-19 16:41:41 +08:00
|
|
|
for page in range(1, total//pagesize+1):
|
|
|
|
|
search_data["page"] = page
|
|
|
|
|
pageHtml = search_config.url + "?" + urlencode(search_data)
|
|
|
|
|
self.parse_html(pageHtml)
|
|
|
|
|
resultDomain.code = 0
|
|
|
|
|
resultDomain.message = "搜索成功"
|
|
|
|
|
resultDomain.success = True
|
|
|
|
|
return resultDomain
|