# 新华网爬虫 from typing import List, Optional from core.ResultDomain import ResultDomain from crawler.BaseCrawler import BaseCrawler, CrawlerConfig, NewsItem, UrlConfig from loguru import logger import re import chardet from datetime import datetime, timedelta from bs4.element import NavigableString from urllib.parse import urlparse, urlencode import json class XhwCrawler(BaseCrawler): def __init__(self): """初始化人民日报爬虫""" config = CrawlerConfig( base_url="https://xhsz.news.cn/", urls={ "search": UrlConfig( url="https://xhsz.news.cn/s", method="GET", params={ "k": "", "action": "", "page": 1 }, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36', 'Accept': 'application/json, text/plain, */*', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Content-Type': 'application/json;charset=UTF-8' } ), }, ) super().__init__(config) self.search_action_map = { "全部": "index", "热点发布": "news" } def search(self, key:str, total=10, action="news") -> ResultDomain: resultDomain = ResultDomain() news_list = [] resultDomain.dataList = news_list # 获取搜索配置 search_config = self.config.urls.get("search") if not search_config: logger.error("未找到搜索URL配置") resultDomain.code = 0 resultDomain.message = "未找到搜索URL配置" resultDomain.success = False return resultDomain pagesize = 10 # 准备搜索参数 search_data = search_config.params.copy() search_data["k"] = key search_data["action"] = action for page in range(1, total//pagesize+1): search_data["page"] = page pageHtml = search_config.url + "?" + urlencode(search_data) self.parse_html(pageHtml) resultDomain.code = 0 resultDomain.message = "搜索成功" resultDomain.success = True return resultDomain