# 新华网爬虫 from typing import List, Optional from core.ResultDomain import ResultDomain from crawler.BaseCrawler import BaseCrawler, CrawlerConfig, NewsItem, UrlConfig from loguru import logger import re import chardet from datetime import datetime, timedelta from bs4.element import NavigableString from urllib.parse import urlparse import json class XhwCrawler(BaseCrawler): def __init__(self): """初始化人民日报爬虫""" config = CrawlerConfig( base_url="https://xhsz.news.cn/", urls={ "search": UrlConfig( url="https://xhsz.news.cn/s", method="POST", params={ "k": "", "action": "index", }, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36', 'Accept': 'application/json, text/plain, */*', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Content-Type': 'application/json;charset=UTF-8' } ), }, ) super().__init__(config) def search(self, key:str, total: int) -> ResultDomain: pass