This commit is contained in:
2025-11-19 16:04:50 +08:00
parent 6537ebeacb
commit 0e436e31f3
27 changed files with 56 additions and 2170 deletions

View File

@@ -0,0 +1,41 @@
# 新华网爬虫
from typing import List, Optional
from core.ResultDomain import ResultDomain
from crawler.BaseCrawler import BaseCrawler, CrawlerConfig, NewsItem, UrlConfig
from loguru import logger
import re
import chardet
from datetime import datetime, timedelta
from bs4.element import NavigableString
from urllib.parse import urlparse
import json
class XhwCrawler(BaseCrawler):
def __init__(self):
"""初始化人民日报爬虫"""
config = CrawlerConfig(
base_url="https://xhsz.news.cn/",
urls={
"search": UrlConfig(
url="https://xhsz.news.cn/s",
method="POST",
params={
"k": "",
"action": "index",
},
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36',
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Content-Type': 'application/json;charset=UTF-8'
}
),
},
)
super().__init__(config)
def search(self, key:str, total: int) -> ResultDomain:
pass