爬虫
This commit is contained in:
41
schoolNewsCrawler/crawler/xhw/XhwCrawler.py
Normal file
41
schoolNewsCrawler/crawler/xhw/XhwCrawler.py
Normal file
@@ -0,0 +1,41 @@
|
||||
# 新华网爬虫
|
||||
from typing import List, Optional
|
||||
from core.ResultDomain import ResultDomain
|
||||
from crawler.BaseCrawler import BaseCrawler, CrawlerConfig, NewsItem, UrlConfig
|
||||
from loguru import logger
|
||||
import re
|
||||
import chardet
|
||||
from datetime import datetime, timedelta
|
||||
from bs4.element import NavigableString
|
||||
from urllib.parse import urlparse
|
||||
import json
|
||||
|
||||
class XhwCrawler(BaseCrawler):
|
||||
def __init__(self):
|
||||
|
||||
"""初始化人民日报爬虫"""
|
||||
config = CrawlerConfig(
|
||||
base_url="https://xhsz.news.cn/",
|
||||
urls={
|
||||
"search": UrlConfig(
|
||||
url="https://xhsz.news.cn/s",
|
||||
method="POST",
|
||||
params={
|
||||
"k": "",
|
||||
"action": "index",
|
||||
},
|
||||
headers={
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36',
|
||||
'Accept': 'application/json, text/plain, */*',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.9',
|
||||
'Content-Type': 'application/json;charset=UTF-8'
|
||||
}
|
||||
),
|
||||
|
||||
},
|
||||
)
|
||||
super().__init__(config)
|
||||
|
||||
def search(self, key:str, total: int) -> ResultDomain:
|
||||
pass
|
||||
|
||||
Reference in New Issue
Block a user