爬虫

2025-11-19 16:04:50 +08:00
parent 6537ebeacb
commit 0e436e31f3
27 changed files with 56 additions and 2170 deletions
--- a/schoolNewsCrawler/crawler/xhw/XhwCrawler.py
+++ b/schoolNewsCrawler/crawler/xhw/XhwCrawler.py
@@ -0,0 +1,41 @@
+# 新华网爬虫
+from typing import List, Optional
+from core.ResultDomain import ResultDomain
+from crawler.BaseCrawler import BaseCrawler, CrawlerConfig, NewsItem, UrlConfig
+from loguru import logger
+import re
+import chardet
+from datetime import datetime, timedelta
+from bs4.element import NavigableString
+from urllib.parse import urlparse
+import json
+
+class XhwCrawler(BaseCrawler):
+    def __init__(self):
+
+        """初始化人民日报爬虫"""
+        config = CrawlerConfig(
+            base_url="https://xhsz.news.cn/",
+            urls={
+                "search": UrlConfig(
+                    url="https://xhsz.news.cn/s",
+                    method="POST",
+                    params={
+                        "k": "",
+                        "action": "index",
+                    },
+                    headers={
+                        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36',
+                        'Accept': 'application/json, text/plain, */*',
+                        'Accept-Language': 'zh-CN,zh;q=0.9',
+                        'Content-Type': 'application/json;charset=UTF-8'
+                    }
+                ),
+
+            },            
+        )
+        super().__init__(config)
+
+    def search(self, key:str, total: int) -> ResultDomain:
+        pass
+