人民日报增加域

2025-11-19 16:41:41 +08:00
parent 0e436e31f3
commit 1ad118b0d3
7 changed files with 56 additions and 28 deletions
--- a/schoolNewsCrawler/crawler/xhw/XhwCrawler.py
+++ b/schoolNewsCrawler/crawler/xhw/XhwCrawler.py
@@ -7,22 +7,22 @@ import re
 import chardet
 from datetime import datetime, timedelta
 from bs4.element import NavigableString
-from urllib.parse import urlparse
+from urllib.parse import urlparse, urlencode
 import json

 class XhwCrawler(BaseCrawler):
    def __init__(self):
-
        """初始化人民日报爬虫"""
        config = CrawlerConfig(
            base_url="https://xhsz.news.cn/",
            urls={
                "search": UrlConfig(
                    url="https://xhsz.news.cn/s",
-                    method="POST",
+                    method="GET",
                    params={
                        "k": "",
-                        "action": "index",
+                        "action": "",
+                        "page": 1
                    },
                    headers={
                        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36',
@@ -35,7 +35,34 @@ class XhwCrawler(BaseCrawler):
            },            
        )
        super().__init__(config)
+        self.search_action_map = {
+          "全部": "index",
+          "热点发布": "news"
+        }

-    def search(self, key:str, total: int) -> ResultDomain:
-        pass
-    
+    def search(self, key:str, total=10, action="news") -> ResultDomain:
+        resultDomain = ResultDomain()
+        news_list = []
+        resultDomain.dataList = news_list
+        # 获取搜索配置
+        search_config = self.config.urls.get("search")
+        if not search_config:
+            logger.error("未找到搜索URL配置")
+            resultDomain.code = 0
+            resultDomain.message = "未找到搜索URL配置"
+            resultDomain.success = False
+            return resultDomain
+        pagesize = 10
+        # 准备搜索参数
+        search_data = search_config.params.copy()
+        search_data["k"] = key
+        search_data["action"] = action
+
+        for page in range(1, total//pagesize+1):
+            search_data["page"] = page
+            pageHtml = search_config.url + "?" + urlencode(search_data)
+            self.parse_html(pageHtml)
+        resultDomain.code = 0
+        resultDomain.message = "搜索成功"
+        resultDomain.success = True
+        return resultDomain