From 1ad118b0d3a1419134e13f61dce28a860359cca5 Mon Sep 17 00:00:00 2001
From: wangys <3401275564@qq.com>
Date: Wed, 19 Nov 2025 16:41:41 +0800
Subject: [PATCH] =?UTF-8?q?=E4=BA=BA=E6=B0=91=E6=97=A5=E6=8A=A5=E5=A2=9E?=
 =?UTF-8?q?=E5=8A=A0=E5=9F=9F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .vscode/settings.json                         |  3 ++
 schoolNewsCrawler/crawler/rmrb/RmrbCrawler.py | 13 +++---
 .../crawler/rmrb/RmrbHotPoint.py              |  2 +-
 schoolNewsCrawler/crawler/rmrb/RmrbSearch.py  | 23 +++++------
 .../crawler/rmrb/RmrbTrending.py              |  2 +-
 schoolNewsCrawler/crawler/rmrb/__init__.py    |  0
 schoolNewsCrawler/crawler/xhw/XhwCrawler.py   | 41 +++++++++++++++----
 7 files changed, 56 insertions(+), 28 deletions(-)
 create mode 100644 schoolNewsCrawler/crawler/rmrb/__init__.py

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 31ef6d3..2b0ebce 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -8,6 +8,9 @@
     "[java]":{
       "editor.tabSize": 4
     },
+    "[python]":{
+      "editor.tabSize": 4
+    },
     "maven.view": "hierarchical",
     "java.compile.nullAnalysis.mode": "automatic",
     // 终端编码设置
diff --git a/schoolNewsCrawler/crawler/rmrb/RmrbCrawler.py b/schoolNewsCrawler/crawler/rmrb/RmrbCrawler.py
index 6db7d40..26db54a 100644
--- a/schoolNewsCrawler/crawler/rmrb/RmrbCrawler.py
+++ b/schoolNewsCrawler/crawler/rmrb/RmrbCrawler.py
@@ -6,7 +6,7 @@ from loguru import logger
 import re
 import chardet
 from datetime import datetime, timedelta
-from bs4.element import NavigableString
+from bs4.element import NavigableString, Tag
 from urllib.parse import urlparse
 import json
 
@@ -70,6 +70,7 @@ class RmrbCrawler(BaseCrawler):
             "politics": self.parse_base_news_detail,
             "finance": self.parse_base_news_detail,
             "cpc": self.parse_cpc_news_detail,
+            "theory": self.parse_cpc_news_detail,
         }
     
     def search(self, key: str, total: int, news_type: int = 0) -> ResultDomain:
@@ -518,13 +519,11 @@ class RmrbCrawler(BaseCrawler):
 
             # 遍历 show_text 下的所有直接子节点（保持顺序）
             for child in content_div.children:
-                # 跳过纯文本节点（如换行、空格）
-                if isinstance(child, NavigableString):
+                # 只处理 Tag 类型的节点，跳过文本节点、注释等
+                if not isinstance(child, Tag):
                     continue
-
+                
                 tag_name = child.name
-                if tag_name is None:
-                    continue
 
                 # 情况1：检测是否是视频容器（根据 id 特征或内部结构）
                 video_tag = child.find('video') if tag_name != 'video' else child
@@ -578,7 +577,7 @@ class RmrbCrawler(BaseCrawler):
                         continue
 
                     # 情况2：检查是否包含人民网的 showPlayer 脚本（动态视频）
-                    script_tags = child.find_all('script', string=True)
+                    script_tags = child.find_all('script')
                     video_src = None
                     poster_url = None
 
diff --git a/schoolNewsCrawler/crawler/rmrb/RmrbHotPoint.py b/schoolNewsCrawler/crawler/rmrb/RmrbHotPoint.py
index c2811f3..ed5cb8f 100644
--- a/schoolNewsCrawler/crawler/rmrb/RmrbHotPoint.py
+++ b/schoolNewsCrawler/crawler/rmrb/RmrbHotPoint.py
@@ -11,7 +11,7 @@ import sys
 from pathlib import Path
 
 # Add parent directory to path to import crawler
-sys.path.insert(0, str(Path(__file__).parent.parent))
+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
 
 from crawler.rmrb.RmrbCrawler import RmrbCrawler
 from loguru import logger
diff --git a/schoolNewsCrawler/crawler/rmrb/RmrbSearch.py b/schoolNewsCrawler/crawler/rmrb/RmrbSearch.py
index 5aa86f5..a3d2ab7 100644
--- a/schoolNewsCrawler/crawler/rmrb/RmrbSearch.py
+++ b/schoolNewsCrawler/crawler/rmrb/RmrbSearch.py
@@ -10,8 +10,8 @@ import json
 import sys
 from pathlib import Path
 
-# Add parent directory to path to import crawler
-sys.path.insert(0, str(Path(__file__).parent.parent))
+# Add project root directory to path to import crawler
+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
 
 from crawler.rmrb.RmrbCrawler import RmrbCrawler
 from loguru import logger
@@ -89,11 +89,13 @@ def main():
             "data": None,
             "dataList": [item.model_dump() for item in result.dataList] if result.dataList else []
         }
-        result = None
-        with open("F:\Project\schoolNews\schoolNewsCrawler\output\output.json", "r", encoding="utf-8") as f:
-            result = json.load(f)
-        print(result)
-        output = result
+        # result = None
+        # with open("F:\Project\schoolNews\schoolNewsCrawler\output\output.json", "r", encoding="utf-8") as f:
+        #     result = json.load(f)
+        # print(result)
+        # output = result
+
+        
         if output_file:
             output_path = Path(output_file)
             output_path.parent.mkdir(parents=True, exist_ok=True)
@@ -101,13 +103,10 @@ def main():
                 json.dump(output, f, ensure_ascii=False, indent=2)
             logger.info(f"结果已保存到: {output_file}")
 
-    
         crawler.close()
-        # sys.exit(0 if result.success else 1)
+        sys.exit(0 if result.success else 1)
         # print(json.dumps(output, ensure_ascii=False, indent=2))
-
-        sys.exit(0 if result["success"] else 1)
-
+        # sys.exit(0 if result["success"] else 1)
     except Exception as e:
         logger.error(f"执行失败: {str(e)}")
         error_output = {
diff --git a/schoolNewsCrawler/crawler/rmrb/RmrbTrending.py b/schoolNewsCrawler/crawler/rmrb/RmrbTrending.py
index 7e8e88d..32dc690 100644
--- a/schoolNewsCrawler/crawler/rmrb/RmrbTrending.py
+++ b/schoolNewsCrawler/crawler/rmrb/RmrbTrending.py
@@ -14,7 +14,7 @@ from datetime import datetime, timedelta
 from pathlib import Path
 
 # Add parent directory to path to import crawler
-sys.path.insert(0, str(Path(__file__).parent.parent))
+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
 
 from crawler.rmrb.RmrbCrawler import RmrbCrawler
 from loguru import logger
diff --git a/schoolNewsCrawler/crawler/rmrb/__init__.py b/schoolNewsCrawler/crawler/rmrb/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/schoolNewsCrawler/crawler/xhw/XhwCrawler.py b/schoolNewsCrawler/crawler/xhw/XhwCrawler.py
index c73b3d6..b6b64e4 100644
--- a/schoolNewsCrawler/crawler/xhw/XhwCrawler.py
+++ b/schoolNewsCrawler/crawler/xhw/XhwCrawler.py
@@ -7,22 +7,22 @@ import re
 import chardet
 from datetime import datetime, timedelta
 from bs4.element import NavigableString
-from urllib.parse import urlparse
+from urllib.parse import urlparse, urlencode
 import json
 
 class XhwCrawler(BaseCrawler):
     def __init__(self):
-
         """初始化人民日报爬虫"""
         config = CrawlerConfig(
             base_url="https://xhsz.news.cn/",
             urls={
                 "search": UrlConfig(
                     url="https://xhsz.news.cn/s",
-                    method="POST",
+                    method="GET",
                     params={
                         "k": "",
-                        "action": "index",
+                        "action": "",
+                        "page": 1
                     },
                     headers={
                         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36',
@@ -35,7 +35,34 @@ class XhwCrawler(BaseCrawler):
             },            
         )
         super().__init__(config)
+        self.search_action_map = {
+          "全部": "index",
+          "热点发布": "news"
+        }
 
-    def search(self, key:str, total: int) -> ResultDomain:
-        pass
-    
\ No newline at end of file
+    def search(self, key:str, total=10, action="news") -> ResultDomain:
+        resultDomain = ResultDomain()
+        news_list = []
+        resultDomain.dataList = news_list
+        # 获取搜索配置
+        search_config = self.config.urls.get("search")
+        if not search_config:
+            logger.error("未找到搜索URL配置")
+            resultDomain.code = 0
+            resultDomain.message = "未找到搜索URL配置"
+            resultDomain.success = False
+            return resultDomain
+        pagesize = 10
+        # 准备搜索参数
+        search_data = search_config.params.copy()
+        search_data["k"] = key
+        search_data["action"] = action
+
+        for page in range(1, total//pagesize+1):
+            search_data["page"] = page
+            pageHtml = search_config.url + "?" + urlencode(search_data)
+            self.parse_html(pageHtml)
+        resultDomain.code = 0
+        resultDomain.message = "搜索成功"
+        resultDomain.success = True
+        return resultDomain    
\ No newline at end of file