From 1ad118b0d3a1419134e13f61dce28a860359cca5 Mon Sep 17 00:00:00 2001 From: wangys <3401275564@qq.com> Date: Wed, 19 Nov 2025 16:41:41 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BA=BA=E6=B0=91=E6=97=A5=E6=8A=A5=E5=A2=9E?= =?UTF-8?q?=E5=8A=A0=E5=9F=9F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .vscode/settings.json | 3 ++ schoolNewsCrawler/crawler/rmrb/RmrbCrawler.py | 13 +++--- .../crawler/rmrb/RmrbHotPoint.py | 2 +- schoolNewsCrawler/crawler/rmrb/RmrbSearch.py | 23 +++++------ .../crawler/rmrb/RmrbTrending.py | 2 +- schoolNewsCrawler/crawler/rmrb/__init__.py | 0 schoolNewsCrawler/crawler/xhw/XhwCrawler.py | 41 +++++++++++++++---- 7 files changed, 56 insertions(+), 28 deletions(-) create mode 100644 schoolNewsCrawler/crawler/rmrb/__init__.py diff --git a/.vscode/settings.json b/.vscode/settings.json index 31ef6d3..2b0ebce 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -8,6 +8,9 @@ "[java]":{ "editor.tabSize": 4 }, + "[python]":{ + "editor.tabSize": 4 + }, "maven.view": "hierarchical", "java.compile.nullAnalysis.mode": "automatic", // 终端编码设置 diff --git a/schoolNewsCrawler/crawler/rmrb/RmrbCrawler.py b/schoolNewsCrawler/crawler/rmrb/RmrbCrawler.py index 6db7d40..26db54a 100644 --- a/schoolNewsCrawler/crawler/rmrb/RmrbCrawler.py +++ b/schoolNewsCrawler/crawler/rmrb/RmrbCrawler.py @@ -6,7 +6,7 @@ from loguru import logger import re import chardet from datetime import datetime, timedelta -from bs4.element import NavigableString +from bs4.element import NavigableString, Tag from urllib.parse import urlparse import json @@ -70,6 +70,7 @@ class RmrbCrawler(BaseCrawler): "politics": self.parse_base_news_detail, "finance": self.parse_base_news_detail, "cpc": self.parse_cpc_news_detail, + "theory": self.parse_cpc_news_detail, } def search(self, key: str, total: int, news_type: int = 0) -> ResultDomain: @@ -518,13 +519,11 @@ class RmrbCrawler(BaseCrawler): # 遍历 show_text 下的所有直接子节点(保持顺序) for child in content_div.children: - # 跳过纯文本节点(如换行、空格) - if isinstance(child, NavigableString): + # 只处理 Tag 类型的节点,跳过文本节点、注释等 + if not isinstance(child, Tag): continue - + tag_name = child.name - if tag_name is None: - continue # 情况1:检测是否是视频容器(根据 id 特征或内部结构) video_tag = child.find('video') if tag_name != 'video' else child @@ -578,7 +577,7 @@ class RmrbCrawler(BaseCrawler): continue # 情况2:检查是否包含人民网的 showPlayer 脚本(动态视频) - script_tags = child.find_all('script', string=True) + script_tags = child.find_all('script') video_src = None poster_url = None diff --git a/schoolNewsCrawler/crawler/rmrb/RmrbHotPoint.py b/schoolNewsCrawler/crawler/rmrb/RmrbHotPoint.py index c2811f3..ed5cb8f 100644 --- a/schoolNewsCrawler/crawler/rmrb/RmrbHotPoint.py +++ b/schoolNewsCrawler/crawler/rmrb/RmrbHotPoint.py @@ -11,7 +11,7 @@ import sys from pathlib import Path # Add parent directory to path to import crawler -sys.path.insert(0, str(Path(__file__).parent.parent)) +sys.path.insert(0, str(Path(__file__).parent.parent.parent)) from crawler.rmrb.RmrbCrawler import RmrbCrawler from loguru import logger diff --git a/schoolNewsCrawler/crawler/rmrb/RmrbSearch.py b/schoolNewsCrawler/crawler/rmrb/RmrbSearch.py index 5aa86f5..a3d2ab7 100644 --- a/schoolNewsCrawler/crawler/rmrb/RmrbSearch.py +++ b/schoolNewsCrawler/crawler/rmrb/RmrbSearch.py @@ -10,8 +10,8 @@ import json import sys from pathlib import Path -# Add parent directory to path to import crawler -sys.path.insert(0, str(Path(__file__).parent.parent)) +# Add project root directory to path to import crawler +sys.path.insert(0, str(Path(__file__).parent.parent.parent)) from crawler.rmrb.RmrbCrawler import RmrbCrawler from loguru import logger @@ -89,11 +89,13 @@ def main(): "data": None, "dataList": [item.model_dump() for item in result.dataList] if result.dataList else [] } - result = None - with open("F:\Project\schoolNews\schoolNewsCrawler\output\output.json", "r", encoding="utf-8") as f: - result = json.load(f) - print(result) - output = result + # result = None + # with open("F:\Project\schoolNews\schoolNewsCrawler\output\output.json", "r", encoding="utf-8") as f: + # result = json.load(f) + # print(result) + # output = result + + if output_file: output_path = Path(output_file) output_path.parent.mkdir(parents=True, exist_ok=True) @@ -101,13 +103,10 @@ def main(): json.dump(output, f, ensure_ascii=False, indent=2) logger.info(f"结果已保存到: {output_file}") - crawler.close() - # sys.exit(0 if result.success else 1) + sys.exit(0 if result.success else 1) # print(json.dumps(output, ensure_ascii=False, indent=2)) - - sys.exit(0 if result["success"] else 1) - + # sys.exit(0 if result["success"] else 1) except Exception as e: logger.error(f"执行失败: {str(e)}") error_output = { diff --git a/schoolNewsCrawler/crawler/rmrb/RmrbTrending.py b/schoolNewsCrawler/crawler/rmrb/RmrbTrending.py index 7e8e88d..32dc690 100644 --- a/schoolNewsCrawler/crawler/rmrb/RmrbTrending.py +++ b/schoolNewsCrawler/crawler/rmrb/RmrbTrending.py @@ -14,7 +14,7 @@ from datetime import datetime, timedelta from pathlib import Path # Add parent directory to path to import crawler -sys.path.insert(0, str(Path(__file__).parent.parent)) +sys.path.insert(0, str(Path(__file__).parent.parent.parent)) from crawler.rmrb.RmrbCrawler import RmrbCrawler from loguru import logger diff --git a/schoolNewsCrawler/crawler/rmrb/__init__.py b/schoolNewsCrawler/crawler/rmrb/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/schoolNewsCrawler/crawler/xhw/XhwCrawler.py b/schoolNewsCrawler/crawler/xhw/XhwCrawler.py index c73b3d6..b6b64e4 100644 --- a/schoolNewsCrawler/crawler/xhw/XhwCrawler.py +++ b/schoolNewsCrawler/crawler/xhw/XhwCrawler.py @@ -7,22 +7,22 @@ import re import chardet from datetime import datetime, timedelta from bs4.element import NavigableString -from urllib.parse import urlparse +from urllib.parse import urlparse, urlencode import json class XhwCrawler(BaseCrawler): def __init__(self): - """初始化人民日报爬虫""" config = CrawlerConfig( base_url="https://xhsz.news.cn/", urls={ "search": UrlConfig( url="https://xhsz.news.cn/s", - method="POST", + method="GET", params={ "k": "", - "action": "index", + "action": "", + "page": 1 }, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36', @@ -35,7 +35,34 @@ class XhwCrawler(BaseCrawler): }, ) super().__init__(config) + self.search_action_map = { + "全部": "index", + "热点发布": "news" + } - def search(self, key:str, total: int) -> ResultDomain: - pass - \ No newline at end of file + def search(self, key:str, total=10, action="news") -> ResultDomain: + resultDomain = ResultDomain() + news_list = [] + resultDomain.dataList = news_list + # 获取搜索配置 + search_config = self.config.urls.get("search") + if not search_config: + logger.error("未找到搜索URL配置") + resultDomain.code = 0 + resultDomain.message = "未找到搜索URL配置" + resultDomain.success = False + return resultDomain + pagesize = 10 + # 准备搜索参数 + search_data = search_config.params.copy() + search_data["k"] = key + search_data["action"] = action + + for page in range(1, total//pagesize+1): + search_data["page"] = page + pageHtml = search_config.url + "?" + urlencode(search_data) + self.parse_html(pageHtml) + resultDomain.code = 0 + resultDomain.message = "搜索成功" + resultDomain.success = True + return resultDomain \ No newline at end of file