人民日报增加域

This commit is contained in:
2025-11-19 16:41:41 +08:00
parent 0e436e31f3
commit 1ad118b0d3
7 changed files with 56 additions and 28 deletions

View File

@@ -8,6 +8,9 @@
"[java]":{ "[java]":{
"editor.tabSize": 4 "editor.tabSize": 4
}, },
"[python]":{
"editor.tabSize": 4
},
"maven.view": "hierarchical", "maven.view": "hierarchical",
"java.compile.nullAnalysis.mode": "automatic", "java.compile.nullAnalysis.mode": "automatic",
// 终端编码设置 // 终端编码设置

View File

@@ -6,7 +6,7 @@ from loguru import logger
import re import re
import chardet import chardet
from datetime import datetime, timedelta from datetime import datetime, timedelta
from bs4.element import NavigableString from bs4.element import NavigableString, Tag
from urllib.parse import urlparse from urllib.parse import urlparse
import json import json
@@ -70,6 +70,7 @@ class RmrbCrawler(BaseCrawler):
"politics": self.parse_base_news_detail, "politics": self.parse_base_news_detail,
"finance": self.parse_base_news_detail, "finance": self.parse_base_news_detail,
"cpc": self.parse_cpc_news_detail, "cpc": self.parse_cpc_news_detail,
"theory": self.parse_cpc_news_detail,
} }
def search(self, key: str, total: int, news_type: int = 0) -> ResultDomain: def search(self, key: str, total: int, news_type: int = 0) -> ResultDomain:
@@ -518,13 +519,11 @@ class RmrbCrawler(BaseCrawler):
# 遍历 show_text 下的所有直接子节点(保持顺序) # 遍历 show_text 下的所有直接子节点(保持顺序)
for child in content_div.children: for child in content_div.children:
# 跳过文本节点(如换行、空格) # 只处理 Tag 类型的节点,跳过文本节点、注释等
if isinstance(child, NavigableString): if not isinstance(child, Tag):
continue continue
tag_name = child.name tag_name = child.name
if tag_name is None:
continue
# 情况1检测是否是视频容器根据 id 特征或内部结构) # 情况1检测是否是视频容器根据 id 特征或内部结构)
video_tag = child.find('video') if tag_name != 'video' else child video_tag = child.find('video') if tag_name != 'video' else child
@@ -578,7 +577,7 @@ class RmrbCrawler(BaseCrawler):
continue continue
# 情况2检查是否包含人民网的 showPlayer 脚本(动态视频) # 情况2检查是否包含人民网的 showPlayer 脚本(动态视频)
script_tags = child.find_all('script', string=True) script_tags = child.find_all('script')
video_src = None video_src = None
poster_url = None poster_url = None

View File

@@ -11,7 +11,7 @@ import sys
from pathlib import Path from pathlib import Path
# Add parent directory to path to import crawler # Add parent directory to path to import crawler
sys.path.insert(0, str(Path(__file__).parent.parent)) sys.path.insert(0, str(Path(__file__).parent.parent.parent))
from crawler.rmrb.RmrbCrawler import RmrbCrawler from crawler.rmrb.RmrbCrawler import RmrbCrawler
from loguru import logger from loguru import logger

View File

@@ -10,8 +10,8 @@ import json
import sys import sys
from pathlib import Path from pathlib import Path
# Add parent directory to path to import crawler # Add project root directory to path to import crawler
sys.path.insert(0, str(Path(__file__).parent.parent)) sys.path.insert(0, str(Path(__file__).parent.parent.parent))
from crawler.rmrb.RmrbCrawler import RmrbCrawler from crawler.rmrb.RmrbCrawler import RmrbCrawler
from loguru import logger from loguru import logger
@@ -89,11 +89,13 @@ def main():
"data": None, "data": None,
"dataList": [item.model_dump() for item in result.dataList] if result.dataList else [] "dataList": [item.model_dump() for item in result.dataList] if result.dataList else []
} }
result = None # result = None
with open("F:\Project\schoolNews\schoolNewsCrawler\output\output.json", "r", encoding="utf-8") as f: # with open("F:\Project\schoolNews\schoolNewsCrawler\output\output.json", "r", encoding="utf-8") as f:
result = json.load(f) # result = json.load(f)
print(result) # print(result)
output = result # output = result
if output_file: if output_file:
output_path = Path(output_file) output_path = Path(output_file)
output_path.parent.mkdir(parents=True, exist_ok=True) output_path.parent.mkdir(parents=True, exist_ok=True)
@@ -101,13 +103,10 @@ def main():
json.dump(output, f, ensure_ascii=False, indent=2) json.dump(output, f, ensure_ascii=False, indent=2)
logger.info(f"结果已保存到: {output_file}") logger.info(f"结果已保存到: {output_file}")
crawler.close() crawler.close()
# sys.exit(0 if result.success else 1) sys.exit(0 if result.success else 1)
# print(json.dumps(output, ensure_ascii=False, indent=2)) # print(json.dumps(output, ensure_ascii=False, indent=2))
# sys.exit(0 if result["success"] else 1)
sys.exit(0 if result["success"] else 1)
except Exception as e: except Exception as e:
logger.error(f"执行失败: {str(e)}") logger.error(f"执行失败: {str(e)}")
error_output = { error_output = {

View File

@@ -14,7 +14,7 @@ from datetime import datetime, timedelta
from pathlib import Path from pathlib import Path
# Add parent directory to path to import crawler # Add parent directory to path to import crawler
sys.path.insert(0, str(Path(__file__).parent.parent)) sys.path.insert(0, str(Path(__file__).parent.parent.parent))
from crawler.rmrb.RmrbCrawler import RmrbCrawler from crawler.rmrb.RmrbCrawler import RmrbCrawler
from loguru import logger from loguru import logger

View File

@@ -7,22 +7,22 @@ import re
import chardet import chardet
from datetime import datetime, timedelta from datetime import datetime, timedelta
from bs4.element import NavigableString from bs4.element import NavigableString
from urllib.parse import urlparse from urllib.parse import urlparse, urlencode
import json import json
class XhwCrawler(BaseCrawler): class XhwCrawler(BaseCrawler):
def __init__(self): def __init__(self):
"""初始化人民日报爬虫""" """初始化人民日报爬虫"""
config = CrawlerConfig( config = CrawlerConfig(
base_url="https://xhsz.news.cn/", base_url="https://xhsz.news.cn/",
urls={ urls={
"search": UrlConfig( "search": UrlConfig(
url="https://xhsz.news.cn/s", url="https://xhsz.news.cn/s",
method="POST", method="GET",
params={ params={
"k": "", "k": "",
"action": "index", "action": "",
"page": 1
}, },
headers={ headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36',
@@ -35,7 +35,34 @@ class XhwCrawler(BaseCrawler):
}, },
) )
super().__init__(config) super().__init__(config)
self.search_action_map = {
"全部": "index",
"热点发布": "news"
}
def search(self, key:str, total: int) -> ResultDomain: def search(self, key:str, total=10, action="news") -> ResultDomain:
pass resultDomain = ResultDomain()
news_list = []
resultDomain.dataList = news_list
# 获取搜索配置
search_config = self.config.urls.get("search")
if not search_config:
logger.error("未找到搜索URL配置")
resultDomain.code = 0
resultDomain.message = "未找到搜索URL配置"
resultDomain.success = False
return resultDomain
pagesize = 10
# 准备搜索参数
search_data = search_config.params.copy()
search_data["k"] = key
search_data["action"] = action
for page in range(1, total//pagesize+1):
search_data["page"] = page
pageHtml = search_config.url + "?" + urlencode(search_data)
self.parse_html(pageHtml)
resultDomain.code = 0
resultDomain.message = "搜索成功"
resultDomain.success = True
return resultDomain