人民日报增加域

This commit is contained in:
2025-11-19 16:41:41 +08:00
parent 0e436e31f3
commit 1ad118b0d3
7 changed files with 56 additions and 28 deletions

View File

@@ -6,7 +6,7 @@ from loguru import logger
import re
import chardet
from datetime import datetime, timedelta
from bs4.element import NavigableString
from bs4.element import NavigableString, Tag
from urllib.parse import urlparse
import json
@@ -70,6 +70,7 @@ class RmrbCrawler(BaseCrawler):
"politics": self.parse_base_news_detail,
"finance": self.parse_base_news_detail,
"cpc": self.parse_cpc_news_detail,
"theory": self.parse_cpc_news_detail,
}
def search(self, key: str, total: int, news_type: int = 0) -> ResultDomain:
@@ -518,13 +519,11 @@ class RmrbCrawler(BaseCrawler):
# 遍历 show_text 下的所有直接子节点(保持顺序)
for child in content_div.children:
# 跳过文本节点(如换行、空格)
if isinstance(child, NavigableString):
# 只处理 Tag 类型的节点,跳过文本节点、注释等
if not isinstance(child, Tag):
continue
tag_name = child.name
if tag_name is None:
continue
# 情况1检测是否是视频容器根据 id 特征或内部结构)
video_tag = child.find('video') if tag_name != 'video' else child
@@ -578,7 +577,7 @@ class RmrbCrawler(BaseCrawler):
continue
# 情况2检查是否包含人民网的 showPlayer 脚本(动态视频)
script_tags = child.find_all('script', string=True)
script_tags = child.find_all('script')
video_src = None
poster_url = None

View File

@@ -11,7 +11,7 @@ import sys
from pathlib import Path
# Add parent directory to path to import crawler
sys.path.insert(0, str(Path(__file__).parent.parent))
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
from crawler.rmrb.RmrbCrawler import RmrbCrawler
from loguru import logger

View File

@@ -10,8 +10,8 @@ import json
import sys
from pathlib import Path
# Add parent directory to path to import crawler
sys.path.insert(0, str(Path(__file__).parent.parent))
# Add project root directory to path to import crawler
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
from crawler.rmrb.RmrbCrawler import RmrbCrawler
from loguru import logger
@@ -89,11 +89,13 @@ def main():
"data": None,
"dataList": [item.model_dump() for item in result.dataList] if result.dataList else []
}
result = None
with open("F:\Project\schoolNews\schoolNewsCrawler\output\output.json", "r", encoding="utf-8") as f:
result = json.load(f)
print(result)
output = result
# result = None
# with open("F:\Project\schoolNews\schoolNewsCrawler\output\output.json", "r", encoding="utf-8") as f:
# result = json.load(f)
# print(result)
# output = result
if output_file:
output_path = Path(output_file)
output_path.parent.mkdir(parents=True, exist_ok=True)
@@ -101,13 +103,10 @@ def main():
json.dump(output, f, ensure_ascii=False, indent=2)
logger.info(f"结果已保存到: {output_file}")
crawler.close()
# sys.exit(0 if result.success else 1)
sys.exit(0 if result.success else 1)
# print(json.dumps(output, ensure_ascii=False, indent=2))
sys.exit(0 if result["success"] else 1)
# sys.exit(0 if result["success"] else 1)
except Exception as e:
logger.error(f"执行失败: {str(e)}")
error_output = {

View File

@@ -14,7 +14,7 @@ from datetime import datetime, timedelta
from pathlib import Path
# Add parent directory to path to import crawler
sys.path.insert(0, str(Path(__file__).parent.parent))
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
from crawler.rmrb.RmrbCrawler import RmrbCrawler
from loguru import logger

View File

@@ -7,22 +7,22 @@ import re
import chardet
from datetime import datetime, timedelta
from bs4.element import NavigableString
from urllib.parse import urlparse
from urllib.parse import urlparse, urlencode
import json
class XhwCrawler(BaseCrawler):
def __init__(self):
"""初始化人民日报爬虫"""
config = CrawlerConfig(
base_url="https://xhsz.news.cn/",
urls={
"search": UrlConfig(
url="https://xhsz.news.cn/s",
method="POST",
method="GET",
params={
"k": "",
"action": "index",
"action": "",
"page": 1
},
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36',
@@ -35,7 +35,34 @@ class XhwCrawler(BaseCrawler):
},
)
super().__init__(config)
self.search_action_map = {
"全部": "index",
"热点发布": "news"
}
def search(self, key:str, total: int) -> ResultDomain:
pass
def search(self, key:str, total=10, action="news") -> ResultDomain:
resultDomain = ResultDomain()
news_list = []
resultDomain.dataList = news_list
# 获取搜索配置
search_config = self.config.urls.get("search")
if not search_config:
logger.error("未找到搜索URL配置")
resultDomain.code = 0
resultDomain.message = "未找到搜索URL配置"
resultDomain.success = False
return resultDomain
pagesize = 10
# 准备搜索参数
search_data = search_config.params.copy()
search_data["k"] = key
search_data["action"] = action
for page in range(1, total//pagesize+1):
search_data["page"] = page
pageHtml = search_config.url + "?" + urlencode(search_data)
self.parse_html(pageHtml)
resultDomain.code = 0
resultDomain.message = "搜索成功"
resultDomain.success = True
return resultDomain