人民日报增加域

This commit is contained in:
2025-11-19 16:41:41 +08:00
parent 0e436e31f3
commit 1ad118b0d3
7 changed files with 56 additions and 28 deletions

View File

@@ -6,7 +6,7 @@ from loguru import logger
import re
import chardet
from datetime import datetime, timedelta
from bs4.element import NavigableString
from bs4.element import NavigableString, Tag
from urllib.parse import urlparse
import json
@@ -70,6 +70,7 @@ class RmrbCrawler(BaseCrawler):
"politics": self.parse_base_news_detail,
"finance": self.parse_base_news_detail,
"cpc": self.parse_cpc_news_detail,
"theory": self.parse_cpc_news_detail,
}
def search(self, key: str, total: int, news_type: int = 0) -> ResultDomain:
@@ -518,13 +519,11 @@ class RmrbCrawler(BaseCrawler):
# 遍历 show_text 下的所有直接子节点(保持顺序)
for child in content_div.children:
# 跳过文本节点(如换行、空格)
if isinstance(child, NavigableString):
# 只处理 Tag 类型的节点,跳过文本节点、注释等
if not isinstance(child, Tag):
continue
tag_name = child.name
if tag_name is None:
continue
# 情况1检测是否是视频容器根据 id 特征或内部结构)
video_tag = child.find('video') if tag_name != 'video' else child
@@ -578,7 +577,7 @@ class RmrbCrawler(BaseCrawler):
continue
# 情况2检查是否包含人民网的 showPlayer 脚本(动态视频)
script_tags = child.find_all('script', string=True)
script_tags = child.find_all('script')
video_src = None
poster_url = None