人民日报增加域
This commit is contained in:
@@ -6,7 +6,7 @@ from loguru import logger
|
||||
import re
|
||||
import chardet
|
||||
from datetime import datetime, timedelta
|
||||
from bs4.element import NavigableString
|
||||
from bs4.element import NavigableString, Tag
|
||||
from urllib.parse import urlparse
|
||||
import json
|
||||
|
||||
@@ -70,6 +70,7 @@ class RmrbCrawler(BaseCrawler):
|
||||
"politics": self.parse_base_news_detail,
|
||||
"finance": self.parse_base_news_detail,
|
||||
"cpc": self.parse_cpc_news_detail,
|
||||
"theory": self.parse_cpc_news_detail,
|
||||
}
|
||||
|
||||
def search(self, key: str, total: int, news_type: int = 0) -> ResultDomain:
|
||||
@@ -518,13 +519,11 @@ class RmrbCrawler(BaseCrawler):
|
||||
|
||||
# 遍历 show_text 下的所有直接子节点(保持顺序)
|
||||
for child in content_div.children:
|
||||
# 跳过纯文本节点(如换行、空格)
|
||||
if isinstance(child, NavigableString):
|
||||
# 只处理 Tag 类型的节点,跳过文本节点、注释等
|
||||
if not isinstance(child, Tag):
|
||||
continue
|
||||
|
||||
|
||||
tag_name = child.name
|
||||
if tag_name is None:
|
||||
continue
|
||||
|
||||
# 情况1:检测是否是视频容器(根据 id 特征或内部结构)
|
||||
video_tag = child.find('video') if tag_name != 'video' else child
|
||||
@@ -578,7 +577,7 @@ class RmrbCrawler(BaseCrawler):
|
||||
continue
|
||||
|
||||
# 情况2:检查是否包含人民网的 showPlayer 脚本(动态视频)
|
||||
script_tags = child.find_all('script', string=True)
|
||||
script_tags = child.find_all('script')
|
||||
video_src = None
|
||||
poster_url = None
|
||||
|
||||
|
||||
@@ -11,7 +11,7 @@ import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add parent directory to path to import crawler
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
||||
|
||||
from crawler.rmrb.RmrbCrawler import RmrbCrawler
|
||||
from loguru import logger
|
||||
|
||||
@@ -10,8 +10,8 @@ import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add parent directory to path to import crawler
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
# Add project root directory to path to import crawler
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
||||
|
||||
from crawler.rmrb.RmrbCrawler import RmrbCrawler
|
||||
from loguru import logger
|
||||
@@ -89,11 +89,13 @@ def main():
|
||||
"data": None,
|
||||
"dataList": [item.model_dump() for item in result.dataList] if result.dataList else []
|
||||
}
|
||||
result = None
|
||||
with open("F:\Project\schoolNews\schoolNewsCrawler\output\output.json", "r", encoding="utf-8") as f:
|
||||
result = json.load(f)
|
||||
print(result)
|
||||
output = result
|
||||
# result = None
|
||||
# with open("F:\Project\schoolNews\schoolNewsCrawler\output\output.json", "r", encoding="utf-8") as f:
|
||||
# result = json.load(f)
|
||||
# print(result)
|
||||
# output = result
|
||||
|
||||
|
||||
if output_file:
|
||||
output_path = Path(output_file)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
@@ -101,13 +103,10 @@ def main():
|
||||
json.dump(output, f, ensure_ascii=False, indent=2)
|
||||
logger.info(f"结果已保存到: {output_file}")
|
||||
|
||||
|
||||
crawler.close()
|
||||
# sys.exit(0 if result.success else 1)
|
||||
sys.exit(0 if result.success else 1)
|
||||
# print(json.dumps(output, ensure_ascii=False, indent=2))
|
||||
|
||||
sys.exit(0 if result["success"] else 1)
|
||||
|
||||
# sys.exit(0 if result["success"] else 1)
|
||||
except Exception as e:
|
||||
logger.error(f"执行失败: {str(e)}")
|
||||
error_output = {
|
||||
|
||||
@@ -14,7 +14,7 @@ from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
|
||||
# Add parent directory to path to import crawler
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
||||
|
||||
from crawler.rmrb.RmrbCrawler import RmrbCrawler
|
||||
from loguru import logger
|
||||
|
||||
0
schoolNewsCrawler/crawler/rmrb/__init__.py
Normal file
0
schoolNewsCrawler/crawler/rmrb/__init__.py
Normal file
Reference in New Issue
Block a user