人民日报增加域
This commit is contained in:
3
.vscode/settings.json
vendored
3
.vscode/settings.json
vendored
@@ -8,6 +8,9 @@
|
|||||||
"[java]":{
|
"[java]":{
|
||||||
"editor.tabSize": 4
|
"editor.tabSize": 4
|
||||||
},
|
},
|
||||||
|
"[python]":{
|
||||||
|
"editor.tabSize": 4
|
||||||
|
},
|
||||||
"maven.view": "hierarchical",
|
"maven.view": "hierarchical",
|
||||||
"java.compile.nullAnalysis.mode": "automatic",
|
"java.compile.nullAnalysis.mode": "automatic",
|
||||||
// 终端编码设置
|
// 终端编码设置
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ from loguru import logger
|
|||||||
import re
|
import re
|
||||||
import chardet
|
import chardet
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from bs4.element import NavigableString
|
from bs4.element import NavigableString, Tag
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
import json
|
import json
|
||||||
|
|
||||||
@@ -70,6 +70,7 @@ class RmrbCrawler(BaseCrawler):
|
|||||||
"politics": self.parse_base_news_detail,
|
"politics": self.parse_base_news_detail,
|
||||||
"finance": self.parse_base_news_detail,
|
"finance": self.parse_base_news_detail,
|
||||||
"cpc": self.parse_cpc_news_detail,
|
"cpc": self.parse_cpc_news_detail,
|
||||||
|
"theory": self.parse_cpc_news_detail,
|
||||||
}
|
}
|
||||||
|
|
||||||
def search(self, key: str, total: int, news_type: int = 0) -> ResultDomain:
|
def search(self, key: str, total: int, news_type: int = 0) -> ResultDomain:
|
||||||
@@ -518,13 +519,11 @@ class RmrbCrawler(BaseCrawler):
|
|||||||
|
|
||||||
# 遍历 show_text 下的所有直接子节点(保持顺序)
|
# 遍历 show_text 下的所有直接子节点(保持顺序)
|
||||||
for child in content_div.children:
|
for child in content_div.children:
|
||||||
# 跳过纯文本节点(如换行、空格)
|
# 只处理 Tag 类型的节点,跳过文本节点、注释等
|
||||||
if isinstance(child, NavigableString):
|
if not isinstance(child, Tag):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
tag_name = child.name
|
tag_name = child.name
|
||||||
if tag_name is None:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# 情况1:检测是否是视频容器(根据 id 特征或内部结构)
|
# 情况1:检测是否是视频容器(根据 id 特征或内部结构)
|
||||||
video_tag = child.find('video') if tag_name != 'video' else child
|
video_tag = child.find('video') if tag_name != 'video' else child
|
||||||
@@ -578,7 +577,7 @@ class RmrbCrawler(BaseCrawler):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
# 情况2:检查是否包含人民网的 showPlayer 脚本(动态视频)
|
# 情况2:检查是否包含人民网的 showPlayer 脚本(动态视频)
|
||||||
script_tags = child.find_all('script', string=True)
|
script_tags = child.find_all('script')
|
||||||
video_src = None
|
video_src = None
|
||||||
poster_url = None
|
poster_url = None
|
||||||
|
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ import sys
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
# Add parent directory to path to import crawler
|
# Add parent directory to path to import crawler
|
||||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
||||||
|
|
||||||
from crawler.rmrb.RmrbCrawler import RmrbCrawler
|
from crawler.rmrb.RmrbCrawler import RmrbCrawler
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|||||||
@@ -10,8 +10,8 @@ import json
|
|||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
# Add parent directory to path to import crawler
|
# Add project root directory to path to import crawler
|
||||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
||||||
|
|
||||||
from crawler.rmrb.RmrbCrawler import RmrbCrawler
|
from crawler.rmrb.RmrbCrawler import RmrbCrawler
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
@@ -89,11 +89,13 @@ def main():
|
|||||||
"data": None,
|
"data": None,
|
||||||
"dataList": [item.model_dump() for item in result.dataList] if result.dataList else []
|
"dataList": [item.model_dump() for item in result.dataList] if result.dataList else []
|
||||||
}
|
}
|
||||||
result = None
|
# result = None
|
||||||
with open("F:\Project\schoolNews\schoolNewsCrawler\output\output.json", "r", encoding="utf-8") as f:
|
# with open("F:\Project\schoolNews\schoolNewsCrawler\output\output.json", "r", encoding="utf-8") as f:
|
||||||
result = json.load(f)
|
# result = json.load(f)
|
||||||
print(result)
|
# print(result)
|
||||||
output = result
|
# output = result
|
||||||
|
|
||||||
|
|
||||||
if output_file:
|
if output_file:
|
||||||
output_path = Path(output_file)
|
output_path = Path(output_file)
|
||||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
@@ -101,13 +103,10 @@ def main():
|
|||||||
json.dump(output, f, ensure_ascii=False, indent=2)
|
json.dump(output, f, ensure_ascii=False, indent=2)
|
||||||
logger.info(f"结果已保存到: {output_file}")
|
logger.info(f"结果已保存到: {output_file}")
|
||||||
|
|
||||||
|
|
||||||
crawler.close()
|
crawler.close()
|
||||||
# sys.exit(0 if result.success else 1)
|
sys.exit(0 if result.success else 1)
|
||||||
# print(json.dumps(output, ensure_ascii=False, indent=2))
|
# print(json.dumps(output, ensure_ascii=False, indent=2))
|
||||||
|
# sys.exit(0 if result["success"] else 1)
|
||||||
sys.exit(0 if result["success"] else 1)
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"执行失败: {str(e)}")
|
logger.error(f"执行失败: {str(e)}")
|
||||||
error_output = {
|
error_output = {
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ from datetime import datetime, timedelta
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
# Add parent directory to path to import crawler
|
# Add parent directory to path to import crawler
|
||||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
||||||
|
|
||||||
from crawler.rmrb.RmrbCrawler import RmrbCrawler
|
from crawler.rmrb.RmrbCrawler import RmrbCrawler
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|||||||
0
schoolNewsCrawler/crawler/rmrb/__init__.py
Normal file
0
schoolNewsCrawler/crawler/rmrb/__init__.py
Normal file
@@ -7,22 +7,22 @@ import re
|
|||||||
import chardet
|
import chardet
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from bs4.element import NavigableString
|
from bs4.element import NavigableString
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse, urlencode
|
||||||
import json
|
import json
|
||||||
|
|
||||||
class XhwCrawler(BaseCrawler):
|
class XhwCrawler(BaseCrawler):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
|
||||||
"""初始化人民日报爬虫"""
|
"""初始化人民日报爬虫"""
|
||||||
config = CrawlerConfig(
|
config = CrawlerConfig(
|
||||||
base_url="https://xhsz.news.cn/",
|
base_url="https://xhsz.news.cn/",
|
||||||
urls={
|
urls={
|
||||||
"search": UrlConfig(
|
"search": UrlConfig(
|
||||||
url="https://xhsz.news.cn/s",
|
url="https://xhsz.news.cn/s",
|
||||||
method="POST",
|
method="GET",
|
||||||
params={
|
params={
|
||||||
"k": "",
|
"k": "",
|
||||||
"action": "index",
|
"action": "",
|
||||||
|
"page": 1
|
||||||
},
|
},
|
||||||
headers={
|
headers={
|
||||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36',
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36',
|
||||||
@@ -35,7 +35,34 @@ class XhwCrawler(BaseCrawler):
|
|||||||
},
|
},
|
||||||
)
|
)
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
|
self.search_action_map = {
|
||||||
|
"全部": "index",
|
||||||
|
"热点发布": "news"
|
||||||
|
}
|
||||||
|
|
||||||
def search(self, key:str, total: int) -> ResultDomain:
|
def search(self, key:str, total=10, action="news") -> ResultDomain:
|
||||||
pass
|
resultDomain = ResultDomain()
|
||||||
|
news_list = []
|
||||||
|
resultDomain.dataList = news_list
|
||||||
|
# 获取搜索配置
|
||||||
|
search_config = self.config.urls.get("search")
|
||||||
|
if not search_config:
|
||||||
|
logger.error("未找到搜索URL配置")
|
||||||
|
resultDomain.code = 0
|
||||||
|
resultDomain.message = "未找到搜索URL配置"
|
||||||
|
resultDomain.success = False
|
||||||
|
return resultDomain
|
||||||
|
pagesize = 10
|
||||||
|
# 准备搜索参数
|
||||||
|
search_data = search_config.params.copy()
|
||||||
|
search_data["k"] = key
|
||||||
|
search_data["action"] = action
|
||||||
|
|
||||||
|
for page in range(1, total//pagesize+1):
|
||||||
|
search_data["page"] = page
|
||||||
|
pageHtml = search_config.url + "?" + urlencode(search_data)
|
||||||
|
self.parse_html(pageHtml)
|
||||||
|
resultDomain.code = 0
|
||||||
|
resultDomain.message = "搜索成功"
|
||||||
|
resultDomain.success = True
|
||||||
|
return resultDomain
|
||||||
Reference in New Issue
Block a user