爬虫
This commit is contained in:
@@ -2,7 +2,7 @@ from pydantic import BaseModel, Field, HttpUrl
|
|||||||
from typing import Any, List, Optional
|
from typing import Any, List, Optional
|
||||||
|
|
||||||
class ResultDomain(BaseModel):
|
class ResultDomain(BaseModel):
|
||||||
code: int = Field(..., description="状态码")
|
code: int = Field(..., description="状态码",)
|
||||||
message: str = Field(..., description="消息")
|
message: str = Field(..., description="消息")
|
||||||
success: bool = Field(..., description="是否成功")
|
success: bool = Field(..., description="是否成功")
|
||||||
data: Optional[Any] = Field(default=None, description="数据")
|
data: Optional[Any] = Field(default=None, description="数据")
|
||||||
|
|||||||
0
schoolNewsCrawler/core/__init__.py
Normal file
0
schoolNewsCrawler/core/__init__.py
Normal file
@@ -1,5 +1,5 @@
|
|||||||
# 定义基础爬虫类
|
# 定义基础爬虫类
|
||||||
from typing import Dict, Optional, List, Any
|
from typing import Callable, Dict, Optional, List, Any, Union
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
@@ -12,7 +12,7 @@ class UrlConfig(BaseModel):
|
|||||||
url: str = Field(..., description="请求URL")
|
url: str = Field(..., description="请求URL")
|
||||||
params: Optional[Dict[str, Any]] = Field(default=None, description="请求参数")
|
params: Optional[Dict[str, Any]] = Field(default=None, description="请求参数")
|
||||||
method: str = Field(default="GET", description="请求方法")
|
method: str = Field(default="GET", description="请求方法")
|
||||||
|
headers: Optional[Dict[str, str]] = Field(default=None, description="请求头")
|
||||||
class Config:
|
class Config:
|
||||||
# 允许任意类型
|
# 允许任意类型
|
||||||
arbitrary_types_allowed = True
|
arbitrary_types_allowed = True
|
||||||
@@ -123,15 +123,6 @@ class BaseCrawler(ABC):
|
|||||||
logger.error(f"HTML解析失败: {str(e)}")
|
logger.error(f"HTML解析失败: {str(e)}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def crawl(self) -> List[NewsItem]:
|
|
||||||
"""
|
|
||||||
爬取新闻(子类必须实现)
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
新闻列表
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def parse_news_detail(self, url: str) -> Optional[NewsItem]:
|
def parse_news_detail(self, url: str) -> Optional[NewsItem]:
|
||||||
|
|||||||
@@ -1,11 +1,11 @@
|
|||||||
# 人民日报爬虫
|
# 人民日报爬虫
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
from core import ResultDomain
|
from core.ResultDomain import ResultDomain
|
||||||
from crawler.BaseCrawler import BaseCrawler, CrawlerConfig, NewsItem, UrlConfig
|
from crawler.BaseCrawler import BaseCrawler, CrawlerConfig, NewsItem, UrlConfig
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
import re
|
import re
|
||||||
import chardet
|
import chardet
|
||||||
from datetime import datetime
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
|
|
||||||
class RmrbCrawler(BaseCrawler):
|
class RmrbCrawler(BaseCrawler):
|
||||||
@@ -50,7 +50,7 @@ class RmrbCrawler(BaseCrawler):
|
|||||||
}
|
}
|
||||||
),
|
),
|
||||||
"one_day_trending_news": UrlConfig(
|
"one_day_trending_news": UrlConfig(
|
||||||
url=lambda date: f"http://www.people.com.cn/GB/59476/review/{date}.html", # date:YYYYMMdd
|
url= "http://www.people.com.cn/GB/59476/review/{date}.html", # date:YYYYMMdd
|
||||||
method="GET",
|
method="GET",
|
||||||
params={},
|
params={},
|
||||||
headers={
|
headers={
|
||||||
@@ -63,7 +63,7 @@ class RmrbCrawler(BaseCrawler):
|
|||||||
)
|
)
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
|
|
||||||
def search(self, key: str, total: int = 10, news_type: int = 0) -> ResultDomain:
|
def search(self, key: str, total: int, news_type: int = 0) -> ResultDomain:
|
||||||
"""
|
"""
|
||||||
搜索人民日报新闻
|
搜索人民日报新闻
|
||||||
|
|
||||||
@@ -76,7 +76,7 @@ class RmrbCrawler(BaseCrawler):
|
|||||||
新闻列表
|
新闻列表
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
resultDomain = ResultDomain()
|
resultDomain = ResultDomain(code=0, message="", success=True)
|
||||||
news_list = []
|
news_list = []
|
||||||
resultDomain.dataList = news_list
|
resultDomain.dataList = news_list
|
||||||
# 获取搜索配置
|
# 获取搜索配置
|
||||||
@@ -98,7 +98,7 @@ class RmrbCrawler(BaseCrawler):
|
|||||||
|
|
||||||
while len(news_list) < total:
|
while len(news_list) < total:
|
||||||
search_data["page"] = page
|
search_data["page"] = page
|
||||||
response = self.fetch(search_config.url, method=search_config.method, data=search_data, headers=search_config.headers)
|
response = self.fetch(search_config.url, method=search_config.method, json=search_data, headers=search_config.headers)
|
||||||
response_json = response.json()
|
response_json = response.json()
|
||||||
if response_json.get("code") == 0:
|
if response_json.get("code") == 0:
|
||||||
records = response_json.get("data", {}).get("records", [])
|
records = response_json.get("data", {}).get("records", [])
|
||||||
@@ -130,7 +130,7 @@ class RmrbCrawler(BaseCrawler):
|
|||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
hot_point_rank_config = self.config.urls.get("hot_point_rank")
|
hot_point_rank_config = self.config.urls.get("hot_point_rank")
|
||||||
resultDomain = ResultDomain()
|
resultDomain = ResultDomain(code=0, message="", success=True)
|
||||||
news_list = []
|
news_list = []
|
||||||
resultDomain.dataList = news_list
|
resultDomain.dataList = news_list
|
||||||
|
|
||||||
@@ -169,7 +169,7 @@ class RmrbCrawler(BaseCrawler):
|
|||||||
获取人民日报一天内的热点新闻
|
获取人民日报一天内的热点新闻
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
resultDomain = ResultDomain()
|
resultDomain = ResultDomain(code=0, message="", success=True)
|
||||||
news_list = []
|
news_list = []
|
||||||
resultDomain.dataList = news_list
|
resultDomain.dataList = news_list
|
||||||
resultDomain.success = True
|
resultDomain.success = True
|
||||||
@@ -177,7 +177,8 @@ class RmrbCrawler(BaseCrawler):
|
|||||||
logger.info(f"获取人民日报一天内的热点新闻成功")
|
logger.info(f"获取人民日报一天内的热点新闻成功")
|
||||||
date_str = date.strftime("%Y%m%d")
|
date_str = date.strftime("%Y%m%d")
|
||||||
one_day_trending_news_config = self.config.urls.get("one_day_trending_news")
|
one_day_trending_news_config = self.config.urls.get("one_day_trending_news")
|
||||||
one_day_trending_news_config.url = one_day_trending_news_config.url(date_str)
|
|
||||||
|
one_day_trending_news_config.url = one_day_trending_news_config.url.format(date_str)
|
||||||
response = self.fetch(one_day_trending_news_config.url, method=one_day_trending_news_config.method, headers=one_day_trending_news_config.headers)
|
response = self.fetch(one_day_trending_news_config.url, method=one_day_trending_news_config.method, headers=one_day_trending_news_config.headers)
|
||||||
if not response:
|
if not response:
|
||||||
logger.error(f"获取响应失败: {one_day_trending_news_config.url}")
|
logger.error(f"获取响应失败: {one_day_trending_news_config.url}")
|
||||||
@@ -194,12 +195,12 @@ class RmrbCrawler(BaseCrawler):
|
|||||||
return resultDomain
|
return resultDomain
|
||||||
|
|
||||||
all_doc_urls = []
|
all_doc_urls = []
|
||||||
all_doc_urls.extend(a_tags)
|
|
||||||
|
|
||||||
bg01 = soup.find('td', class_="bg01")
|
bg01 = soup.find('td', class_="bg01")
|
||||||
indexfont13 = bg01.find('td', class_='indexfont13')
|
indexfont13 = bg01.find('td', class_='indexfont13')
|
||||||
# 获取该 td 下的所有 a 标签
|
# 获取该 td 下的所有 a 标签
|
||||||
a_tags = indexfont13.find_all('a')
|
a_tags = indexfont13.find_all('a')
|
||||||
|
all_doc_urls.extend(a_tags)
|
||||||
|
|
||||||
bg02 = soup.find('td', class_="bg02")
|
bg02 = soup.find('td', class_="bg02")
|
||||||
p6 = bg02.find('td', class_='p6')
|
p6 = bg02.find('td', class_='p6')
|
||||||
@@ -223,19 +224,21 @@ class RmrbCrawler(BaseCrawler):
|
|||||||
获取人民日报多天内的热点新闻
|
获取人民日报多天内的热点新闻
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
resultDomain = ResultDomain()
|
resultDomain = ResultDomain(code=0,message="", success=True)
|
||||||
news_list = []
|
news_list = []
|
||||||
resultDomain.dataList = news_list
|
resultDomain.dataList = news_list
|
||||||
resultDomain.success = True
|
resultDomain.success = True
|
||||||
resultDomain.code = 0
|
resultDomain.code = 0
|
||||||
resultDomain.message = "获取人民日报多天内的热点新闻成功"
|
resultDomain.message = "获取人民日报多天内的热点新闻成功"
|
||||||
for date in range(start_date, end_date):
|
|
||||||
resultDomain = self.getOneDayTrendingNews(date)
|
|
||||||
if not resultDomain.success:
|
|
||||||
continue
|
|
||||||
news_list.extend(resultDomain.dataList)
|
|
||||||
|
|
||||||
logger.info(f"获取人民日报多天内的热点新闻成功")
|
current_date = start_date
|
||||||
|
while current_date <= end_date:
|
||||||
|
day_result = self.getOneDayTrendingNews(current_date)
|
||||||
|
if day_result.success and day_result.dataList:
|
||||||
|
news_list.extend(day_result.dataList)
|
||||||
|
current_date += timedelta(days=1)
|
||||||
|
|
||||||
|
logger.info(f"获取人民日报多天内的热点新闻成功,共 {len(news_list)} 条")
|
||||||
return resultDomain
|
return resultDomain
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"获取人民日报多天内的热点新闻失败: {str(e)}")
|
logger.error(f"获取人民日报多天内的热点新闻失败: {str(e)}")
|
||||||
@@ -322,15 +325,19 @@ class RmrbCrawler(BaseCrawler):
|
|||||||
if p.find('img'):
|
if p.find('img'):
|
||||||
tag = "img"
|
tag = "img"
|
||||||
src = p.find('img').get('src')
|
src = p.find('img').get('src')
|
||||||
if not src.startswith("http") and src:
|
if src:
|
||||||
src = self.config.base_url + src
|
src = str(src) # 转换为字符串
|
||||||
|
if not src.startswith("http"):
|
||||||
|
src = self.config.base_url + src
|
||||||
content = f"<img style='{p_style}' src='{src}' />"
|
content = f"<img style='{p_style}' src='{src}' />"
|
||||||
|
|
||||||
elif p.find('video'):
|
elif p.find('video'):
|
||||||
tag = "video"
|
tag = "video"
|
||||||
src = p.find('video').get('src')
|
src = p.find('video').get('src')
|
||||||
if not src.startswith("http") and src:
|
if src:
|
||||||
src = self.config.base_url + src
|
src = str(src) # 转换为字符串
|
||||||
|
if not src.startswith("http"):
|
||||||
|
src = self.config.base_url + src
|
||||||
content = f"<video style='align-items: center;' src='{src}' />"
|
content = f"<video style='align-items: center;' src='{src}' />"
|
||||||
else:
|
else:
|
||||||
content = str(p)
|
content = str(p)
|
||||||
@@ -356,60 +363,3 @@ class RmrbCrawler(BaseCrawler):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"解析新闻详情失败 [{url}]: {str(e)}")
|
logger.error(f"解析新闻详情失败 [{url}]: {str(e)}")
|
||||||
return None
|
return None
|
||||||
"""
|
|
||||||
解析人民日报新闻详情并保存为HTML文件(UTF-8编码)
|
|
||||||
|
|
||||||
Args:
|
|
||||||
url: 新闻详情页URL
|
|
||||||
output_file: 输出文件路径,默认为 "crawler/response.html"
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
response = self.fetch(url)
|
|
||||||
if not response:
|
|
||||||
logger.error(f"获取响应失败: {url}")
|
|
||||||
return
|
|
||||||
|
|
||||||
# BeautifulSoup 可以自动检测并解码编码,直接传入字节数据即可
|
|
||||||
# 它会从 HTML 的 <meta charset> 标签或响应头自动检测编码
|
|
||||||
soup = self.parse_html(response.content)
|
|
||||||
if not soup:
|
|
||||||
logger.error("解析HTML失败")
|
|
||||||
return
|
|
||||||
|
|
||||||
# 保存为UTF-8编码的文件(BeautifulSoup 已经自动解码为 Unicode 字符串)
|
|
||||||
with open(output_file, "w", encoding="utf-8") as f:
|
|
||||||
f.write(soup.prettify())
|
|
||||||
|
|
||||||
logger.info(f"成功保存HTML文件: {output_file}")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"解析并保存新闻详情失败 [{url}]: {str(e)}")
|
|
||||||
import traceback
|
|
||||||
logger.error(traceback.format_exc())
|
|
||||||
"""
|
|
||||||
解析时间字符串
|
|
||||||
|
|
||||||
Args:
|
|
||||||
time_text: 时间文本
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
标准化的时间字符串
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
# 尝试匹配常见的时间格式
|
|
||||||
patterns = [
|
|
||||||
r'(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})',
|
|
||||||
r'(\d{4})年(\d{2})月(\d{2})日\s+(\d{2}):(\d{2})',
|
|
||||||
r'(\d{4})/(\d{2})/(\d{2})\s+(\d{2}):(\d{2})',
|
|
||||||
]
|
|
||||||
|
|
||||||
for pattern in patterns:
|
|
||||||
match = re.search(pattern, time_text)
|
|
||||||
if match:
|
|
||||||
return time_text
|
|
||||||
|
|
||||||
return time_text
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"时间解析失败: {str(e)}")
|
|
||||||
return None
|
|
||||||
72
schoolNewsCrawler/crawler/RmrbHotPoint.py
Normal file
72
schoolNewsCrawler/crawler/RmrbHotPoint.py
Normal file
@@ -0,0 +1,72 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
人民日报热点排行爬虫命令行工具
|
||||||
|
用法: python RmrbHotPoint.py
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Add parent directory to path to import crawler
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
from crawler.RmrbCrawler import RmrbCrawler
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""主函数"""
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description='人民日报热点排行获取工具',
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
|
epilog="""
|
||||||
|
示例:
|
||||||
|
python RmrbHotPoint.py
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 创建爬虫实例
|
||||||
|
logger.info("开始获取人民日报热点排行")
|
||||||
|
crawler = RmrbCrawler()
|
||||||
|
|
||||||
|
# 执行获取热点排行
|
||||||
|
result = crawler.hotPointRank()
|
||||||
|
|
||||||
|
# 输出JSON结果
|
||||||
|
output = {
|
||||||
|
"code": result.code,
|
||||||
|
"message": result.message,
|
||||||
|
"success": result.success,
|
||||||
|
"data": None,
|
||||||
|
"dataList": [item.dict() for item in result.dataList] if result.dataList else []
|
||||||
|
}
|
||||||
|
|
||||||
|
print(json.dumps(output, ensure_ascii=False, indent=2))
|
||||||
|
|
||||||
|
# 关闭爬虫
|
||||||
|
crawler.close()
|
||||||
|
|
||||||
|
# 退出码: 成功=0, 失败=1
|
||||||
|
sys.exit(0 if result.success else 1)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"执行失败: {str(e)}")
|
||||||
|
error_output = {
|
||||||
|
"code": 500,
|
||||||
|
"message": f"执行失败: {str(e)}",
|
||||||
|
"success": False,
|
||||||
|
"data": None,
|
||||||
|
"dataList": []
|
||||||
|
}
|
||||||
|
print(json.dumps(error_output, ensure_ascii=False, indent=2))
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
103
schoolNewsCrawler/crawler/RmrbSearch.py
Normal file
103
schoolNewsCrawler/crawler/RmrbSearch.py
Normal file
@@ -0,0 +1,103 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
人民日报搜索爬虫命令行工具
|
||||||
|
用法: python RmrbSearch.py --key "关键词" --total 10 --type 0
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Add parent directory to path to import crawler
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
from crawler.RmrbCrawler import RmrbCrawler
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""主函数"""
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description='人民日报新闻搜索工具',
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
|
epilog="""
|
||||||
|
示例:
|
||||||
|
python RmrbSearch.py --key "教育改革" --total 20
|
||||||
|
python RmrbSearch.py -k "科技创新" -t 15 -n 1
|
||||||
|
|
||||||
|
新闻类型说明:
|
||||||
|
0 - 所有类型 (默认)
|
||||||
|
1 - 新闻
|
||||||
|
2 - 互动
|
||||||
|
3 - 报刊
|
||||||
|
4 - 图片
|
||||||
|
5 - 视频
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--key', '-k',
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help='搜索关键词 (必需)'
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--total', '-t',
|
||||||
|
type=int,
|
||||||
|
default=10,
|
||||||
|
help='获取新闻总数 (默认: 10)'
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--type', '-n',
|
||||||
|
type=int,
|
||||||
|
default=0,
|
||||||
|
choices=[0, 1, 2, 3, 4, 5],
|
||||||
|
help='新闻类型: 0=全部, 1=新闻, 2=互动, 3=报刊, 4=图片, 5=视频 (默认: 0)'
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 创建爬虫实例
|
||||||
|
logger.info(f"开始搜索: 关键词='{args.key}', 数量={args.total}, 类型={args.type}")
|
||||||
|
crawler = RmrbCrawler()
|
||||||
|
|
||||||
|
# 执行搜索
|
||||||
|
result = crawler.search(key=args.key, total=args.total, news_type=args.type)
|
||||||
|
|
||||||
|
# 输出JSON结果
|
||||||
|
output = {
|
||||||
|
"code": result.code,
|
||||||
|
"message": result.message,
|
||||||
|
"success": result.success,
|
||||||
|
"data": None,
|
||||||
|
"dataList": [item.dict() for item in result.dataList] if result.dataList else []
|
||||||
|
}
|
||||||
|
|
||||||
|
print(json.dumps(output, ensure_ascii=False, indent=2))
|
||||||
|
|
||||||
|
# 关闭爬虫
|
||||||
|
crawler.close()
|
||||||
|
|
||||||
|
# 退出码: 成功=0, 失败=1
|
||||||
|
sys.exit(0 if result.success else 1)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"执行失败: {str(e)}")
|
||||||
|
error_output = {
|
||||||
|
"code": 500,
|
||||||
|
"message": f"执行失败: {str(e)}",
|
||||||
|
"success": False,
|
||||||
|
"data": None,
|
||||||
|
"dataList": []
|
||||||
|
}
|
||||||
|
print(json.dumps(error_output, ensure_ascii=False, indent=2))
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
158
schoolNewsCrawler/crawler/RmrbTrending.py
Normal file
158
schoolNewsCrawler/crawler/RmrbTrending.py
Normal file
@@ -0,0 +1,158 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
人民日报热点新闻爬虫命令行工具
|
||||||
|
用法:
|
||||||
|
python RmrbTrending.py --date 20250110
|
||||||
|
python RmrbTrending.py --start-date 20250101 --end-date 20250110
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Add parent directory to path to import crawler
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
from crawler.RmrbCrawler import RmrbCrawler
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
|
||||||
|
def parse_date(date_str: str) -> datetime:
|
||||||
|
"""
|
||||||
|
解析日期字符串为datetime对象
|
||||||
|
|
||||||
|
Args:
|
||||||
|
date_str: 日期字符串,格式为YYYYMMDD
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
datetime对象
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
return datetime.strptime(date_str, "%Y%m%d")
|
||||||
|
except ValueError:
|
||||||
|
raise ValueError(f"日期格式错误: {date_str},正确格式为YYYYMMDD,例如: 20250110")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""主函数"""
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description='人民日报热点新闻获取工具',
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
|
epilog="""
|
||||||
|
示例:
|
||||||
|
# 获取单日热点新闻
|
||||||
|
python RmrbTrending.py --date 20250110
|
||||||
|
python RmrbTrending.py -d 20250110
|
||||||
|
|
||||||
|
# 获取日期范围内的热点新闻
|
||||||
|
python RmrbTrending.py --start-date 20250101 --end-date 20250110
|
||||||
|
python RmrbTrending.py -s 20250101 -e 20250110
|
||||||
|
|
||||||
|
# 不指定日期则获取今天的热点新闻
|
||||||
|
python RmrbTrending.py
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--date', '-d',
|
||||||
|
type=str,
|
||||||
|
help='指定日期 (格式: YYYYMMDD,例如: 20250110)'
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--start-date', '-s',
|
||||||
|
type=str,
|
||||||
|
help='开始日期 (格式: YYYYMMDD,需与--end-date一起使用)'
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--end-date', '-e',
|
||||||
|
type=str,
|
||||||
|
help='结束日期 (格式: YYYYMMDD,需与--start-date一起使用)'
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 创建爬虫实例
|
||||||
|
crawler = RmrbCrawler()
|
||||||
|
|
||||||
|
# 判断使用哪种模式
|
||||||
|
if args.date:
|
||||||
|
# 单日模式
|
||||||
|
if args.start_date or args.end_date:
|
||||||
|
raise ValueError("不能同时使用--date和--start-date/--end-date参数")
|
||||||
|
|
||||||
|
target_date = parse_date(args.date)
|
||||||
|
logger.info(f"获取单日热点新闻: {args.date}")
|
||||||
|
result = crawler.getOneDayTrendingNews(target_date)
|
||||||
|
|
||||||
|
elif args.start_date and args.end_date:
|
||||||
|
# 日期范围模式
|
||||||
|
start_date = parse_date(args.start_date)
|
||||||
|
end_date = parse_date(args.end_date)
|
||||||
|
|
||||||
|
if start_date > end_date:
|
||||||
|
raise ValueError("开始日期不能晚于结束日期")
|
||||||
|
|
||||||
|
logger.info(f"获取日期范围热点新闻: {args.start_date} 至 {args.end_date}")
|
||||||
|
result = crawler.getDaysTrendingNews(start_date, end_date)
|
||||||
|
|
||||||
|
elif args.start_date or args.end_date:
|
||||||
|
# 只指定了一个日期
|
||||||
|
raise ValueError("--start-date和--end-date必须同时使用")
|
||||||
|
|
||||||
|
else:
|
||||||
|
# 默认使用今天的日期
|
||||||
|
today = datetime.now()
|
||||||
|
today_str = today.strftime("%Y%m%d")
|
||||||
|
logger.info(f"获取今日热点新闻: {today_str}")
|
||||||
|
result = crawler.getOneDayTrendingNews(today)
|
||||||
|
|
||||||
|
# 输出JSON结果
|
||||||
|
output = {
|
||||||
|
"code": result.code,
|
||||||
|
"message": result.message,
|
||||||
|
"success": result.success,
|
||||||
|
"data": None,
|
||||||
|
"dataList": [item.dict() for item in result.dataList] if result.dataList else []
|
||||||
|
}
|
||||||
|
|
||||||
|
print(json.dumps(output, ensure_ascii=False, indent=2))
|
||||||
|
|
||||||
|
# 关闭爬虫
|
||||||
|
crawler.close()
|
||||||
|
|
||||||
|
# 退出码: 成功=0, 失败=1
|
||||||
|
sys.exit(0 if result.success else 1)
|
||||||
|
|
||||||
|
except ValueError as e:
|
||||||
|
logger.error(f"参数错误: {str(e)}")
|
||||||
|
error_output = {
|
||||||
|
"code": 400,
|
||||||
|
"message": f"参数错误: {str(e)}",
|
||||||
|
"success": False,
|
||||||
|
"data": None,
|
||||||
|
"dataList": []
|
||||||
|
}
|
||||||
|
print(json.dumps(error_output, ensure_ascii=False, indent=2))
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"执行失败: {str(e)}")
|
||||||
|
error_output = {
|
||||||
|
"code": 500,
|
||||||
|
"message": f"执行失败: {str(e)}",
|
||||||
|
"success": False,
|
||||||
|
"data": None,
|
||||||
|
"dataList": []
|
||||||
|
}
|
||||||
|
print(json.dumps(error_output, ensure_ascii=False, indent=2))
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user