Files
schoolNews/schoolNewsCrawler/crawler/rmrb/RmrbTrending.py
2025-11-19 16:41:41 +08:00

160 lines
5.3 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
人民日报热点新闻爬虫命令行工具
用法:
python RmrbTrending.py --date 20250110
python RmrbTrending.py --start-date 20250101 --end-date 20250110
"""
import argparse
import json
import sys
from datetime import datetime, timedelta
from pathlib import Path
# Add parent directory to path to import crawler
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
from crawler.rmrb.RmrbCrawler import RmrbCrawler
from loguru import logger
from core.ResultDomain import ResultDomain
def parse_date(date_str) -> datetime:
"""
解析日期字符串或数字为datetime对象 (格式: YYYYMMDD)
Args:
date_str: 可为字符串或整数,如 "20250110" 或 20250110
Returns:
datetime对象
Raises:
ValueError: 格式错误
"""
# 统一转为字符串并清理
if date_str is None:
raise ValueError("日期不能为空")
date_str = str(date_str).strip()
if len(date_str) != 8 or not date_str.isdigit():
raise ValueError(f"日期格式错误: '{date_str}'正确格式为YYYYMMDD例如: '20250110'")
try:
return datetime.strptime(date_str, "%Y%m%d")
except ValueError:
raise ValueError(f"日期格式错误: '{date_str}'正确格式为YYYYMMDD例如: '20250110'")
def main():
"""主函数"""
parser = argparse.ArgumentParser(
description='人民日报热点新闻获取工具',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
示例:
# 获取单日热点新闻
python RmrbTrending.py --date 20250110
python RmrbTrending.py -d 20250110
# 获取日期范围内的热点新闻
python RmrbTrending.py --start-date 20250101 --end-date 20250110
python RmrbTrending.py -s 20250101 -e 20250110
# 不指定日期则根据 isYesterday 决定(默认昨日)
python RmrbTrending.py
"""
)
parser.add_argument('--startDate', '-s', type=str, help='开始日期 (需与--end-date一起使用)')
parser.add_argument('--endDate', '-e', type=str, help='结束日期 (需与--start-date一起使用)')
parser.add_argument('--yesterday', '-y', action='store_true', help='查询昨日 (默认行为)')
parser.add_argument('--output', '-o', type=str, help='输出文件路径')
args = parser.parse_args()
# 初始化变量
output_file = args.output
start_date = args.startDate
end_date = args.endDate
is_yesterday = args.yesterday if args.yesterday else True # 默认查昨日
logger.info("使用直接参数模式")
# 辅助函数:清理空字符串
def clean(s):
return s.strip() if s and isinstance(s, str) and s.strip() else None
start_date = clean(start_date)
end_date = clean(end_date)
try:
crawler = RmrbCrawler()
if is_yesterday:
target_date = datetime.now() - timedelta(days=1)
date_str = target_date.strftime('%Y%m%d')
logger.info(f"获取昨日热点新闻: {target_date.strftime('%Y-%m-%d')} (参数格式: {date_str})")
result = crawler.getOneDayTrendingNews(target_date)
# 日期范围模式
elif start_date and end_date:
start_dt = parse_date(start_date)
end_dt = parse_date(end_date)
if start_dt > end_dt:
raise ValueError("开始日期不能晚于结束日期")
logger.info(f"获取日期范围热点新闻: {start_dt.strftime('%Y-%m-%d')}{end_dt.strftime('%Y-%m-%d')}")
result = crawler.getDaysTrendingNews(start_dt, end_dt)
# 只给一个边界
elif start_date or end_date:
raise ValueError("--startDate 和 --endDate 必须同时指定")
else:
result = ResultDomain(code=0, message="参数错误", success=False, data=None, dataList=None)
# 构造输出
output = {
"code": result.code,
"message": result.message,
"success": result.success,
"data": None,
"dataList": [item.model_dump() for item in result.dataList] if result.dataList else []
}
# 保存到文件
if output_file:
output_path = Path(output_file)
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(output, f, ensure_ascii=False, indent=2)
logger.info(f"结果已保存到: {output_file}")
print(json.dumps(output, ensure_ascii=False, indent=2))
crawler.close()
sys.exit(0 if result.success else 1)
except ValueError as e:
logger.error(f"参数错误: {str(e)}")
error_output = {
"code": 400,
"message": f"参数错误: {str(e)}",
"success": False,
"data": None,
"dataList": []
}
print(json.dumps(error_output, ensure_ascii=False, indent=2))
sys.exit(1)
except Exception as e:
logger.error(f"执行失败: {str(e)}")
error_output = {
"code": 500,
"message": f"执行失败: {str(e)}",
"success": False,
"data": None,
"dataList": []
}
print(json.dumps(error_output, ensure_ascii=False, indent=2))
sys.exit(1)
if __name__ == "__main__":
main()