#!/usr/bin/env python # -*- coding: utf-8 -*- """ 人民日报热点新闻爬虫命令行工具 用法: python RmrbTrending.py --date 20250110 python RmrbTrending.py --start-date 20250101 --end-date 20250110 """ import argparse import json import sys from datetime import datetime, timedelta from pathlib import Path # Add parent directory to path to import crawler sys.path.insert(0, str(Path(__file__).parent.parent)) from crawler.RmrbCrawler import RmrbCrawler from loguru import logger from core.ResultDomain import ResultDomain def parse_date(date_str) -> datetime: """ 解析日期字符串或数字为datetime对象 (格式: YYYYMMDD) Args: date_str: 可为字符串或整数,如 "20250110" 或 20250110 Returns: datetime对象 Raises: ValueError: 格式错误 """ # 统一转为字符串并清理 if date_str is None: raise ValueError("日期不能为空") date_str = str(date_str).strip() if len(date_str) != 8 or not date_str.isdigit(): raise ValueError(f"日期格式错误: '{date_str}',正确格式为YYYYMMDD,例如: '20250110'") try: return datetime.strptime(date_str, "%Y%m%d") except ValueError: raise ValueError(f"日期格式错误: '{date_str}',正确格式为YYYYMMDD,例如: '20250110'") def main(): """主函数""" parser = argparse.ArgumentParser( description='人民日报热点新闻获取工具', formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" 示例: # 获取单日热点新闻 python RmrbTrending.py --date 20250110 python RmrbTrending.py -d 20250110 # 获取日期范围内的热点新闻 python RmrbTrending.py --start-date 20250101 --end-date 20250110 python RmrbTrending.py -s 20250101 -e 20250110 # 不指定日期则根据 isYesterday 决定(默认昨日) python RmrbTrending.py """ ) parser.add_argument('--startDate', '-s', type=str, help='开始日期 (需与--end-date一起使用)') parser.add_argument('--endDate', '-e', type=str, help='结束日期 (需与--start-date一起使用)') parser.add_argument('--yesterday', '-y', action='store_true', help='查询昨日 (默认行为)') parser.add_argument('--output', '-o', type=str, help='输出文件路径') args = parser.parse_args() # 初始化变量 output_file = args.output start_date = args.startDate end_date = args.endDate is_yesterday = args.yesterday if args.yesterday else True # 默认查昨日 logger.info("使用直接参数模式") # 辅助函数:清理空字符串 def clean(s): return s.strip() if s and isinstance(s, str) and s.strip() else None start_date = clean(start_date) end_date = clean(end_date) try: crawler = RmrbCrawler() if is_yesterday: target_date = datetime.now() - timedelta(days=1) date_str = target_date.strftime('%Y%m%d') logger.info(f"获取昨日热点新闻: {target_date.strftime('%Y-%m-%d')} (参数格式: {date_str})") result = crawler.getOneDayTrendingNews(target_date) # 日期范围模式 elif start_date and end_date: start_dt = parse_date(start_date) end_dt = parse_date(end_date) if start_dt > end_dt: raise ValueError("开始日期不能晚于结束日期") logger.info(f"获取日期范围热点新闻: {start_dt.strftime('%Y-%m-%d')} 至 {end_dt.strftime('%Y-%m-%d')}") result = crawler.getDaysTrendingNews(start_dt, end_dt) # 只给一个边界 elif start_date or end_date: raise ValueError("--startDate 和 --endDate 必须同时指定") else: result = ResultDomain(code=0, message="参数错误", success=False, data=None, dataList=None) # 构造输出 output = { "code": result.code, "message": result.message, "success": result.success, "data": None, "dataList": [item.model_dump() for item in result.dataList] if result.dataList else [] } # 保存到文件 if output_file: output_path = Path(output_file) output_path.parent.mkdir(parents=True, exist_ok=True) with open(output_path, 'w', encoding='utf-8') as f: json.dump(output, f, ensure_ascii=False, indent=2) logger.info(f"结果已保存到: {output_file}") print(json.dumps(output, ensure_ascii=False, indent=2)) crawler.close() sys.exit(0 if result.success else 1) except ValueError as e: logger.error(f"参数错误: {str(e)}") error_output = { "code": 400, "message": f"参数错误: {str(e)}", "success": False, "data": None, "dataList": [] } print(json.dumps(error_output, ensure_ascii=False, indent=2)) sys.exit(1) except Exception as e: logger.error(f"执行失败: {str(e)}") error_output = { "code": 500, "message": f"执行失败: {str(e)}", "success": False, "data": None, "dataList": [] } print(json.dumps(error_output, ensure_ascii=False, indent=2)) sys.exit(1) if __name__ == "__main__": main()