""" 新闻爬虫主程序 用于被定时任务调用或独立运行 """ import sys import json from typing import List from loguru import logger from crawler.RmrbCrawler import RmrbCrawler from crawler.BaseCrawler import NewsItem # 配置日志 logger.remove() # 移除默认处理器 logger.add( sys.stdout, format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {name}:{function} - {message}", level="INFO" ) logger.add( "logs/crawler_{time:YYYY-MM-DD}.log", rotation="00:00", retention="30 days", encoding="utf-8", level="DEBUG" ) def crawl_rmrb_news(category: str = "politics", limit: int = 20) -> List[dict]: """ 爬取人民日报新闻 Args: category: 新闻分类 limit: 爬取数量 Returns: 新闻列表(字典格式) """ logger.info(f"开始爬取人民日报新闻 - 分类: {category}, 数量: {limit}") crawler = None try: crawler = RmrbCrawler() news_list = crawler.crawl(category=category, limit=limit) # 转换为字典列表 result = [news.model_dump() for news in news_list] logger.info(f"爬取完成,共获取 {len(result)} 条新闻") return result except Exception as e: logger.error(f"爬取失败: {str(e)}") return [] finally: if crawler: crawler.close() def save_to_json(news_list: List[dict], output_file: str = "output/news.json"): """ 保存新闻到JSON文件 Args: news_list: 新闻列表 output_file: 输出文件路径 """ try: import os os.makedirs(os.path.dirname(output_file), exist_ok=True) with open(output_file, 'w', encoding='utf-8') as f: json.dump(news_list, f, ensure_ascii=False, indent=2) logger.info(f"新闻已保存到: {output_file}") except Exception as e: logger.error(f"保存文件失败: {str(e)}") def main(): """主函数""" # 解析命令行参数 category = "politics" limit = 20 output_file = "output/news.json" if len(sys.argv) > 1: category = sys.argv[1] if len(sys.argv) > 2: limit = int(sys.argv[2]) if len(sys.argv) > 3: output_file = sys.argv[3] logger.info("=" * 60) logger.info("新闻爬虫程序启动") logger.info("=" * 60) # 爬取新闻 news_list = crawl_rmrb_news(category=category, limit=limit) # 保存结果 if news_list: save_to_json(news_list, output_file) # 输出统计信息 logger.info(f"爬取统计:") logger.info(f" - 成功: {len(news_list)} 条") logger.info(f" - 失败: {limit - len(news_list)} 条") else: logger.warning("未获取到任何新闻") logger.info("=" * 60) logger.info("新闻爬虫程序结束") logger.info("=" * 60) if __name__ == "__main__": main()