""" 使用 Crawl4AI 爬取人民网新闻的主程序 """ import sys import json import asyncio from typing import List from loguru import logger from crawl4ai.PeopleNetCrewer import PeopleNetCrewer # 配置日志 logger.remove() # 移除默认处理器 logger.add( sys.stdout, format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {name}:{function} - {message}", level="INFO" ) logger.add( "logs/crewer_{time:YYYY-MM-DD}.log", rotation="00:00", retention="30 days", encoding="utf-8", level="DEBUG" ) async def crawl_people_net_news( category: str = "politics", limit: int = 20, chrome_path: str = None ) -> List[dict]: """ 使用 Crawl4AI 爬取人民网新闻 Args: category: 新闻分类 limit: 爬取数量 chrome_path: Chrome 浏览器可执行文件路径(可选) Returns: 新闻列表(字典格式) """ logger.info(f"开始爬取人民网新闻 - 分类: {category}, 数量: {limit}") crewer = None try: crewer = PeopleNetCrewer(chrome_path=chrome_path) news_list = await crewer.crawl(category=category, limit=limit) # 转换为字典列表 result = [news.model_dump() for news in news_list] logger.info(f"爬取完成,共获取 {len(result)} 条新闻") return result except Exception as e: logger.error(f"爬取失败: {str(e)}") import traceback logger.error(traceback.format_exc()) return [] finally: if crewer: await crewer.close() def save_to_json(news_list: List[dict], output_file: str = "output/news.json"): """ 保存新闻到JSON文件 Args: news_list: 新闻列表 output_file: 输出文件路径 """ try: import os os.makedirs(os.path.dirname(output_file), exist_ok=True) with open(output_file, 'w', encoding='utf-8') as f: json.dump(news_list, f, ensure_ascii=False, indent=2) logger.info(f"新闻已保存到: {output_file}") except Exception as e: logger.error(f"保存文件失败: {str(e)}") async def main_async(): """异步主函数""" # 解析命令行参数 category = "politics" limit = 20 output_file = "output/news.json" chrome_path = None if len(sys.argv) > 1: category = sys.argv[1] if len(sys.argv) > 2: limit = int(sys.argv[2]) if len(sys.argv) > 3: output_file = sys.argv[3] if len(sys.argv) > 4: chrome_path = sys.argv[4] logger.info("=" * 60) logger.info("人民网新闻爬虫程序启动 (Crawl4AI)") logger.info("=" * 60) # 爬取新闻 news_list = await crawl_people_net_news( category=category, limit=limit, chrome_path=chrome_path ) # 保存结果 if news_list: save_to_json(news_list, output_file) # 输出统计信息 logger.info(f"爬取统计:") logger.info(f" - 成功: {len(news_list)} 条") logger.info(f" - 失败: {limit - len(news_list)} 条") else: logger.warning("未获取到任何新闻") logger.info("=" * 60) logger.info("人民网新闻爬虫程序结束") logger.info("=" * 60) def main(): """主函数入口""" try: asyncio.run(main_async()) except KeyboardInterrupt: logger.info("程序被用户中断") except Exception as e: logger.error(f"程序运行出错: {str(e)}") import traceback logger.error(traceback.format_exc()) if __name__ == "__main__": main()