#!/usr/bin/env python # -*- coding: utf-8 -*- """ 人民日报搜索爬虫命令行工具 用法: python RmrbSearch.py --key "关键词" --total 10 --type 0 """ import argparse import json import sys from pathlib import Path # Add project root directory to path to import crawler sys.path.insert(0, str(Path(__file__).parent.parent.parent)) from crawler.rmrb.RmrbCrawler import RmrbCrawler from loguru import logger def main(): """主函数""" parser = argparse.ArgumentParser( description='人民日报新闻搜索工具', formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" 示例: python RmrbSearch.py --key "教育改革" --total 20 python RmrbSearch.py -k "科技创新" -t 15 --type 1 python RmrbSearch.py --key "AI" --total 5 --output "out.json" 新闻类型说明: 0 - 所有类型 (默认) 1 - 新闻 2 - 互动 3 - 报刊 4 - 图片 5 - 视频 """ ) parser.add_argument( '--query', '-q', type=str, required=True, help='搜索关键词' ) parser.add_argument( '--total', '-t', type=int, default=10, help='抓取数量 (默认: 10)' ) parser.add_argument( '--type', '-n', type=int, default=0, help='新闻类型 (默认: 0=所有类型)' ) parser.add_argument( '--output', '-o', type=str, help='输出文件路径' ) args = parser.parse_args() # 获取参数 key = args.query total = args.total news_type = args.type output_file = args.output logger.info("使用直接参数模式") # 关键校验:key 必须存在 if not key or not key.strip(): parser.error("搜索关键词不能为空!") try: logger.info(f"开始搜索: 关键词='{key}', 数量={total}, 类型={news_type}") crawler = RmrbCrawler() result = crawler.search(key=key.strip(), total=total, news_type=news_type) output = { "code": result.code, "message": result.message, "success": result.success, "data": None, "dataList": [item.model_dump() for item in result.dataList] if result.dataList else [] } # result = None # with open("F:\Project\schoolNews\schoolNewsCrawler\output\output.json", "r", encoding="utf-8") as f: # result = json.load(f) # print(result) # output = result if output_file: output_path = Path(output_file) output_path.parent.mkdir(parents=True, exist_ok=True) with open(output_path, 'w', encoding='utf-8') as f: json.dump(output, f, ensure_ascii=False, indent=2) logger.info(f"结果已保存到: {output_file}") crawler.close() sys.exit(0 if result.success else 1) # print(json.dumps(output, ensure_ascii=False, indent=2)) # sys.exit(0 if result["success"] else 1) except Exception as e: logger.error(f"执行失败: {str(e)}") error_output = { "code": 500, "message": f"执行失败: {str(e)}", "success": False, "data": None, "dataList": [] } print(json.dumps(error_output, ensure_ascii=False, indent=2)) sys.exit(1) if __name__ == "__main__": main()