Files
schoolNews/schoolNewsCrawler/crawler/rmrb/RmrbSearch.py
2025-11-19 16:04:50 +08:00

125 lines
3.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
人民日报搜索爬虫命令行工具
用法: python RmrbSearch.py --key "关键词" --total 10 --type 0
"""
import argparse
import json
import sys
from pathlib import Path
# Add parent directory to path to import crawler
sys.path.insert(0, str(Path(__file__).parent.parent))
from crawler.rmrb.RmrbCrawler import RmrbCrawler
from loguru import logger
def main():
"""主函数"""
parser = argparse.ArgumentParser(
description='人民日报新闻搜索工具',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
示例:
python RmrbSearch.py --key "教育改革" --total 20
python RmrbSearch.py -k "科技创新" -t 15 --type 1
python RmrbSearch.py --key "AI" --total 5 --output "out.json"
新闻类型说明:
0 - 所有类型 (默认)
1 - 新闻
2 - 互动
3 - 报刊
4 - 图片
5 - 视频
"""
)
parser.add_argument(
'--query', '-q',
type=str,
required=True,
help='搜索关键词'
)
parser.add_argument(
'--total', '-t',
type=int,
default=10,
help='抓取数量 (默认: 10)'
)
parser.add_argument(
'--type', '-n',
type=int,
default=0,
help='新闻类型 (默认: 0=所有类型)'
)
parser.add_argument(
'--output', '-o',
type=str,
help='输出文件路径'
)
args = parser.parse_args()
# 获取参数
key = args.query
total = args.total
news_type = args.type
output_file = args.output
logger.info("使用直接参数模式")
# 关键校验key 必须存在
if not key or not key.strip():
parser.error("搜索关键词不能为空!")
try:
logger.info(f"开始搜索: 关键词='{key}', 数量={total}, 类型={news_type}")
crawler = RmrbCrawler()
result = crawler.search(key=key.strip(), total=total, news_type=news_type)
output = {
"code": result.code,
"message": result.message,
"success": result.success,
"data": None,
"dataList": [item.model_dump() for item in result.dataList] if result.dataList else []
}
result = None
with open("F:\Project\schoolNews\schoolNewsCrawler\output\output.json", "r", encoding="utf-8") as f:
result = json.load(f)
print(result)
output = result
if output_file:
output_path = Path(output_file)
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(output, f, ensure_ascii=False, indent=2)
logger.info(f"结果已保存到: {output_file}")
crawler.close()
# sys.exit(0 if result.success else 1)
# print(json.dumps(output, ensure_ascii=False, indent=2))
sys.exit(0 if result["success"] else 1)
except Exception as e:
logger.error(f"执行失败: {str(e)}")
error_output = {
"code": 500,
"message": f"执行失败: {str(e)}",
"success": False,
"data": None,
"dataList": []
}
print(json.dumps(error_output, ensure_ascii=False, indent=2))
sys.exit(1)
if __name__ == "__main__":
main()