Files
schoolNews/schoolNewsCrawler/crawler/xxqg/XxqgColumn.py
2025-11-21 15:41:28 +08:00

130 lines
3.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
学习强国栏目爬虫命令行工具
用法: python RmrbSearch.py --key "关键词" --total 10 --type 0
"""
import argparse
import json
import sys
from pathlib import Path
import time
# Add project root directory to path to import crawler
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
from crawler.xxqg.XxqgCrawler import XxqgCrawler
from loguru import logger
def main():
"""主函数"""
parser = argparse.ArgumentParser(
description='学习强国新闻栏目爬虫工具',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
"""
)
parser.add_argument(
'--column', '-c',
type=str,
required=True,
help='栏目名称 important重要新闻 xuexishiping学习时评zonghexinwen综合新闻zhongxuanbu中宣部'
)
parser.add_argument(
'--yesterday', '-y',
type=str,
default="True",
help='是否抓取昨天的数据 (默认: True)'
)
parser.add_argument(
'--start', '-s',
type=str,
default=None,
help='开始日期 (格式: YYYY-MM-DD)'
)
parser.add_argument(
'--end', '-e',
type=str,
default=None,
help='结束日期 (格式: YYYY-MM-DD)'
)
parser.add_argument(
'--output', '-o',
type=str,
help='输出文件路径'
)
args = parser.parse_args()
# 获取参数
column = args.column
yesterday = str(args.yesterday)
if yesterday.upper() == "FALSE":
yesterday = False
elif yesterday.upper() == "TRUE":
yesterday = True
else:
parser.error("--yesterday 参数必须是 True 或 False")
start = args.start
end = args.end
output_file = args.output
logger.info("使用直接参数模式")
# column 必须存在
if not column or not column.strip():
parser.error("栏目不能为空!")
try:
logger.info(f"开始搜索: 栏目='{column}', 昨天={yesterday}, 开始日期={start}, 结束日期={end}")
crawler = XxqgCrawler()
url_config= crawler.config.urls[column]
time.sleep(5)
result = crawler.crawl_base(url_config, yesterday=yesterday, start=start, end=end)
# print(result)
output = {
"code": result.code,
"message": result.message,
"success": result.success,
"data": None,
"dataList": [item.model_dump() for item in result.dataList] if result.dataList else []
}
# result = None
# with open("F:\Project\schoolNews\schoolNewsCrawler\output\output.json", "r", encoding="utf-8") as f:
# result = json.load(f)
# print(result)
# output = result
if output_file:
output_path = Path(output_file)
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(output, f, ensure_ascii=False, indent=2)
logger.info(f"结果已保存到: {output_file}")
crawler.close()
sys.exit(0 if result.success else 1)
# print(json.dumps(output, ensure_ascii=False, indent=2))
# sys.exit(0 if result["success"] else 1)
except Exception as e:
logger.error(f"执行失败: {str(e)}")
error_output = {
"code": 500,
"message": f"执行失败: {str(e)}",
"success": False,
"data": None,
"dataList": []
}
print(json.dumps(error_output, ensure_ascii=False, indent=2))
sys.exit(1)
if __name__ == "__main__":
main()