171 lines
4.3 KiB
Python
171 lines
4.3 KiB
Python
"""
|
|
新闻爬虫主程序
|
|
用于被定时任务调用或独立运行
|
|
"""
|
|
|
|
import sys
|
|
import json
|
|
import argparse
|
|
from typing import List
|
|
from pathlib import Path
|
|
from loguru import logger
|
|
from crawler.RmrbCrawler import RmrbCrawler
|
|
from crawler.BaseCrawler import NewsItem
|
|
|
|
|
|
# 配置日志
|
|
logger.remove() # 移除默认处理器
|
|
logger.add(
|
|
sys.stdout,
|
|
format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan> - <level>{message}</level>",
|
|
level="INFO"
|
|
)
|
|
logger.add(
|
|
"logs/crawler_{time:YYYY-MM-DD}.log",
|
|
rotation="00:00",
|
|
retention="30 days",
|
|
encoding="utf-8",
|
|
level="DEBUG"
|
|
)
|
|
|
|
|
|
def crawl_rmrb_news(category: str = "politics", limit: int = 20) -> List[dict]:
|
|
"""
|
|
爬取人民日报新闻
|
|
|
|
Args:
|
|
category: 新闻分类
|
|
limit: 爬取数量
|
|
|
|
Returns:
|
|
新闻列表(字典格式)
|
|
"""
|
|
logger.info(f"开始爬取人民日报新闻 - 分类: {category}, 数量: {limit}")
|
|
|
|
crawler = None
|
|
try:
|
|
crawler = RmrbCrawler()
|
|
news_list = crawler.crawl(category=category, limit=limit)
|
|
|
|
# 转换为字典列表
|
|
result = [news.model_dump() for news in news_list]
|
|
|
|
logger.info(f"爬取完成,共获取 {len(result)} 条新闻")
|
|
return result
|
|
|
|
except Exception as e:
|
|
logger.error(f"爬取失败: {str(e)}")
|
|
return []
|
|
|
|
finally:
|
|
if crawler:
|
|
crawler.close()
|
|
|
|
|
|
def save_to_json(news_list: List[dict], output_file: str = "output/news.json"):
|
|
"""
|
|
保存新闻到JSON文件
|
|
|
|
Args:
|
|
news_list: 新闻列表
|
|
output_file: 输出文件路径
|
|
"""
|
|
try:
|
|
import os
|
|
os.makedirs(os.path.dirname(output_file), exist_ok=True)
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(news_list, f, ensure_ascii=False, indent=2)
|
|
|
|
logger.info(f"新闻已保存到: {output_file}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"保存文件失败: {str(e)}")
|
|
|
|
|
|
def main():
|
|
"""主函数"""
|
|
# 创建参数解析器
|
|
parser = argparse.ArgumentParser(
|
|
description='人民日报新闻爬虫主程序',
|
|
formatter_class=argparse.RawDescriptionHelpFormatter
|
|
)
|
|
|
|
# 添加位置参数(保持向后兼容)
|
|
parser.add_argument(
|
|
'category',
|
|
nargs='?',
|
|
default='politics',
|
|
help='新闻分类 (默认: politics)'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'limit',
|
|
nargs='?',
|
|
type=int,
|
|
default=20,
|
|
help='爬取数量 (默认: 20)'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'output_file',
|
|
nargs='?',
|
|
default='output/news.json',
|
|
help='输出文件路径 (默认: output/news.json)'
|
|
)
|
|
|
|
# 添加JSON参数支持
|
|
parser.add_argument(
|
|
'--json', '-j',
|
|
type=str,
|
|
help='JSON格式参数 (优先级高于其他参数)'
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# 解析参数: JSON参数优先
|
|
if args.json:
|
|
try:
|
|
json_data = json.loads(args.json)
|
|
params = json_data.get('params', {})
|
|
category = params.get('category', 'politics')
|
|
limit = params.get('limit', 20)
|
|
output_file = json_data.get('outputFile', 'output/news.json')
|
|
logger.info("使用JSON参数模式")
|
|
except Exception as e:
|
|
logger.error(f"JSON参数解析失败: {e}")
|
|
sys.exit(1)
|
|
else:
|
|
# 使用命令行参数
|
|
category = args.category
|
|
limit = args.limit
|
|
output_file = args.output_file
|
|
logger.info("使用命令行参数模式")
|
|
|
|
logger.info("=" * 60)
|
|
logger.info("新闻爬虫程序启动")
|
|
logger.info("=" * 60)
|
|
|
|
# 爬取新闻
|
|
news_list = crawl_rmrb_news(category=category, limit=limit)
|
|
|
|
# 保存结果
|
|
if news_list:
|
|
save_to_json(news_list, output_file)
|
|
|
|
# 输出统计信息
|
|
logger.info(f"爬取统计:")
|
|
logger.info(f" - 成功: {len(news_list)} 条")
|
|
logger.info(f" - 失败: {limit - len(news_list)} 条")
|
|
else:
|
|
logger.warning("未获取到任何新闻")
|
|
|
|
logger.info("=" * 60)
|
|
logger.info("新闻爬虫程序结束")
|
|
logger.info("=" * 60)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|