人民日报爬虫

This commit is contained in:
2025-11-10 15:22:44 +08:00
parent 08df5f1e8a
commit e8b76278e9
36 changed files with 4241 additions and 0 deletions

123
schoolNewsCrawler/main.py Normal file
View File

@@ -0,0 +1,123 @@
"""
新闻爬虫主程序
用于被定时任务调用或独立运行
"""
import sys
import json
from typing import List
from loguru import logger
from crawler.RmrbCrawler import RmrbCrawler
from crawler.BaseCrawler import NewsItem
# 配置日志
logger.remove() # 移除默认处理器
logger.add(
sys.stdout,
format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan> - <level>{message}</level>",
level="INFO"
)
logger.add(
"logs/crawler_{time:YYYY-MM-DD}.log",
rotation="00:00",
retention="30 days",
encoding="utf-8",
level="DEBUG"
)
def crawl_rmrb_news(category: str = "politics", limit: int = 20) -> List[dict]:
"""
爬取人民日报新闻
Args:
category: 新闻分类
limit: 爬取数量
Returns:
新闻列表(字典格式)
"""
logger.info(f"开始爬取人民日报新闻 - 分类: {category}, 数量: {limit}")
crawler = None
try:
crawler = RmrbCrawler()
news_list = crawler.crawl(category=category, limit=limit)
# 转换为字典列表
result = [news.model_dump() for news in news_list]
logger.info(f"爬取完成,共获取 {len(result)} 条新闻")
return result
except Exception as e:
logger.error(f"爬取失败: {str(e)}")
return []
finally:
if crawler:
crawler.close()
def save_to_json(news_list: List[dict], output_file: str = "output/news.json"):
"""
保存新闻到JSON文件
Args:
news_list: 新闻列表
output_file: 输出文件路径
"""
try:
import os
os.makedirs(os.path.dirname(output_file), exist_ok=True)
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(news_list, f, ensure_ascii=False, indent=2)
logger.info(f"新闻已保存到: {output_file}")
except Exception as e:
logger.error(f"保存文件失败: {str(e)}")
def main():
"""主函数"""
# 解析命令行参数
category = "politics"
limit = 20
output_file = "output/news.json"
if len(sys.argv) > 1:
category = sys.argv[1]
if len(sys.argv) > 2:
limit = int(sys.argv[2])
if len(sys.argv) > 3:
output_file = sys.argv[3]
logger.info("=" * 60)
logger.info("新闻爬虫程序启动")
logger.info("=" * 60)
# 爬取新闻
news_list = crawl_rmrb_news(category=category, limit=limit)
# 保存结果
if news_list:
save_to_json(news_list, output_file)
# 输出统计信息
logger.info(f"爬取统计:")
logger.info(f" - 成功: {len(news_list)}")
logger.info(f" - 失败: {limit - len(news_list)}")
else:
logger.warning("未获取到任何新闻")
logger.info("=" * 60)
logger.info("新闻爬虫程序结束")
logger.info("=" * 60)
if __name__ == "__main__":
main()