From 3d742bf3227b305ee4877cbdfa109ace5ff9e03c Mon Sep 17 00:00:00 2001
From: wangys <3401275564@qq.com>
Date: Mon, 10 Nov 2025 19:13:54 +0800
Subject: [PATCH] =?UTF-8?q?=E7=88=AC=E8=99=AB?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
schoolNewsCrawler/core/ResultDomain.py | 2 +-
schoolNewsCrawler/core/__init__.py | 0
schoolNewsCrawler/crawler/BaseCrawler.py | 13 +-
schoolNewsCrawler/crawler/RmrbCrawler.py | 106 ++++-----------
schoolNewsCrawler/crawler/RmrbHotPoint.py | 72 ++++++++++
schoolNewsCrawler/crawler/RmrbSearch.py | 103 ++++++++++++++
schoolNewsCrawler/crawler/RmrbTrending.py | 158 ++++++++++++++++++++++
7 files changed, 364 insertions(+), 90 deletions(-)
create mode 100644 schoolNewsCrawler/core/__init__.py
create mode 100644 schoolNewsCrawler/crawler/RmrbHotPoint.py
create mode 100644 schoolNewsCrawler/crawler/RmrbSearch.py
create mode 100644 schoolNewsCrawler/crawler/RmrbTrending.py
diff --git a/schoolNewsCrawler/core/ResultDomain.py b/schoolNewsCrawler/core/ResultDomain.py
index 7ce9c1b..abf2ea7 100644
--- a/schoolNewsCrawler/core/ResultDomain.py
+++ b/schoolNewsCrawler/core/ResultDomain.py
@@ -2,7 +2,7 @@ from pydantic import BaseModel, Field, HttpUrl
from typing import Any, List, Optional
class ResultDomain(BaseModel):
- code: int = Field(..., description="状态码")
+ code: int = Field(..., description="状态码",)
message: str = Field(..., description="消息")
success: bool = Field(..., description="是否成功")
data: Optional[Any] = Field(default=None, description="数据")
diff --git a/schoolNewsCrawler/core/__init__.py b/schoolNewsCrawler/core/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/schoolNewsCrawler/crawler/BaseCrawler.py b/schoolNewsCrawler/crawler/BaseCrawler.py
index 427d6af..6046c15 100644
--- a/schoolNewsCrawler/crawler/BaseCrawler.py
+++ b/schoolNewsCrawler/crawler/BaseCrawler.py
@@ -1,5 +1,5 @@
# 定义基础爬虫类
-from typing import Dict, Optional, List, Any
+from typing import Callable, Dict, Optional, List, Any, Union
from abc import ABC, abstractmethod
import requests
from bs4 import BeautifulSoup
@@ -12,7 +12,7 @@ class UrlConfig(BaseModel):
url: str = Field(..., description="请求URL")
params: Optional[Dict[str, Any]] = Field(default=None, description="请求参数")
method: str = Field(default="GET", description="请求方法")
-
+ headers: Optional[Dict[str, str]] = Field(default=None, description="请求头")
class Config:
# 允许任意类型
arbitrary_types_allowed = True
@@ -123,15 +123,6 @@ class BaseCrawler(ABC):
logger.error(f"HTML解析失败: {str(e)}")
return None
- @abstractmethod
- def crawl(self) -> List[NewsItem]:
- """
- 爬取新闻(子类必须实现)
-
- Returns:
- 新闻列表
- """
- pass
@abstractmethod
def parse_news_detail(self, url: str) -> Optional[NewsItem]:
diff --git a/schoolNewsCrawler/crawler/RmrbCrawler.py b/schoolNewsCrawler/crawler/RmrbCrawler.py
index 03475c7..bb15c19 100644
--- a/schoolNewsCrawler/crawler/RmrbCrawler.py
+++ b/schoolNewsCrawler/crawler/RmrbCrawler.py
@@ -1,11 +1,11 @@
# 人民日报爬虫
from typing import List, Optional
-from core import ResultDomain
+from core.ResultDomain import ResultDomain
from crawler.BaseCrawler import BaseCrawler, CrawlerConfig, NewsItem, UrlConfig
from loguru import logger
import re
import chardet
-from datetime import datetime
+from datetime import datetime, timedelta
class RmrbCrawler(BaseCrawler):
@@ -50,7 +50,7 @@ class RmrbCrawler(BaseCrawler):
}
),
"one_day_trending_news": UrlConfig(
- url=lambda date: f"http://www.people.com.cn/GB/59476/review/{date}.html", # date:YYYYMMdd
+ url= "http://www.people.com.cn/GB/59476/review/{date}.html", # date:YYYYMMdd
method="GET",
params={},
headers={
@@ -63,7 +63,7 @@ class RmrbCrawler(BaseCrawler):
)
super().__init__(config)
- def search(self, key: str, total: int = 10, news_type: int = 0) -> ResultDomain:
+ def search(self, key: str, total: int, news_type: int = 0) -> ResultDomain:
"""
搜索人民日报新闻
@@ -76,7 +76,7 @@ class RmrbCrawler(BaseCrawler):
新闻列表
"""
try:
- resultDomain = ResultDomain()
+ resultDomain = ResultDomain(code=0, message="", success=True)
news_list = []
resultDomain.dataList = news_list
# 获取搜索配置
@@ -98,7 +98,7 @@ class RmrbCrawler(BaseCrawler):
while len(news_list) < total:
search_data["page"] = page
- response = self.fetch(search_config.url, method=search_config.method, data=search_data, headers=search_config.headers)
+ response = self.fetch(search_config.url, method=search_config.method, json=search_data, headers=search_config.headers)
response_json = response.json()
if response_json.get("code") == 0:
records = response_json.get("data", {}).get("records", [])
@@ -130,7 +130,7 @@ class RmrbCrawler(BaseCrawler):
"""
try:
hot_point_rank_config = self.config.urls.get("hot_point_rank")
- resultDomain = ResultDomain()
+ resultDomain = ResultDomain(code=0, message="", success=True)
news_list = []
resultDomain.dataList = news_list
@@ -169,7 +169,7 @@ class RmrbCrawler(BaseCrawler):
获取人民日报一天内的热点新闻
"""
try:
- resultDomain = ResultDomain()
+ resultDomain = ResultDomain(code=0, message="", success=True)
news_list = []
resultDomain.dataList = news_list
resultDomain.success = True
@@ -177,7 +177,8 @@ class RmrbCrawler(BaseCrawler):
logger.info(f"获取人民日报一天内的热点新闻成功")
date_str = date.strftime("%Y%m%d")
one_day_trending_news_config = self.config.urls.get("one_day_trending_news")
- one_day_trending_news_config.url = one_day_trending_news_config.url(date_str)
+
+ one_day_trending_news_config.url = one_day_trending_news_config.url.format(date_str)
response = self.fetch(one_day_trending_news_config.url, method=one_day_trending_news_config.method, headers=one_day_trending_news_config.headers)
if not response:
logger.error(f"获取响应失败: {one_day_trending_news_config.url}")
@@ -194,12 +195,12 @@ class RmrbCrawler(BaseCrawler):
return resultDomain
all_doc_urls = []
- all_doc_urls.extend(a_tags)
bg01 = soup.find('td', class_="bg01")
indexfont13 = bg01.find('td', class_='indexfont13')
# 获取该 td 下的所有 a 标签
a_tags = indexfont13.find_all('a')
+ all_doc_urls.extend(a_tags)
bg02 = soup.find('td', class_="bg02")
p6 = bg02.find('td', class_='p6')
@@ -223,19 +224,21 @@ class RmrbCrawler(BaseCrawler):
获取人民日报多天内的热点新闻
"""
try:
- resultDomain = ResultDomain()
+ resultDomain = ResultDomain(code=0,message="", success=True)
news_list = []
resultDomain.dataList = news_list
resultDomain.success = True
resultDomain.code = 0
resultDomain.message = "获取人民日报多天内的热点新闻成功"
- for date in range(start_date, end_date):
- resultDomain = self.getOneDayTrendingNews(date)
- if not resultDomain.success:
- continue
- news_list.extend(resultDomain.dataList)
- logger.info(f"获取人民日报多天内的热点新闻成功")
+ current_date = start_date
+ while current_date <= end_date:
+ day_result = self.getOneDayTrendingNews(current_date)
+ if day_result.success and day_result.dataList:
+ news_list.extend(day_result.dataList)
+ current_date += timedelta(days=1)
+
+ logger.info(f"获取人民日报多天内的热点新闻成功,共 {len(news_list)} 条")
return resultDomain
except Exception as e:
logger.error(f"获取人民日报多天内的热点新闻失败: {str(e)}")
@@ -322,15 +325,19 @@ class RmrbCrawler(BaseCrawler):
if p.find('img'):
tag = "img"
src = p.find('img').get('src')
- if not src.startswith("http") and src:
- src = self.config.base_url + src
+ if src:
+ src = str(src) # 转换为字符串
+ if not src.startswith("http"):
+ src = self.config.base_url + src
content = f"
"
elif p.find('video'):
tag = "video"
src = p.find('video').get('src')
- if not src.startswith("http") and src:
- src = self.config.base_url + src
+ if src:
+ src = str(src) # 转换为字符串
+ if not src.startswith("http"):
+ src = self.config.base_url + src
content = f""
else:
content = str(p)
@@ -355,61 +362,4 @@ class RmrbCrawler(BaseCrawler):
except Exception as e:
logger.error(f"解析新闻详情失败 [{url}]: {str(e)}")
- return None
- """
- 解析人民日报新闻详情并保存为HTML文件(UTF-8编码)
-
- Args:
- url: 新闻详情页URL
- output_file: 输出文件路径,默认为 "crawler/response.html"
- """
- try:
- response = self.fetch(url)
- if not response:
- logger.error(f"获取响应失败: {url}")
- return
-
- # BeautifulSoup 可以自动检测并解码编码,直接传入字节数据即可
- # 它会从 HTML 的 标签或响应头自动检测编码
- soup = self.parse_html(response.content)
- if not soup:
- logger.error("解析HTML失败")
- return
-
- # 保存为UTF-8编码的文件(BeautifulSoup 已经自动解码为 Unicode 字符串)
- with open(output_file, "w", encoding="utf-8") as f:
- f.write(soup.prettify())
-
- logger.info(f"成功保存HTML文件: {output_file}")
-
- except Exception as e:
- logger.error(f"解析并保存新闻详情失败 [{url}]: {str(e)}")
- import traceback
- logger.error(traceback.format_exc())
- """
- 解析时间字符串
-
- Args:
- time_text: 时间文本
-
- Returns:
- 标准化的时间字符串
- """
- try:
- # 尝试匹配常见的时间格式
- patterns = [
- r'(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})',
- r'(\d{4})年(\d{2})月(\d{2})日\s+(\d{2}):(\d{2})',
- r'(\d{4})/(\d{2})/(\d{2})\s+(\d{2}):(\d{2})',
- ]
-
- for pattern in patterns:
- match = re.search(pattern, time_text)
- if match:
- return time_text
-
- return time_text
-
- except Exception as e:
- logger.warning(f"时间解析失败: {str(e)}")
return None
\ No newline at end of file
diff --git a/schoolNewsCrawler/crawler/RmrbHotPoint.py b/schoolNewsCrawler/crawler/RmrbHotPoint.py
new file mode 100644
index 0000000..326383a
--- /dev/null
+++ b/schoolNewsCrawler/crawler/RmrbHotPoint.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+人民日报热点排行爬虫命令行工具
+用法: python RmrbHotPoint.py
+"""
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+# Add parent directory to path to import crawler
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from crawler.RmrbCrawler import RmrbCrawler
+from loguru import logger
+
+
+def main():
+ """主函数"""
+ parser = argparse.ArgumentParser(
+ description='人民日报热点排行获取工具',
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ epilog="""
+示例:
+ python RmrbHotPoint.py
+ """
+ )
+
+ args = parser.parse_args()
+
+ try:
+ # 创建爬虫实例
+ logger.info("开始获取人民日报热点排行")
+ crawler = RmrbCrawler()
+
+ # 执行获取热点排行
+ result = crawler.hotPointRank()
+
+ # 输出JSON结果
+ output = {
+ "code": result.code,
+ "message": result.message,
+ "success": result.success,
+ "data": None,
+ "dataList": [item.dict() for item in result.dataList] if result.dataList else []
+ }
+
+ print(json.dumps(output, ensure_ascii=False, indent=2))
+
+ # 关闭爬虫
+ crawler.close()
+
+ # 退出码: 成功=0, 失败=1
+ sys.exit(0 if result.success else 1)
+
+ except Exception as e:
+ logger.error(f"执行失败: {str(e)}")
+ error_output = {
+ "code": 500,
+ "message": f"执行失败: {str(e)}",
+ "success": False,
+ "data": None,
+ "dataList": []
+ }
+ print(json.dumps(error_output, ensure_ascii=False, indent=2))
+ sys.exit(1)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/schoolNewsCrawler/crawler/RmrbSearch.py b/schoolNewsCrawler/crawler/RmrbSearch.py
new file mode 100644
index 0000000..6e4fbfe
--- /dev/null
+++ b/schoolNewsCrawler/crawler/RmrbSearch.py
@@ -0,0 +1,103 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+人民日报搜索爬虫命令行工具
+用法: python RmrbSearch.py --key "关键词" --total 10 --type 0
+"""
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+# Add parent directory to path to import crawler
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from crawler.RmrbCrawler import RmrbCrawler
+from loguru import logger
+
+
+def main():
+ """主函数"""
+ parser = argparse.ArgumentParser(
+ description='人民日报新闻搜索工具',
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ epilog="""
+示例:
+ python RmrbSearch.py --key "教育改革" --total 20
+ python RmrbSearch.py -k "科技创新" -t 15 -n 1
+
+新闻类型说明:
+ 0 - 所有类型 (默认)
+ 1 - 新闻
+ 2 - 互动
+ 3 - 报刊
+ 4 - 图片
+ 5 - 视频
+ """
+ )
+
+ parser.add_argument(
+ '--key', '-k',
+ type=str,
+ required=True,
+ help='搜索关键词 (必需)'
+ )
+
+ parser.add_argument(
+ '--total', '-t',
+ type=int,
+ default=10,
+ help='获取新闻总数 (默认: 10)'
+ )
+
+ parser.add_argument(
+ '--type', '-n',
+ type=int,
+ default=0,
+ choices=[0, 1, 2, 3, 4, 5],
+ help='新闻类型: 0=全部, 1=新闻, 2=互动, 3=报刊, 4=图片, 5=视频 (默认: 0)'
+ )
+
+ args = parser.parse_args()
+
+ try:
+ # 创建爬虫实例
+ logger.info(f"开始搜索: 关键词='{args.key}', 数量={args.total}, 类型={args.type}")
+ crawler = RmrbCrawler()
+
+ # 执行搜索
+ result = crawler.search(key=args.key, total=args.total, news_type=args.type)
+
+ # 输出JSON结果
+ output = {
+ "code": result.code,
+ "message": result.message,
+ "success": result.success,
+ "data": None,
+ "dataList": [item.dict() for item in result.dataList] if result.dataList else []
+ }
+
+ print(json.dumps(output, ensure_ascii=False, indent=2))
+
+ # 关闭爬虫
+ crawler.close()
+
+ # 退出码: 成功=0, 失败=1
+ sys.exit(0 if result.success else 1)
+
+ except Exception as e:
+ logger.error(f"执行失败: {str(e)}")
+ error_output = {
+ "code": 500,
+ "message": f"执行失败: {str(e)}",
+ "success": False,
+ "data": None,
+ "dataList": []
+ }
+ print(json.dumps(error_output, ensure_ascii=False, indent=2))
+ sys.exit(1)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/schoolNewsCrawler/crawler/RmrbTrending.py b/schoolNewsCrawler/crawler/RmrbTrending.py
new file mode 100644
index 0000000..98c0658
--- /dev/null
+++ b/schoolNewsCrawler/crawler/RmrbTrending.py
@@ -0,0 +1,158 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+人民日报热点新闻爬虫命令行工具
+用法:
+ python RmrbTrending.py --date 20250110
+ python RmrbTrending.py --start-date 20250101 --end-date 20250110
+"""
+
+import argparse
+import json
+import sys
+from datetime import datetime
+from pathlib import Path
+
+# Add parent directory to path to import crawler
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from crawler.RmrbCrawler import RmrbCrawler
+from loguru import logger
+
+
+def parse_date(date_str: str) -> datetime:
+ """
+ 解析日期字符串为datetime对象
+
+ Args:
+ date_str: 日期字符串,格式为YYYYMMDD
+
+ Returns:
+ datetime对象
+ """
+ try:
+ return datetime.strptime(date_str, "%Y%m%d")
+ except ValueError:
+ raise ValueError(f"日期格式错误: {date_str},正确格式为YYYYMMDD,例如: 20250110")
+
+
+def main():
+ """主函数"""
+ parser = argparse.ArgumentParser(
+ description='人民日报热点新闻获取工具',
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ epilog="""
+示例:
+ # 获取单日热点新闻
+ python RmrbTrending.py --date 20250110
+ python RmrbTrending.py -d 20250110
+
+ # 获取日期范围内的热点新闻
+ python RmrbTrending.py --start-date 20250101 --end-date 20250110
+ python RmrbTrending.py -s 20250101 -e 20250110
+
+ # 不指定日期则获取今天的热点新闻
+ python RmrbTrending.py
+ """
+ )
+
+ parser.add_argument(
+ '--date', '-d',
+ type=str,
+ help='指定日期 (格式: YYYYMMDD,例如: 20250110)'
+ )
+
+ parser.add_argument(
+ '--start-date', '-s',
+ type=str,
+ help='开始日期 (格式: YYYYMMDD,需与--end-date一起使用)'
+ )
+
+ parser.add_argument(
+ '--end-date', '-e',
+ type=str,
+ help='结束日期 (格式: YYYYMMDD,需与--start-date一起使用)'
+ )
+
+ args = parser.parse_args()
+
+ try:
+ # 创建爬虫实例
+ crawler = RmrbCrawler()
+
+ # 判断使用哪种模式
+ if args.date:
+ # 单日模式
+ if args.start_date or args.end_date:
+ raise ValueError("不能同时使用--date和--start-date/--end-date参数")
+
+ target_date = parse_date(args.date)
+ logger.info(f"获取单日热点新闻: {args.date}")
+ result = crawler.getOneDayTrendingNews(target_date)
+
+ elif args.start_date and args.end_date:
+ # 日期范围模式
+ start_date = parse_date(args.start_date)
+ end_date = parse_date(args.end_date)
+
+ if start_date > end_date:
+ raise ValueError("开始日期不能晚于结束日期")
+
+ logger.info(f"获取日期范围热点新闻: {args.start_date} 至 {args.end_date}")
+ result = crawler.getDaysTrendingNews(start_date, end_date)
+
+ elif args.start_date or args.end_date:
+ # 只指定了一个日期
+ raise ValueError("--start-date和--end-date必须同时使用")
+
+ else:
+ # 默认使用今天的日期
+ today = datetime.now()
+ today_str = today.strftime("%Y%m%d")
+ logger.info(f"获取今日热点新闻: {today_str}")
+ result = crawler.getOneDayTrendingNews(today)
+
+ # 输出JSON结果
+ output = {
+ "code": result.code,
+ "message": result.message,
+ "success": result.success,
+ "data": None,
+ "dataList": [item.dict() for item in result.dataList] if result.dataList else []
+ }
+
+ print(json.dumps(output, ensure_ascii=False, indent=2))
+
+ # 关闭爬虫
+ crawler.close()
+
+ # 退出码: 成功=0, 失败=1
+ sys.exit(0 if result.success else 1)
+
+ except ValueError as e:
+ logger.error(f"参数错误: {str(e)}")
+ error_output = {
+ "code": 400,
+ "message": f"参数错误: {str(e)}",
+ "success": False,
+ "data": None,
+ "dataList": []
+ }
+ print(json.dumps(error_output, ensure_ascii=False, indent=2))
+ sys.exit(1)
+
+ except Exception as e:
+ logger.error(f"执行失败: {str(e)}")
+ error_output = {
+ "code": 500,
+ "message": f"执行失败: {str(e)}",
+ "success": False,
+ "data": None,
+ "dataList": []
+ }
+ print(json.dumps(error_output, ensure_ascii=False, indent=2))
+ sys.exit(1)
+
+
+if __name__ == "__main__":
+ main()