From 3d742bf3227b305ee4877cbdfa109ace5ff9e03c Mon Sep 17 00:00:00 2001 From: wangys <3401275564@qq.com> Date: Mon, 10 Nov 2025 19:13:54 +0800 Subject: [PATCH] =?UTF-8?q?=E7=88=AC=E8=99=AB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- schoolNewsCrawler/core/ResultDomain.py | 2 +- schoolNewsCrawler/core/__init__.py | 0 schoolNewsCrawler/crawler/BaseCrawler.py | 13 +- schoolNewsCrawler/crawler/RmrbCrawler.py | 106 ++++----------- schoolNewsCrawler/crawler/RmrbHotPoint.py | 72 ++++++++++ schoolNewsCrawler/crawler/RmrbSearch.py | 103 ++++++++++++++ schoolNewsCrawler/crawler/RmrbTrending.py | 158 ++++++++++++++++++++++ 7 files changed, 364 insertions(+), 90 deletions(-) create mode 100644 schoolNewsCrawler/core/__init__.py create mode 100644 schoolNewsCrawler/crawler/RmrbHotPoint.py create mode 100644 schoolNewsCrawler/crawler/RmrbSearch.py create mode 100644 schoolNewsCrawler/crawler/RmrbTrending.py diff --git a/schoolNewsCrawler/core/ResultDomain.py b/schoolNewsCrawler/core/ResultDomain.py index 7ce9c1b..abf2ea7 100644 --- a/schoolNewsCrawler/core/ResultDomain.py +++ b/schoolNewsCrawler/core/ResultDomain.py @@ -2,7 +2,7 @@ from pydantic import BaseModel, Field, HttpUrl from typing import Any, List, Optional class ResultDomain(BaseModel): - code: int = Field(..., description="状态码") + code: int = Field(..., description="状态码",) message: str = Field(..., description="消息") success: bool = Field(..., description="是否成功") data: Optional[Any] = Field(default=None, description="数据") diff --git a/schoolNewsCrawler/core/__init__.py b/schoolNewsCrawler/core/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/schoolNewsCrawler/crawler/BaseCrawler.py b/schoolNewsCrawler/crawler/BaseCrawler.py index 427d6af..6046c15 100644 --- a/schoolNewsCrawler/crawler/BaseCrawler.py +++ b/schoolNewsCrawler/crawler/BaseCrawler.py @@ -1,5 +1,5 @@ # 定义基础爬虫类 -from typing import Dict, Optional, List, Any +from typing import Callable, Dict, Optional, List, Any, Union from abc import ABC, abstractmethod import requests from bs4 import BeautifulSoup @@ -12,7 +12,7 @@ class UrlConfig(BaseModel): url: str = Field(..., description="请求URL") params: Optional[Dict[str, Any]] = Field(default=None, description="请求参数") method: str = Field(default="GET", description="请求方法") - + headers: Optional[Dict[str, str]] = Field(default=None, description="请求头") class Config: # 允许任意类型 arbitrary_types_allowed = True @@ -123,15 +123,6 @@ class BaseCrawler(ABC): logger.error(f"HTML解析失败: {str(e)}") return None - @abstractmethod - def crawl(self) -> List[NewsItem]: - """ - 爬取新闻(子类必须实现) - - Returns: - 新闻列表 - """ - pass @abstractmethod def parse_news_detail(self, url: str) -> Optional[NewsItem]: diff --git a/schoolNewsCrawler/crawler/RmrbCrawler.py b/schoolNewsCrawler/crawler/RmrbCrawler.py index 03475c7..bb15c19 100644 --- a/schoolNewsCrawler/crawler/RmrbCrawler.py +++ b/schoolNewsCrawler/crawler/RmrbCrawler.py @@ -1,11 +1,11 @@ # 人民日报爬虫 from typing import List, Optional -from core import ResultDomain +from core.ResultDomain import ResultDomain from crawler.BaseCrawler import BaseCrawler, CrawlerConfig, NewsItem, UrlConfig from loguru import logger import re import chardet -from datetime import datetime +from datetime import datetime, timedelta class RmrbCrawler(BaseCrawler): @@ -50,7 +50,7 @@ class RmrbCrawler(BaseCrawler): } ), "one_day_trending_news": UrlConfig( - url=lambda date: f"http://www.people.com.cn/GB/59476/review/{date}.html", # date:YYYYMMdd + url= "http://www.people.com.cn/GB/59476/review/{date}.html", # date:YYYYMMdd method="GET", params={}, headers={ @@ -63,7 +63,7 @@ class RmrbCrawler(BaseCrawler): ) super().__init__(config) - def search(self, key: str, total: int = 10, news_type: int = 0) -> ResultDomain: + def search(self, key: str, total: int, news_type: int = 0) -> ResultDomain: """ 搜索人民日报新闻 @@ -76,7 +76,7 @@ class RmrbCrawler(BaseCrawler): 新闻列表 """ try: - resultDomain = ResultDomain() + resultDomain = ResultDomain(code=0, message="", success=True) news_list = [] resultDomain.dataList = news_list # 获取搜索配置 @@ -98,7 +98,7 @@ class RmrbCrawler(BaseCrawler): while len(news_list) < total: search_data["page"] = page - response = self.fetch(search_config.url, method=search_config.method, data=search_data, headers=search_config.headers) + response = self.fetch(search_config.url, method=search_config.method, json=search_data, headers=search_config.headers) response_json = response.json() if response_json.get("code") == 0: records = response_json.get("data", {}).get("records", []) @@ -130,7 +130,7 @@ class RmrbCrawler(BaseCrawler): """ try: hot_point_rank_config = self.config.urls.get("hot_point_rank") - resultDomain = ResultDomain() + resultDomain = ResultDomain(code=0, message="", success=True) news_list = [] resultDomain.dataList = news_list @@ -169,7 +169,7 @@ class RmrbCrawler(BaseCrawler): 获取人民日报一天内的热点新闻 """ try: - resultDomain = ResultDomain() + resultDomain = ResultDomain(code=0, message="", success=True) news_list = [] resultDomain.dataList = news_list resultDomain.success = True @@ -177,7 +177,8 @@ class RmrbCrawler(BaseCrawler): logger.info(f"获取人民日报一天内的热点新闻成功") date_str = date.strftime("%Y%m%d") one_day_trending_news_config = self.config.urls.get("one_day_trending_news") - one_day_trending_news_config.url = one_day_trending_news_config.url(date_str) + + one_day_trending_news_config.url = one_day_trending_news_config.url.format(date_str) response = self.fetch(one_day_trending_news_config.url, method=one_day_trending_news_config.method, headers=one_day_trending_news_config.headers) if not response: logger.error(f"获取响应失败: {one_day_trending_news_config.url}") @@ -194,12 +195,12 @@ class RmrbCrawler(BaseCrawler): return resultDomain all_doc_urls = [] - all_doc_urls.extend(a_tags) bg01 = soup.find('td', class_="bg01") indexfont13 = bg01.find('td', class_='indexfont13') # 获取该 td 下的所有 a 标签 a_tags = indexfont13.find_all('a') + all_doc_urls.extend(a_tags) bg02 = soup.find('td', class_="bg02") p6 = bg02.find('td', class_='p6') @@ -223,19 +224,21 @@ class RmrbCrawler(BaseCrawler): 获取人民日报多天内的热点新闻 """ try: - resultDomain = ResultDomain() + resultDomain = ResultDomain(code=0,message="", success=True) news_list = [] resultDomain.dataList = news_list resultDomain.success = True resultDomain.code = 0 resultDomain.message = "获取人民日报多天内的热点新闻成功" - for date in range(start_date, end_date): - resultDomain = self.getOneDayTrendingNews(date) - if not resultDomain.success: - continue - news_list.extend(resultDomain.dataList) - logger.info(f"获取人民日报多天内的热点新闻成功") + current_date = start_date + while current_date <= end_date: + day_result = self.getOneDayTrendingNews(current_date) + if day_result.success and day_result.dataList: + news_list.extend(day_result.dataList) + current_date += timedelta(days=1) + + logger.info(f"获取人民日报多天内的热点新闻成功,共 {len(news_list)} 条") return resultDomain except Exception as e: logger.error(f"获取人民日报多天内的热点新闻失败: {str(e)}") @@ -322,15 +325,19 @@ class RmrbCrawler(BaseCrawler): if p.find('img'): tag = "img" src = p.find('img').get('src') - if not src.startswith("http") and src: - src = self.config.base_url + src + if src: + src = str(src) # 转换为字符串 + if not src.startswith("http"): + src = self.config.base_url + src content = f"" elif p.find('video'): tag = "video" src = p.find('video').get('src') - if not src.startswith("http") and src: - src = self.config.base_url + src + if src: + src = str(src) # 转换为字符串 + if not src.startswith("http"): + src = self.config.base_url + src content = f"