This commit is contained in:
2025-11-10 19:13:54 +08:00
parent 81ec0f0fc9
commit 3d742bf322
7 changed files with 364 additions and 90 deletions

View File

@@ -1,11 +1,11 @@
# 人民日报爬虫
from typing import List, Optional
from core import ResultDomain
from core.ResultDomain import ResultDomain
from crawler.BaseCrawler import BaseCrawler, CrawlerConfig, NewsItem, UrlConfig
from loguru import logger
import re
import chardet
from datetime import datetime
from datetime import datetime, timedelta
class RmrbCrawler(BaseCrawler):
@@ -50,7 +50,7 @@ class RmrbCrawler(BaseCrawler):
}
),
"one_day_trending_news": UrlConfig(
url=lambda date: f"http://www.people.com.cn/GB/59476/review/{date}.html", # date:YYYYMMdd
url= "http://www.people.com.cn/GB/59476/review/{date}.html", # date:YYYYMMdd
method="GET",
params={},
headers={
@@ -63,7 +63,7 @@ class RmrbCrawler(BaseCrawler):
)
super().__init__(config)
def search(self, key: str, total: int = 10, news_type: int = 0) -> ResultDomain:
def search(self, key: str, total: int, news_type: int = 0) -> ResultDomain:
"""
搜索人民日报新闻
@@ -76,7 +76,7 @@ class RmrbCrawler(BaseCrawler):
新闻列表
"""
try:
resultDomain = ResultDomain()
resultDomain = ResultDomain(code=0, message="", success=True)
news_list = []
resultDomain.dataList = news_list
# 获取搜索配置
@@ -98,7 +98,7 @@ class RmrbCrawler(BaseCrawler):
while len(news_list) < total:
search_data["page"] = page
response = self.fetch(search_config.url, method=search_config.method, data=search_data, headers=search_config.headers)
response = self.fetch(search_config.url, method=search_config.method, json=search_data, headers=search_config.headers)
response_json = response.json()
if response_json.get("code") == 0:
records = response_json.get("data", {}).get("records", [])
@@ -130,7 +130,7 @@ class RmrbCrawler(BaseCrawler):
"""
try:
hot_point_rank_config = self.config.urls.get("hot_point_rank")
resultDomain = ResultDomain()
resultDomain = ResultDomain(code=0, message="", success=True)
news_list = []
resultDomain.dataList = news_list
@@ -169,7 +169,7 @@ class RmrbCrawler(BaseCrawler):
获取人民日报一天内的热点新闻
"""
try:
resultDomain = ResultDomain()
resultDomain = ResultDomain(code=0, message="", success=True)
news_list = []
resultDomain.dataList = news_list
resultDomain.success = True
@@ -177,7 +177,8 @@ class RmrbCrawler(BaseCrawler):
logger.info(f"获取人民日报一天内的热点新闻成功")
date_str = date.strftime("%Y%m%d")
one_day_trending_news_config = self.config.urls.get("one_day_trending_news")
one_day_trending_news_config.url = one_day_trending_news_config.url(date_str)
one_day_trending_news_config.url = one_day_trending_news_config.url.format(date_str)
response = self.fetch(one_day_trending_news_config.url, method=one_day_trending_news_config.method, headers=one_day_trending_news_config.headers)
if not response:
logger.error(f"获取响应失败: {one_day_trending_news_config.url}")
@@ -194,12 +195,12 @@ class RmrbCrawler(BaseCrawler):
return resultDomain
all_doc_urls = []
all_doc_urls.extend(a_tags)
bg01 = soup.find('td', class_="bg01")
indexfont13 = bg01.find('td', class_='indexfont13')
# 获取该 td 下的所有 a 标签
a_tags = indexfont13.find_all('a')
all_doc_urls.extend(a_tags)
bg02 = soup.find('td', class_="bg02")
p6 = bg02.find('td', class_='p6')
@@ -223,19 +224,21 @@ class RmrbCrawler(BaseCrawler):
获取人民日报多天内的热点新闻
"""
try:
resultDomain = ResultDomain()
resultDomain = ResultDomain(code=0,message="", success=True)
news_list = []
resultDomain.dataList = news_list
resultDomain.success = True
resultDomain.code = 0
resultDomain.message = "获取人民日报多天内的热点新闻成功"
for date in range(start_date, end_date):
resultDomain = self.getOneDayTrendingNews(date)
if not resultDomain.success:
continue
news_list.extend(resultDomain.dataList)
logger.info(f"获取人民日报多天内的热点新闻成功")
current_date = start_date
while current_date <= end_date:
day_result = self.getOneDayTrendingNews(current_date)
if day_result.success and day_result.dataList:
news_list.extend(day_result.dataList)
current_date += timedelta(days=1)
logger.info(f"获取人民日报多天内的热点新闻成功,共 {len(news_list)}")
return resultDomain
except Exception as e:
logger.error(f"获取人民日报多天内的热点新闻失败: {str(e)}")
@@ -322,15 +325,19 @@ class RmrbCrawler(BaseCrawler):
if p.find('img'):
tag = "img"
src = p.find('img').get('src')
if not src.startswith("http") and src:
src = self.config.base_url + src
if src:
src = str(src) # 转换为字符串
if not src.startswith("http"):
src = self.config.base_url + src
content = f"<img style='{p_style}' src='{src}' />"
elif p.find('video'):
tag = "video"
src = p.find('video').get('src')
if not src.startswith("http") and src:
src = self.config.base_url + src
if src:
src = str(src) # 转换为字符串
if not src.startswith("http"):
src = self.config.base_url + src
content = f"<video style='align-items: center;' src='{src}' />"
else:
content = str(p)
@@ -355,61 +362,4 @@ class RmrbCrawler(BaseCrawler):
except Exception as e:
logger.error(f"解析新闻详情失败 [{url}]: {str(e)}")
return None
"""
解析人民日报新闻详情并保存为HTML文件UTF-8编码
Args:
url: 新闻详情页URL
output_file: 输出文件路径,默认为 "crawler/response.html"
"""
try:
response = self.fetch(url)
if not response:
logger.error(f"获取响应失败: {url}")
return
# BeautifulSoup 可以自动检测并解码编码,直接传入字节数据即可
# 它会从 HTML 的 <meta charset> 标签或响应头自动检测编码
soup = self.parse_html(response.content)
if not soup:
logger.error("解析HTML失败")
return
# 保存为UTF-8编码的文件BeautifulSoup 已经自动解码为 Unicode 字符串)
with open(output_file, "w", encoding="utf-8") as f:
f.write(soup.prettify())
logger.info(f"成功保存HTML文件: {output_file}")
except Exception as e:
logger.error(f"解析并保存新闻详情失败 [{url}]: {str(e)}")
import traceback
logger.error(traceback.format_exc())
"""
解析时间字符串
Args:
time_text: 时间文本
Returns:
标准化的时间字符串
"""
try:
# 尝试匹配常见的时间格式
patterns = [
r'(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})',
r'(\d{4})年(\d{2})月(\d{2})日\s+(\d{2}):(\d{2})',
r'(\d{4})/(\d{2})/(\d{2})\s+(\d{2}):(\d{2})',
]
for pattern in patterns:
match = re.search(pattern, time_text)
if match:
return time_text
return time_text
except Exception as e:
logger.warning(f"时间解析失败: {str(e)}")
return None