搜索关键字爬虫

This commit is contained in:
2025-11-12 16:10:34 +08:00
parent 7be02fe396
commit 675e6da7d7
37 changed files with 3382 additions and 572 deletions

View File

@@ -100,14 +100,25 @@ class RmrbCrawler(BaseCrawler):
search_data["page"] = page
response = self.fetch(search_config.url, method=search_config.method, json=search_data, headers=search_config.headers)
response_json = response.json()
if response_json.get("code") == 0:
if response_json.get("code") == '0':
records = response_json.get("data", {}).get("records", [])
for record in records:
news = self.parse_news_detail(record.get("url"))
if news['title'] == '':
news['title'] = record.get("title")
if news['contentRows'] == []:
news['contentRows'] = record.get("contentOriginal")
if news['publishTime'] == '':
news['publishTime'] = datetime.datetime.fromtimestamp(record.get("displayTime") / 1000).date()
if news['author'] == '':
news['author'] = record.get("author")
if news['source'] == '':
news['source'] = record.get("originName")
news_list.append(news)
else:
resultDomain.code = response_json.get("code")
resultDomain.message = "获取搜索结果失败" + response_json.get("message")
resultDomain.message = f"获取搜索结果失败{response_json.get('message') or ''}"
resultDomain.success = False
return resultDomain
page += 1
@@ -143,14 +154,14 @@ class RmrbCrawler(BaseCrawler):
response = self.fetch(hot_point_rank_config.url, method=hot_point_rank_config.method, headers=hot_point_rank_config.headers)
response_json = response.json()
if response_json.get("code") == 0:
if response_json.get("code") == '0':
records = response_json.get("data", [])
for record in records:
news = self.parse_news_detail(record.get("url"))
news_list.append(news)
else:
resultDomain.code = response_json.get("code")
resultDomain.message = "获取人民日报热点排行失败" + response_json.get("message")
resultDomain.message = f"获取人民日报热点排行失败{response_json.get('message') or ''}"
resultDomain.success = False
return resultDomain
resultDomain.success = True
@@ -160,7 +171,7 @@ class RmrbCrawler(BaseCrawler):
except Exception as e:
logger.error(f"获取人民日报热点排行失败: {str(e)}")
resultDomain.code = 0
resultDomain.message = "获取人民日报热点排行失败" + str(e)
resultDomain.message = f"获取人民日报热点排行失败{str(e)}"
resultDomain.success = False
return resultDomain
@@ -178,19 +189,19 @@ class RmrbCrawler(BaseCrawler):
date_str = date.strftime("%Y%m%d")
one_day_trending_news_config = self.config.urls.get("one_day_trending_news")
one_day_trending_news_config.url = one_day_trending_news_config.url.format(date_str)
one_day_trending_news_config.url = one_day_trending_news_config.url.format(date=date_str)
response = self.fetch(one_day_trending_news_config.url, method=one_day_trending_news_config.method, headers=one_day_trending_news_config.headers)
if not response:
logger.error(f"获取响应失败: {one_day_trending_news_config.url}")
resultDomain.code = 0
resultDomain.message = "获取响应失败" + one_day_trending_news_config.url
resultDomain.message = f"获取响应失败{one_day_trending_news_config.url or ''}"
resultDomain.success = False
return resultDomain
soup = self.parse_html(response.content)
if not soup:
logger.error(f"解析HTML失败: {one_day_trending_news_config.url}")
resultDomain.code = 0
resultDomain.message = "解析HTML失败" + one_day_trending_news_config.url
resultDomain.message = f"解析HTML失败{one_day_trending_news_config.url or ''}"
resultDomain.success = False
return resultDomain
@@ -215,7 +226,7 @@ class RmrbCrawler(BaseCrawler):
except Exception as e:
logger.error(f"获取人民日报一天内的热点新闻失败: {str(e)}")
resultDomain.code = 0
resultDomain.message = "获取人民日报一天内的热点新闻失败" + str(e)
resultDomain.message = f"获取人民日报一天内的热点新闻失败{str(e)}"
resultDomain.success = False
return resultDomain
@@ -243,7 +254,7 @@ class RmrbCrawler(BaseCrawler):
except Exception as e:
logger.error(f"获取人民日报多天内的热点新闻失败: {str(e)}")
resultDomain.code = 0
resultDomain.message = "获取人民日报多天内的热点新闻失败" + str(e)
resultDomain.message = f"获取人民日报多天内的热点新闻失败{str(e)}"
resultDomain.success = False
return resultDomain
@@ -259,29 +270,37 @@ class RmrbCrawler(BaseCrawler):
"""
try:
response = self.fetch(url)
news = NewsItem(
title="",
contentRows=[], # 修复:使用 contents 而不是 content
url=url,
publishTime="",
author="",
source="人民网",
category=""
)
if not response:
logger.error(f"获取响应失败: {url}")
return None
return news
# BeautifulSoup 可以自动检测并解码编码,直接传入字节数据即可
# 它会从 HTML 的 <meta charset> 标签或响应头自动检测编码
soup = self.parse_html(response.content)
if not soup:
logger.error("解析HTML失败")
return None
return news
# 提取主内容区域
main_div = soup.find("div", class_="layout rm_txt cf")
if not main_div:
logger.error("未找到主内容区域")
return None
return news
# 提取文章区域
article_div = main_div.find("div", class_="col col-1")
if not article_div:
logger.error("未找到文章区域")
return None
return news
# 提取标题
title_tag = article_div.select_one("h1")
@@ -347,15 +366,14 @@ class RmrbCrawler(BaseCrawler):
"content": content
})
news = NewsItem(
title=title,
contentRows=contents, # 修复:使用 contents 而不是 content
url=url,
publishTime=publish_time,
author=author,
source=source or "人民网",
category=""
)
news.title=title
news.contentRows=contents # 修复:使用 contents 而不是 content
news.url=url
news.publishTime=publish_time
news.author=author
news.source=source or "人民网"
news.category=""
logger.info(f"成功解析新闻: {title}")
return news