搜索关键字爬虫
This commit is contained in:
@@ -100,14 +100,25 @@ class RmrbCrawler(BaseCrawler):
|
||||
search_data["page"] = page
|
||||
response = self.fetch(search_config.url, method=search_config.method, json=search_data, headers=search_config.headers)
|
||||
response_json = response.json()
|
||||
if response_json.get("code") == 0:
|
||||
if response_json.get("code") == '0':
|
||||
records = response_json.get("data", {}).get("records", [])
|
||||
for record in records:
|
||||
news = self.parse_news_detail(record.get("url"))
|
||||
if news['title'] == '':
|
||||
news['title'] = record.get("title")
|
||||
if news['contentRows'] == []:
|
||||
news['contentRows'] = record.get("contentOriginal")
|
||||
if news['publishTime'] == '':
|
||||
news['publishTime'] = datetime.datetime.fromtimestamp(record.get("displayTime") / 1000).date()
|
||||
if news['author'] == '':
|
||||
news['author'] = record.get("author")
|
||||
if news['source'] == '':
|
||||
news['source'] = record.get("originName")
|
||||
|
||||
news_list.append(news)
|
||||
else:
|
||||
resultDomain.code = response_json.get("code")
|
||||
resultDomain.message = "获取搜索结果失败" + response_json.get("message")
|
||||
resultDomain.message = f"获取搜索结果失败{response_json.get('message') or ''}"
|
||||
resultDomain.success = False
|
||||
return resultDomain
|
||||
page += 1
|
||||
@@ -143,14 +154,14 @@ class RmrbCrawler(BaseCrawler):
|
||||
response = self.fetch(hot_point_rank_config.url, method=hot_point_rank_config.method, headers=hot_point_rank_config.headers)
|
||||
response_json = response.json()
|
||||
|
||||
if response_json.get("code") == 0:
|
||||
if response_json.get("code") == '0':
|
||||
records = response_json.get("data", [])
|
||||
for record in records:
|
||||
news = self.parse_news_detail(record.get("url"))
|
||||
news_list.append(news)
|
||||
else:
|
||||
resultDomain.code = response_json.get("code")
|
||||
resultDomain.message = "获取人民日报热点排行失败" + response_json.get("message")
|
||||
resultDomain.message = f"获取人民日报热点排行失败{response_json.get('message') or ''}"
|
||||
resultDomain.success = False
|
||||
return resultDomain
|
||||
resultDomain.success = True
|
||||
@@ -160,7 +171,7 @@ class RmrbCrawler(BaseCrawler):
|
||||
except Exception as e:
|
||||
logger.error(f"获取人民日报热点排行失败: {str(e)}")
|
||||
resultDomain.code = 0
|
||||
resultDomain.message = "获取人民日报热点排行失败" + str(e)
|
||||
resultDomain.message = f"获取人民日报热点排行失败{str(e)}"
|
||||
resultDomain.success = False
|
||||
return resultDomain
|
||||
|
||||
@@ -178,19 +189,19 @@ class RmrbCrawler(BaseCrawler):
|
||||
date_str = date.strftime("%Y%m%d")
|
||||
one_day_trending_news_config = self.config.urls.get("one_day_trending_news")
|
||||
|
||||
one_day_trending_news_config.url = one_day_trending_news_config.url.format(date_str)
|
||||
one_day_trending_news_config.url = one_day_trending_news_config.url.format(date=date_str)
|
||||
response = self.fetch(one_day_trending_news_config.url, method=one_day_trending_news_config.method, headers=one_day_trending_news_config.headers)
|
||||
if not response:
|
||||
logger.error(f"获取响应失败: {one_day_trending_news_config.url}")
|
||||
resultDomain.code = 0
|
||||
resultDomain.message = "获取响应失败" + one_day_trending_news_config.url
|
||||
resultDomain.message = f"获取响应失败{one_day_trending_news_config.url or ''}"
|
||||
resultDomain.success = False
|
||||
return resultDomain
|
||||
soup = self.parse_html(response.content)
|
||||
if not soup:
|
||||
logger.error(f"解析HTML失败: {one_day_trending_news_config.url}")
|
||||
resultDomain.code = 0
|
||||
resultDomain.message = "解析HTML失败" + one_day_trending_news_config.url
|
||||
resultDomain.message = f"解析HTML失败{one_day_trending_news_config.url or ''}"
|
||||
resultDomain.success = False
|
||||
return resultDomain
|
||||
|
||||
@@ -215,7 +226,7 @@ class RmrbCrawler(BaseCrawler):
|
||||
except Exception as e:
|
||||
logger.error(f"获取人民日报一天内的热点新闻失败: {str(e)}")
|
||||
resultDomain.code = 0
|
||||
resultDomain.message = "获取人民日报一天内的热点新闻失败" + str(e)
|
||||
resultDomain.message = f"获取人民日报一天内的热点新闻失败{str(e)}"
|
||||
resultDomain.success = False
|
||||
return resultDomain
|
||||
|
||||
@@ -243,7 +254,7 @@ class RmrbCrawler(BaseCrawler):
|
||||
except Exception as e:
|
||||
logger.error(f"获取人民日报多天内的热点新闻失败: {str(e)}")
|
||||
resultDomain.code = 0
|
||||
resultDomain.message = "获取人民日报多天内的热点新闻失败" + str(e)
|
||||
resultDomain.message = f"获取人民日报多天内的热点新闻失败{str(e)}"
|
||||
resultDomain.success = False
|
||||
return resultDomain
|
||||
|
||||
@@ -259,29 +270,37 @@ class RmrbCrawler(BaseCrawler):
|
||||
"""
|
||||
try:
|
||||
response = self.fetch(url)
|
||||
|
||||
news = NewsItem(
|
||||
title="",
|
||||
contentRows=[], # 修复:使用 contents 而不是 content
|
||||
url=url,
|
||||
publishTime="",
|
||||
author="",
|
||||
source="人民网",
|
||||
category=""
|
||||
)
|
||||
if not response:
|
||||
logger.error(f"获取响应失败: {url}")
|
||||
return None
|
||||
return news
|
||||
|
||||
# BeautifulSoup 可以自动检测并解码编码,直接传入字节数据即可
|
||||
# 它会从 HTML 的 <meta charset> 标签或响应头自动检测编码
|
||||
soup = self.parse_html(response.content)
|
||||
if not soup:
|
||||
logger.error("解析HTML失败")
|
||||
return None
|
||||
return news
|
||||
|
||||
# 提取主内容区域
|
||||
main_div = soup.find("div", class_="layout rm_txt cf")
|
||||
if not main_div:
|
||||
logger.error("未找到主内容区域")
|
||||
return None
|
||||
return news
|
||||
|
||||
# 提取文章区域
|
||||
article_div = main_div.find("div", class_="col col-1")
|
||||
if not article_div:
|
||||
logger.error("未找到文章区域")
|
||||
return None
|
||||
return news
|
||||
|
||||
# 提取标题
|
||||
title_tag = article_div.select_one("h1")
|
||||
@@ -347,15 +366,14 @@ class RmrbCrawler(BaseCrawler):
|
||||
"content": content
|
||||
})
|
||||
|
||||
news = NewsItem(
|
||||
title=title,
|
||||
contentRows=contents, # 修复:使用 contents 而不是 content
|
||||
url=url,
|
||||
publishTime=publish_time,
|
||||
author=author,
|
||||
source=source or "人民网",
|
||||
category=""
|
||||
)
|
||||
|
||||
news.title=title
|
||||
news.contentRows=contents # 修复:使用 contents 而不是 content
|
||||
news.url=url
|
||||
news.publishTime=publish_time
|
||||
news.author=author
|
||||
news.source=source or "人民网"
|
||||
news.category=""
|
||||
|
||||
logger.info(f"成功解析新闻: {title}")
|
||||
return news
|
||||
|
||||
Reference in New Issue
Block a user