新华网爬虫更新

This commit is contained in:
2026-01-09 14:03:14 +08:00
parent 30e3d86c9f
commit 8042675d71
2 changed files with 205 additions and 31 deletions

View File

@@ -106,6 +106,16 @@ class XhwCrawler(BaseCrawler):
"热点发布": "news"
}
# 详情页解析器映射,根据 URL 域名选择解析方法
# xhsz.news.cn -> parse_xhsz_news_detail
# www.news.cn -> parse_xh_news_detail
# www.xinhuanet.com -> parse_xinhuanet_news_detail (待实现)
self.detail_map = {
"xhsz.news.cn": self.parse_xhsz_news_detail, # 新华时政
"www.news.cn": self.parse_xh_news_detail, # 新华网主站
"www.xinhuanet.com": self.parse_xinhuanet_news_detail, # 新华网旧站
}
# 初始化时创建driver
self.driver = self._init_driver()
@@ -192,7 +202,20 @@ class XhwCrawler(BaseCrawler):
return driver
def parse_news_detail(self, url: str) -> Optional[NewsItem]:
return self.parse_xhsz_news_detail(url)
"""
根据 URL 域名自动选择对应的解析器
"""
# 从 URL 中提取完整域名
netloc = urlparse(url).netloc
# 从 detail_map 中获取对应的解析函数
parser_func = self.detail_map.get(netloc)
if parser_func is None:
logger.warning(f"未找到对应解析器netloc={netloc}, url={url},使用默认解析器")
parser_func = self.parse_xhsz_news_detail
return parser_func(url)
def parse_xhsz_news_detail(self, url: str) -> NewsItem:
"""
@@ -231,7 +254,11 @@ class XhwCrawler(BaseCrawler):
final_url = self.driver.current_url
if final_url != url:
news_item = self.parse_xh_news_detail(final_url)
# URL 发生重定向,根据新 URL 选择对应的解析器
netloc = urlparse(final_url).netloc
parser_func = self.detail_map.get(netloc, self.parse_xhsz_news_detail)
news_item = parser_func(final_url)
news_item.url = url
return news_item
@@ -422,7 +449,13 @@ class XhwCrawler(BaseCrawler):
article_div = self.driver.find_element(By.CSS_SELECTOR, "div.main.clearfix")
content_div = article_div.find_element(By.CSS_SELECTOR, "div#detail span#detailContent")
parse_content(content_div)
except Exception as e:
logger.warning(f"新闻内容解析失败: {url}, {e}")
# 分页处理(单独 try-except避免影响已解析的内容
try:
article_div = self.driver.find_element(By.CSS_SELECTOR, "div.main.clearfix")
content_div = article_div.find_element(By.CSS_SELECTOR, "div#detail span#detailContent")
page_div = content_div.find_element(By.CSS_SELECTOR, "center.xinhuaPager")
page_urls = []
if page_div:
@@ -437,9 +470,152 @@ class XhwCrawler(BaseCrawler):
time.sleep(2)
content_div = self.driver.find_element(By.CSS_SELECTOR, "div#detail span#detailContent")
parse_content(content_div)
except:
logger.warning(f"新闻内容解析失败: {url}")
# 没有分页是正常情况,不需要警告
pass
return news_item
def parse_xinhuanet_news_detail(self, url: str) -> NewsItem:
"""
使用Selenium解析新华网旧站(xinhuanet.com)新闻详情页
异常局部捕获,保证返回 NewsItem 对象,即使部分内容解析失败
"""
news_item = NewsItem(title="", contentRows=[], url=url)
if not self.driver:
logger.error("WebDriver未初始化无法获取新闻详情")
return news_item
try:
self.driver.get(url)
time.sleep(2)
except Exception as e:
logger.warning(f"访问新闻详情页失败: {url}, {e}")
return news_item
# 滑动验证处理
try:
sliders = self.driver.find_elements(By.CSS_SELECTOR, ".handler.handler_bg")
if sliders:
slider = sliders[0]
action_chain = ActionChains(self.driver)
action_chain.click_and_hold(slider).perform()
distance = 1000
tracks = [distance*0.2, distance*0.3, distance*0.25, distance*0.25]
for track in tracks:
action_chain.move_by_offset(int(track), 0).pause(1)
action_chain.perform()
action_chain.release().perform()
time.sleep(2)
except Exception as e:
logger.info(f"滑块验证处理失败或未出现: {e}")
# head - xinhuanet 格式: <span class="year"><em> 2021</em></span><span class="day">01/21</span><span class="time"> 00:28:02</span>
try:
head_div = self.driver.find_element(By.CSS_SELECTOR, "div.header.domPC")
time_div = head_div.find_element(By.CSS_SELECTOR, "div.header-time.left")
year = time_div.find_element(By.CSS_SELECTOR, "span.year").text.strip()
day = time_div.find_element(By.CSS_SELECTOR, "span.day").text.strip()
time_str = time_div.find_element(By.CSS_SELECTOR, "span.time").text.strip()
datetimes = f"{year}/{day} {time_str}"
news_item.publishTime = str(datetime.strptime(datetimes, "%Y/%m/%d %H:%M:%S"))
source_div = head_div.find_element(By.CSS_SELECTOR, "div.source")
source_text = source_div.text.strip()
if "" in source_text:
news_item.source = source_text.split("")[1]
else:
news_item.source = source_text
# 标题在 h1 > span.title 中
title = head_div.find_element(By.CSS_SELECTOR, "h1 span.title").text.strip()
news_item.title = title
except Exception as e:
logger.warning(f"解析头部信息失败: {url}, {e}")
def parse_content(content_div):
children = content_div.find_elements(By.XPATH, "./*")
for child in children:
try:
tag_name = child.tag_name.lower()
if tag_name == "p" or tag_name == "div":
text = child.text.strip().replace("\xa0", "")
if not text and len(child.find_elements(By.TAG_NAME, "img")) == 0:
continue
# 视频
try:
video = child.find_element(By.TAG_NAME, "video")
src = video.get_attribute("src")
if src and not src.startswith("http"):
src = self._normalize_url(src)
style = video.get_attribute("style") or ""
news_item.contentRows.append({"tag": "video", "content": f"<video style='{style}' src='{src}' />"})
continue
except:
pass
# 图片
try:
img = child.find_element(By.TAG_NAME, "img")
src = img.get_attribute("src")
if src and not src.startswith("http"):
src = self._normalize_url(src)
style = img.get_attribute("style") or ""
news_item.contentRows.append({"tag": "img", "content": f"<img style='{style}' src='{src}' />"})
continue
except:
pass
# 普通段落
news_item.contentRows.append({"tag": "p", "content": child.get_attribute("outerHTML")})
elif tag_name == "img":
src = child.get_attribute("src")
if src and not src.startswith("http"):
src = self._normalize_url(src)
style = child.get_attribute("style") or ""
news_item.contentRows.append({"tag": "img", "content": f"<img style='{style}' src='{src}' />"})
elif tag_name == "video":
src = child.get_attribute("src")
if src and not src.startswith("http"):
src = self._normalize_url(src)
style = child.get_attribute("style") or ""
news_item.contentRows.append({"tag": "video", "content": f"<video style='{style}' src='{src}' />"})
except Exception as e:
logger.warning(f"解析段落失败: {e}")
continue
# 内容
try:
article_div = self.driver.find_element(By.CSS_SELECTOR, "div.main.clearfix")
content_div = article_div.find_element(By.CSS_SELECTOR, "div#detail")
parse_content(content_div)
except Exception as e:
logger.warning(f"新闻内容解析失败: {url}, {e}")
# 分页处理(单独 try-except避免影响已解析的内容
try:
article_div = self.driver.find_element(By.CSS_SELECTOR, "div.main.clearfix")
content_div = article_div.find_element(By.CSS_SELECTOR, "div#detail")
page_div = content_div.find_element(By.CSS_SELECTOR, "center.xinhuaPager")
page_urls = []
if page_div:
page_as = page_div.find_elements(By.CSS_SELECTOR, "span#xinhuaPagerBox > a")
for page_a in page_as:
page_url = page_a.get_attribute("href")
if page_url and not page_url.startswith("http"):
page_url = self._normalize_url(page_url)
page_urls.append(page_url)
for page_url in page_urls:
self.driver.get(page_url)
time.sleep(2)
content_div = self.driver.find_element(By.CSS_SELECTOR, "div#detail")
parse_content(content_div)
except:
# 没有分页是正常情况,不需要警告
pass
return news_item