Files
schoolNews/schoolNewsCrawler/crawler/xhw/XhwCrawler.py
2025-11-20 15:46:53 +08:00

710 lines
31 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# 新华网爬虫
from itertools import count
from typing import List, Optional
from bs4 import Tag
from pydantic import InstanceOf
from core.ResultDomain import ResultDomain
from crawler.BaseCrawler import BaseCrawler, CrawlerConfig, NewsItem, UrlConfig
from loguru import logger
import re
import chardet
from datetime import datetime, timedelta
from bs4.element import NavigableString
from urllib.parse import urlparse, urlencode
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.service import Service
import time
import random
import os
class XhwCrawler(BaseCrawler):
def __init__(self):
"""初始化人民日报爬虫"""
config = CrawlerConfig(
base_url="https://xhsz.news.cn/",
urls={
"search": UrlConfig(
url="https://xhsz.news.cn/s",
method="GET",
params={
"k": "",
"action": "",
"page": 1
},
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Cache-Control': 'max-age=0',
'Referer': 'https://xhsz.news.cn/',
'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
}
),
"hot_point": UrlConfig(
url="https://xhsz.news.cn/focus_news",
method="GET",
params={},
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Cache-Control': 'max-age=0',
'Referer': 'https://xhsz.news.cn/',
'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
}
),
"commend": UrlConfig(
url="https://xhsz.news.cn/focus_news",
method="GET",
params={},
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Cache-Control': 'max-age=0',
'Referer': 'https://xhsz.news.cn/',
'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
}
)
},
)
super().__init__(config)
self.search_action_map = {
"全部": "index",
"热点发布": "news"
}
# 初始化时创建driver
self.driver = self._init_driver()
def _init_driver(self):
"""初始化并返回Chrome WebDriver实例"""
chrome_options = Options()
# 确保浏览器可见,不使用无头模式
# 或者完全删除这行,因为默认就是有界面模式
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
# 确保浏览器可见
chrome_options.add_argument('--start-maximized')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--disable-web-security')
chrome_options.add_argument('--allow-running-insecure-content')
chrome_options.add_argument('--disable-features=VizDisplayCompositor')
service = Service(executable_path=r"chromedriver.exe")
driver = None
try:
driver = webdriver.Chrome(service=service, options=chrome_options)
logger.info("Chrome浏览器初始化成功")
except Exception as e:
logger.error(f"Chrome浏览器初始化失败: {str(e)}")
return None
# 设置隐式等待时间
# driver.implicitly_wait(10)
# 访问主页获取初始Cookie
logger.info("访问主页获取初始Cookie")
try:
driver.get(self.config.base_url)
except Exception as e:
logger.error(f"访问URL失败: {self.config.base_url}, 错误: {str(e)}")
return None
time.sleep(random.uniform(2, 4))
# 检查是否有验证页面
page_source = driver.page_source
if "验证" in page_source or "captcha" in page_source.lower() or "滑动验证" in page_source:
logger.warning("检测到验证页面,尝试手动处理验证")
# 尝试等待用户手动处理验证
logger.info("请在30秒内手动完成验证...")
time.sleep(30)
# 刷新页面,检查验证是否完成
driver.refresh()
time.sleep(random.uniform(2, 4))
# 再次检查验证状态
page_source = driver.page_source
if "验证" in page_source or "captcha" in page_source.lower() or "滑动验证" in page_source:
logger.error("验证未完成,无法继续爬取")
# self.driver.quit()
# self.driver = None
return None
return driver
def parse_news_detail(self, url: str) -> Optional[NewsItem]:
return self.parse_xhsz_news_detail(url)
def parse_xhsz_news_detail(self, url: str) -> NewsItem:
"""
使用Selenium解析新华网新闻详情页
异常局部捕获,保证返回 NewsItem 对象,即使部分内容解析失败
"""
news_item = NewsItem(title="", contentRows=[], url=url)
if not self.driver:
logger.error("WebDriver未初始化无法获取新闻详情")
return news_item
try:
self.driver.get(url)
time.sleep(2)
except Exception as e:
logger.warning(f"访问新闻详情页失败: {url}, {e}")
return news_item
# 滑动验证处理
try:
sliders = self.driver.find_elements(By.CSS_SELECTOR, ".handler.handler_bg")
if sliders:
slider = sliders[0]
action_chain = ActionChains(self.driver)
action_chain.click_and_hold(slider).perform()
distance = 1000
tracks = [distance*0.2, distance*0.3, distance*0.25, distance*0.25]
for track in tracks:
action_chain.move_by_offset(int(track), 0).pause(1)
action_chain.perform()
action_chain.release().perform()
time.sleep(2)
except Exception as e:
logger.info(f"滑块验证处理失败或未出现: {e}")
final_url = self.driver.current_url
if final_url != url:
news_item = self.parse_xh_news_detail(final_url)
news_item.url = url
return news_item
# 新闻主体
try:
main_div = WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "div.page-news-detail"))
)
except Exception as e:
logger.warning(f"未找到新闻主体: {url}, {e}")
return news_item
try:
article_div = main_div.find_element(By.CSS_SELECTOR, "div.page-news-l")
except:
logger.warning(f"未找到文章主体: {url}")
return news_item
# 标题
try:
title_div = article_div.find_element(By.CSS_SELECTOR, "div.page-news-detail-title")
news_item.title = title_div.text.strip()
except:
pass
# 新闻元信息
try:
channal_div = article_div.find_element(By.CSS_SELECTOR, "div.page-news-detail-note")
channal_items = channal_div.find_elements(By.CSS_SELECTOR, "div.item")
for item in channal_items:
text = item.text.strip()
if "来源" in text:
news_item.source = text.split("", 1)[-1].strip()
elif "发布时间" in text:
news_item.publishTime = text.split("", 1)[-1].strip()
elif "浏览人数" in text:
try:
news_item.viewCount = int(text.split("", 1)[-1].strip())
except:
pass
except:
pass
# 内容
try:
content_div = article_div.find_element(By.CSS_SELECTOR, "div.page-news-detail-content")
children = content_div.find_elements(By.XPATH, "./*")
for child in children:
try:
tag_name = child.tag_name.lower()
if tag_name == "p":
text = child.text.strip().replace("\xa0", "")
if not text and len(child.find_elements(By.TAG_NAME, "img")) == 0:
continue
# 图片
try:
img = child.find_element(By.TAG_NAME, "img")
src = img.get_attribute("src")
if src and not src.startswith("http"):
src = self.config.base_url + src
news_item.contentRows.append({"tag": "img", "content": f"<img src='{src}' />"})
continue
except:
pass
# 视频
try:
video = child.find_element(By.TAG_NAME, "video")
src = video.get_attribute("src")
if src and not src.startswith("http"):
src = self.config.base_url + src
news_item.contentRows.append({"tag": "video", "content": f"<video src='{src}' />"})
continue
except:
pass
# 普通段落
news_item.contentRows.append({"tag": "p", "content": child.get_attribute("outerHTML")})
elif tag_name in ["img", "video"]:
news_item.contentRows.append({"tag": tag_name, "content": child.get_attribute("outerHTML")})
except Exception as e:
logger.warning(f"解析段落失败: {e}")
continue
except:
logger.warning(f"新闻内容解析失败: {url}")
return news_item
def parse_xh_news_detail(self, url: str) -> NewsItem:
"""
使用Selenium解析新华网新闻详情页
异常局部捕获,保证返回 NewsItem 对象,即使部分内容解析失败
"""
news_item = NewsItem(title="", contentRows=[], url=url)
if not self.driver:
logger.error("WebDriver未初始化无法获取新闻详情")
return news_item
try:
self.driver.get(url)
time.sleep(2)
except Exception as e:
logger.warning(f"访问新闻详情页失败: {url}, {e}")
return news_item
# 滑动验证处理
try:
sliders = self.driver.find_elements(By.CSS_SELECTOR, ".handler.handler_bg")
if sliders:
slider = sliders[0]
action_chain = ActionChains(self.driver)
action_chain.click_and_hold(slider).perform()
distance = 1000
tracks = [distance*0.2, distance*0.3, distance*0.25, distance*0.25]
for track in tracks:
action_chain.move_by_offset(int(track), 0).pause(1)
action_chain.perform()
action_chain.release().perform()
time.sleep(2)
except Exception as e:
logger.info(f"滑块验证处理失败或未出现: {e}")
# head
head_div = self.driver.find_element(By.CSS_SELECTOR, "div.header.domPC")
time_div = head_div.find_element(By.CSS_SELECTOR, "div.header-time.left")
datetimes = time_div.find_element(By.CSS_SELECTOR, "span.year").text+"/"+time_div.find_element(By.CSS_SELECTOR, "span.day").text+" "+time_div.find_element(By.CSS_SELECTOR, "span.time").text
news_item.publishTime = str(datetime.strptime(datetimes, "%Y/%m/%d %H:%M:%S"))
source = head_div.find_element(By.CSS_SELECTOR, "div.source").text.split("")[1]
news_item.source = source
title = head_div.find_element(By.CSS_SELECTOR, "h1").text
news_item.title = title
# 内容
try:
article_div = self.driver.find_element(By.CSS_SELECTOR, "div.main.clearfix")
content_div = article_div.find_element(By.CSS_SELECTOR, "div#detail span#detailContent")
children = content_div.find_elements(By.XPATH, "./*")
for child in children:
try:
tag_name = child.tag_name.lower()
if tag_name == "p" or tag_name == "div":
text = child.text.strip().replace("\xa0", "")
if not text and len(child.find_elements(By.TAG_NAME, "img")) == 0:
continue
# 视频
try:
video = child.find_element(By.TAG_NAME, "video")
src = video.get_attribute("src")
if src and not src.startswith("http"):
src = self._normalize_url(src)
news_item.contentRows.append({"tag": "video", "content": f"<video src='{src}' />"})
continue
except:
pass
# 图片
try:
img = child.find_element(By.TAG_NAME, "img")
src = img.get_attribute("src")
if src and not src.startswith("http"):
src = self._normalize_url(src)
news_item.contentRows.append({"tag": "img", "content": f"<img src='{src}' />"})
continue
except:
pass
# 普通段落
news_item.contentRows.append({"tag": "p", "content": child.get_attribute("outerHTML")})
elif tag_name == "img":
src = child.get_attribute("src")
if src and not src.startswith("http"):
src = self._normalize_url(src)
news_item.contentRows.append({"tag": "img", "content": f"<img src='{src}' />"})
elif tag_name == "video":
src = child.get_attribute("src")
if src and not src.startswith("http"):
src = self._normalize_url(src)
news_item.contentRows.append({"tag": "video", "content": f"<video src='{src}' />"})
except Exception as e:
logger.warning(f"解析段落失败: {e}")
continue
except:
logger.warning(f"新闻内容解析失败: {url}")
return news_item
def _normalize_url(self, url: str) -> str:
"""
规范化 URL补全协议和域名
Args:
url: 原始 URL
Returns:
完整的 URL
"""
if not url:
return url
# 已经是完整 URL
if url.startswith("http://") or url.startswith("https://"):
return url
# 协议相对 URL补充 https:
if url.startswith("//"):
return "https:" + url
# 相对路径,补全域名
return self.config.base_url + url
def search(self, key:str, total=10, action="news") -> ResultDomain:
# 检查driver是否已初始化
if not self.driver:
logger.error("WebDriver未初始化无法继续爬取")
return ResultDomain(code=1, message="WebDriver未初始化无法继续爬取", success=False)
news_urls = []
news_list = []
resultDomain = ResultDomain(code=0, message="", success=True, dataList=news_list)
# 获取搜索配置
search_config = self.config.urls.get("search")
if not search_config:
logger.error("未找到搜索URL配置")
resultDomain.code = 0
resultDomain.message = "未找到搜索URL配置"
resultDomain.success = False
return resultDomain
pagesize = 10
search_data = search_config.params.copy()
search_data["k"] = key
search_data["action"] = action
try:
# 获取新闻url
url_base_map = {}
total_pages = (total + pagesize - 1) // pagesize
for page in range(1, total_pages + 1):
search_data["page"] = page
pageHtml = search_config.url + "?" + urlencode(search_data)
logger.info(f"请求URL: {pageHtml}")
# 使用Selenium访问页面
try:
self.driver.get(pageHtml)
time.sleep(2)
except Exception as e:
logger.warning(f"访问搜索页失败: {pageHtml}, {e}")
continue
# 滑动验证处理
try:
sliders = self.driver.find_elements(By.CSS_SELECTOR, ".handler.handler_bg")
if sliders:
slider = sliders[0]
action_chain = ActionChains(self.driver)
action_chain.click_and_hold(slider).perform()
distance = 1000
tracks = [distance*0.2, distance*0.3, distance*0.25, distance*0.25]
for track in tracks:
action_chain.move_by_offset(int(track), 0).pause(1)
action_chain.release().perform()
time.sleep(5)
except Exception as e:
logger.info(f"滑动验证处理失败或未出现: {e}")
# 提取新闻列表
try:
search_main = self.driver.find_element(By.CSS_SELECTOR, "div.page-search-main")
search_group = search_main.find_element(By.CSS_SELECTOR, "div.page-search-group")
news_div = search_group.find_element(By.CSS_SELECTOR, "div.page-search-news")
news_items = news_div.find_elements(By.CSS_SELECTOR, "div.group")
for news in news_items:
try:
head = news.find_element(By.CSS_SELECTOR, "div.head")
title_div = head.find_element(By.CSS_SELECTOR, "div.title")
date_div = head.find_element(By.CSS_SELECTOR, "div.date")
a_tag = title_div.find_element(By.TAG_NAME, "a")
news_url = a_tag.get_attribute("href")
news_title = a_tag.text.strip()
news_date = date_div.text.strip()
url_base_map[news_url] = {"title": news_title, "date": news_date}
news_urls.append(news_url)
except Exception as e:
logger.warning(f"提取单条新闻URL失败: {e}")
except Exception as e:
logger.warning(f"提取新闻列表失败: {e}")
continue
# 从新闻url中获取新闻详情
count = 0
for news_url in news_urls:
try:
news = self.parse_news_detail(news_url)
if news:
news.title = url_base_map.get(news_url, {}).get("title") or news.title
news.publishTime = url_base_map.get(news_url, {}).get("date") or news.publishTime
news_list.append(news)
count += 1
if count >= total:
break
except Exception as e:
logger.warning(f"解析新闻失败: {news_url}, {e}")
continue
except Exception as e:
logger.error(f"搜索过程整体异常: {e}")
resultDomain.success = False
resultDomain.code = 0
resultDomain.message = "爬取失败"
# 最终保证返回 dataList
resultDomain.dataList = news_list
resultDomain.success = bool(news_list)
return resultDomain
def hot_point(self) -> ResultDomain:
# 检查driver是否已初始化
if not self.driver:
logger.error("WebDriver未初始化无法继续爬取")
return ResultDomain(code=1, message="WebDriver未初始化无法继续爬取", success=False)
news_urls = []
news_list = []
resultDomain = ResultDomain(code=0, message="", success=True, dataList=news_list)
# 获取搜索配置
hot_point_config = self.config.urls.get("hot_point")
if not hot_point_config:
logger.error("未找到搜索URL配置")
resultDomain.code = 0
resultDomain.message = "未找到搜索URL配置"
resultDomain.success = False
return resultDomain
# 访问搜索页
try:
self.driver.get(hot_point_config.url)
time.sleep(2)
except Exception as e:
logger.warning(f"访问搜索页失败: {hot_point_config.url}, {e}")
return resultDomain
try:
# 获取新闻url
url_base_map = {}
news_div = self.driver.find_element(By.CSS_SELECTOR, "section.wrapper > div.page-news.center-1200")
hot_news_div = news_div.find_element(By.CSS_SELECTOR, "div.page-news-l")
news_items_div = hot_news_div.find_element(By.CSS_SELECTOR, "div.page-news-list")
news_items = news_items_div.find_elements(By.CSS_SELECTOR, "div.item")
for news in news_items:
a_tag = news.find_element(By.TAG_NAME, "a")
news_url = a_tag.get_attribute("href")
news_title = a_tag.text.strip()
url_base_map[news_url] = {"title": news_title}
news_urls.append(news_url)
# 从新闻url中获取新闻详情
count = 0
for news_url in news_urls:
try:
news = self.parse_news_detail(news_url)
if news:
news.title = url_base_map.get(news_url, {}).get("title") or news.title
news_list.append(news)
count += 1
if count >= 5:
break
except Exception as e:
logger.warning(f"解析新闻失败: {news_url}, {e}")
continue
except Exception as e:
logger.error(f"搜索过程整体异常: {e}")
resultDomain.success = False
resultDomain.code = 0
resultDomain.message = "爬取失败"
# 最终保证返回 dataList
resultDomain.dataList = news_list
resultDomain.success = bool(news_list)
return resultDomain
# 特别推荐
def commend(self) -> ResultDomain:
# 检查driver是否已初始化
if not self.driver:
logger.error("WebDriver未初始化无法继续爬取")
return ResultDomain(code=1, message="WebDriver未初始化无法继续爬取", success=False)
news_urls = []
news_list = []
resultDomain = ResultDomain(code=0, message="", success=True, dataList=news_list)
# 获取搜索配置
hot_point_config = self.config.urls.get("hot_point")
if not hot_point_config:
logger.error("未找到搜索URL配置")
resultDomain.code = 0
resultDomain.message = "未找到搜索URL配置"
resultDomain.success = False
return resultDomain
# 访问搜索页
try:
self.driver.get(hot_point_config.url)
time.sleep(2)
except Exception as e:
logger.warning(f"访问搜索页失败: {hot_point_config.url}, {e}")
return resultDomain
try:
# 获取新闻url
url_base_map = {}
news_div = self.driver.find_element(By.CSS_SELECTOR, "section.wrapper > div.page-news.center-1200")
page_r_div = news_div.find_element(By.CSS_SELECTOR, "div.page-news-r")
commend_jump_divs = page_r_div.find_elements(By.CSS_SELECTOR, "div.page-news-recommend > div.item")
jump_urls = []
for commend_jump_div in commend_jump_divs:
a = commend_jump_div.find_element(By.CSS_SELECTOR, "div.txt > a")
jump_url = self._normalize_url(a.get_attribute("href") or '')
jump_urls.append(jump_url)
for jump_url in jump_urls:
self.driver.get(jump_url)
conent_div = WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.content")))
# 轮播图区域
swiper_wrapper_div = conent_div.find_element(By.CSS_SELECTOR, "div.part01 > div.swiper-container > div.swiper-wrapper")
if swiper_wrapper_div:
swiper_slides = swiper_wrapper_div.find_elements(By.CSS_SELECTOR, "div.swiper-slide")
# swiper_news_urls = []
for swiper_slide in swiper_slides:
a = swiper_slide.find_element(By.CSS_SELECTOR, "div.tit > a")
news_url = self._normalize_url(a.get_attribute("href") or '')
news_urls.append(news_url)
# swiper_news_urls.append(news_url)
# 聚焦区域
news_ul_div = conent_div.find_element(By.CSS_SELECTOR, "div.part02 > div.part02_con > ul")
if news_ul_div:
news_li_divs = news_ul_div.find_elements(By.CSS_SELECTOR, "li")
# focus_news_urls = []
for news_li_div in news_li_divs:
a = news_li_div.find_element(By.CSS_SELECTOR, "h3.h3Tit > a")
news_url = self._normalize_url(a.get_attribute("href") or '')
news_urls.append(news_url)
# focus_news_urls.append(news_url)
# 从新闻url中获取新闻详情
count = 0
for news_url in news_urls:
try:
news = self.parse_news_detail(news_url)
if news:
news.title = url_base_map.get(news_url, {}).get("title") or news.title
news_list.append(news)
count += 1
if count >= 5:
break
except Exception as e:
logger.warning(f"解析新闻失败: {news_url}, {e}")
continue
except Exception as e:
logger.error(f"搜索过程整体异常: {e}")
resultDomain.success = False
resultDomain.code = 0
resultDomain.message = "爬取失败"
# 最终保证返回 dataList
resultDomain.dataList = news_list
resultDomain.success = bool(news_list)
return resultDomain
def close(self):
if hasattr(self, 'driver') and self.driver:
try:
self.driver.quit()
logger.info("浏览器已关闭")
except Exception as e:
logger.warning(f"关闭浏览器失败: {str(e)}")
self.driver = None