# 新华网爬虫 from itertools import count from typing import List, Optional from bs4 import Tag from pydantic import InstanceOf from core.ResultDomain import ResultDomain from crawler.BaseCrawler import BaseCrawler, CrawlerConfig, NewsItem, UrlConfig from loguru import logger import re import chardet from datetime import datetime, timedelta from bs4.element import NavigableString from urllib.parse import urlparse, urlencode import json from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.chrome.service import Service import time import random import os class XhwCrawler(BaseCrawler): def __init__(self): """初始化人民日报爬虫""" config = CrawlerConfig( base_url="https://xhsz.news.cn/", urls={ "search": UrlConfig( url="https://xhsz.news.cn/s", method="GET", params={ "k": "", "action": "", "page": 1 }, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'none', 'Cache-Control': 'max-age=0', 'Referer': 'https://xhsz.news.cn/', 'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"Windows"' } ), "hot_point": UrlConfig( url="https://xhsz.news.cn/focus_news", method="GET", params={}, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'none', 'Cache-Control': 'max-age=0', 'Referer': 'https://xhsz.news.cn/', 'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"Windows"' } ) }, ) super().__init__(config) self.search_action_map = { "全部": "index", "热点发布": "news" } # 初始化时创建driver self.driver = self._init_driver() def _init_driver(self): """初始化并返回Chrome WebDriver实例""" chrome_options = Options() # 确保浏览器可见,不使用无头模式 # 或者完全删除这行,因为默认就是有界面模式 chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-dev-shm-usage') chrome_options.add_argument('--disable-blink-features=AutomationControlled') chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"]) chrome_options.add_experimental_option('useAutomationExtension', False) chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36') # 确保浏览器可见 chrome_options.add_argument('--start-maximized') chrome_options.add_argument('--disable-gpu') chrome_options.add_argument('--disable-web-security') chrome_options.add_argument('--allow-running-insecure-content') chrome_options.add_argument('--disable-features=VizDisplayCompositor') service = Service(executable_path=r"chromedriver.exe") driver = None try: driver = webdriver.Chrome(service=service, options=chrome_options) logger.info("Chrome浏览器初始化成功") except Exception as e: logger.error(f"Chrome浏览器初始化失败: {str(e)}") return None # 设置隐式等待时间 # driver.implicitly_wait(10) # 访问主页获取初始Cookie logger.info("访问主页获取初始Cookie") logger.info(f"准备访问URL: {self.config.base_url}") try: driver.get(self.config.base_url) logger.info(f"成功访问URL: {self.config.base_url}") except Exception as e: logger.error(f"访问URL失败: {self.config.base_url}, 错误: {str(e)}") return None time.sleep(random.uniform(2, 4)) # 检查是否有验证页面 page_source = driver.page_source if "验证" in page_source or "captcha" in page_source.lower() or "滑动验证" in page_source: logger.warning("检测到验证页面,尝试手动处理验证") # 尝试等待用户手动处理验证 logger.info("请在30秒内手动完成验证...") time.sleep(30) # 刷新页面,检查验证是否完成 driver.refresh() time.sleep(random.uniform(2, 4)) # 再次检查验证状态 page_source = driver.page_source if "验证" in page_source or "captcha" in page_source.lower() or "滑动验证" in page_source: logger.error("验证未完成,无法继续爬取") # self.driver.quit() # self.driver = None return None return driver def parse_news_detail(self, url: str) -> Optional[NewsItem]: return self.parse_xhsz_news_detail(url) def parse_xhsz_news_detail(self, url: str) -> NewsItem: """ 使用Selenium解析新华网新闻详情页 异常局部捕获,保证返回 NewsItem 对象,即使部分内容解析失败 """ news_item = NewsItem(title="", contentRows=[], url=url) if not self.driver: logger.error("WebDriver未初始化,无法获取新闻详情") return news_item try: self.driver.get(url) time.sleep(2) except Exception as e: logger.warning(f"访问新闻详情页失败: {url}, {e}") return news_item # 滑动验证处理 try: sliders = self.driver.find_elements(By.CSS_SELECTOR, ".handler.handler_bg") if sliders: slider = sliders[0] action_chain = ActionChains(self.driver) action_chain.click_and_hold(slider).perform() distance = 1000 tracks = [distance*0.2, distance*0.3, distance*0.25, distance*0.25] for track in tracks: action_chain.move_by_offset(int(track), 0).pause(1) action_chain.perform() action_chain.release().perform() time.sleep(2) except Exception as e: logger.info(f"滑块验证处理失败或未出现: {e}") final_url = self.driver.current_url if final_url != url: news_item = self.parse_xh_news_detail(final_url) news_item.url = url return news_item # 新闻主体 try: main_div = WebDriverWait(self.driver, 10).until( EC.presence_of_element_located((By.CSS_SELECTOR, "div.page-news-detail")) ) except Exception as e: logger.warning(f"未找到新闻主体: {url}, {e}") return news_item try: article_div = main_div.find_element(By.CSS_SELECTOR, "div.page-news-l") except: logger.warning(f"未找到文章主体: {url}") return news_item # 标题 try: title_div = article_div.find_element(By.CSS_SELECTOR, "div.page-news-detail-title") news_item.title = title_div.text.strip() except: pass # 新闻元信息 try: channal_div = article_div.find_element(By.CSS_SELECTOR, "div.page-news-detail-note") channal_items = channal_div.find_elements(By.CSS_SELECTOR, "div.item") for item in channal_items: text = item.text.strip() if "来源" in text: news_item.source = text.split(":", 1)[-1].strip() elif "发布时间" in text: news_item.publishTime = text.split(":", 1)[-1].strip() elif "浏览人数" in text: try: news_item.viewCount = int(text.split(":", 1)[-1].strip()) except: pass except: pass # 内容 try: content_div = article_div.find_element(By.CSS_SELECTOR, "div.page-news-detail-content") children = content_div.find_elements(By.XPATH, "./*") for child in children: try: tag_name = child.tag_name.lower() if tag_name == "p": text = child.text.strip().replace("\xa0", "") if not text and len(child.find_elements(By.TAG_NAME, "img")) == 0: continue # 图片 try: img = child.find_element(By.TAG_NAME, "img") src = img.get_attribute("src") if src and not src.startswith("http"): src = self.config.base_url + src news_item.contentRows.append({"tag": "img", "content": f""}) continue except: pass # 视频 try: video = child.find_element(By.TAG_NAME, "video") src = video.get_attribute("src") if src and not src.startswith("http"): src = self.config.base_url + src news_item.contentRows.append({"tag": "video", "content": f"