推荐爬虫

2025-11-20 15:46:53 +08:00
parent 9f56f4fd24
commit 078d86db6e
2 changed files with 148 additions and 15 deletions
--- a/schoolNewsCrawler/crawler/xhw/XhwCrawler.py
+++ b/schoolNewsCrawler/crawler/xhw/XhwCrawler.py
@@ -76,6 +76,27 @@ class XhwCrawler(BaseCrawler):
                        'sec-ch-ua-mobile': '?0',
                        'sec-ch-ua-platform': '"Windows"'
                    }
                ),
                "commend": UrlConfig(
                    url="https://xhsz.news.cn/focus_news",
                    method="GET",
                    params={},
                    headers={
                        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
                        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
                        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
                        'Accept-Encoding': 'gzip, deflate, br',
                        'Connection': 'keep-alive',
                        'Upgrade-Insecure-Requests': '1',
                        'Sec-Fetch-Dest': 'document',
                        'Sec-Fetch-Mode': 'navigate',
                        'Sec-Fetch-Site': 'none',
                        'Cache-Control': 'max-age=0',
                        'Referer': 'https://xhsz.news.cn/',
                        'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
                        'sec-ch-ua-mobile': '?0',
                        'sec-ch-ua-platform': '"Windows"'
                    }
                )
            },            
        )
@@ -585,6 +606,99 @@ class XhwCrawler(BaseCrawler):
        resultDomain.success = bool(news_list)
        return resultDomain
    # 特别推荐
    def commend(self) -> ResultDomain:
        # 检查driver是否已初始化
        if not self.driver:
            logger.error("WebDriver未初始化，无法继续爬取")
            return ResultDomain(code=1, message="WebDriver未初始化，无法继续爬取", success=False)
        news_urls = []
        news_list = []
        resultDomain = ResultDomain(code=0, message="", success=True, dataList=news_list)
        # 获取搜索配置
        hot_point_config = self.config.urls.get("hot_point")
        if not hot_point_config:
            logger.error("未找到搜索URL配置")
            resultDomain.code = 0
            resultDomain.message = "未找到搜索URL配置"
            resultDomain.success = False
            return resultDomain
        # 访问搜索页
        try:
            self.driver.get(hot_point_config.url)
            time.sleep(2)
        except Exception as e:
            logger.warning(f"访问搜索页失败: {hot_point_config.url}, {e}")
            return resultDomain
        try:
            # 获取新闻url
            url_base_map = {}
            news_div = self.driver.find_element(By.CSS_SELECTOR, "section.wrapper > div.page-news.center-1200")
            page_r_div = news_div.find_element(By.CSS_SELECTOR, "div.page-news-r")
            commend_jump_divs = page_r_div.find_elements(By.CSS_SELECTOR, "div.page-news-recommend > div.item")
            jump_urls = []
            for commend_jump_div in commend_jump_divs:
                a = commend_jump_div.find_element(By.CSS_SELECTOR, "div.txt > a")
                jump_url = self._normalize_url(a.get_attribute("href") or '')
                jump_urls.append(jump_url)
            for jump_url in jump_urls:
                self.driver.get(jump_url)
                conent_div = WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.content")))
                # 轮播图区域
                swiper_wrapper_div = conent_div.find_element(By.CSS_SELECTOR, "div.part01 > div.swiper-container > div.swiper-wrapper")
                if swiper_wrapper_div:
                    swiper_slides = swiper_wrapper_div.find_elements(By.CSS_SELECTOR, "div.swiper-slide")
                    # swiper_news_urls = []
                    for swiper_slide in swiper_slides:
                        a = swiper_slide.find_element(By.CSS_SELECTOR, "div.tit > a")
                        news_url = self._normalize_url(a.get_attribute("href") or '')
                        news_urls.append(news_url)
                        # swiper_news_urls.append(news_url)
                # 聚焦区域
                news_ul_div = conent_div.find_element(By.CSS_SELECTOR, "div.part02 > div.part02_con > ul")
                if news_ul_div:
                    news_li_divs = news_ul_div.find_elements(By.CSS_SELECTOR, "li")
                    # focus_news_urls = []
                    for news_li_div in news_li_divs:
                        a = news_li_div.find_element(By.CSS_SELECTOR, "h3.h3Tit > a")
                        news_url = self._normalize_url(a.get_attribute("href") or '')
                        news_urls.append(news_url)
                        # focus_news_urls.append(news_url)
            # 从新闻url中获取新闻详情
            count = 0
            for news_url in news_urls:
                try:
                    news = self.parse_news_detail(news_url)
                    if news:
                        news.title = url_base_map.get(news_url, {}).get("title") or news.title
                        news_list.append(news)
                        count += 1
                        if count >= 5:
                            break
                except Exception as e:
                    logger.warning(f"解析新闻失败: {news_url}, {e}")
                    continue
        except Exception as e:
            logger.error(f"搜索过程整体异常: {e}")
            resultDomain.success = False
            resultDomain.code = 0
            resultDomain.message = "爬取失败"
        # 最终保证返回 dataList
        resultDomain.dataList = news_list
        resultDomain.success = bool(news_list)
        return resultDomain
    def close(self):
        if hasattr(self, 'driver') and self.driver:
            try:
--- a/schoolNewsCrawler/test.ipynb
+++ b/schoolNewsCrawler/test.ipynb
@@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 9,
   "id": "948be230",
   "metadata": {},
   "outputs": [
@@ -41,7 +41,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 10,
   "id": "31a8a0dd",
   "metadata": {},
   "outputs": [
@@ -49,11 +49,9 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "\u001b[32m2025-11-20 14:48:38.587\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mcrawler.BaseCrawler\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m71\u001b[0m - \u001b[1m初始化爬虫: XhwCrawler\u001b[0m\n",
+      "\u001b[32m2025-11-20 15:39:21.410\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mcrawler.BaseCrawler\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m71\u001b[0m - \u001b[1m初始化爬虫: XhwCrawler\u001b[0m\n",
-      "\u001b[32m2025-11-20 14:48:39.615\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m113\u001b[0m - \u001b[1mChrome浏览器初始化成功\u001b[0m\n",
+      "\u001b[32m2025-11-20 15:39:22.502\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m20\u001b[0m - \u001b[1mChrome浏览器初始化成功\u001b[0m\n",
-      "\u001b[32m2025-11-20 14:48:39.615\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m122\u001b[0m - \u001b[1m访问主页获取初始Cookie\u001b[0m\n",
+      "\u001b[32m2025-11-20 15:39:22.502\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m24\u001b[0m - \u001b[1m访问主页获取初始Cookie\u001b[0m\n"
      "\u001b[32m2025-11-20 14:48:39.616\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m123\u001b[0m - \u001b[1m准备访问URL: https://xhsz.news.cn/\u001b[0m\n",
      "\u001b[32m2025-11-20 14:48:41.227\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m126\u001b[0m - \u001b[1m成功访问URL: https://xhsz.news.cn/\u001b[0m\n"
     ]
    }
   ],
@@ -63,7 +61,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 11,
   "id": "e5a6e91c",
   "metadata": {},
   "outputs": [],
@@ -78,7 +76,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 12,
   "id": "7e0f56fa",
   "metadata": {},
   "outputs": [],
@@ -88,30 +86,51 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 13,
   "id": "47327ebf",
   "metadata": {},
   "outputs": [],
   "source": [
    "#crawler.parse_xh_news_detail(\"https://www.news.cn/politics/leaders/20250224/5384be3d47c643b3a68e3bb724656152/c.html\")\n",
    "# crawler.parse_xh_news_detail(\"https://www.news.cn/politics/leaders/20240207/2819fe60663140eab9599581dcae8c1e/c.html\") #视频"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "fa359d5b",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[32m2025-11-20 15:45:21.322\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mcommend\u001b[0m:\u001b[36m44\u001b[0m - \u001b[1m轮播图新闻url: 5\u001b[0m\n",
      "\u001b[32m2025-11-20 15:45:21.483\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mcommend\u001b[0m:\u001b[36m54\u001b[0m - \u001b[1m聚焦新闻url: 21\u001b[0m\n",
      "\u001b[32m2025-11-20 15:45:22.214\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mcommend\u001b[0m:\u001b[36m44\u001b[0m - \u001b[1m轮播图新闻url: 7\u001b[0m\n",
      "\u001b[32m2025-11-20 15:45:23.134\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mcommend\u001b[0m:\u001b[36m54\u001b[0m - \u001b[1m聚焦新闻url: 124\u001b[0m\n",
      "\u001b[32m2025-11-20 15:45:23.135\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mcommend\u001b[0m:\u001b[36m55\u001b[0m - \u001b[1m获取到新闻url:157\u001b[0m\n"
     ]
    },
    {
     "data": {
      "text/plain": [
-       "NewsItem(title='微纪录片｜习近平的“三农”情', contentRows=[{'tag': 'video', 'content': \"<video src='https://vodpub6.v.news.cn/yqfbzx-original/20240207/202402072819fe60663140eab9599581dcae8c1e_73db5fe0318c44469be3ea83adfc730d.mp4' />\"}, {'tag': 'p', 'content': '<p style=\"text-align: center;\">习近平对农民有着深厚的感情</p>'}, {'tag': 'p', 'content': '<p style=\"text-align: center;\">对“三农”问题有深入的思考</p>'}, {'tag': 'p', 'content': '<p style=\"text-align: center;\">一路走来</p>'}, {'tag': 'p', 'content': '<p style=\"text-align: center;\">他经常和农民在一起</p>'}, {'tag': 'p', 'content': '<p style=\"text-align: center;\">从小小山村的党支部书记</p>'}, {'tag': 'p', 'content': '<p style=\"text-align: center;\">到党的总书记</p>'}, {'tag': 'p', 'content': '<p style=\"text-align: center;\">寸寸光阴</p>'}, {'tag': 'p', 'content': '<p style=\"text-align: center;\">见证着他的“三农”情</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003总策划：刘健</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003策划：李拯宇</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003监制：孙志平</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003制片：樊华</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003统筹：韩珅、王志斌</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003编导：陈晓宇</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003记者：陈晓宇、范世辉、岳文婷、邹尚伯、张晨俊、王怿文、李涛、胡友松、朱晓光</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003报道员：刘鹏飞、李树锋、张伟、朱海亮、徐涛、王盟、王静</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003海报：韩彤（实习）</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003鸣谢：中共延川县委宣传部</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003中共石家庄市委宣传部</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003中共曹县县委宣传部</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003新华社音视频部制作</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003新华通讯社出品</p>'}], url='https://www.news.cn/politics/leaders/20240207/2819fe60663140eab9599581dcae8c1e/c.html', viewCount=None, publishTime='2024-02-07 12:43:29', author=None, source='新华社', category=None, executeStatus=0, executeMessage=None)"
+       "ResultDomain(code=0, message='', success=True, data=None, dataList=[])"
      ]
     },
-     "execution_count": 7,
+     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
-    "#crawler.parse_xh_news_detail(\"https://www.news.cn/politics/leaders/20250224/5384be3d47c643b3a68e3bb724656152/c.html\")\n",
+    "crawler.commend()"
    "crawler.parse_xh_news_detail(\"https://www.news.cn/politics/leaders/20240207/2819fe60663140eab9599581dcae8c1e/c.html\") #视频"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "fa359d5b",
+   "id": "ebabbd5b",
   "metadata": {},
   "outputs": [],
   "source": []