diff --git a/schoolNewsCrawler/crawler/xhw/XhwCrawler.py b/schoolNewsCrawler/crawler/xhw/XhwCrawler.py index d391006..707d864 100644 --- a/schoolNewsCrawler/crawler/xhw/XhwCrawler.py +++ b/schoolNewsCrawler/crawler/xhw/XhwCrawler.py @@ -76,6 +76,27 @@ class XhwCrawler(BaseCrawler): 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"Windows"' } + ), + "commend": UrlConfig( + url="https://xhsz.news.cn/focus_news", + method="GET", + params={}, + headers={ + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', + 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', + 'Accept-Encoding': 'gzip, deflate, br', + 'Connection': 'keep-alive', + 'Upgrade-Insecure-Requests': '1', + 'Sec-Fetch-Dest': 'document', + 'Sec-Fetch-Mode': 'navigate', + 'Sec-Fetch-Site': 'none', + 'Cache-Control': 'max-age=0', + 'Referer': 'https://xhsz.news.cn/', + 'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"', + 'sec-ch-ua-mobile': '?0', + 'sec-ch-ua-platform': '"Windows"' + } ) }, ) @@ -585,6 +606,99 @@ class XhwCrawler(BaseCrawler): resultDomain.success = bool(news_list) return resultDomain + # 特别推荐 + def commend(self) -> ResultDomain: + # 检查driver是否已初始化 + if not self.driver: + logger.error("WebDriver未初始化,无法继续爬取") + return ResultDomain(code=1, message="WebDriver未初始化,无法继续爬取", success=False) + + news_urls = [] + news_list = [] + resultDomain = ResultDomain(code=0, message="", success=True, dataList=news_list) + + # 获取搜索配置 + hot_point_config = self.config.urls.get("hot_point") + if not hot_point_config: + logger.error("未找到搜索URL配置") + resultDomain.code = 0 + resultDomain.message = "未找到搜索URL配置" + resultDomain.success = False + return resultDomain + + # 访问搜索页 + try: + self.driver.get(hot_point_config.url) + time.sleep(2) + except Exception as e: + logger.warning(f"访问搜索页失败: {hot_point_config.url}, {e}") + return resultDomain + + try: + # 获取新闻url + url_base_map = {} + news_div = self.driver.find_element(By.CSS_SELECTOR, "section.wrapper > div.page-news.center-1200") + page_r_div = news_div.find_element(By.CSS_SELECTOR, "div.page-news-r") + + commend_jump_divs = page_r_div.find_elements(By.CSS_SELECTOR, "div.page-news-recommend > div.item") + jump_urls = [] + for commend_jump_div in commend_jump_divs: + a = commend_jump_div.find_element(By.CSS_SELECTOR, "div.txt > a") + jump_url = self._normalize_url(a.get_attribute("href") or '') + jump_urls.append(jump_url) + + for jump_url in jump_urls: + self.driver.get(jump_url) + conent_div = WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.content"))) + # 轮播图区域 + swiper_wrapper_div = conent_div.find_element(By.CSS_SELECTOR, "div.part01 > div.swiper-container > div.swiper-wrapper") + if swiper_wrapper_div: + swiper_slides = swiper_wrapper_div.find_elements(By.CSS_SELECTOR, "div.swiper-slide") + # swiper_news_urls = [] + for swiper_slide in swiper_slides: + a = swiper_slide.find_element(By.CSS_SELECTOR, "div.tit > a") + news_url = self._normalize_url(a.get_attribute("href") or '') + news_urls.append(news_url) + # swiper_news_urls.append(news_url) + + # 聚焦区域 + news_ul_div = conent_div.find_element(By.CSS_SELECTOR, "div.part02 > div.part02_con > ul") + if news_ul_div: + news_li_divs = news_ul_div.find_elements(By.CSS_SELECTOR, "li") + # focus_news_urls = [] + for news_li_div in news_li_divs: + a = news_li_div.find_element(By.CSS_SELECTOR, "h3.h3Tit > a") + news_url = self._normalize_url(a.get_attribute("href") or '') + news_urls.append(news_url) + # focus_news_urls.append(news_url) + + + # 从新闻url中获取新闻详情 + count = 0 + for news_url in news_urls: + try: + news = self.parse_news_detail(news_url) + if news: + news.title = url_base_map.get(news_url, {}).get("title") or news.title + news_list.append(news) + count += 1 + if count >= 5: + break + except Exception as e: + logger.warning(f"解析新闻失败: {news_url}, {e}") + continue + + except Exception as e: + logger.error(f"搜索过程整体异常: {e}") + resultDomain.success = False + resultDomain.code = 0 + resultDomain.message = "爬取失败" + + # 最终保证返回 dataList + resultDomain.dataList = news_list + resultDomain.success = bool(news_list) + return resultDomain + def close(self): if hasattr(self, 'driver') and self.driver: try: diff --git a/schoolNewsCrawler/test.ipynb b/schoolNewsCrawler/test.ipynb index 3b9ebcd..cd447f3 100644 --- a/schoolNewsCrawler/test.ipynb +++ b/schoolNewsCrawler/test.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 9, "id": "948be230", "metadata": {}, "outputs": [ @@ -41,7 +41,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 10, "id": "31a8a0dd", "metadata": {}, "outputs": [ @@ -49,11 +49,9 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32m2025-11-20 14:48:38.587\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.BaseCrawler\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m71\u001b[0m - \u001b[1m初始化爬虫: XhwCrawler\u001b[0m\n", - "\u001b[32m2025-11-20 14:48:39.615\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m113\u001b[0m - \u001b[1mChrome浏览器初始化成功\u001b[0m\n", - "\u001b[32m2025-11-20 14:48:39.615\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m122\u001b[0m - \u001b[1m访问主页获取初始Cookie\u001b[0m\n", - "\u001b[32m2025-11-20 14:48:39.616\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m123\u001b[0m - \u001b[1m准备访问URL: https://xhsz.news.cn/\u001b[0m\n", - "\u001b[32m2025-11-20 14:48:41.227\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m126\u001b[0m - \u001b[1m成功访问URL: https://xhsz.news.cn/\u001b[0m\n" + "\u001b[32m2025-11-20 15:39:21.410\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.BaseCrawler\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m71\u001b[0m - \u001b[1m初始化爬虫: XhwCrawler\u001b[0m\n", + "\u001b[32m2025-11-20 15:39:22.502\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m20\u001b[0m - \u001b[1mChrome浏览器初始化成功\u001b[0m\n", + "\u001b[32m2025-11-20 15:39:22.502\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m24\u001b[0m - \u001b[1m访问主页获取初始Cookie\u001b[0m\n" ] } ], @@ -63,7 +61,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 11, "id": "e5a6e91c", "metadata": {}, "outputs": [], @@ -78,7 +76,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 12, "id": "7e0f56fa", "metadata": {}, "outputs": [], @@ -88,30 +86,51 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 13, "id": "47327ebf", "metadata": {}, + "outputs": [], + "source": [ + "#crawler.parse_xh_news_detail(\"https://www.news.cn/politics/leaders/20250224/5384be3d47c643b3a68e3bb724656152/c.html\")\n", + "# crawler.parse_xh_news_detail(\"https://www.news.cn/politics/leaders/20240207/2819fe60663140eab9599581dcae8c1e/c.html\") #视频" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "fa359d5b", + "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[32m2025-11-20 15:45:21.322\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mcommend\u001b[0m:\u001b[36m44\u001b[0m - \u001b[1m轮播图新闻url: 5\u001b[0m\n", + "\u001b[32m2025-11-20 15:45:21.483\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mcommend\u001b[0m:\u001b[36m54\u001b[0m - \u001b[1m聚焦新闻url: 21\u001b[0m\n", + "\u001b[32m2025-11-20 15:45:22.214\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mcommend\u001b[0m:\u001b[36m44\u001b[0m - \u001b[1m轮播图新闻url: 7\u001b[0m\n", + "\u001b[32m2025-11-20 15:45:23.134\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mcommend\u001b[0m:\u001b[36m54\u001b[0m - \u001b[1m聚焦新闻url: 124\u001b[0m\n", + "\u001b[32m2025-11-20 15:45:23.135\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mcommend\u001b[0m:\u001b[36m55\u001b[0m - \u001b[1m获取到新闻url:157\u001b[0m\n" + ] + }, { "data": { "text/plain": [ - "NewsItem(title='微纪录片|习近平的“三农”情', contentRows=[{'tag': 'video', 'content': \"\"}, {'tag': 'p', 'content': '
习近平对农民有着深厚的感情
'}, {'tag': 'p', 'content': '对“三农”问题有深入的思考
'}, {'tag': 'p', 'content': '一路走来
'}, {'tag': 'p', 'content': '他经常和农民在一起
'}, {'tag': 'p', 'content': '从小小山村的党支部书记
'}, {'tag': 'p', 'content': '到党的总书记
'}, {'tag': 'p', 'content': '寸寸光阴
'}, {'tag': 'p', 'content': '见证着他的“三农”情
'}, {'tag': 'p', 'content': '\\u2003\\u2003总策划:刘健
'}, {'tag': 'p', 'content': '\\u2003\\u2003策划:李拯宇
'}, {'tag': 'p', 'content': '\\u2003\\u2003监制:孙志平
'}, {'tag': 'p', 'content': '\\u2003\\u2003制片:樊华
'}, {'tag': 'p', 'content': '\\u2003\\u2003统筹:韩珅、王志斌
'}, {'tag': 'p', 'content': '\\u2003\\u2003编导:陈晓宇
'}, {'tag': 'p', 'content': '\\u2003\\u2003记者:陈晓宇、范世辉、岳文婷、邹尚伯、张晨俊、王怿文、李涛、胡友松、朱晓光
'}, {'tag': 'p', 'content': '\\u2003\\u2003报道员:刘鹏飞、李树锋、张伟、朱海亮、徐涛、王盟、王静
'}, {'tag': 'p', 'content': '\\u2003\\u2003海报:韩彤(实习)
'}, {'tag': 'p', 'content': '\\u2003\\u2003鸣谢:中共延川县委宣传部
'}, {'tag': 'p', 'content': '\\u2003\\u2003中共石家庄市委宣传部
'}, {'tag': 'p', 'content': '\\u2003\\u2003中共曹县县委宣传部
'}, {'tag': 'p', 'content': '\\u2003\\u2003新华社音视频部制作
'}, {'tag': 'p', 'content': '\\u2003\\u2003新华通讯社出品
'}], url='https://www.news.cn/politics/leaders/20240207/2819fe60663140eab9599581dcae8c1e/c.html', viewCount=None, publishTime='2024-02-07 12:43:29', author=None, source='新华社', category=None, executeStatus=0, executeMessage=None)" + "ResultDomain(code=0, message='', success=True, data=None, dataList=[])" ] }, - "execution_count": 7, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "#crawler.parse_xh_news_detail(\"https://www.news.cn/politics/leaders/20250224/5384be3d47c643b3a68e3bb724656152/c.html\")\n", - "crawler.parse_xh_news_detail(\"https://www.news.cn/politics/leaders/20240207/2819fe60663140eab9599581dcae8c1e/c.html\") #视频" + "crawler.commend()" ] }, { "cell_type": "code", "execution_count": null, - "id": "fa359d5b", + "id": "ebabbd5b", "metadata": {}, "outputs": [], "source": []