From cd4e1b88fac670f71e497a5063a4c783b98ccf36 Mon Sep 17 00:00:00 2001
From: wangys <3401275564@qq.com>
Date: Thu, 20 Nov 2025 14:42:15 +0800
Subject: [PATCH] =?UTF-8?q?=E6=96=B0=E5=8D=8E=E7=BD=91=E6=90=9C=E7=B4=A2?=
=?UTF-8?q?=E7=88=AC=E8=99=AB+=E6=96=B0=E9=97=BB=E5=86=85=E5=AE=B9?=
=?UTF-8?q?=E6=8F=90=E5=8F=96?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
schoolNewsCrawler/crawler/xhw/XhwCrawler.py | 623 +++++++++++---------
schoolNewsCrawler/crawler/xhw/XhwSearch.py | 106 ++++
schoolNewsCrawler/test.ipynb | 101 +++-
3 files changed, 510 insertions(+), 320 deletions(-)
create mode 100644 schoolNewsCrawler/crawler/xhw/XhwSearch.py
diff --git a/schoolNewsCrawler/crawler/xhw/XhwCrawler.py b/schoolNewsCrawler/crawler/xhw/XhwCrawler.py
index 86e600b..629fcf3 100644
--- a/schoolNewsCrawler/crawler/xhw/XhwCrawler.py
+++ b/schoolNewsCrawler/crawler/xhw/XhwCrawler.py
@@ -1,4 +1,5 @@
# 新华网爬虫
+from itertools import count
from typing import List, Optional
from bs4 import Tag
@@ -84,24 +85,24 @@ class XhwCrawler(BaseCrawler):
chrome_options.add_argument('--disable-web-security')
chrome_options.add_argument('--allow-running-insecure-content')
chrome_options.add_argument('--disable-features=VizDisplayCompositor')
- chrome_options.add_argument('--remote-debugging-port=9222') # 添加调试端口
service = Service(executable_path=r"chromedriver.exe")
+ driver = None
try:
- self.driver = webdriver.Chrome(service=service, options=chrome_options)
+ driver = webdriver.Chrome(service=service, options=chrome_options)
logger.info("Chrome浏览器初始化成功")
except Exception as e:
logger.error(f"Chrome浏览器初始化失败: {str(e)}")
return None
# 设置隐式等待时间
- self.driver.implicitly_wait(10)
+ # driver.implicitly_wait(10)
# 访问主页获取初始Cookie
logger.info("访问主页获取初始Cookie")
logger.info(f"准备访问URL: {self.config.base_url}")
try:
- self.driver.get(self.config.base_url)
+ driver.get(self.config.base_url)
logger.info(f"成功访问URL: {self.config.base_url}")
except Exception as e:
logger.error(f"访问URL失败: {self.config.base_url}, 错误: {str(e)}")
@@ -109,7 +110,7 @@ class XhwCrawler(BaseCrawler):
time.sleep(random.uniform(2, 4))
# 检查是否有验证页面
- page_source = self.driver.page_source
+ page_source = driver.page_source
if "验证" in page_source or "captcha" in page_source.lower() or "滑动验证" in page_source:
logger.warning("检测到验证页面,尝试手动处理验证")
@@ -118,318 +119,244 @@ class XhwCrawler(BaseCrawler):
time.sleep(30)
# 刷新页面,检查验证是否完成
- self.driver.refresh()
+ driver.refresh()
time.sleep(random.uniform(2, 4))
# 再次检查验证状态
- page_source = self.driver.page_source
+ page_source = driver.page_source
if "验证" in page_source or "captcha" in page_source.lower() or "滑动验证" in page_source:
logger.error("验证未完成,无法继续爬取")
# self.driver.quit()
# self.driver = None
return None
- return self.driver
+ return driver
- def __del__(self):
- """析构函数,确保关闭浏览器"""
- if hasattr(self, 'driver') and self.driver:
- self.driver.quit()
- logger.info("浏览器已关闭")
-
- def search(self, key:str, total=10, action="news") -> ResultDomain:
- # 检查driver是否已初始化
- if not self.driver:
- logger.error("WebDriver未初始化,无法继续爬取")
- resultDomain = ResultDomain(code=1, message="WebDriver未初始化,无法继续爬取", success=False)
- return resultDomain
-
- # 直接使用self.driver
- news_urls = []
- news_list = []
- resultDomain = ResultDomain(code=0, message="", success=True, dataList=news_list)
- # 获取搜索配置
- search_config = self.config.urls.get("search")
- if not search_config:
- logger.error("未找到搜索URL配置")
- resultDomain.code = 0
- resultDomain.message = "未找到搜索URL配置"
- resultDomain.success = False
- return resultDomain
- pagesize = 10
- # 准备搜索参数
- search_data = search_config.params.copy()
- search_data["k"] = key
- search_data["action"] = action
-
- # 获取新闻url
- url_base_map = {}
- # 向上取整计算需要的页数
- total_pages = (total + pagesize - 1) // pagesize
- for page in range(1, total_pages + 1):
- search_data["page"] = page
- pageHtml = search_config.url + "?" + urlencode(search_data)
- # 分页的html
- logger.info(f"请求URL: {pageHtml}")
-
- # 使用Selenium访问页面
- self.driver.get(pageHtml)
- time.sleep(random.uniform(2, 4))
-
- # 检查是否有验证页面
- if not self.driver:
- logger.error("WebDriver已失效,无法继续爬取")
- resultDomain = ResultDomain(code=1, message="WebDriver已失效,无法继续爬取", success=False)
- return resultDomain
-
- page_source = self.driver.page_source
- if "验证" in page_source or "captcha" in page_source.lower() or "滑动验证" in page_source:
- logger.warning("检测到验证页面,尝试手动处理验证")
- logger.info("请在30秒内手动完成验证...")
- time.sleep(30)
-
- # 检查driver是否仍然有效
- if not self.driver:
- logger.error("WebDriver已失效,无法继续爬取")
- resultDomain = ResultDomain(code=1, message="WebDriver已失效,无法继续爬取", success=False)
- return resultDomain
-
- self.driver.refresh()
- time.sleep(random.uniform(2, 4))
-
- # 再次检查验证状态
- if not self.driver:
- logger.error("WebDriver已失效,无法继续爬取")
- resultDomain = ResultDomain(code=1, message="WebDriver已失效,无法继续爬取", success=False)
- return resultDomain
-
- page_source = self.driver.page_source
- if "验证" in page_source or "captcha" in page_source.lower() or "滑动验证" in page_source:
- logger.error("验证未完成,无法继续爬取")
- resultDomain = ResultDomain(code=1, message="验证未完成,无法继续爬取", success=False)
- return resultDomain
-
- # 解析页面内容
- pageSoup = self.parse_html(page_source)
- logger.info(f"解析后的HTML内容: {str(pageSoup)[:500]}...") # 只输出前500个字符
- # 从分页中获取新闻url
- searchMainDiv = pageSoup.find("div", class_="page-search-main")
- if not searchMainDiv:
- logger.error("未找到搜索主体部分")
- resultDomain.code = 0
- resultDomain.message = "未找到搜索主体部分"
- resultDomain.success = False
- return resultDomain
- searchGroupDiv = searchMainDiv.find("div", class_="page-search-group")
- if not searchGroupDiv:
- logger.error("未找到搜索组")
- resultDomain.code = 0
- resultDomain.message = "未找到搜索组"
- resultDomain.success = False
- return resultDomain
- newsDiv = searchGroupDiv.find("div", class_="page-search-news")
- if not newsDiv:
- logger.error("未找到新闻列表")
- resultDomain.code = 0
- resultDomain.message = "未找到新闻列表"
- resultDomain.success = False
- return resultDomain
- newsList = newsDiv.find_all("div", class_="group")
- for news in newsList:
- news_info = news.find("div.head")
- news_title = news_info.find("div.title")
- news_date = news_info.find("div.date").text.strip()
- url = news_title.find("a").get("href")
- url_base_map[url] = {"title": news_title.get_text(strip=True), "date": news_date}
- news_urls.append(url)
- # 临时保存url到url.json
- with open("url.json", "w", encoding="utf-8") as f:
- json.dump(url_base_map, f, ensure_ascii=False, indent=4)
- # 从新闻url中获取新闻详情
- for news_url in news_urls:
- news = self.parse_news_detail(news_url)
- news.title = url_base_map.get(news_url, {}).get("title")
- news.publishTime = url_base_map.get(news_url, {}).get("date")
- news_list.append(news)
-
- # 临时保存新闻到news.json
- with open("news.json", "w", encoding="utf-8") as f:
- json.dump(news_list, f, ensure_ascii=False, indent=4)
-
- # 关闭浏览器
- if self.driver:
- self.driver.quit()
- logger.info("浏览器已关闭")
-
- return resultDomain
-
def parse_news_detail(self, url: str) -> Optional[NewsItem]:
- return self.parse_xhsz_news_detail_selenium(url)
+ return self.parse_xhsz_news_detail(url)
+
+ def parse_xhsz_news_detail(self, url: str) -> NewsItem:
+ """
+ 使用Selenium解析新华网新闻详情页
+ 异常局部捕获,保证返回 NewsItem 对象,即使部分内容解析失败
+ """
+ news_item = NewsItem(title="", contentRows=[], url=url)
- def parse_xhsz_news_detail_selenium(self, url: str) -> Optional[NewsItem]:
- # 检查driver是否已初始化
if not self.driver:
logger.error("WebDriver未初始化,无法获取新闻详情")
- return None
-
- newsItem = NewsItem(title="", contentRows=[], url=url)
+ return news_item
- # 使用Selenium访问新闻详情页
- self.driver.get(url)
- time.sleep(random.uniform(2, 4))
+ try:
+ self.driver.get(url)
+ time.sleep(2)
+ except Exception as e:
+ logger.warning(f"访问新闻详情页失败: {url}, {e}")
+ return news_item
+
+ # 滑动验证处理
+ try:
+ sliders = self.driver.find_elements(By.CSS_SELECTOR, ".handler.handler_bg")
+ if sliders:
+ slider = sliders[0]
+ action_chain = ActionChains(self.driver)
+ action_chain.click_and_hold(slider).perform()
+ distance = 1000
+ tracks = [distance*0.2, distance*0.3, distance*0.25, distance*0.25]
+ for track in tracks:
+ action_chain.move_by_offset(int(track), 0).pause(1)
+ action_chain.perform()
+ action_chain.release().perform()
+ time.sleep(2)
+ except Exception as e:
+ logger.info(f"滑块验证处理失败或未出现: {e}")
- # 检查是否有验证页面
- if not self.driver:
- logger.error("WebDriver已失效,无法获取新闻详情")
- return None
-
- page_source = self.driver.page_source
- if "验证" in page_source or "captcha" in page_source.lower() or "滑动验证" in page_source:
- logger.warning("检测到验证页面,尝试手动处理验证")
- logger.info("请在30秒内手动完成验证...")
- time.sleep(30)
-
- # 检查driver是否仍然有效
- if not self.driver:
- logger.error("WebDriver已失效,无法获取新闻详情")
- return None
-
- self.driver.refresh()
- time.sleep(random.uniform(2, 4))
+ final_url = self.driver.current_url
+ if final_url != url:
+ news_item = self.parse_xh_news_detail(final_url)
+ news_item.url = url
+ return news_item
- # 再次检查验证状态
- if not self.driver:
- logger.error("WebDriver已失效,无法获取新闻详情")
- return None
-
- page_source = self.driver.page_source
- if "验证" in page_source or "captcha" in page_source.lower() or "滑动验证" in page_source:
- logger.error("验证未完成,无法获取新闻详情")
- return None
+ # 新闻主体
+ try:
+ main_div = WebDriverWait(self.driver, 10).until(
+ EC.presence_of_element_located((By.CSS_SELECTOR, "div.page-news-detail"))
+ )
+ except Exception as e:
+ logger.warning(f"未找到新闻主体: {url}, {e}")
+ return news_item
- # 解析页面内容
- newsDetailSoup = self.parse_html(page_source)
+ try:
+ article_div = main_div.find_element(By.CSS_SELECTOR, "div.page-news-l")
+ except:
+ logger.warning(f"未找到文章主体: {url}")
+ return news_item
- # 查找新闻主体部分
- main_div = newsDetailSoup.find("div.page-news-detail")
- if not main_div:
- logger.error(f"未找到新闻主体部分: {url}")
- return None
+ # 标题
+ try:
+ title_div = article_div.find_element(By.CSS_SELECTOR, "div.page-news-detail-title")
+ news_item.title = title_div.text.strip()
+ except:
+ pass
- article_div = main_div.find("div.page-news-l")
- if not article_div:
- logger.error(f"未找到新闻文章部分: {url}")
- return None
-
- # 获取标题
- title_div = article_div.find("div.page-news-detail-title")
- if title_div:
- newsItem.title = title_div.text.strip()
-
- # 获取新闻元信息
- channal_div = article_div.find("div.page-news-detail-note")
- if channal_div:
- channal_items_div = channal_div.find_all("div.item")
- for item in channal_items_div:
+ # 新闻元信息
+ try:
+ channal_div = article_div.find_element(By.CSS_SELECTOR, "div.page-news-detail-note")
+ channal_items = channal_div.find_elements(By.CSS_SELECTOR, "div.item")
+ for item in channal_items:
text = item.text.strip()
if "来源" in text:
- parts = text.split(":", 1)
- if len(parts) > 1:
- newsItem.source = parts[1].strip()
+ news_item.source = text.split(":", 1)[-1].strip()
elif "发布时间" in text:
- parts = text.split(":", 1)
- if len(parts) > 1:
- newsItem.publishTime = parts[1].strip()
+ news_item.publishTime = text.split(":", 1)[-1].strip()
elif "浏览人数" in text:
- parts = text.split(":", 1)
- if len(parts) > 1:
- newsItem.viewCount = parts[1].strip()
+ try:
+ news_item.viewCount = int(text.split(":", 1)[-1].strip())
+ except:
+ pass
+ except:
+ pass
- # 获取新闻内容
- content_div = article_div.find("div.page-news-detail-content")
- if content_div:
- # 遍历内容区域中的所有元素
- for child in content_div.children:
- if not isinstance(child, Tag):
+ # 内容
+ try:
+ content_div = article_div.find_element(By.CSS_SELECTOR, "div.page-news-detail-content")
+ children = content_div.find_elements(By.XPATH, "./*")
+ for child in children:
+ try:
+ tag_name = child.tag_name.lower()
+ if tag_name == "p":
+ text = child.text.strip().replace("\xa0", "")
+ if not text and len(child.find_elements(By.TAG_NAME, "img")) == 0:
+ continue
+
+ # 图片
+ try:
+ img = child.find_element(By.TAG_NAME, "img")
+ src = img.get_attribute("src")
+ if src and not src.startswith("http"):
+ src = self.config.base_url + src
+ news_item.contentRows.append({"tag": "img", "content": f""})
+ continue
+ except:
+ pass
+
+ # 视频
+ try:
+ video = child.find_element(By.TAG_NAME, "video")
+ src = video.get_attribute("src")
+ if src and not src.startswith("http"):
+ src = self.config.base_url + src
+ news_item.contentRows.append({"tag": "video", "content": f""})
+ continue
+ except:
+ pass
+
+ # 普通段落
+ news_item.contentRows.append({"tag": "p", "content": child.get_attribute("outerHTML")})
+ elif tag_name in ["img", "video"]:
+ news_item.contentRows.append({"tag": tag_name, "content": child.get_attribute("outerHTML")})
+ except Exception as e:
+ logger.warning(f"解析段落失败: {e}")
continue
+ except:
+ logger.warning(f"新闻内容解析失败: {url}")
- # 处理图片
- if child.name == "p" and child.find("img"):
- img_tag = child.find("img")
- if img_tag:
- src = str(img_tag.get("src"))
- img_tag["src"] = self._normalize_url(src)
- newsItem.contentRows.append({
- "tag": "img",
- "content": str(img_tag)
- })
- # 处理视频
- elif child.find("video"):
- video_tag = child.find("video")
- if video_tag:
- src = str(video_tag.get("src"))
- video_tag["src"] = self._normalize_url(src)
- newsItem.contentRows.append({
- "tag": "video",
- "content": str(video_tag)
- })
- # 处理普通段落
- elif child.name == "p" and child.get_text(strip=True):
- newsItem.contentRows.append({
- "tag": "p",
- "content": child.get_text(strip=True)
- })
-
- return newsItem
-
- def parse_xhsz_news_detail(self, url: str) -> Optional[NewsItem]:
- newsItem = NewsItem(title="", contentRows=[], url=url)
- response = self.fetch(url)
- newsDetailSoup = self.parse_html(response.content)
-
- main_div = newsDetailSoup.find("div.page-news-detail")
- article_div = main_div.find("div.page-news-l")
- title_div = article_div.find("div.page-news-detail-title")
- channal_div = article_div.find("div.page-news-detail-note")
- content_div = article_div.find("div.page-news-detail-content")
-
- # 获取新闻标题
- newsItem.title = title_div.text.strip()
-
- # 获取新闻来源、发布时间、浏览人数
- channal_items_div = channal_div.find_all("div.item")
- if("来源" in channal_items_div[0].text):
- newsItem.source = channal_items_div[0].text.strip().split(":")[1]
- if("发布时间" in channal_items_div[1].text):
- newsItem.publishTime = channal_items_div[1].text.strip().split(":")[1]
- if("浏览人数" in channal_items_div[2].text):
- newsItem.viewCount = channal_items_div[2].text.strip().split(":")[1]
-
- for child in content_div.children:
- if not isinstance(child, Tag):
- continue
-
- img_tag = child.find("img")
- video_tag = child.find("video")
- tag = "p"
- content = str(child)
- if img_tag: # 是图片
- tag = "img"
- src = str(img_tag.get("src"))
- if src:
- img_tag["src"] = self._normalize_url(src)
- content = str(img_tag)
- elif video_tag: # 是视频
- tag = "video"
- src = str(video_tag.get("src"))
- if src:
- video_tag["src"] = self._normalize_url(src)
- content = str(video_tag)
-
- newsItem.contentRows.append({"tag": tag, "content": content})
-
- return newsItem
+ return news_item
+
+
+ def parse_xh_news_detail(self, url: str) -> NewsItem:
+ """
+ 使用Selenium解析新华网新闻详情页
+ 异常局部捕获,保证返回 NewsItem 对象,即使部分内容解析失败
+ """
+ news_item = NewsItem(title="", contentRows=[], url=url)
+
+ if not self.driver:
+ logger.error("WebDriver未初始化,无法获取新闻详情")
+ return news_item
+
+ try:
+ self.driver.get(url)
+ time.sleep(2)
+ except Exception as e:
+ logger.warning(f"访问新闻详情页失败: {url}, {e}")
+ return news_item
+
+ # 滑动验证处理
+ try:
+ sliders = self.driver.find_elements(By.CSS_SELECTOR, ".handler.handler_bg")
+ if sliders:
+ slider = sliders[0]
+ action_chain = ActionChains(self.driver)
+ action_chain.click_and_hold(slider).perform()
+ distance = 1000
+ tracks = [distance*0.2, distance*0.3, distance*0.25, distance*0.25]
+ for track in tracks:
+ action_chain.move_by_offset(int(track), 0).pause(1)
+ action_chain.perform()
+ action_chain.release().perform()
+ time.sleep(2)
+ except Exception as e:
+ logger.info(f"滑块验证处理失败或未出现: {e}")
+
+ # head
+ head_div = self.driver.find_element(By.CSS_SELECTOR, "div.header.domPC")
+ time_div = head_div.find_element(By.CSS_SELECTOR, "div.header-time.left")
+ datetimes = time_div.find_element(By.CSS_SELECTOR, "span.year").text+"/"+time_div.find_element(By.CSS_SELECTOR, "span.day").text+" "+time_div.find_element(By.CSS_SELECTOR, "span.time").text
+ news_item.publishTime = str(datetime.strptime(datetimes, "%Y/%m/%d %H:%M:%S"))
+ source = head_div.find_element(By.CSS_SELECTOR, "div.source").text.split(":")[1]
+ news_item.source = source
+
+ title = head_div.find_element(By.CSS_SELECTOR, "h1").text
+ news_item.title = title
+
+ # 内容
+ try:
+ article_div = self.driver.find_element(By.CSS_SELECTOR, "div.main.clearfix")
+ content_div = article_div.find_element(By.CSS_SELECTOR, "div#detail span#detailContent")
+ children = content_div.find_elements(By.XPATH, "./*")
+ for child in children:
+ try:
+ tag_name = child.tag_name.lower()
+ if tag_name == "p":
+ text = child.text.strip().replace("\xa0", "")
+ if not text and len(child.find_elements(By.TAG_NAME, "img")) == 0:
+ continue
+
+ # 图片
+ try:
+ img = child.find_element(By.TAG_NAME, "img")
+ src = img.get_attribute("src")
+ if src and not src.startswith("http"):
+ src = self.config.base_url + src
+ news_item.contentRows.append({"tag": "img", "content": f"
"})
+ continue
+ except:
+ pass
+
+ # 视频
+ try:
+ video = child.find_element(By.TAG_NAME, "video")
+ src = video.get_attribute("src")
+ if src and not src.startswith("http"):
+ src = self.config.base_url + src
+ news_item.contentRows.append({"tag": "video", "content": f""})
+ continue
+ except:
+ pass
+
+ # 普通段落
+ news_item.contentRows.append({"tag": "p", "content": child.get_attribute("outerHTML")})
+ elif tag_name in ["img", "video"]:
+ news_item.contentRows.append({"tag": tag_name, "content": child.get_attribute("outerHTML")})
+ except Exception as e:
+ logger.warning(f"解析段落失败: {e}")
+ continue
+ except:
+ logger.warning(f"新闻内容解析失败: {url}")
+
+ return news_item
+
def _normalize_url(self, url: str) -> str:
"""
规范化 URL,补全协议和域名
@@ -453,3 +380,121 @@ class XhwCrawler(BaseCrawler):
# 相对路径,补全域名
return self.config.base_url + url
+
+ def search(self, key:str, total=10, action="news") -> ResultDomain:
+ # 检查driver是否已初始化
+ if not self.driver:
+ logger.error("WebDriver未初始化,无法继续爬取")
+ return ResultDomain(code=1, message="WebDriver未初始化,无法继续爬取", success=False)
+
+ news_urls = []
+ news_list = []
+ resultDomain = ResultDomain(code=0, message="", success=True, dataList=news_list)
+
+ # 获取搜索配置
+ search_config = self.config.urls.get("search")
+ if not search_config:
+ logger.error("未找到搜索URL配置")
+ resultDomain.code = 0
+ resultDomain.message = "未找到搜索URL配置"
+ resultDomain.success = False
+ return resultDomain
+
+ pagesize = 10
+ search_data = search_config.params.copy()
+ search_data["k"] = key
+ search_data["action"] = action
+
+ try:
+ # 获取新闻url
+ url_base_map = {}
+ total_pages = (total + pagesize - 1) // pagesize
+ for page in range(1, total_pages + 1):
+ search_data["page"] = page
+ pageHtml = search_config.url + "?" + urlencode(search_data)
+ logger.info(f"请求URL: {pageHtml}")
+
+ # 使用Selenium访问页面
+ try:
+ self.driver.get(pageHtml)
+ time.sleep(2)
+ except Exception as e:
+ logger.warning(f"访问搜索页失败: {pageHtml}, {e}")
+ continue
+
+ # 滑动验证处理
+ try:
+ sliders = self.driver.find_elements(By.CSS_SELECTOR, ".handler.handler_bg")
+ if sliders:
+ slider = sliders[0]
+ action_chain = ActionChains(self.driver)
+ action_chain.click_and_hold(slider).perform()
+ distance = 1000
+ tracks = [distance*0.2, distance*0.3, distance*0.25, distance*0.25]
+ for track in tracks:
+ action_chain.move_by_offset(int(track), 0).pause(1)
+ action_chain.release().perform()
+ time.sleep(5)
+ except Exception as e:
+ logger.info(f"滑动验证处理失败或未出现: {e}")
+
+ # 提取新闻列表
+ try:
+ search_main = self.driver.find_element(By.CSS_SELECTOR, "div.page-search-main")
+ search_group = search_main.find_element(By.CSS_SELECTOR, "div.page-search-group")
+ news_div = search_group.find_element(By.CSS_SELECTOR, "div.page-search-news")
+ news_items = news_div.find_elements(By.CSS_SELECTOR, "div.group")
+ for news in news_items:
+ try:
+ head = news.find_element(By.CSS_SELECTOR, "div.head")
+ title_div = head.find_element(By.CSS_SELECTOR, "div.title")
+ date_div = head.find_element(By.CSS_SELECTOR, "div.date")
+ a_tag = title_div.find_element(By.TAG_NAME, "a")
+ news_url = a_tag.get_attribute("href")
+ news_title = a_tag.text.strip()
+ news_date = date_div.text.strip()
+ url_base_map[news_url] = {"title": news_title, "date": news_date}
+ news_urls.append(news_url)
+ except Exception as e:
+ logger.warning(f"提取单条新闻URL失败: {e}")
+ except Exception as e:
+ logger.warning(f"提取新闻列表失败: {e}")
+ continue
+
+ # 从新闻url中获取新闻详情
+ count = 0
+ for news_url in news_urls:
+ try:
+ news = self.parse_news_detail(news_url)
+ if news:
+ news.title = url_base_map.get(news_url, {}).get("title") or news.title
+ news.publishTime = url_base_map.get(news_url, {}).get("date") or news.publishTime
+ news_list.append(news)
+ count += 1
+ if count >= total:
+ break
+ except Exception as e:
+ logger.warning(f"解析新闻失败: {news_url}, {e}")
+ continue
+
+ except Exception as e:
+ logger.error(f"搜索过程整体异常: {e}")
+ resultDomain.success = False
+ resultDomain.code = 0
+ resultDomain.message = "爬取失败"
+
+ # 最终保证返回 dataList
+ resultDomain.dataList = news_list
+ resultDomain.success = bool(news_list)
+ return resultDomain
+
+
+
+ def close(self):
+ if hasattr(self, 'driver') and self.driver:
+ try:
+ self.driver.quit()
+ logger.info("浏览器已关闭")
+ except Exception as e:
+ logger.warning(f"关闭浏览器失败: {str(e)}")
+ self.driver = None
diff --git a/schoolNewsCrawler/crawler/xhw/XhwSearch.py b/schoolNewsCrawler/crawler/xhw/XhwSearch.py
new file mode 100644
index 0000000..c1e8de2
--- /dev/null
+++ b/schoolNewsCrawler/crawler/xhw/XhwSearch.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+人民日报搜索爬虫命令行工具
+用法: python RmrbSearch.py --key "关键词" --total 10 --type 0
+"""
+
+import argparse
+import json
+import sys
+from pathlib import Path
+import time
+# Add project root directory to path to import crawler
+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
+
+from crawler.xhw.XhwCrawler import XhwCrawler
+from loguru import logger
+
+
+def main():
+ """主函数"""
+ parser = argparse.ArgumentParser(
+ description='人民日报新闻搜索工具',
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ epilog="""
+ """
+ )
+
+ parser.add_argument(
+ '--query', '-q',
+ type=str,
+ required=True,
+ help='搜索关键词'
+ )
+
+ parser.add_argument(
+ '--total', '-t',
+ type=int,
+ default=10,
+ help='抓取数量 (默认: 10)'
+ )
+
+ parser.add_argument(
+ '--output', '-o',
+ type=str,
+ help='输出文件路径'
+ )
+
+ args = parser.parse_args()
+
+ # 获取参数
+ key = args.query
+ total = args.total
+ output_file = args.output
+
+ logger.info("使用直接参数模式")
+
+ # 关键校验:key 必须存在
+ if not key or not key.strip():
+ parser.error("搜索关键词不能为空!")
+ try:
+ logger.info(f"开始搜索: 关键词='{key}', 数量={total}")
+ crawler = XhwCrawler()
+ time.sleep(5)
+ result = crawler.search(key=key.strip(), total=total)
+ # print(result)
+ output = {
+ "code": result.code,
+ "message": result.message,
+ "success": result.success,
+ "data": None,
+ "dataList": [item.model_dump() for item in result.dataList] if result.dataList else []
+ }
+ # result = None
+ # with open("F:\Project\schoolNews\schoolNewsCrawler\output\output.json", "r", encoding="utf-8") as f:
+ # result = json.load(f)
+ # print(result)
+ # output = result
+
+
+ if output_file:
+ output_path = Path(output_file)
+ output_path.parent.mkdir(parents=True, exist_ok=True)
+ with open(output_path, 'w', encoding='utf-8') as f:
+ json.dump(output, f, ensure_ascii=False, indent=2)
+ logger.info(f"结果已保存到: {output_file}")
+
+ crawler.close()
+ sys.exit(0 if result.success else 1)
+ # print(json.dumps(output, ensure_ascii=False, indent=2))
+ # sys.exit(0 if result["success"] else 1)
+ except Exception as e:
+ logger.error(f"执行失败: {str(e)}")
+ error_output = {
+ "code": 500,
+ "message": f"执行失败: {str(e)}",
+ "success": False,
+ "data": None,
+ "dataList": []
+ }
+ print(json.dumps(error_output, ensure_ascii=False, indent=2))
+ sys.exit(1)
+
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/schoolNewsCrawler/test.ipynb b/schoolNewsCrawler/test.ipynb
index 01728a5..f18e159 100644
--- a/schoolNewsCrawler/test.ipynb
+++ b/schoolNewsCrawler/test.ipynb
@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 18,
"id": "948be230",
"metadata": {},
"outputs": [
@@ -41,7 +41,7 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 19,
"id": "31a8a0dd",
"metadata": {},
"outputs": [
@@ -49,11 +49,11 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "\u001b[32m2025-11-19 19:03:54.324\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.BaseCrawler\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m71\u001b[0m - \u001b[1m初始化爬虫: XhwCrawler\u001b[0m\n",
- "\u001b[32m2025-11-19 19:03:55.214\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m20\u001b[0m - \u001b[1mChrome浏览器初始化成功\u001b[0m\n",
- "\u001b[32m2025-11-19 19:03:55.216\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m25\u001b[0m - \u001b[1m访问主页获取初始Cookie\u001b[0m\n",
- "\u001b[32m2025-11-19 19:03:55.217\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m26\u001b[0m - \u001b[1m准备访问URL: https://xhsz.news.cn/\u001b[0m\n",
- "\u001b[32m2025-11-19 19:03:57.557\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m29\u001b[0m - \u001b[1m成功访问URL: https://xhsz.news.cn/\u001b[0m\n"
+ "\u001b[32m2025-11-20 14:39:07.858\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.BaseCrawler\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m71\u001b[0m - \u001b[1m初始化爬虫: XhwCrawler\u001b[0m\n",
+ "\u001b[32m2025-11-20 14:39:08.884\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m20\u001b[0m - \u001b[1mChrome浏览器初始化成功\u001b[0m\n",
+ "\u001b[32m2025-11-20 14:39:08.884\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m24\u001b[0m - \u001b[1m访问主页获取初始Cookie\u001b[0m\n",
+ "\u001b[32m2025-11-20 14:39:08.885\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m25\u001b[0m - \u001b[1m准备访问URL: https://xhsz.news.cn/\u001b[0m\n",
+ "\u001b[32m2025-11-20 14:39:10.309\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m28\u001b[0m - \u001b[1m成功访问URL: https://xhsz.news.cn/\u001b[0m\n"
]
}
],
@@ -63,7 +63,7 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": null,
"id": "e5a6e91c",
"metadata": {},
"outputs": [
@@ -71,34 +71,42 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "\u001b[32m2025-11-19 19:04:12.458\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36msearch\u001b[0m:\u001b[36m174\u001b[0m - \u001b[1m请求URL: https://xhsz.news.cn/s?k=%E4%B9%A0%E8%BF%91%E5%B9%B3&action=news&page=1\u001b[0m\n",
- "\u001b[32m2025-11-19 19:04:15.858\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36msearch\u001b[0m:\u001b[36m188\u001b[0m - \u001b[33m\u001b[1m检测到验证页面,尝试手动处理验证\u001b[0m\n",
- "\u001b[32m2025-11-19 19:04:15.858\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36msearch\u001b[0m:\u001b[36m189\u001b[0m - \u001b[1m请在30秒内手动完成验证...\u001b[0m\n",
- "\u001b[32m2025-11-19 19:04:48.814\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36msearch\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m解析后的HTML内容:
\\u3000\\u3000新华社北京2月24日电\\u3000《习近平总书记关于党的建设的重要思想概论》出版座谈会2月24日在京召开。与会代表结合《概论》主要内容,交流学习贯彻习近平总书记关于党的建设的重要思想的认识和体会。
'}, {'tag': 'p', 'content': '\\u3000\\u3000会议认为,《概论》是广大党员、干部深入学习领会习近平总书记关于党的建设的重要思想的权威辅助读物。习近平总书记关于党的建设的重要思想,是一个逻辑严密、内涵丰富、系统全面、博大精深的科学体系,是对中国化的马克思主义党建理论体系的继承发展,构成习近平新时代中国特色社会主义思想的“党建篇”。在这一重要思想的科学指引下,我们党成功开辟百年大党自我革命新境界,推动党和国家事业取得历史性成就、发生历史性变革,为世界政党建设提供了重要借鉴。
'}, {'tag': 'p', 'content': '\\u3000\\u3000会议指出,要以学好用好《概论》为契机,进一步把习近平总书记关于党的建设的重要思想领会深、把握准、落到位,深刻领会其科学体系、理论品质和实践指向,更加深刻领悟“两个确立”的决定性意义,增强“四个意识”、坚定“四个自信”、做到“两个维护”。要不断深化体系化研究、学理化阐释,深刻把握这一重要思想蕴含的深刻道理、透彻学理、深邃哲理。要坚持用这一重要思想武装头脑、指导实践、推动工作,把学习成果转化为工作实效,推进党建研究高质量发展,以党建研究新成果推进党的建设和组织工作高质量发展,为以中国式现代化全面推进强国建设、民族复兴伟业提供坚强组织保证。
'}, {'tag': 'p', 'content': '\\u3000\\u3000座谈会由全国党建研究会举办,中央和国家机关有关部门,各省区市和新疆生产建设兵团党建研究会(学会),部分中管企业、高校有关负责同志,党史党建专家代表参加座谈会。
'}], url='https://www.news.cn/politics/leaders/20250224/5384be3d47c643b3a68e3bb724656152/c.html', viewCount=None, publishTime='2025-02-24 22:44:25', author=None, source='新华网', category=None, executeStatus=0, executeMessage=None)" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "crawler.parse_xh_news_detail(\"https://www.news.cn/politics/leaders/20250224/5384be3d47c643b3a68e3bb724656152/c.html\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa359d5b", + "metadata": {}, + "outputs": [], "source": [] } ],