视频

2025-11-20 14:57:20 +08:00
parent cd4e1b88fa
commit d8533ae348
2 changed files with 110 additions and 69 deletions
--- a/schoolNewsCrawler/crawler/xhw/XhwCrawler.py
+++ b/schoolNewsCrawler/crawler/xhw/XhwCrawler.py
@@ -56,7 +56,27 @@ class XhwCrawler(BaseCrawler):
                        'sec-ch-ua-platform': '"Windows"'
                    }
                ),
-
+                "hot_point": UrlConfig(
+                    url="https://xhsz.news.cn/focus_news",
+                    method="GET",
+                    params={},
+                    headers={
+                        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+                        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+                        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
+                        'Accept-Encoding': 'gzip, deflate, br',
+                        'Connection': 'keep-alive',
+                        'Upgrade-Insecure-Requests': '1',
+                        'Sec-Fetch-Dest': 'document',
+                        'Sec-Fetch-Mode': 'navigate',
+                        'Sec-Fetch-Site': 'none',
+                        'Cache-Control': 'max-age=0',
+                        'Referer': 'https://xhsz.news.cn/',
+                        'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
+                        'sec-ch-ua-mobile': '?0',
+                        'sec-ch-ua-platform': '"Windows"'
+                    }
+                )
            },            
        )
        super().__init__(config)
@@ -318,37 +338,47 @@ class XhwCrawler(BaseCrawler):
            for child in children:
                try:
                    tag_name = child.tag_name.lower()
-                    if tag_name == "p":
+                    if tag_name == "p" or tag_name == "div":
                        text = child.text.strip().replace("\xa0", "")
                        if not text and len(child.find_elements(By.TAG_NAME, "img")) == 0:
                            continue

-                        # 图片
-                        try:
-                            img = child.find_element(By.TAG_NAME, "img")
-                            src = img.get_attribute("src")
-                            if src and not src.startswith("http"):
-                                src = self.config.base_url + src
-                            news_item.contentRows.append({"tag": "img", "content": f"<img src='{src}' />"})
-                            continue
-                        except:
-                            pass

                        # 视频
                        try:
                            video = child.find_element(By.TAG_NAME, "video")
                            src = video.get_attribute("src")
                            if src and not src.startswith("http"):
-                                src = self.config.base_url + src
+                                src = self._normalize_url(src)
                            news_item.contentRows.append({"tag": "video", "content": f"<video src='{src}' />"})
                            continue
                        except:
                            pass
+                        
+                        # 图片
+                        try:
+                            img = child.find_element(By.TAG_NAME, "img")
+                            src = img.get_attribute("src")
+                            if src and not src.startswith("http"):
+                                src = self._normalize_url(src)
+                            news_item.contentRows.append({"tag": "img", "content": f"<img src='{src}' />"})
+                            continue
+                        except:
+                            pass

                        # 普通段落
                        news_item.contentRows.append({"tag": "p", "content": child.get_attribute("outerHTML")})
-                    elif tag_name in ["img", "video"]:
-                        news_item.contentRows.append({"tag": tag_name, "content": child.get_attribute("outerHTML")})
+                    elif tag_name == "img":
+                        src = child.get_attribute("src")
+                        if src and not src.startswith("http"):
+                            src = self._normalize_url(src)
+                        news_item.contentRows.append({"tag": "img", "content": f"<img src='{src}' />"})
+                    elif tag_name == "video":
+                        src = child.get_attribute("src")
+                        if src and not src.startswith("http"):
+                            src = self._normalize_url(src)
+                        news_item.contentRows.append({"tag": "video", "content": f"<video src='{src}' />"})
+
                except Exception as e:
                    logger.warning(f"解析段落失败: {e}")
                    continue
@@ -488,7 +518,55 @@ class XhwCrawler(BaseCrawler):
        resultDomain.success = bool(news_list)
        return resultDomain
 
-    
+    def hot_point(self) -> ResultDomain:
+        # 检查driver是否已初始化
+        if not self.driver:
+            logger.error("WebDriver未初始化，无法继续爬取")
+            return ResultDomain(code=1, message="WebDriver未初始化，无法继续爬取", success=False)
+
+        news_urls = []
+        news_list = []
+        resultDomain = ResultDomain(code=0, message="", success=True, dataList=news_list)
+
+        # 获取搜索配置
+        hot_point_config = self.config.urls.get("hot_point")
+        if not hot_point_config:
+            logger.error("未找到搜索URL配置")
+            resultDomain.code = 0
+            resultDomain.message = "未找到搜索URL配置"
+            resultDomain.success = False
+            return resultDomain
+
+
+
+        try:
+            # 获取新闻url
+            url_base_map = {}
+            
+            
+            
+            # 从新闻url中获取新闻详情
+            for news_url in news_urls:
+                try:
+                    news = self.parse_news_detail(news_url)
+                    if news:
+                        news.title = url_base_map.get(news_url, {}).get("title") or news.title
+                        news.publishTime = url_base_map.get(news_url, {}).get("date") or news.publishTime
+                        news_list.append(news)
+                except Exception as e:
+                    logger.warning(f"解析新闻失败: {news_url}, {e}")
+                    continue
+
+        except Exception as e:
+            logger.error(f"搜索过程整体异常: {e}")
+            resultDomain.success = False
+            resultDomain.code = 0
+            resultDomain.message = "爬取失败"
+
+        # 最终保证返回 dataList
+        resultDomain.dataList = news_list
+        resultDomain.success = bool(news_list)
+        return resultDomain
    
    def close(self):
        if hasattr(self, 'driver') and self.driver: