学习时评+综合新闻+中宣部发布

2025-11-21 15:20:18 +08:00
parent 7ccec2b624
commit 3a44a2ddc4
2 changed files with 269 additions and 1386 deletions
--- a/schoolNewsCrawler/XxqgTest.ipynb
+++ b/schoolNewsCrawler/XxqgTest.ipynb
--- a/schoolNewsCrawler/crawler/xxqg/XxqgCrawler.py
+++ b/schoolNewsCrawler/crawler/xxqg/XxqgCrawler.py
@@ -61,7 +61,9 @@ class XxqgCrawler(BaseCrawler):
                "important": UrlConfig(
                    url="https://www.xuexi.cn/98d5ae483720f701144e4dabf99a4a34/5957f69bffab66811b99940516ec8784.html",
                    method="GET",
-                    params={},
+                    params={
+                        "path": "98d5ae483720f701144e4dabf99a4a34/5957f69bffab66811b99940516ec8784"
+                    },
                    headers={
                        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
                        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
@@ -79,10 +81,12 @@ class XxqgCrawler(BaseCrawler):
                        'sec-ch-ua-platform': '"Windows"'
                    }
                ),
-                "home": UrlConfig(
-                    url="https://www.xuexi.cn/",
+                "xuexishiping":UrlConfig(
+                    url="https://www.xuexi.cn/d05cad69216e688d304bb91ef3aac4c6/9a3668c13f6e303932b5e0e100fc248b.html",
                    method="GET",
-                    params={},
+                    params={
+                        "path": "d05cad69216e688d304bb91ef3aac4c6/9a3668c13f6e303932b5e0e100fc248b"
+                    },
                    headers={
                        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
                        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
@@ -99,7 +103,53 @@ class XxqgCrawler(BaseCrawler):
                        'sec-ch-ua-mobile': '?0',
                        'sec-ch-ua-platform': '"Windows"'
                    }
-                )
+                ),
+                "zongheshiping": UrlConfig(
+                    url="https://www.xuexi.cn/7097477a9643eacffe4cc101e4906fdb/9a3668c13f6e303932b5e0e100fc248b.html",
+                    method="GET",
+                    params={
+                        "path": "7097477a9643eacffe4cc101e4906fdb/9a3668c13f6e303932b5e0e100fc248b"
+                    },
+                    headers={
+                        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+                        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+                        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
+                        'Accept-Encoding': 'gzip, deflate, br',
+                        'Connection': 'keep-alive',
+                        'Upgrade-Insecure-Requests': '1',
+                        'Sec-Fetch-Dest': 'document',
+                        'Sec-Fetch-Mode': 'navigate',
+                        'Sec-Fetch-Site': 'none',
+                        'Cache-Control': 'max-age=0',
+                        'Referer': 'https://www.xuexi.cn/',
+                        'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
+                        'sec-ch-ua-mobile': '?0',
+                        'sec-ch-ua-platform': '"Windows"'
+                    }
+                ),
+                "zhongxuanbu": UrlConfig(
+                    url="https://www.xuexi.cn/105c2fa2843fa9e6d17440e172115c92/9a3668c13f6e303932b5e0e100fc248b.html",
+                    method="GET",
+                    params={
+                        "path": "105c2fa2843fa9e6d17440e172115c92/9a3668c13f6e303932b5e0e100fc248b"
+                    },
+                    headers={
+                        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+                        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+                        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
+                        'Accept-Encoding': 'gzip, deflate, br',
+                        'Connection': 'keep-alive',
+                        'Upgrade-Insecure-Requests': '1',
+                        'Sec-Fetch-Dest': 'document',
+                        'Sec-Fetch-Mode': 'navigate',
+                        'Sec-Fetch-Site': 'none',
+                        'Cache-Control': 'max-age=0',
+                        'Referer': 'https://www.xuexi.cn/',
+                        'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
+                        'sec-ch-ua-mobile': '?0',
+                        'sec-ch-ua-platform': '"Windows"'
+                    }
+                ),

            },         
        )
@@ -229,7 +279,7 @@ class XxqgCrawler(BaseCrawler):
            logger.warning(f"提取发布时间失败: {e}")
        
        try:
-            source_div = article_area_div.find_element(By.CSS_SELECTOR, "span.render-detail-source")
+            source_div = article_area_div.find_element(By.CSS_SELECTOR, "span.render-detail-resource")
            news_item.source = source_div.text.strip().split("：")[1]
        except Exception as e:
            logger.warning(f"提取来源失败: {e}")
@@ -472,21 +522,11 @@ class XxqgCrawler(BaseCrawler):
                continue
        
        resultDomain.dataList = news_list
-        with open("Xxqg_news_list.json", "w", encoding="utf-8") as f:
-            json.dump([item.model_dump() for item in resultDomain.dataList] if resultDomain.dataList else [], f, ensure_ascii=False, indent=4)
+        # with open("Xxqg_news_list.json", "w", encoding="utf-8") as f:
+        #     json.dump([item.model_dump() for item in resultDomain.dataList] if resultDomain.dataList else [], f, ensure_ascii=False, indent=4)
        return resultDomain

-    def crawl_important(self, total=10) -> ResultDomain:
-        """
-        爬取重要新闻栏目
-        参考旧版myQiangguo爬虫方式，使用requests获取文章列表，然后用Selenium解析详情
-        
-        Args:
-            total: 最多爬取的文章数量，默认10
-            
-        Returns:
-            ResultDomain: 包含新闻列表的结果对象
-        """
+    def crawl_base(self, config: CrawlerConfig, yesterday=True, start:Optional[str]=None, end:Optional[str]=None) -> ResultDomain:
        news_list = []
        resultDomain = ResultDomain(code=0, message="", success=True, dataList=news_list)
        
@@ -497,16 +537,7 @@ class XxqgCrawler(BaseCrawler):
            resultDomain.message = "WebDriver未初始化"
            return resultDomain
        
-        # 获取important配置
-        important_config = self.config.urls.get("important")
-        if not important_config:
-            logger.error("未找到important配置")
-            resultDomain.code = 1
-            resultDomain.success = False
-            resultDomain.message = "未找到important配置"
-            return resultDomain
-        
-        self.driver.get(important_config.url)
+        self.driver.get(config.url)
        try:
            if self.driver is None:
                resultDomain.message="driver未初始化"
@@ -525,7 +556,7 @@ class XxqgCrawler(BaseCrawler):
        time.sleep(3)  # 等待所有请求完成
        request_list = self.driver.requests
        json_request = []
-        target_path = "98d5ae483720f701144e4dabf99a4a34/5957f69bffab66811b99940516ec8784"
+        target_path = config.params.get("path")
        target_request = None
        logger.info(f"开始查找目标JSON请求，共有 {len(request_list)} 个请求")
        
@@ -602,23 +633,50 @@ class XxqgCrawler(BaseCrawler):
            resultDomain.message = f"解析文章数据失败: {str(e)}"
            return resultDomain
        
-        for article in article_data:
-            # 判断是昨天的新闻     "publishTime": "2025-11-21 10:04:20",
-            publish_date = article['publishTime'].split(" ")[0]
-            yesterday = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')
-            day_before_yesterday = (datetime.now() - timedelta(days=2)).strftime('%Y-%m-%d')
+        # 确定时间筛选范围（在循环外计算，避免重复）
+        if not yesterday and start and end:
+            # 自定义时间范围
+            start_date = start
+            end_date = end
+            logger.info(f"使用自定义时间范围: {start_date} 到 {end_date}")
+        else:
+            # 默认昨天
+            yesterday_str = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')
+            start_date = yesterday_str
+            end_date = yesterday_str
+            logger.info(f"使用默认时间范围（昨天）: {yesterday_str}")
        
-            if publish_date == yesterday:
-                news_item = self.parse_news_detail(article['url'])
-                news_item.title = article['title']
-                news_item.publishTime = article['publishTime']
-                news_item.source = article['source'].split("_")[1]
-                news_item.url = article['url']
-                news_list.append(news_item)
-                logger.info(f"添加昨日新闻: {news_item.title}")
-            elif publish_date == day_before_yesterday:
-                # 遇到前天的新闻就停止，因为数据是按时间倒序排列的
-                logger.info("已到达前天的新闻，停止遍历")
+        # 计算起始日期的前一天，用于提前终止循环（优化性能）
+        day_before_start = (datetime.strptime(start_date, '%Y-%m-%d') - timedelta(days=1)).strftime('%Y-%m-%d')
+        
+        for article in article_data:
+            # 提取发布日期 "publishTime": "2025-11-21 10:04:20"
+            publish_date = article['publishTime'].split(" ")[0]
+            
+            # 跳过未来的新闻（如果有）
+            if publish_date > end_date:
+                continue
+            
+            # 在时间范围内的新闻
+            if publish_date >= start_date and publish_date <= end_date:
+                try:
+                    # 提取来源，安全处理
+                    source = article['source'].split("_")[1] if "_" in article.get('source', '') else article.get('source', '')
+                    
+                    news_item = self.parse_news_detail(article['url'])
+                    news_item.title = article['title']
+                    news_item.publishTime = article['publishTime']
+                    news_item.source = source
+                    news_item.url = article['url']
+                    news_list.append(news_item)
+                    logger.info(f"添加新闻: {news_item.title} ({publish_date})")
+                except Exception as e:
+                    logger.warning(f"解析文章详情失败: {article.get('title', 'unknown')} - {str(e)}")
+                    continue
+            
+            # 如果遇到比起始日期还早的新闻，提前终止（数据按时间倒序）
+            elif publish_date < day_before_start:
+                logger.info(f"已到达时间范围之前的新闻（{publish_date}），停止遍历")
                break
        
        resultDomain.dataList = news_list
@@ -626,6 +684,73 @@ class XxqgCrawler(BaseCrawler):
        #     json.dump([item.model_dump() for item in resultDomain.dataList] if resultDomain.dataList else [], f, ensure_ascii=False, indent=4)
        return resultDomain
      
+    def crawl_important(self, total=10) -> ResultDomain:
+        """
+        爬取重要新闻栏目
+        参考旧版myQiangguo爬虫方式，使用requests获取文章列表，然后用Selenium解析详情
+        
+        Args:
+            total: 最多爬取的文章数量，默认10
+            
+        Returns:
+            ResultDomain: 包含新闻列表的结果对象
+        """
+        news_list = []
+        resultDomain = ResultDomain(code=0, message="", success=True, dataList=news_list)
+        
+        if self.driver is None:
+            logger.error("WebDriver未初始化，无法继续爬取")
+            resultDomain.code = 1
+            resultDomain.success = False
+            resultDomain.message = "WebDriver未初始化"
+            return resultDomain
+        
+        # 获取important配置
+        important_config = self.config.urls.get("important")
+        if not important_config:
+            logger.error("未找到important配置")
+            resultDomain.code = 1
+            resultDomain.success = False
+            resultDomain.message = "未找到important配置"
+            return resultDomain
+        
+        resultDomain = self.crawl_base(important_config)
+        
+        return resultDomain
+    
+    def crawl_xuexishiping(self, total=10) -> ResultDomain:
+        """
+        爬取学习时评栏目
+        
+        Args:
+            total: 最多爬取的文章数量，默认10
+            
+        Returns:
+            ResultDomain: 包含新闻列表的结果对象
+        """
+        news_list = []
+        resultDomain = ResultDomain(code=0, message="", success=True, dataList=news_list)
+        
+        if self.driver is None:
+            logger.error("WebDriver未初始化，无法继续爬取")
+            resultDomain.code = 1
+            resultDomain.success = False
+            resultDomain.message = "WebDriver未初始化"
+            return resultDomain
+        
+        # 获取important配置
+        xuexishiping_config = self.config.urls.get("xuexishiping")
+        if not xuexishiping_config:
+            logger.error("xuexishiping")
+            resultDomain.code = 1
+            resultDomain.success = False
+            resultDomain.message = "xuexishiping"
+            return resultDomain
+        
+        resultDomain = self.crawl_base(xuexishiping_config)
+        
+        return resultDomain
+    
    def home(self, type="") -> ResultDomain:
        """获取首页数据"""
        count = 0