人民日报爬虫

2025-11-10 15:22:44 +08:00
parent 08df5f1e8a
commit e8b76278e9
36 changed files with 4241 additions and 0 deletions
--- a/schoolNewsCrawler/crawl4AI/PeopleNetCrewer.py
+++ b/schoolNewsCrawler/crawl4AI/PeopleNetCrewer.py
@@ -0,0 +1,317 @@
+"""
+使用 Crawl4AI 爬取人民网新闻
+"""
+from typing import List, Optional
+from loguru import logger
+import re
+from datetime import datetime
+from crawler.BaseCrawler import NewsItem
+import asyncio
+
+
+class PeopleNetCrewer:
+    """使用 Crawl4AI 爬取人民网新闻"""
+    
+    def __init__(self, chrome_path: Optional[str] = None):
+        """
+        初始化人民网爬虫
+        
+        Args:
+            chrome_path: Chrome 浏览器可执行文件路径，如果为 None 则使用系统默认路径
+        """
+        self.base_url = "http://www.people.com.cn"
+        self.chrome_path = chrome_path
+        self.crawler = None
+        logger.info(f"初始化人民网爬虫 (Crawl4AI)")
+    
+    async def _get_crawler(self):
+        """获取或创建爬虫实例"""
+        if self.crawler is None:
+            try:
+                from crawl4ai import AsyncWebCrawler
+            except ImportError:
+                logger.error("请先安装 crawl4ai: pip install crawl4ai")
+                raise ImportError("crawl4ai 未安装")
+            
+            # 配置浏览器选项，使用本地 Chrome
+            browser_config = {
+                "headless": True,
+                "verbose": False,
+            }
+            
+            # 如果指定了 Chrome 路径，使用指定路径
+            if self.chrome_path:
+                browser_config["executable_path"] = self.chrome_path
+                logger.info(f"使用指定的 Chrome 路径: {self.chrome_path}")
+            else:
+                # 使用本地安装的 Chrome（通过 channel 参数）
+                browser_config["channel"] = "chrome"
+                logger.info("使用本地安装的 Chrome 浏览器")
+            
+            try:
+                self.crawler = AsyncWebCrawler(
+                    browser_type="chromium",
+                    browser_config=browser_config
+                )
+            except Exception as e:
+                logger.error(f"创建爬虫实例失败: {str(e)}")
+                # 如果使用 channel 失败，尝试使用 executable_path
+                if not self.chrome_path and "channel" in browser_config:
+                    logger.warning("使用 channel 参数失败，尝试其他方式...")
+                    # 尝试常见的 Chrome 路径
+                    import os
+                    common_paths = [
+                        r"C:\Program Files\Google\Chrome\Application\chrome.exe",
+                        r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe",
+                        os.path.expanduser(r"~\AppData\Local\Google\Chrome\Application\chrome.exe"),
+                    ]
+                    for path in common_paths:
+                        if os.path.exists(path):
+                            browser_config.pop("channel", None)
+                            browser_config["executable_path"] = path
+                            logger.info(f"找到 Chrome 路径: {path}")
+                            try:
+                                self.crawler = AsyncWebCrawler(
+                                    browser_type="chromium",
+                                    browser_config=browser_config
+                                )
+                                break
+                            except Exception:
+                                continue
+                    else:
+                        raise
+                else:
+                    raise
+        return self.crawler
+    
+    async def crawl(self, category: str = "politics", limit: int = 20) -> List[NewsItem]:
+        """
+        爬取人民网新闻列表
+        
+        Args:
+            category: 新闻分类（politics-政治, society-社会, world-国际等）
+            limit: 爬取数量限制
+            
+        Returns:
+            新闻列表
+        """
+        news_list = []
+        
+        try:
+            crawler = await self._get_crawler()
+            
+            # 构建列表页URL
+            list_url = f"{self.base_url}/{category}/index.html"
+            logger.info(f"开始爬取人民网新闻: {list_url}")
+            
+            # 使用 Crawl4AI 爬取页面
+            result = await crawler.arun(url=list_url)
+            
+            if not result.success:
+                logger.error(f"爬取失败: {list_url}")
+                return news_list
+            
+            html_content = result.html
+            if not html_content:
+                logger.warning("未获取到页面内容")
+                return news_list
+            
+            # 解析新闻列表
+            from bs4 import BeautifulSoup
+            soup = BeautifulSoup(html_content, 'lxml')
+            
+            # 根据人民网实际结构调整选择器
+            # 这里使用通用的新闻列表选择器，实际使用时需要根据网站结构调整
+            news_items = soup.select('.news-list .news-item, .list-item, .news-item, article')
+            
+            if not news_items:
+                # 尝试其他可能的选择器
+                news_items = soup.select('a[href*="/n1/"], a[href*="/n2/"], a[href*="/n3/"]')
+            
+            logger.info(f"找到 {len(news_items)} 个新闻链接")
+            
+            # 提取新闻链接并爬取详情
+            processed_urls = set()
+            for item in news_items[:limit * 2]:  # 多取一些，因为有些链接可能无效
+                try:
+                    # 提取链接
+                    if item.name == 'a':
+                        link_tag = item
+                    else:
+                        link_tag = item.select_one('a')
+                    
+                    if not link_tag:
+                        continue
+                    
+                    news_url = link_tag.get('href', '')
+                    if not news_url:
+                        continue
+                    
+                    # 处理相对路径
+                    if not news_url.startswith('http'):
+                        if news_url.startswith('/'):
+                            news_url = self.base_url + news_url
+                        else:
+                            news_url = f"{self.base_url}/{news_url}"
+                    
+                    # 跳过重复链接
+                    if news_url in processed_urls:
+                        continue
+                    processed_urls.add(news_url)
+                    
+                    # 解析新闻详情
+                    news = await self.parse_news_detail(news_url)
+                    if news:
+                        news_list.append(news)
+                        logger.info(f"成功爬取新闻: {news.title}")
+                        
+                        if len(news_list) >= limit:
+                            break
+                            
+                except Exception as e:
+                    logger.error(f"处理新闻项失败: {str(e)}")
+                    continue
+            
+            logger.info(f"爬取完成，共获取 {len(news_list)} 条新闻")
+            
+        except Exception as e:
+            logger.error(f"爬取新闻列表失败: {str(e)}")
+        
+        return news_list
+    
+    async def parse_news_detail(self, url: str) -> Optional[NewsItem]:
+        """
+        解析人民网新闻详情
+        
+        Args:
+            url: 新闻详情页URL
+            
+        Returns:
+            新闻对象
+        """
+        try:
+            crawler = await self._get_crawler()
+            
+            # 使用 Crawl4AI 爬取详情页
+            result = await crawler.arun(url=url)
+            
+            if not result.success:
+                logger.warning(f"爬取详情页失败: {url}")
+                return None
+            
+            html_content = result.html
+            if not html_content:
+                return None
+            
+            from bs4 import BeautifulSoup
+            soup = BeautifulSoup(html_content, 'lxml')
+            
+            # 提取标题
+            title_tag = soup.select_one('h1, .title, .article-title, .p_title')
+            title = title_tag.get_text(strip=True) if title_tag else "未知标题"
+            
+            if title == "未知标题" or not title:
+                logger.warning(f"无法提取标题: {url}")
+                return None
+            
+            # 提取内容
+            content_tag = soup.select_one(
+                '.article-content, .content, .text-content, .p_content, '
+                '.article-body, .article-text, #articleContent'
+            )
+            content = ""
+            if content_tag:
+                # 移除脚本和样式标签
+                for script in content_tag(['script', 'style', 'noscript']):
+                    script.decompose()
+                content = content_tag.get_text(separator='\n', strip=True)
+            
+            # 提取发布时间
+            time_tag = soup.select_one(
+                '.time, .publish-time, .date, .p_time, .article-time, '
+                'time[datetime], .pubtime'
+            )
+            publish_time = None
+            if time_tag:
+                time_text = time_tag.get_text(strip=True)
+                if not time_text:
+                    time_text = time_tag.get('datetime', '')
+                publish_time = self._parse_time(time_text)
+            
+            # 提取作者
+            author_tag = soup.select_one('.author, .writer, .p_author, .article-author')
+            author = None
+            if author_tag:
+                author = author_tag.get_text(strip=True)
+            
+            # 提取图片
+            images = []
+            img_tags = soup.select('.article-content img, .content img, .p_content img')
+            for img in img_tags:
+                img_url = img.get('src', '') or img.get('data-src', '')
+                if img_url and not img_url.startswith('data:'):
+                    if not img_url.startswith('http'):
+                        if img_url.startswith('/'):
+                            img_url = self.base_url + img_url
+                        else:
+                            img_url = f"{self.base_url}/{img_url}"
+                    images.append(img_url)
+            
+            # 创建新闻对象
+            news = NewsItem(
+                title=title,
+                content=content,
+                url=url,
+                publish_time=publish_time,
+                author=author,
+                source="人民网",
+                category="时政",
+                images=images
+            )
+            
+            return news
+            
+        except Exception as e:
+            logger.error(f"解析新闻详情失败 [{url}]: {str(e)}")
+            return None
+    
+    def _parse_time(self, time_text: str) -> Optional[str]:
+        """
+        解析时间字符串
+        
+        Args:
+            time_text: 时间文本
+            
+        Returns:
+            标准化的时间字符串
+        """
+        if not time_text:
+            return None
+        
+        try:
+            # 尝试匹配常见的时间格式
+            patterns = [
+                r'(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})',
+                r'(\d{4})年(\d{2})月(\d{2})日\s+(\d{2}):(\d{2})',
+                r'(\d{4})/(\d{2})/(\d{2})\s+(\d{2}):(\d{2})',
+                r'(\d{4})-(\d{2})-(\d{2})',
+            ]
+            
+            for pattern in patterns:
+                match = re.search(pattern, time_text)
+                if match:
+                    return time_text.strip()
+            
+            return time_text.strip()
+            
+        except Exception as e:
+            logger.warning(f"时间解析失败: {str(e)}")
+            return None
+    
+    async def close(self):
+        """关闭爬虫，释放资源"""
+        if self.crawler:
+            await self.crawler.close()
+            self.crawler = None
+            logger.info("爬虫已关闭")
+
--- a/schoolNewsCrawler/crawl4AI/README.md
+++ b/schoolNewsCrawler/crawl4AI/README.md
@@ -0,0 +1,77 @@
+# Crawl4AI 人民网新闻爬虫
+
+使用 Crawl4AI 框架爬取人民网新闻，支持使用本地 Chrome 浏览器。
+
+## 安装依赖
+
+```bash
+pip install crawl4ai playwright
+playwright install chromium  # 或者使用本地 Chrome
+```
+
+## 使用方法
+
+### 基本使用
+
+```bash
+# 使用默认配置（自动使用本地 Chrome）
+python crawl4ai/main.py [category] [limit] [output_file]
+
+# 示例
+python crawl4ai/main.py politics 20 output/news.json
+```
+
+### 指定 Chrome 路径
+
+```bash
+# 指定 Chrome 可执行文件路径
+python crawl4ai/main.py politics 20 output/news.json "C:\Program Files\Google\Chrome\Application\chrome.exe"
+```
+
+### 在代码中使用
+
+```python
+import asyncio
+from crawl4ai.PeopleNetCrewer import PeopleNetCrewer
+
+async def main():
+    # 使用默认 Chrome（自动检测）
+    crewer = PeopleNetCrewer()
+    
+    # 或者指定 Chrome 路径
+    # crewer = PeopleNetCrewer(chrome_path="C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe")
+    
+    news_list = await crewer.crawl(category="politics", limit=20)
+    
+    for news in news_list:
+        print(f"标题: {news.title}")
+        print(f"链接: {news.url}")
+        print("-" * 50)
+    
+    await crewer.close()
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+## 配置说明
+
+### 使用本地 Chrome
+
+代码会自动尝试使用本地安装的 Chrome 浏览器。如果未指定 `chrome_path`，会通过 `channel="chrome"` 参数使用系统默认的 Chrome。
+
+### 浏览器配置
+
+在 `PeopleNetCrewer` 类中，可以通过修改 `_get_crawler` 方法中的 `browser_config` 来调整浏览器行为：
+
+- `headless`: 是否无头模式（默认 True）
+- `verbose`: 是否显示详细日志（默认 False）
+- `channel`: 浏览器通道（"chrome" 表示使用本地 Chrome）
+- `executable_path`: 指定浏览器可执行文件路径
+
+## 注意事项
+
+1. 确保已安装 Chrome 浏览器
+2. 如果遇到 Playwright 浏览器未找到的错误，可以运行 `playwright install chromium` 安装 Playwright 自带的浏览器
+3. 使用本地 Chrome 时，确保 Chrome 版本与 Playwright 兼容
+
--- a/schoolNewsCrawler/crawl4AI/init.py
+++ b/schoolNewsCrawler/crawl4AI/init.py
@@ -0,0 +1,9 @@
+"""
+Crawl4AI 爬虫模块
+使用 Crawl4AI 进行动态网页爬取
+"""
+
+from .PeopleNetCrewer import PeopleNetCrewer
+
+__all__ = ['PeopleNetCrewer']
+
--- a/schoolNewsCrawler/crawl4AI/main.py
+++ b/schoolNewsCrawler/crawl4AI/main.py
@@ -0,0 +1,147 @@
+"""
+使用 Crawl4AI 爬取人民网新闻的主程序
+"""
+import sys
+import json
+import asyncio
+from typing import List
+from loguru import logger
+from crawl4ai.PeopleNetCrewer import PeopleNetCrewer
+
+
+# 配置日志
+logger.remove()  # 移除默认处理器
+logger.add(
+    sys.stdout,
+    format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan> - <level>{message}</level>",
+    level="INFO"
+)
+logger.add(
+    "logs/crewer_{time:YYYY-MM-DD}.log",
+    rotation="00:00",
+    retention="30 days",
+    encoding="utf-8",
+    level="DEBUG"
+)
+
+
+async def crawl_people_net_news(
+    category: str = "politics", 
+    limit: int = 20,
+    chrome_path: str = None
+) -> List[dict]:
+    """
+    使用 Crawl4AI 爬取人民网新闻
+    
+    Args:
+        category: 新闻分类
+        limit: 爬取数量
+        chrome_path: Chrome 浏览器可执行文件路径（可选）
+        
+    Returns:
+        新闻列表（字典格式）
+    """
+    logger.info(f"开始爬取人民网新闻 - 分类: {category}, 数量: {limit}")
+    
+    crewer = None
+    try:
+        crewer = PeopleNetCrewer(chrome_path=chrome_path)
+        news_list = await crewer.crawl(category=category, limit=limit)
+        
+        # 转换为字典列表
+        result = [news.model_dump() for news in news_list]
+        
+        logger.info(f"爬取完成，共获取 {len(result)} 条新闻")
+        return result
+        
+    except Exception as e:
+        logger.error(f"爬取失败: {str(e)}")
+        import traceback
+        logger.error(traceback.format_exc())
+        return []
+        
+    finally:
+        if crewer:
+            await crewer.close()
+
+
+def save_to_json(news_list: List[dict], output_file: str = "output/news.json"):
+    """
+    保存新闻到JSON文件
+    
+    Args:
+        news_list: 新闻列表
+        output_file: 输出文件路径
+    """
+    try:
+        import os
+        os.makedirs(os.path.dirname(output_file), exist_ok=True)
+        
+        with open(output_file, 'w', encoding='utf-8') as f:
+            json.dump(news_list, f, ensure_ascii=False, indent=2)
+        
+        logger.info(f"新闻已保存到: {output_file}")
+        
+    except Exception as e:
+        logger.error(f"保存文件失败: {str(e)}")
+
+
+async def main_async():
+    """异步主函数"""
+    # 解析命令行参数
+    category = "politics"
+    limit = 20
+    output_file = "output/news.json"
+    chrome_path = None
+    
+    if len(sys.argv) > 1:
+        category = sys.argv[1]
+    if len(sys.argv) > 2:
+        limit = int(sys.argv[2])
+    if len(sys.argv) > 3:
+        output_file = sys.argv[3]
+    if len(sys.argv) > 4:
+        chrome_path = sys.argv[4]
+    
+    logger.info("=" * 60)
+    logger.info("人民网新闻爬虫程序启动 (Crawl4AI)")
+    logger.info("=" * 60)
+    
+    # 爬取新闻
+    news_list = await crawl_people_net_news(
+        category=category, 
+        limit=limit,
+        chrome_path=chrome_path
+    )
+    
+    # 保存结果
+    if news_list:
+        save_to_json(news_list, output_file)
+        
+        # 输出统计信息
+        logger.info(f"爬取统计:")
+        logger.info(f"  - 成功: {len(news_list)} 条")
+        logger.info(f"  - 失败: {limit - len(news_list)} 条")
+    else:
+        logger.warning("未获取到任何新闻")
+    
+    logger.info("=" * 60)
+    logger.info("人民网新闻爬虫程序结束")
+    logger.info("=" * 60)
+
+
+def main():
+    """主函数入口"""
+    try:
+        asyncio.run(main_async())
+    except KeyboardInterrupt:
+        logger.info("程序被用户中断")
+    except Exception as e:
+        logger.error(f"程序运行出错: {str(e)}")
+        import traceback
+        logger.error(traceback.format_exc())
+
+
+if __name__ == "__main__":
+    main()
+