schoolNews/schoolNewsCrawler/crawl4AI/PeopleNetCrewer.py

"""
使用 Crawl4AI 爬取人民网新闻
"""
from typing import List, Optional
from loguru import logger
import re
from datetime import datetime
from crawler.BaseCrawler import NewsItem
import asyncio


class PeopleNetCrewer:
    """使用 Crawl4AI 爬取人民网新闻"""

    def __init__(self, chrome_path: Optional[str] = None):
        """
        初始化人民网爬虫

        Args:
            chrome_path: Chrome 浏览器可执行文件路径，如果为 None 则使用系统默认路径
        """
        self.base_url = "http://www.people.com.cn"
        self.chrome_path = chrome_path
        self.crawler = None
        logger.info(f"初始化人民网爬虫 (Crawl4AI)")

    async def _get_crawler(self):
        """获取或创建爬虫实例"""
        if self.crawler is None:
            try:
                from crawl4ai import AsyncWebCrawler
            except ImportError:
                logger.error("请先安装 crawl4ai: pip install crawl4ai")
                raise ImportError("crawl4ai 未安装")

            # 配置浏览器选项，使用本地 Chrome
            browser_config = {
                "headless": True,
                "verbose": False,
            }

            # 如果指定了 Chrome 路径，使用指定路径
            if self.chrome_path:
                browser_config["executable_path"] = self.chrome_path
                logger.info(f"使用指定的 Chrome 路径: {self.chrome_path}")
            else:
                # 使用本地安装的 Chrome（通过 channel 参数）
                browser_config["channel"] = "chrome"
                logger.info("使用本地安装的 Chrome 浏览器")

            try:
                self.crawler = AsyncWebCrawler(
                    browser_type="chromium",
                    browser_config=browser_config
                )
            except Exception as e:
                logger.error(f"创建爬虫实例失败: {str(e)}")
                # 如果使用 channel 失败，尝试使用 executable_path
                if not self.chrome_path and "channel" in browser_config:
                    logger.warning("使用 channel 参数失败，尝试其他方式...")
                    # 尝试常见的 Chrome 路径
                    import os
                    common_paths = [
                        r"C:\Program Files\Google\Chrome\Application\chrome.exe",
                        r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe",
                        os.path.expanduser(r"~\AppData\Local\Google\Chrome\Application\chrome.exe"),
                    ]
                    for path in common_paths:
                        if os.path.exists(path):
                            browser_config.pop("channel", None)
                            browser_config["executable_path"] = path
                            logger.info(f"找到 Chrome 路径: {path}")
                            try:
                                self.crawler = AsyncWebCrawler(
                                    browser_type="chromium",
                                    browser_config=browser_config
                                )
                                break
                            except Exception:
                                continue
                    else:
                        raise
                else:
                    raise
        return self.crawler

    async def crawl(self, category: str = "politics", limit: int = 20) -> List[NewsItem]:
        """
        爬取人民网新闻列表

        Args:
            category: 新闻分类（politics-政治, society-社会, world-国际等）
            limit: 爬取数量限制

        Returns:
            新闻列表
        """
        news_list = []

        try:
            crawler = await self._get_crawler()

            # 构建列表页URL
            list_url = f"{self.base_url}/{category}/index.html"
            logger.info(f"开始爬取人民网新闻: {list_url}")

            # 使用 Crawl4AI 爬取页面
            result = await crawler.arun(url=list_url)

            if not result.success:
                logger.error(f"爬取失败: {list_url}")
                return news_list

            html_content = result.html
            if not html_content:
                logger.warning("未获取到页面内容")
                return news_list

            # 解析新闻列表
            from bs4 import BeautifulSoup
            soup = BeautifulSoup(html_content, 'lxml')

            # 根据人民网实际结构调整选择器
            # 这里使用通用的新闻列表选择器，实际使用时需要根据网站结构调整
            news_items = soup.select('.news-list .news-item, .list-item, .news-item, article')

            if not news_items:
                # 尝试其他可能的选择器
                news_items = soup.select('a[href*="/n1/"], a[href*="/n2/"], a[href*="/n3/"]')

            logger.info(f"找到 {len(news_items)} 个新闻链接")

            # 提取新闻链接并爬取详情
            processed_urls = set()
            for item in news_items[:limit * 2]:  # 多取一些，因为有些链接可能无效
                try:
                    # 提取链接
                    if item.name == 'a':
                        link_tag = item
                    else:
                        link_tag = item.select_one('a')

                    if not link_tag:
                        continue

                    news_url = link_tag.get('href', '')
                    if not news_url:
                        continue

                    # 处理相对路径
                    if not news_url.startswith('http'):
                        if news_url.startswith('/'):
                            news_url = self.base_url + news_url
                        else:
                            news_url = f"{self.base_url}/{news_url}"

                    # 跳过重复链接
                    if news_url in processed_urls:
                        continue
                    processed_urls.add(news_url)

                    # 解析新闻详情
                    news = await self.parse_news_detail(news_url)
                    if news:
                        news_list.append(news)
                        logger.info(f"成功爬取新闻: {news.title}")

                        if len(news_list) >= limit:
                            break

                except Exception as e:
                    logger.error(f"处理新闻项失败: {str(e)}")
                    continue

            logger.info(f"爬取完成，共获取 {len(news_list)} 条新闻")

        except Exception as e:
            logger.error(f"爬取新闻列表失败: {str(e)}")

        return news_list

    async def parse_news_detail(self, url: str) -> Optional[NewsItem]:
        """
        解析人民网新闻详情

        Args:
            url: 新闻详情页URL

        Returns:
            新闻对象
        """
        try:
            crawler = await self._get_crawler()

            # 使用 Crawl4AI 爬取详情页
            result = await crawler.arun(url=url)

            if not result.success:
                logger.warning(f"爬取详情页失败: {url}")
                return None

            html_content = result.html
            if not html_content:
                return None

            from bs4 import BeautifulSoup
            soup = BeautifulSoup(html_content, 'lxml')

            # 提取标题
            title_tag = soup.select_one('h1, .title, .article-title, .p_title')
            title = title_tag.get_text(strip=True) if title_tag else "未知标题"

            if title == "未知标题" or not title:
                logger.warning(f"无法提取标题: {url}")
                return None

            # 提取内容
            content_tag = soup.select_one(
                '.article-content, .content, .text-content, .p_content, '
                '.article-body, .article-text, #articleContent'
            )
            content = ""
            if content_tag:
                # 移除脚本和样式标签
                for script in content_tag(['script', 'style', 'noscript']):
                    script.decompose()
                content = content_tag.get_text(separator='\n', strip=True)

            # 提取发布时间
            time_tag = soup.select_one(
                '.time, .publish-time, .date, .p_time, .article-time, '
                'time[datetime], .pubtime'
            )
            publish_time = None
            if time_tag:
                time_text = time_tag.get_text(strip=True)
                if not time_text:
                    time_text = time_tag.get('datetime', '')
                publish_time = self._parse_time(time_text)

            # 提取作者
            author_tag = soup.select_one('.author, .writer, .p_author, .article-author')
            author = None
            if author_tag:
                author = author_tag.get_text(strip=True)

            # 提取图片
            images = []
            img_tags = soup.select('.article-content img, .content img, .p_content img')
            for img in img_tags:
                img_url = img.get('src', '') or img.get('data-src', '')
                if img_url and not img_url.startswith('data:'):
                    if not img_url.startswith('http'):
                        if img_url.startswith('/'):
                            img_url = self.base_url + img_url
                        else:
                            img_url = f"{self.base_url}/{img_url}"
                    images.append(img_url)

            # 创建新闻对象
            news = NewsItem(
                title=title,
                content=content,
                url=url,
                publish_time=publish_time,
                author=author,
                source="人民网",
                category="时政",
                images=images
            )

            return news

        except Exception as e:
            logger.error(f"解析新闻详情失败 [{url}]: {str(e)}")
            return None

    def _parse_time(self, time_text: str) -> Optional[str]:
        """
        解析时间字符串

        Args:
            time_text: 时间文本

        Returns:
            标准化的时间字符串
        """
        if not time_text:
            return None

        try:
            # 尝试匹配常见的时间格式
            patterns = [
                r'(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})',
                r'(\d{4})年(\d{2})月(\d{2})日\s+(\d{2}):(\d{2})',
                r'(\d{4})/(\d{2})/(\d{2})\s+(\d{2}):(\d{2})',
                r'(\d{4})-(\d{2})-(\d{2})',
            ]

            for pattern in patterns:
                match = re.search(pattern, time_text)
                if match:
                    return time_text.strip()

            return time_text.strip()

        except Exception as e:
            logger.warning(f"时间解析失败: {str(e)}")
            return None

    async def close(self):
        """关闭爬虫，释放资源"""
        if self.crawler:
            await self.crawler.close()
            self.crawler = None
            logger.info("爬虫已关闭")