""" 使用 Crawl4AI 爬取人民网新闻 """ from typing import List, Optional from loguru import logger import re from datetime import datetime from crawler.BaseCrawler import NewsItem import asyncio class PeopleNetCrewer: """使用 Crawl4AI 爬取人民网新闻""" def __init__(self, chrome_path: Optional[str] = None): """ 初始化人民网爬虫 Args: chrome_path: Chrome 浏览器可执行文件路径,如果为 None 则使用系统默认路径 """ self.base_url = "http://www.people.com.cn" self.chrome_path = chrome_path self.crawler = None logger.info(f"初始化人民网爬虫 (Crawl4AI)") async def _get_crawler(self): """获取或创建爬虫实例""" if self.crawler is None: try: from crawl4ai import AsyncWebCrawler except ImportError: logger.error("请先安装 crawl4ai: pip install crawl4ai") raise ImportError("crawl4ai 未安装") # 配置浏览器选项,使用本地 Chrome browser_config = { "headless": True, "verbose": False, } # 如果指定了 Chrome 路径,使用指定路径 if self.chrome_path: browser_config["executable_path"] = self.chrome_path logger.info(f"使用指定的 Chrome 路径: {self.chrome_path}") else: # 使用本地安装的 Chrome(通过 channel 参数) browser_config["channel"] = "chrome" logger.info("使用本地安装的 Chrome 浏览器") try: self.crawler = AsyncWebCrawler( browser_type="chromium", browser_config=browser_config ) except Exception as e: logger.error(f"创建爬虫实例失败: {str(e)}") # 如果使用 channel 失败,尝试使用 executable_path if not self.chrome_path and "channel" in browser_config: logger.warning("使用 channel 参数失败,尝试其他方式...") # 尝试常见的 Chrome 路径 import os common_paths = [ r"C:\Program Files\Google\Chrome\Application\chrome.exe", r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe", os.path.expanduser(r"~\AppData\Local\Google\Chrome\Application\chrome.exe"), ] for path in common_paths: if os.path.exists(path): browser_config.pop("channel", None) browser_config["executable_path"] = path logger.info(f"找到 Chrome 路径: {path}") try: self.crawler = AsyncWebCrawler( browser_type="chromium", browser_config=browser_config ) break except Exception: continue else: raise else: raise return self.crawler async def crawl(self, category: str = "politics", limit: int = 20) -> List[NewsItem]: """ 爬取人民网新闻列表 Args: category: 新闻分类(politics-政治, society-社会, world-国际等) limit: 爬取数量限制 Returns: 新闻列表 """ news_list = [] try: crawler = await self._get_crawler() # 构建列表页URL list_url = f"{self.base_url}/{category}/index.html" logger.info(f"开始爬取人民网新闻: {list_url}") # 使用 Crawl4AI 爬取页面 result = await crawler.arun(url=list_url) if not result.success: logger.error(f"爬取失败: {list_url}") return news_list html_content = result.html if not html_content: logger.warning("未获取到页面内容") return news_list # 解析新闻列表 from bs4 import BeautifulSoup soup = BeautifulSoup(html_content, 'lxml') # 根据人民网实际结构调整选择器 # 这里使用通用的新闻列表选择器,实际使用时需要根据网站结构调整 news_items = soup.select('.news-list .news-item, .list-item, .news-item, article') if not news_items: # 尝试其他可能的选择器 news_items = soup.select('a[href*="/n1/"], a[href*="/n2/"], a[href*="/n3/"]') logger.info(f"找到 {len(news_items)} 个新闻链接") # 提取新闻链接并爬取详情 processed_urls = set() for item in news_items[:limit * 2]: # 多取一些,因为有些链接可能无效 try: # 提取链接 if item.name == 'a': link_tag = item else: link_tag = item.select_one('a') if not link_tag: continue news_url = link_tag.get('href', '') if not news_url: continue # 处理相对路径 if not news_url.startswith('http'): if news_url.startswith('/'): news_url = self.base_url + news_url else: news_url = f"{self.base_url}/{news_url}" # 跳过重复链接 if news_url in processed_urls: continue processed_urls.add(news_url) # 解析新闻详情 news = await self.parse_news_detail(news_url) if news: news_list.append(news) logger.info(f"成功爬取新闻: {news.title}") if len(news_list) >= limit: break except Exception as e: logger.error(f"处理新闻项失败: {str(e)}") continue logger.info(f"爬取完成,共获取 {len(news_list)} 条新闻") except Exception as e: logger.error(f"爬取新闻列表失败: {str(e)}") return news_list async def parse_news_detail(self, url: str) -> Optional[NewsItem]: """ 解析人民网新闻详情 Args: url: 新闻详情页URL Returns: 新闻对象 """ try: crawler = await self._get_crawler() # 使用 Crawl4AI 爬取详情页 result = await crawler.arun(url=url) if not result.success: logger.warning(f"爬取详情页失败: {url}") return None html_content = result.html if not html_content: return None from bs4 import BeautifulSoup soup = BeautifulSoup(html_content, 'lxml') # 提取标题 title_tag = soup.select_one('h1, .title, .article-title, .p_title') title = title_tag.get_text(strip=True) if title_tag else "未知标题" if title == "未知标题" or not title: logger.warning(f"无法提取标题: {url}") return None # 提取内容 content_tag = soup.select_one( '.article-content, .content, .text-content, .p_content, ' '.article-body, .article-text, #articleContent' ) content = "" if content_tag: # 移除脚本和样式标签 for script in content_tag(['script', 'style', 'noscript']): script.decompose() content = content_tag.get_text(separator='\n', strip=True) # 提取发布时间 time_tag = soup.select_one( '.time, .publish-time, .date, .p_time, .article-time, ' 'time[datetime], .pubtime' ) publish_time = None if time_tag: time_text = time_tag.get_text(strip=True) if not time_text: time_text = time_tag.get('datetime', '') publish_time = self._parse_time(time_text) # 提取作者 author_tag = soup.select_one('.author, .writer, .p_author, .article-author') author = None if author_tag: author = author_tag.get_text(strip=True) # 提取图片 images = [] img_tags = soup.select('.article-content img, .content img, .p_content img') for img in img_tags: img_url = img.get('src', '') or img.get('data-src', '') if img_url and not img_url.startswith('data:'): if not img_url.startswith('http'): if img_url.startswith('/'): img_url = self.base_url + img_url else: img_url = f"{self.base_url}/{img_url}" images.append(img_url) # 创建新闻对象 news = NewsItem( title=title, content=content, url=url, publish_time=publish_time, author=author, source="人民网", category="时政", images=images ) return news except Exception as e: logger.error(f"解析新闻详情失败 [{url}]: {str(e)}") return None def _parse_time(self, time_text: str) -> Optional[str]: """ 解析时间字符串 Args: time_text: 时间文本 Returns: 标准化的时间字符串 """ if not time_text: return None try: # 尝试匹配常见的时间格式 patterns = [ r'(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})', r'(\d{4})年(\d{2})月(\d{2})日\s+(\d{2}):(\d{2})', r'(\d{4})/(\d{2})/(\d{2})\s+(\d{2}):(\d{2})', r'(\d{4})-(\d{2})-(\d{2})', ] for pattern in patterns: match = re.search(pattern, time_text) if match: return time_text.strip() return time_text.strip() except Exception as e: logger.warning(f"时间解析失败: {str(e)}") return None async def close(self): """关闭爬虫,释放资源""" if self.crawler: await self.crawler.close() self.crawler = None logger.info("爬虫已关闭")