Files
schoolNews/schoolNewsCrawler/crawl4AI/PeopleNetCrewer.py

318 lines
12 KiB
Python
Raw Permalink Normal View History

2025-11-10 15:22:44 +08:00
"""
使用 Crawl4AI 爬取人民网新闻
"""
from typing import List, Optional
from loguru import logger
import re
from datetime import datetime
from crawler.BaseCrawler import NewsItem
import asyncio
class PeopleNetCrewer:
"""使用 Crawl4AI 爬取人民网新闻"""
def __init__(self, chrome_path: Optional[str] = None):
"""
初始化人民网爬虫
Args:
chrome_path: Chrome 浏览器可执行文件路径如果为 None 则使用系统默认路径
"""
self.base_url = "http://www.people.com.cn"
self.chrome_path = chrome_path
self.crawler = None
logger.info(f"初始化人民网爬虫 (Crawl4AI)")
async def _get_crawler(self):
"""获取或创建爬虫实例"""
if self.crawler is None:
try:
from crawl4ai import AsyncWebCrawler
except ImportError:
logger.error("请先安装 crawl4ai: pip install crawl4ai")
raise ImportError("crawl4ai 未安装")
# 配置浏览器选项,使用本地 Chrome
browser_config = {
"headless": True,
"verbose": False,
}
# 如果指定了 Chrome 路径,使用指定路径
if self.chrome_path:
browser_config["executable_path"] = self.chrome_path
logger.info(f"使用指定的 Chrome 路径: {self.chrome_path}")
else:
# 使用本地安装的 Chrome通过 channel 参数)
browser_config["channel"] = "chrome"
logger.info("使用本地安装的 Chrome 浏览器")
try:
self.crawler = AsyncWebCrawler(
browser_type="chromium",
browser_config=browser_config
)
except Exception as e:
logger.error(f"创建爬虫实例失败: {str(e)}")
# 如果使用 channel 失败,尝试使用 executable_path
if not self.chrome_path and "channel" in browser_config:
logger.warning("使用 channel 参数失败,尝试其他方式...")
# 尝试常见的 Chrome 路径
import os
common_paths = [
r"C:\Program Files\Google\Chrome\Application\chrome.exe",
r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe",
os.path.expanduser(r"~\AppData\Local\Google\Chrome\Application\chrome.exe"),
]
for path in common_paths:
if os.path.exists(path):
browser_config.pop("channel", None)
browser_config["executable_path"] = path
logger.info(f"找到 Chrome 路径: {path}")
try:
self.crawler = AsyncWebCrawler(
browser_type="chromium",
browser_config=browser_config
)
break
except Exception:
continue
else:
raise
else:
raise
return self.crawler
async def crawl(self, category: str = "politics", limit: int = 20) -> List[NewsItem]:
"""
爬取人民网新闻列表
Args:
category: 新闻分类politics-政治, society-社会, world-国际等
limit: 爬取数量限制
Returns:
新闻列表
"""
news_list = []
try:
crawler = await self._get_crawler()
# 构建列表页URL
list_url = f"{self.base_url}/{category}/index.html"
logger.info(f"开始爬取人民网新闻: {list_url}")
# 使用 Crawl4AI 爬取页面
result = await crawler.arun(url=list_url)
if not result.success:
logger.error(f"爬取失败: {list_url}")
return news_list
html_content = result.html
if not html_content:
logger.warning("未获取到页面内容")
return news_list
# 解析新闻列表
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_content, 'lxml')
# 根据人民网实际结构调整选择器
# 这里使用通用的新闻列表选择器,实际使用时需要根据网站结构调整
news_items = soup.select('.news-list .news-item, .list-item, .news-item, article')
if not news_items:
# 尝试其他可能的选择器
news_items = soup.select('a[href*="/n1/"], a[href*="/n2/"], a[href*="/n3/"]')
logger.info(f"找到 {len(news_items)} 个新闻链接")
# 提取新闻链接并爬取详情
processed_urls = set()
for item in news_items[:limit * 2]: # 多取一些,因为有些链接可能无效
try:
# 提取链接
if item.name == 'a':
link_tag = item
else:
link_tag = item.select_one('a')
if not link_tag:
continue
news_url = link_tag.get('href', '')
if not news_url:
continue
# 处理相对路径
if not news_url.startswith('http'):
if news_url.startswith('/'):
news_url = self.base_url + news_url
else:
news_url = f"{self.base_url}/{news_url}"
# 跳过重复链接
if news_url in processed_urls:
continue
processed_urls.add(news_url)
# 解析新闻详情
news = await self.parse_news_detail(news_url)
if news:
news_list.append(news)
logger.info(f"成功爬取新闻: {news.title}")
if len(news_list) >= limit:
break
except Exception as e:
logger.error(f"处理新闻项失败: {str(e)}")
continue
logger.info(f"爬取完成,共获取 {len(news_list)} 条新闻")
except Exception as e:
logger.error(f"爬取新闻列表失败: {str(e)}")
return news_list
async def parse_news_detail(self, url: str) -> Optional[NewsItem]:
"""
解析人民网新闻详情
Args:
url: 新闻详情页URL
Returns:
新闻对象
"""
try:
crawler = await self._get_crawler()
# 使用 Crawl4AI 爬取详情页
result = await crawler.arun(url=url)
if not result.success:
logger.warning(f"爬取详情页失败: {url}")
return None
html_content = result.html
if not html_content:
return None
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_content, 'lxml')
# 提取标题
title_tag = soup.select_one('h1, .title, .article-title, .p_title')
title = title_tag.get_text(strip=True) if title_tag else "未知标题"
if title == "未知标题" or not title:
logger.warning(f"无法提取标题: {url}")
return None
# 提取内容
content_tag = soup.select_one(
'.article-content, .content, .text-content, .p_content, '
'.article-body, .article-text, #articleContent'
)
content = ""
if content_tag:
# 移除脚本和样式标签
for script in content_tag(['script', 'style', 'noscript']):
script.decompose()
content = content_tag.get_text(separator='\n', strip=True)
# 提取发布时间
time_tag = soup.select_one(
'.time, .publish-time, .date, .p_time, .article-time, '
'time[datetime], .pubtime'
)
publish_time = None
if time_tag:
time_text = time_tag.get_text(strip=True)
if not time_text:
time_text = time_tag.get('datetime', '')
publish_time = self._parse_time(time_text)
# 提取作者
author_tag = soup.select_one('.author, .writer, .p_author, .article-author')
author = None
if author_tag:
author = author_tag.get_text(strip=True)
# 提取图片
images = []
img_tags = soup.select('.article-content img, .content img, .p_content img')
for img in img_tags:
img_url = img.get('src', '') or img.get('data-src', '')
if img_url and not img_url.startswith('data:'):
if not img_url.startswith('http'):
if img_url.startswith('/'):
img_url = self.base_url + img_url
else:
img_url = f"{self.base_url}/{img_url}"
images.append(img_url)
# 创建新闻对象
news = NewsItem(
title=title,
content=content,
url=url,
publish_time=publish_time,
author=author,
source="人民网",
category="时政",
images=images
)
return news
except Exception as e:
logger.error(f"解析新闻详情失败 [{url}]: {str(e)}")
return None
def _parse_time(self, time_text: str) -> Optional[str]:
"""
解析时间字符串
Args:
time_text: 时间文本
Returns:
标准化的时间字符串
"""
if not time_text:
return None
try:
# 尝试匹配常见的时间格式
patterns = [
r'(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})',
r'(\d{4})年(\d{2})月(\d{2})日\s+(\d{2}):(\d{2})',
r'(\d{4})/(\d{2})/(\d{2})\s+(\d{2}):(\d{2})',
r'(\d{4})-(\d{2})-(\d{2})',
]
for pattern in patterns:
match = re.search(pattern, time_text)
if match:
return time_text.strip()
return time_text.strip()
except Exception as e:
logger.warning(f"时间解析失败: {str(e)}")
return None
async def close(self):
"""关闭爬虫,释放资源"""
if self.crawler:
await self.crawler.close()
self.crawler = None
logger.info("爬虫已关闭")