Files
schoolNews/schoolNewsCrawler/crawl4AI/PeopleNetCrewer.py
2025-11-10 15:22:44 +08:00

318 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
使用 Crawl4AI 爬取人民网新闻
"""
from typing import List, Optional
from loguru import logger
import re
from datetime import datetime
from crawler.BaseCrawler import NewsItem
import asyncio
class PeopleNetCrewer:
"""使用 Crawl4AI 爬取人民网新闻"""
def __init__(self, chrome_path: Optional[str] = None):
"""
初始化人民网爬虫
Args:
chrome_path: Chrome 浏览器可执行文件路径,如果为 None 则使用系统默认路径
"""
self.base_url = "http://www.people.com.cn"
self.chrome_path = chrome_path
self.crawler = None
logger.info(f"初始化人民网爬虫 (Crawl4AI)")
async def _get_crawler(self):
"""获取或创建爬虫实例"""
if self.crawler is None:
try:
from crawl4ai import AsyncWebCrawler
except ImportError:
logger.error("请先安装 crawl4ai: pip install crawl4ai")
raise ImportError("crawl4ai 未安装")
# 配置浏览器选项,使用本地 Chrome
browser_config = {
"headless": True,
"verbose": False,
}
# 如果指定了 Chrome 路径,使用指定路径
if self.chrome_path:
browser_config["executable_path"] = self.chrome_path
logger.info(f"使用指定的 Chrome 路径: {self.chrome_path}")
else:
# 使用本地安装的 Chrome通过 channel 参数)
browser_config["channel"] = "chrome"
logger.info("使用本地安装的 Chrome 浏览器")
try:
self.crawler = AsyncWebCrawler(
browser_type="chromium",
browser_config=browser_config
)
except Exception as e:
logger.error(f"创建爬虫实例失败: {str(e)}")
# 如果使用 channel 失败,尝试使用 executable_path
if not self.chrome_path and "channel" in browser_config:
logger.warning("使用 channel 参数失败,尝试其他方式...")
# 尝试常见的 Chrome 路径
import os
common_paths = [
r"C:\Program Files\Google\Chrome\Application\chrome.exe",
r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe",
os.path.expanduser(r"~\AppData\Local\Google\Chrome\Application\chrome.exe"),
]
for path in common_paths:
if os.path.exists(path):
browser_config.pop("channel", None)
browser_config["executable_path"] = path
logger.info(f"找到 Chrome 路径: {path}")
try:
self.crawler = AsyncWebCrawler(
browser_type="chromium",
browser_config=browser_config
)
break
except Exception:
continue
else:
raise
else:
raise
return self.crawler
async def crawl(self, category: str = "politics", limit: int = 20) -> List[NewsItem]:
"""
爬取人民网新闻列表
Args:
category: 新闻分类politics-政治, society-社会, world-国际等)
limit: 爬取数量限制
Returns:
新闻列表
"""
news_list = []
try:
crawler = await self._get_crawler()
# 构建列表页URL
list_url = f"{self.base_url}/{category}/index.html"
logger.info(f"开始爬取人民网新闻: {list_url}")
# 使用 Crawl4AI 爬取页面
result = await crawler.arun(url=list_url)
if not result.success:
logger.error(f"爬取失败: {list_url}")
return news_list
html_content = result.html
if not html_content:
logger.warning("未获取到页面内容")
return news_list
# 解析新闻列表
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_content, 'lxml')
# 根据人民网实际结构调整选择器
# 这里使用通用的新闻列表选择器,实际使用时需要根据网站结构调整
news_items = soup.select('.news-list .news-item, .list-item, .news-item, article')
if not news_items:
# 尝试其他可能的选择器
news_items = soup.select('a[href*="/n1/"], a[href*="/n2/"], a[href*="/n3/"]')
logger.info(f"找到 {len(news_items)} 个新闻链接")
# 提取新闻链接并爬取详情
processed_urls = set()
for item in news_items[:limit * 2]: # 多取一些,因为有些链接可能无效
try:
# 提取链接
if item.name == 'a':
link_tag = item
else:
link_tag = item.select_one('a')
if not link_tag:
continue
news_url = link_tag.get('href', '')
if not news_url:
continue
# 处理相对路径
if not news_url.startswith('http'):
if news_url.startswith('/'):
news_url = self.base_url + news_url
else:
news_url = f"{self.base_url}/{news_url}"
# 跳过重复链接
if news_url in processed_urls:
continue
processed_urls.add(news_url)
# 解析新闻详情
news = await self.parse_news_detail(news_url)
if news:
news_list.append(news)
logger.info(f"成功爬取新闻: {news.title}")
if len(news_list) >= limit:
break
except Exception as e:
logger.error(f"处理新闻项失败: {str(e)}")
continue
logger.info(f"爬取完成,共获取 {len(news_list)} 条新闻")
except Exception as e:
logger.error(f"爬取新闻列表失败: {str(e)}")
return news_list
async def parse_news_detail(self, url: str) -> Optional[NewsItem]:
"""
解析人民网新闻详情
Args:
url: 新闻详情页URL
Returns:
新闻对象
"""
try:
crawler = await self._get_crawler()
# 使用 Crawl4AI 爬取详情页
result = await crawler.arun(url=url)
if not result.success:
logger.warning(f"爬取详情页失败: {url}")
return None
html_content = result.html
if not html_content:
return None
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_content, 'lxml')
# 提取标题
title_tag = soup.select_one('h1, .title, .article-title, .p_title')
title = title_tag.get_text(strip=True) if title_tag else "未知标题"
if title == "未知标题" or not title:
logger.warning(f"无法提取标题: {url}")
return None
# 提取内容
content_tag = soup.select_one(
'.article-content, .content, .text-content, .p_content, '
'.article-body, .article-text, #articleContent'
)
content = ""
if content_tag:
# 移除脚本和样式标签
for script in content_tag(['script', 'style', 'noscript']):
script.decompose()
content = content_tag.get_text(separator='\n', strip=True)
# 提取发布时间
time_tag = soup.select_one(
'.time, .publish-time, .date, .p_time, .article-time, '
'time[datetime], .pubtime'
)
publish_time = None
if time_tag:
time_text = time_tag.get_text(strip=True)
if not time_text:
time_text = time_tag.get('datetime', '')
publish_time = self._parse_time(time_text)
# 提取作者
author_tag = soup.select_one('.author, .writer, .p_author, .article-author')
author = None
if author_tag:
author = author_tag.get_text(strip=True)
# 提取图片
images = []
img_tags = soup.select('.article-content img, .content img, .p_content img')
for img in img_tags:
img_url = img.get('src', '') or img.get('data-src', '')
if img_url and not img_url.startswith('data:'):
if not img_url.startswith('http'):
if img_url.startswith('/'):
img_url = self.base_url + img_url
else:
img_url = f"{self.base_url}/{img_url}"
images.append(img_url)
# 创建新闻对象
news = NewsItem(
title=title,
content=content,
url=url,
publish_time=publish_time,
author=author,
source="人民网",
category="时政",
images=images
)
return news
except Exception as e:
logger.error(f"解析新闻详情失败 [{url}]: {str(e)}")
return None
def _parse_time(self, time_text: str) -> Optional[str]:
"""
解析时间字符串
Args:
time_text: 时间文本
Returns:
标准化的时间字符串
"""
if not time_text:
return None
try:
# 尝试匹配常见的时间格式
patterns = [
r'(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})',
r'(\d{4})年(\d{2})月(\d{2})日\s+(\d{2}):(\d{2})',
r'(\d{4})/(\d{2})/(\d{2})\s+(\d{2}):(\d{2})',
r'(\d{4})-(\d{2})-(\d{2})',
]
for pattern in patterns:
match = re.search(pattern, time_text)
if match:
return time_text.strip()
return time_text.strip()
except Exception as e:
logger.warning(f"时间解析失败: {str(e)}")
return None
async def close(self):
"""关闭爬虫,释放资源"""
if self.crawler:
await self.crawler.close()
self.crawler = None
logger.info("爬虫已关闭")