人民日报爬虫
This commit is contained in:
317
schoolNewsCrawler/crawl4AI/PeopleNetCrewer.py
Normal file
317
schoolNewsCrawler/crawl4AI/PeopleNetCrewer.py
Normal file
@@ -0,0 +1,317 @@
|
||||
"""
|
||||
使用 Crawl4AI 爬取人民网新闻
|
||||
"""
|
||||
from typing import List, Optional
|
||||
from loguru import logger
|
||||
import re
|
||||
from datetime import datetime
|
||||
from crawler.BaseCrawler import NewsItem
|
||||
import asyncio
|
||||
|
||||
|
||||
class PeopleNetCrewer:
|
||||
"""使用 Crawl4AI 爬取人民网新闻"""
|
||||
|
||||
def __init__(self, chrome_path: Optional[str] = None):
|
||||
"""
|
||||
初始化人民网爬虫
|
||||
|
||||
Args:
|
||||
chrome_path: Chrome 浏览器可执行文件路径,如果为 None 则使用系统默认路径
|
||||
"""
|
||||
self.base_url = "http://www.people.com.cn"
|
||||
self.chrome_path = chrome_path
|
||||
self.crawler = None
|
||||
logger.info(f"初始化人民网爬虫 (Crawl4AI)")
|
||||
|
||||
async def _get_crawler(self):
|
||||
"""获取或创建爬虫实例"""
|
||||
if self.crawler is None:
|
||||
try:
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
except ImportError:
|
||||
logger.error("请先安装 crawl4ai: pip install crawl4ai")
|
||||
raise ImportError("crawl4ai 未安装")
|
||||
|
||||
# 配置浏览器选项,使用本地 Chrome
|
||||
browser_config = {
|
||||
"headless": True,
|
||||
"verbose": False,
|
||||
}
|
||||
|
||||
# 如果指定了 Chrome 路径,使用指定路径
|
||||
if self.chrome_path:
|
||||
browser_config["executable_path"] = self.chrome_path
|
||||
logger.info(f"使用指定的 Chrome 路径: {self.chrome_path}")
|
||||
else:
|
||||
# 使用本地安装的 Chrome(通过 channel 参数)
|
||||
browser_config["channel"] = "chrome"
|
||||
logger.info("使用本地安装的 Chrome 浏览器")
|
||||
|
||||
try:
|
||||
self.crawler = AsyncWebCrawler(
|
||||
browser_type="chromium",
|
||||
browser_config=browser_config
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"创建爬虫实例失败: {str(e)}")
|
||||
# 如果使用 channel 失败,尝试使用 executable_path
|
||||
if not self.chrome_path and "channel" in browser_config:
|
||||
logger.warning("使用 channel 参数失败,尝试其他方式...")
|
||||
# 尝试常见的 Chrome 路径
|
||||
import os
|
||||
common_paths = [
|
||||
r"C:\Program Files\Google\Chrome\Application\chrome.exe",
|
||||
r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe",
|
||||
os.path.expanduser(r"~\AppData\Local\Google\Chrome\Application\chrome.exe"),
|
||||
]
|
||||
for path in common_paths:
|
||||
if os.path.exists(path):
|
||||
browser_config.pop("channel", None)
|
||||
browser_config["executable_path"] = path
|
||||
logger.info(f"找到 Chrome 路径: {path}")
|
||||
try:
|
||||
self.crawler = AsyncWebCrawler(
|
||||
browser_type="chromium",
|
||||
browser_config=browser_config
|
||||
)
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
else:
|
||||
raise
|
||||
else:
|
||||
raise
|
||||
return self.crawler
|
||||
|
||||
async def crawl(self, category: str = "politics", limit: int = 20) -> List[NewsItem]:
|
||||
"""
|
||||
爬取人民网新闻列表
|
||||
|
||||
Args:
|
||||
category: 新闻分类(politics-政治, society-社会, world-国际等)
|
||||
limit: 爬取数量限制
|
||||
|
||||
Returns:
|
||||
新闻列表
|
||||
"""
|
||||
news_list = []
|
||||
|
||||
try:
|
||||
crawler = await self._get_crawler()
|
||||
|
||||
# 构建列表页URL
|
||||
list_url = f"{self.base_url}/{category}/index.html"
|
||||
logger.info(f"开始爬取人民网新闻: {list_url}")
|
||||
|
||||
# 使用 Crawl4AI 爬取页面
|
||||
result = await crawler.arun(url=list_url)
|
||||
|
||||
if not result.success:
|
||||
logger.error(f"爬取失败: {list_url}")
|
||||
return news_list
|
||||
|
||||
html_content = result.html
|
||||
if not html_content:
|
||||
logger.warning("未获取到页面内容")
|
||||
return news_list
|
||||
|
||||
# 解析新闻列表
|
||||
from bs4 import BeautifulSoup
|
||||
soup = BeautifulSoup(html_content, 'lxml')
|
||||
|
||||
# 根据人民网实际结构调整选择器
|
||||
# 这里使用通用的新闻列表选择器,实际使用时需要根据网站结构调整
|
||||
news_items = soup.select('.news-list .news-item, .list-item, .news-item, article')
|
||||
|
||||
if not news_items:
|
||||
# 尝试其他可能的选择器
|
||||
news_items = soup.select('a[href*="/n1/"], a[href*="/n2/"], a[href*="/n3/"]')
|
||||
|
||||
logger.info(f"找到 {len(news_items)} 个新闻链接")
|
||||
|
||||
# 提取新闻链接并爬取详情
|
||||
processed_urls = set()
|
||||
for item in news_items[:limit * 2]: # 多取一些,因为有些链接可能无效
|
||||
try:
|
||||
# 提取链接
|
||||
if item.name == 'a':
|
||||
link_tag = item
|
||||
else:
|
||||
link_tag = item.select_one('a')
|
||||
|
||||
if not link_tag:
|
||||
continue
|
||||
|
||||
news_url = link_tag.get('href', '')
|
||||
if not news_url:
|
||||
continue
|
||||
|
||||
# 处理相对路径
|
||||
if not news_url.startswith('http'):
|
||||
if news_url.startswith('/'):
|
||||
news_url = self.base_url + news_url
|
||||
else:
|
||||
news_url = f"{self.base_url}/{news_url}"
|
||||
|
||||
# 跳过重复链接
|
||||
if news_url in processed_urls:
|
||||
continue
|
||||
processed_urls.add(news_url)
|
||||
|
||||
# 解析新闻详情
|
||||
news = await self.parse_news_detail(news_url)
|
||||
if news:
|
||||
news_list.append(news)
|
||||
logger.info(f"成功爬取新闻: {news.title}")
|
||||
|
||||
if len(news_list) >= limit:
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"处理新闻项失败: {str(e)}")
|
||||
continue
|
||||
|
||||
logger.info(f"爬取完成,共获取 {len(news_list)} 条新闻")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"爬取新闻列表失败: {str(e)}")
|
||||
|
||||
return news_list
|
||||
|
||||
async def parse_news_detail(self, url: str) -> Optional[NewsItem]:
|
||||
"""
|
||||
解析人民网新闻详情
|
||||
|
||||
Args:
|
||||
url: 新闻详情页URL
|
||||
|
||||
Returns:
|
||||
新闻对象
|
||||
"""
|
||||
try:
|
||||
crawler = await self._get_crawler()
|
||||
|
||||
# 使用 Crawl4AI 爬取详情页
|
||||
result = await crawler.arun(url=url)
|
||||
|
||||
if not result.success:
|
||||
logger.warning(f"爬取详情页失败: {url}")
|
||||
return None
|
||||
|
||||
html_content = result.html
|
||||
if not html_content:
|
||||
return None
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
soup = BeautifulSoup(html_content, 'lxml')
|
||||
|
||||
# 提取标题
|
||||
title_tag = soup.select_one('h1, .title, .article-title, .p_title')
|
||||
title = title_tag.get_text(strip=True) if title_tag else "未知标题"
|
||||
|
||||
if title == "未知标题" or not title:
|
||||
logger.warning(f"无法提取标题: {url}")
|
||||
return None
|
||||
|
||||
# 提取内容
|
||||
content_tag = soup.select_one(
|
||||
'.article-content, .content, .text-content, .p_content, '
|
||||
'.article-body, .article-text, #articleContent'
|
||||
)
|
||||
content = ""
|
||||
if content_tag:
|
||||
# 移除脚本和样式标签
|
||||
for script in content_tag(['script', 'style', 'noscript']):
|
||||
script.decompose()
|
||||
content = content_tag.get_text(separator='\n', strip=True)
|
||||
|
||||
# 提取发布时间
|
||||
time_tag = soup.select_one(
|
||||
'.time, .publish-time, .date, .p_time, .article-time, '
|
||||
'time[datetime], .pubtime'
|
||||
)
|
||||
publish_time = None
|
||||
if time_tag:
|
||||
time_text = time_tag.get_text(strip=True)
|
||||
if not time_text:
|
||||
time_text = time_tag.get('datetime', '')
|
||||
publish_time = self._parse_time(time_text)
|
||||
|
||||
# 提取作者
|
||||
author_tag = soup.select_one('.author, .writer, .p_author, .article-author')
|
||||
author = None
|
||||
if author_tag:
|
||||
author = author_tag.get_text(strip=True)
|
||||
|
||||
# 提取图片
|
||||
images = []
|
||||
img_tags = soup.select('.article-content img, .content img, .p_content img')
|
||||
for img in img_tags:
|
||||
img_url = img.get('src', '') or img.get('data-src', '')
|
||||
if img_url and not img_url.startswith('data:'):
|
||||
if not img_url.startswith('http'):
|
||||
if img_url.startswith('/'):
|
||||
img_url = self.base_url + img_url
|
||||
else:
|
||||
img_url = f"{self.base_url}/{img_url}"
|
||||
images.append(img_url)
|
||||
|
||||
# 创建新闻对象
|
||||
news = NewsItem(
|
||||
title=title,
|
||||
content=content,
|
||||
url=url,
|
||||
publish_time=publish_time,
|
||||
author=author,
|
||||
source="人民网",
|
||||
category="时政",
|
||||
images=images
|
||||
)
|
||||
|
||||
return news
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"解析新闻详情失败 [{url}]: {str(e)}")
|
||||
return None
|
||||
|
||||
def _parse_time(self, time_text: str) -> Optional[str]:
|
||||
"""
|
||||
解析时间字符串
|
||||
|
||||
Args:
|
||||
time_text: 时间文本
|
||||
|
||||
Returns:
|
||||
标准化的时间字符串
|
||||
"""
|
||||
if not time_text:
|
||||
return None
|
||||
|
||||
try:
|
||||
# 尝试匹配常见的时间格式
|
||||
patterns = [
|
||||
r'(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})',
|
||||
r'(\d{4})年(\d{2})月(\d{2})日\s+(\d{2}):(\d{2})',
|
||||
r'(\d{4})/(\d{2})/(\d{2})\s+(\d{2}):(\d{2})',
|
||||
r'(\d{4})-(\d{2})-(\d{2})',
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, time_text)
|
||||
if match:
|
||||
return time_text.strip()
|
||||
|
||||
return time_text.strip()
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"时间解析失败: {str(e)}")
|
||||
return None
|
||||
|
||||
async def close(self):
|
||||
"""关闭爬虫,释放资源"""
|
||||
if self.crawler:
|
||||
await self.crawler.close()
|
||||
self.crawler = None
|
||||
logger.info("爬虫已关闭")
|
||||
|
||||
77
schoolNewsCrawler/crawl4AI/README.md
Normal file
77
schoolNewsCrawler/crawl4AI/README.md
Normal file
@@ -0,0 +1,77 @@
|
||||
# Crawl4AI 人民网新闻爬虫
|
||||
|
||||
使用 Crawl4AI 框架爬取人民网新闻,支持使用本地 Chrome 浏览器。
|
||||
|
||||
## 安装依赖
|
||||
|
||||
```bash
|
||||
pip install crawl4ai playwright
|
||||
playwright install chromium # 或者使用本地 Chrome
|
||||
```
|
||||
|
||||
## 使用方法
|
||||
|
||||
### 基本使用
|
||||
|
||||
```bash
|
||||
# 使用默认配置(自动使用本地 Chrome)
|
||||
python crawl4ai/main.py [category] [limit] [output_file]
|
||||
|
||||
# 示例
|
||||
python crawl4ai/main.py politics 20 output/news.json
|
||||
```
|
||||
|
||||
### 指定 Chrome 路径
|
||||
|
||||
```bash
|
||||
# 指定 Chrome 可执行文件路径
|
||||
python crawl4ai/main.py politics 20 output/news.json "C:\Program Files\Google\Chrome\Application\chrome.exe"
|
||||
```
|
||||
|
||||
### 在代码中使用
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai.PeopleNetCrewer import PeopleNetCrewer
|
||||
|
||||
async def main():
|
||||
# 使用默认 Chrome(自动检测)
|
||||
crewer = PeopleNetCrewer()
|
||||
|
||||
# 或者指定 Chrome 路径
|
||||
# crewer = PeopleNetCrewer(chrome_path="C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe")
|
||||
|
||||
news_list = await crewer.crawl(category="politics", limit=20)
|
||||
|
||||
for news in news_list:
|
||||
print(f"标题: {news.title}")
|
||||
print(f"链接: {news.url}")
|
||||
print("-" * 50)
|
||||
|
||||
await crewer.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
## 配置说明
|
||||
|
||||
### 使用本地 Chrome
|
||||
|
||||
代码会自动尝试使用本地安装的 Chrome 浏览器。如果未指定 `chrome_path`,会通过 `channel="chrome"` 参数使用系统默认的 Chrome。
|
||||
|
||||
### 浏览器配置
|
||||
|
||||
在 `PeopleNetCrewer` 类中,可以通过修改 `_get_crawler` 方法中的 `browser_config` 来调整浏览器行为:
|
||||
|
||||
- `headless`: 是否无头模式(默认 True)
|
||||
- `verbose`: 是否显示详细日志(默认 False)
|
||||
- `channel`: 浏览器通道("chrome" 表示使用本地 Chrome)
|
||||
- `executable_path`: 指定浏览器可执行文件路径
|
||||
|
||||
## 注意事项
|
||||
|
||||
1. 确保已安装 Chrome 浏览器
|
||||
2. 如果遇到 Playwright 浏览器未找到的错误,可以运行 `playwright install chromium` 安装 Playwright 自带的浏览器
|
||||
3. 使用本地 Chrome 时,确保 Chrome 版本与 Playwright 兼容
|
||||
|
||||
9
schoolNewsCrawler/crawl4AI/__init__.py
Normal file
9
schoolNewsCrawler/crawl4AI/__init__.py
Normal file
@@ -0,0 +1,9 @@
|
||||
"""
|
||||
Crawl4AI 爬虫模块
|
||||
使用 Crawl4AI 进行动态网页爬取
|
||||
"""
|
||||
|
||||
from .PeopleNetCrewer import PeopleNetCrewer
|
||||
|
||||
__all__ = ['PeopleNetCrewer']
|
||||
|
||||
147
schoolNewsCrawler/crawl4AI/main.py
Normal file
147
schoolNewsCrawler/crawl4AI/main.py
Normal file
@@ -0,0 +1,147 @@
|
||||
"""
|
||||
使用 Crawl4AI 爬取人民网新闻的主程序
|
||||
"""
|
||||
import sys
|
||||
import json
|
||||
import asyncio
|
||||
from typing import List
|
||||
from loguru import logger
|
||||
from crawl4ai.PeopleNetCrewer import PeopleNetCrewer
|
||||
|
||||
|
||||
# 配置日志
|
||||
logger.remove() # 移除默认处理器
|
||||
logger.add(
|
||||
sys.stdout,
|
||||
format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan> - <level>{message}</level>",
|
||||
level="INFO"
|
||||
)
|
||||
logger.add(
|
||||
"logs/crewer_{time:YYYY-MM-DD}.log",
|
||||
rotation="00:00",
|
||||
retention="30 days",
|
||||
encoding="utf-8",
|
||||
level="DEBUG"
|
||||
)
|
||||
|
||||
|
||||
async def crawl_people_net_news(
|
||||
category: str = "politics",
|
||||
limit: int = 20,
|
||||
chrome_path: str = None
|
||||
) -> List[dict]:
|
||||
"""
|
||||
使用 Crawl4AI 爬取人民网新闻
|
||||
|
||||
Args:
|
||||
category: 新闻分类
|
||||
limit: 爬取数量
|
||||
chrome_path: Chrome 浏览器可执行文件路径(可选)
|
||||
|
||||
Returns:
|
||||
新闻列表(字典格式)
|
||||
"""
|
||||
logger.info(f"开始爬取人民网新闻 - 分类: {category}, 数量: {limit}")
|
||||
|
||||
crewer = None
|
||||
try:
|
||||
crewer = PeopleNetCrewer(chrome_path=chrome_path)
|
||||
news_list = await crewer.crawl(category=category, limit=limit)
|
||||
|
||||
# 转换为字典列表
|
||||
result = [news.model_dump() for news in news_list]
|
||||
|
||||
logger.info(f"爬取完成,共获取 {len(result)} 条新闻")
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"爬取失败: {str(e)}")
|
||||
import traceback
|
||||
logger.error(traceback.format_exc())
|
||||
return []
|
||||
|
||||
finally:
|
||||
if crewer:
|
||||
await crewer.close()
|
||||
|
||||
|
||||
def save_to_json(news_list: List[dict], output_file: str = "output/news.json"):
|
||||
"""
|
||||
保存新闻到JSON文件
|
||||
|
||||
Args:
|
||||
news_list: 新闻列表
|
||||
output_file: 输出文件路径
|
||||
"""
|
||||
try:
|
||||
import os
|
||||
os.makedirs(os.path.dirname(output_file), exist_ok=True)
|
||||
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(news_list, f, ensure_ascii=False, indent=2)
|
||||
|
||||
logger.info(f"新闻已保存到: {output_file}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"保存文件失败: {str(e)}")
|
||||
|
||||
|
||||
async def main_async():
|
||||
"""异步主函数"""
|
||||
# 解析命令行参数
|
||||
category = "politics"
|
||||
limit = 20
|
||||
output_file = "output/news.json"
|
||||
chrome_path = None
|
||||
|
||||
if len(sys.argv) > 1:
|
||||
category = sys.argv[1]
|
||||
if len(sys.argv) > 2:
|
||||
limit = int(sys.argv[2])
|
||||
if len(sys.argv) > 3:
|
||||
output_file = sys.argv[3]
|
||||
if len(sys.argv) > 4:
|
||||
chrome_path = sys.argv[4]
|
||||
|
||||
logger.info("=" * 60)
|
||||
logger.info("人民网新闻爬虫程序启动 (Crawl4AI)")
|
||||
logger.info("=" * 60)
|
||||
|
||||
# 爬取新闻
|
||||
news_list = await crawl_people_net_news(
|
||||
category=category,
|
||||
limit=limit,
|
||||
chrome_path=chrome_path
|
||||
)
|
||||
|
||||
# 保存结果
|
||||
if news_list:
|
||||
save_to_json(news_list, output_file)
|
||||
|
||||
# 输出统计信息
|
||||
logger.info(f"爬取统计:")
|
||||
logger.info(f" - 成功: {len(news_list)} 条")
|
||||
logger.info(f" - 失败: {limit - len(news_list)} 条")
|
||||
else:
|
||||
logger.warning("未获取到任何新闻")
|
||||
|
||||
logger.info("=" * 60)
|
||||
logger.info("人民网新闻爬虫程序结束")
|
||||
logger.info("=" * 60)
|
||||
|
||||
|
||||
def main():
|
||||
"""主函数入口"""
|
||||
try:
|
||||
asyncio.run(main_async())
|
||||
except KeyboardInterrupt:
|
||||
logger.info("程序被用户中断")
|
||||
except Exception as e:
|
||||
logger.error(f"程序运行出错: {str(e)}")
|
||||
import traceback
|
||||
logger.error(traceback.format_exc())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
Reference in New Issue
Block a user