178 lines
7.9 KiB
Plaintext
178 lines
7.9 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"# 爬虫测试 Notebook\n",
|
||
"用于测试人民日报爬虫功能\n",
|
||
"\n",
|
||
"## 1. 导入依赖"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"项目根目录: f:\\Project\\schoolNews\\schoolNewsCrawler\n",
|
||
"✓ 已启用自动重载模块功能 - 修改 .py 文件后会自动生效\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# 自动重载模块(当文件修改后自动刷新)\n",
|
||
"%reload_ext autoreload\n",
|
||
"%autoreload 2\n",
|
||
"\n",
|
||
"import sys\n",
|
||
"import os\n",
|
||
"\n",
|
||
"# 先添加项目根目录到路径(必须在导入之前)\n",
|
||
"project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))\n",
|
||
"if project_root not in sys.path:\n",
|
||
" sys.path.insert(0, project_root)\n",
|
||
"\n",
|
||
"# 然后再导入模块\n",
|
||
"from crawler.RmrbCrawler import RmrbCrawler\n",
|
||
"from crawler.BaseCrawler import NewsItem\n",
|
||
"from loguru import logger\n",
|
||
"import json\n",
|
||
"from pprint import pprint\n",
|
||
"\n",
|
||
"print(f\"项目根目录: {project_root}\")\n",
|
||
"print(\"✓ 已启用自动重载模块功能 - 修改 .py 文件后会自动生效\")\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## 2. 初始化爬虫"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"\u001b[32m2025-11-12 19:16:32.990\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.BaseCrawler\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m69\u001b[0m - \u001b[1m初始化爬虫: RmrbCrawler\u001b[0m\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"爬虫初始化成功!\n",
|
||
"基础URL: http://www.people.com.cn\n",
|
||
"URLS: {'search': UrlConfig(url='http://search.people.cn/search-platform/front/search', params={'key': '', 'page': 1, 'limit': 10, 'hasTitle': True, 'hasContent': True, 'isFuzzy': True, 'type': 0, 'sortType': 2, 'startTime': 0, 'endTime': 0}, method='POST', headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36', 'Accept': 'application/json, text/plain, */*', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Content-Type': 'application/json;charset=UTF-8'}), 'hot_point_rank': UrlConfig(url='http://search.people.cn/search-platform/front/searchRank', params={}, method='GET', headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36', 'Accept': 'application/json, text/plain, */*', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Content-Type': 'application/json;charset=UTF-8'}), 'one_day_trending_news': UrlConfig(url='http://www.people.com.cn/GB/59476/review/{date}.html', params={}, method='GET', headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Accept-Language': 'zh-CN,zh;q=0.9'})}\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# 创建爬虫实例\n",
|
||
"crawler = RmrbCrawler()\n",
|
||
"print(\"爬虫初始化成功!\")\n",
|
||
"print(f\"基础URL: {crawler.config.base_url}\")\n",
|
||
"print(f\"URLS: {crawler.config.urls}\")\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"\u001b[32m2025-11-12 19:16:33.072\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.BaseCrawler\u001b[0m:\u001b[36mfetch\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1m请求URL: https://cpc.people.com.cn/n1/2025/1110/c164113-40600321.html (尝试 1/3)\u001b[0m\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"cpc\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"\u001b[32m2025-11-12 19:16:33.152\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.RmrbCrawler\u001b[0m:\u001b[36mparse_cpc_news_detail\u001b[0m:\u001b[36m129\u001b[0m - \u001b[1m成功解析新闻: 时习之丨“柚”见乡村好风光\u001b[0m\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"NewsItem(title='时习之丨“柚”见乡村好风光', contentRows=[{'tag': 'video', 'content': \"<video style='text-align: center;' poster='https://video.people.cn/userUpload/1739759454736028/1743494135227389/picture/e2cd514c-2af8-4b2c-b35a-c70606de0fff.png' controls src='https://video.people.cn/upload/vod/user1739759454736028/1762614745429386/origin.mp4'></video>\"}, {'tag': 'p', 'content': '<p>\\u3000\\u30002025年11月7日,习近平总书记在广东省梅州市梅县区雁洋镇考察南福金柚种植基地,了解当地推进乡村全面振兴等情况。</p>'}, {'tag': 'p', 'content': '<p>\\u3000\\u3000乡村全面振兴是总书记地方考察中关切的重要主题之一。“乡村振兴要靠产业,各地要各展其长,走适合自己的振兴道路”,他多次走进田间地头,为特色产业谋思路、为农民致富找门路。</p>'}, {'tag': 'p', 'content': '<p>\\u3000\\u3000一个个“土特产”正变身成乡亲增收致富的大产业!</p>'}, {'tag': 'p', 'content': '<p>\\u3000\\u3000<span style=\"font-family: 楷体;\">制作:彭静</span></p>'}, {'tag': 'p', 'content': '<p><span style=\"font-family: 楷体;\">\\u3000\\u3000素材来源:人民日报、新华社、央视网等</span></p>'}], url='https://cpc.people.com.cn/n1/2025/1110/c164113-40600321.html', publishTime='', author='', source='人民网', category='', executeStatus=1, executeMessage='成功解析新闻')"
|
||
]
|
||
},
|
||
"execution_count": 7,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"crawler.parse_news_detail(\"https://cpc.people.com.cn/n1/2025/1110/c164113-40600321.html\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"\u001b[32m2025-11-12 19:16:33.254\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.BaseCrawler\u001b[0m:\u001b[36mclose\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m关闭爬虫: RmrbCrawler\u001b[0m\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"爬虫会话已关闭\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# 关闭爬虫会话\n",
|
||
"crawler.close()\n",
|
||
"print(\"爬虫会话已关闭\")\n"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "schoolNewsCrawler",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.12.12"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|