Files
schoolNews/schoolNewsCrawler/crawler/rmrb/test_crawler.ipynb
2025-11-19 19:05:31 +08:00

178 lines
7.9 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 爬虫测试 Notebook\n",
"用于测试人民日报爬虫功能\n",
"\n",
"## 1. 导入依赖"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"项目根目录: f:\\Project\\schoolNews\\schoolNewsCrawler\n",
"✓ 已启用自动重载模块功能 - 修改 .py 文件后会自动生效\n"
]
}
],
"source": [
"# 自动重载模块(当文件修改后自动刷新)\n",
"%reload_ext autoreload\n",
"%autoreload 2\n",
"\n",
"import sys\n",
"import os\n",
"\n",
"# 先添加项目根目录到路径(必须在导入之前)\n",
"project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))\n",
"if project_root not in sys.path:\n",
" sys.path.insert(0, project_root)\n",
"\n",
"# 然后再导入模块\n",
"from crawler.rmrb.RmrbCrawler import RmrbCrawler\n",
"from crawler.BaseCrawler import NewsItem\n",
"from loguru import logger\n",
"import json\n",
"from pprint import pprint\n",
"\n",
"print(f\"项目根目录: {project_root}\")\n",
"print(\"✓ 已启用自动重载模块功能 - 修改 .py 文件后会自动生效\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. 初始化爬虫"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[32m2025-11-12 19:16:32.990\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.BaseCrawler\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m69\u001b[0m - \u001b[1m初始化爬虫: RmrbCrawler\u001b[0m\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"爬虫初始化成功!\n",
"基础URL: http://www.people.com.cn\n",
"URLS: {'search': UrlConfig(url='http://search.people.cn/search-platform/front/search', params={'key': '', 'page': 1, 'limit': 10, 'hasTitle': True, 'hasContent': True, 'isFuzzy': True, 'type': 0, 'sortType': 2, 'startTime': 0, 'endTime': 0}, method='POST', headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36', 'Accept': 'application/json, text/plain, */*', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Content-Type': 'application/json;charset=UTF-8'}), 'hot_point_rank': UrlConfig(url='http://search.people.cn/search-platform/front/searchRank', params={}, method='GET', headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36', 'Accept': 'application/json, text/plain, */*', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Content-Type': 'application/json;charset=UTF-8'}), 'one_day_trending_news': UrlConfig(url='http://www.people.com.cn/GB/59476/review/{date}.html', params={}, method='GET', headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Accept-Language': 'zh-CN,zh;q=0.9'})}\n"
]
}
],
"source": [
"# 创建爬虫实例\n",
"crawler = RmrbCrawler()\n",
"print(\"爬虫初始化成功!\")\n",
"print(f\"基础URL: {crawler.config.base_url}\")\n",
"print(f\"URLS: {crawler.config.urls}\")\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[32m2025-11-12 19:16:33.072\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.BaseCrawler\u001b[0m:\u001b[36mfetch\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1m请求URL: https://cpc.people.com.cn/n1/2025/1110/c164113-40600321.html (尝试 1/3)\u001b[0m\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"cpc\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[32m2025-11-12 19:16:33.152\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.RmrbCrawler\u001b[0m:\u001b[36mparse_cpc_news_detail\u001b[0m:\u001b[36m129\u001b[0m - \u001b[1m成功解析新闻: 时习之丨“柚”见乡村好风光\u001b[0m\n"
]
},
{
"data": {
"text/plain": [
"NewsItem(title='时习之丨“柚”见乡村好风光', contentRows=[{'tag': 'video', 'content': \"<video style='text-align: center;' poster='https://video.people.cn/userUpload/1739759454736028/1743494135227389/picture/e2cd514c-2af8-4b2c-b35a-c70606de0fff.png' controls src='https://video.people.cn/upload/vod/user1739759454736028/1762614745429386/origin.mp4'></video>\"}, {'tag': 'p', 'content': '<p>\\u3000\\u30002025年11月7日习近平总书记在广东省梅州市梅县区雁洋镇考察南福金柚种植基地了解当地推进乡村全面振兴等情况。</p>'}, {'tag': 'p', 'content': '<p>\\u3000\\u3000乡村全面振兴是总书记地方考察中关切的重要主题之一。“乡村振兴要靠产业各地要各展其长走适合自己的振兴道路”他多次走进田间地头为特色产业谋思路、为农民致富找门路。</p>'}, {'tag': 'p', 'content': '<p>\\u3000\\u3000一个个“土特产”正变身成乡亲增收致富的大产业</p>'}, {'tag': 'p', 'content': '<p>\\u3000\\u3000<span style=\"font-family: 楷体;\">制作:彭静</span></p>'}, {'tag': 'p', 'content': '<p><span style=\"font-family: 楷体;\">\\u3000\\u3000素材来源人民日报、新华社、央视网等</span></p>'}], url='https://cpc.people.com.cn/n1/2025/1110/c164113-40600321.html', publishTime='', author='', source='人民网', category='', executeStatus=1, executeMessage='成功解析新闻')"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"crawler.parse_news_detail(\"https://cpc.people.com.cn/n1/2025/1110/c164113-40600321.html\")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[32m2025-11-12 19:16:33.254\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.BaseCrawler\u001b[0m:\u001b[36mclose\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m关闭爬虫: RmrbCrawler\u001b[0m\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"爬虫会话已关闭\n"
]
}
],
"source": [
"# 关闭爬虫会话\n",
"crawler.close()\n",
"print(\"爬虫会话已关闭\")\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "schoolNewsCrawler",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}