{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# 爬虫测试 Notebook\n", "用于测试人民日报爬虫功能\n", "\n", "## 1. 导入依赖" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "项目根目录: f:\\Project\\schoolNews\\schoolNewsCrawler\n", "✓ 已启用自动重载模块功能 - 修改 .py 文件后会自动生效\n" ] } ], "source": [ "# 自动重载模块(当文件修改后自动刷新)\n", "%reload_ext autoreload\n", "%autoreload 2\n", "\n", "import sys\n", "import os\n", "\n", "# 先添加项目根目录到路径(必须在导入之前)\n", "project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))\n", "if project_root not in sys.path:\n", " sys.path.insert(0, project_root)\n", "\n", "# 然后再导入模块\n", "from crawler.rmrb.RmrbCrawler import RmrbCrawler\n", "from crawler.BaseCrawler import NewsItem\n", "from loguru import logger\n", "import json\n", "from pprint import pprint\n", "\n", "print(f\"项目根目录: {project_root}\")\n", "print(\"✓ 已启用自动重载模块功能 - 修改 .py 文件后会自动生效\")\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2. 初始化爬虫" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\u001b[32m2025-11-12 19:16:32.990\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.BaseCrawler\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m69\u001b[0m - \u001b[1m初始化爬虫: RmrbCrawler\u001b[0m\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "爬虫初始化成功!\n", "基础URL: http://www.people.com.cn\n", "URLS: {'search': UrlConfig(url='http://search.people.cn/search-platform/front/search', params={'key': '', 'page': 1, 'limit': 10, 'hasTitle': True, 'hasContent': True, 'isFuzzy': True, 'type': 0, 'sortType': 2, 'startTime': 0, 'endTime': 0}, method='POST', headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36', 'Accept': 'application/json, text/plain, */*', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Content-Type': 'application/json;charset=UTF-8'}), 'hot_point_rank': UrlConfig(url='http://search.people.cn/search-platform/front/searchRank', params={}, method='GET', headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36', 'Accept': 'application/json, text/plain, */*', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Content-Type': 'application/json;charset=UTF-8'}), 'one_day_trending_news': UrlConfig(url='http://www.people.com.cn/GB/59476/review/{date}.html', params={}, method='GET', headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Accept-Language': 'zh-CN,zh;q=0.9'})}\n" ] } ], "source": [ "# 创建爬虫实例\n", "crawler = RmrbCrawler()\n", "print(\"爬虫初始化成功!\")\n", "print(f\"基础URL: {crawler.config.base_url}\")\n", "print(f\"URLS: {crawler.config.urls}\")\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\u001b[32m2025-11-12 19:16:33.072\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.BaseCrawler\u001b[0m:\u001b[36mfetch\u001b[0m:\u001b[36m87\u001b[0m - \u001b[1m请求URL: https://cpc.people.com.cn/n1/2025/1110/c164113-40600321.html (尝试 1/3)\u001b[0m\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "cpc\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\u001b[32m2025-11-12 19:16:33.152\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.RmrbCrawler\u001b[0m:\u001b[36mparse_cpc_news_detail\u001b[0m:\u001b[36m129\u001b[0m - \u001b[1m成功解析新闻: 时习之丨“柚”见乡村好风光\u001b[0m\n" ] }, { "data": { "text/plain": [ "NewsItem(title='时习之丨“柚”见乡村好风光', contentRows=[{'tag': 'video', 'content': \"\"}, {'tag': 'p', 'content': '

\\u3000\\u30002025年11月7日,习近平总书记在广东省梅州市梅县区雁洋镇考察南福金柚种植基地,了解当地推进乡村全面振兴等情况。

'}, {'tag': 'p', 'content': '

\\u3000\\u3000乡村全面振兴是总书记地方考察中关切的重要主题之一。“乡村振兴要靠产业,各地要各展其长,走适合自己的振兴道路”,他多次走进田间地头,为特色产业谋思路、为农民致富找门路。

'}, {'tag': 'p', 'content': '

\\u3000\\u3000一个个“土特产”正变身成乡亲增收致富的大产业!

'}, {'tag': 'p', 'content': '

\\u3000\\u3000制作:彭静

'}, {'tag': 'p', 'content': '

\\u3000\\u3000素材来源:人民日报、新华社、央视网等

'}], url='https://cpc.people.com.cn/n1/2025/1110/c164113-40600321.html', publishTime='', author='', source='人民网', category='', executeStatus=1, executeMessage='成功解析新闻')" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "crawler.parse_news_detail(\"https://cpc.people.com.cn/n1/2025/1110/c164113-40600321.html\")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\u001b[32m2025-11-12 19:16:33.254\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.BaseCrawler\u001b[0m:\u001b[36mclose\u001b[0m:\u001b[36m155\u001b[0m - \u001b[1m关闭爬虫: RmrbCrawler\u001b[0m\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "爬虫会话已关闭\n" ] } ], "source": [ "# 关闭爬虫会话\n", "crawler.close()\n", "print(\"爬虫会话已关闭\")\n" ] } ], "metadata": { "kernelspec": { "display_name": "schoolNewsCrawler", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.12" } }, "nbformat": 4, "nbformat_minor": 2 }