{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "948be230", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "项目根目录: f:\\Project\\schoolNews\n", "✓ 已启用自动重载模块功能 - 修改 .py 文件后会自动生效\n" ] } ], "source": [ "# 自动重载模块(当文件修改后自动刷新)\n", "%reload_ext autoreload\n", "%autoreload 2\n", "\n", "import sys\n", "import os\n", "\n", "# 先添加项目根目录到路径(必须在导入之前)\n", "project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))\n", "if project_root not in sys.path:\n", " sys.path.insert(0, project_root)\n", "\n", "# 然后再导入模块\n", "from crawler.xhw.XhwCrawler import XhwCrawler\n", "from crawler.BaseCrawler import NewsItem\n", "from loguru import logger\n", "import json\n", "from pprint import pprint\n", "\n", "print(f\"项目根目录: {project_root}\")\n", "print(\"✓ 已启用自动重载模块功能 - 修改 .py 文件后会自动生效\")\n" ] }, { "cell_type": "code", "execution_count": 2, "id": "31a8a0dd", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\u001b[32m2025-11-20 14:48:38.587\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.BaseCrawler\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m71\u001b[0m - \u001b[1m初始化爬虫: XhwCrawler\u001b[0m\n", "\u001b[32m2025-11-20 14:48:39.615\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m113\u001b[0m - \u001b[1mChrome浏览器初始化成功\u001b[0m\n", "\u001b[32m2025-11-20 14:48:39.615\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m122\u001b[0m - \u001b[1m访问主页获取初始Cookie\u001b[0m\n", "\u001b[32m2025-11-20 14:48:39.616\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m123\u001b[0m - \u001b[1m准备访问URL: https://xhsz.news.cn/\u001b[0m\n", "\u001b[32m2025-11-20 14:48:41.227\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m126\u001b[0m - \u001b[1m成功访问URL: https://xhsz.news.cn/\u001b[0m\n" ] } ], "source": [ "crawler = XhwCrawler()" ] }, { "cell_type": "code", "execution_count": 3, "id": "e5a6e91c", "metadata": {}, "outputs": [], "source": [ "#crawler.search(\"大学\", 1)\n", "# crawler.search(\"中国\", 10, \"xhsz\")\n", "# crawler.search(\"中国\", 10, \"news\")\n", "# crawler.search(\"中国\", 10, \"xhsz\")\n", "# crawler.search(\"中国\", 10, \"news\")\n", "# crawler.search(\"中国\", 10, \"news\")" ] }, { "cell_type": "code", "execution_count": 4, "id": "7e0f56fa", "metadata": {}, "outputs": [], "source": [ "# crawler.parse_xhsz_news_detail(\"https://xhsz.news.cn/focus_news/detail?id=9752\")" ] }, { "cell_type": "code", "execution_count": 7, "id": "47327ebf", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "NewsItem(title='微纪录片|习近平的“三农”情', contentRows=[{'tag': 'video', 'content': \"\"}, {'tag': 'p', 'content': '
习近平对农民有着深厚的感情
'}, {'tag': 'p', 'content': '对“三农”问题有深入的思考
'}, {'tag': 'p', 'content': '一路走来
'}, {'tag': 'p', 'content': '他经常和农民在一起
'}, {'tag': 'p', 'content': '从小小山村的党支部书记
'}, {'tag': 'p', 'content': '到党的总书记
'}, {'tag': 'p', 'content': '寸寸光阴
'}, {'tag': 'p', 'content': '见证着他的“三农”情
'}, {'tag': 'p', 'content': '\\u2003\\u2003总策划:刘健
'}, {'tag': 'p', 'content': '\\u2003\\u2003策划:李拯宇
'}, {'tag': 'p', 'content': '\\u2003\\u2003监制:孙志平
'}, {'tag': 'p', 'content': '\\u2003\\u2003制片:樊华
'}, {'tag': 'p', 'content': '\\u2003\\u2003统筹:韩珅、王志斌
'}, {'tag': 'p', 'content': '\\u2003\\u2003编导:陈晓宇
'}, {'tag': 'p', 'content': '\\u2003\\u2003记者:陈晓宇、范世辉、岳文婷、邹尚伯、张晨俊、王怿文、李涛、胡友松、朱晓光
'}, {'tag': 'p', 'content': '\\u2003\\u2003报道员:刘鹏飞、李树锋、张伟、朱海亮、徐涛、王盟、王静
'}, {'tag': 'p', 'content': '\\u2003\\u2003海报:韩彤(实习)
'}, {'tag': 'p', 'content': '\\u2003\\u2003鸣谢:中共延川县委宣传部
'}, {'tag': 'p', 'content': '\\u2003\\u2003中共石家庄市委宣传部
'}, {'tag': 'p', 'content': '\\u2003\\u2003中共曹县县委宣传部
'}, {'tag': 'p', 'content': '\\u2003\\u2003新华社音视频部制作
'}, {'tag': 'p', 'content': '\\u2003\\u2003新华通讯社出品
'}], url='https://www.news.cn/politics/leaders/20240207/2819fe60663140eab9599581dcae8c1e/c.html', viewCount=None, publishTime='2024-02-07 12:43:29', author=None, source='新华社', category=None, executeStatus=0, executeMessage=None)" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#crawler.parse_xh_news_detail(\"https://www.news.cn/politics/leaders/20250224/5384be3d47c643b3a68e3bb724656152/c.html\")\n", "crawler.parse_xh_news_detail(\"https://www.news.cn/politics/leaders/20240207/2819fe60663140eab9599581dcae8c1e/c.html\") #视频" ] }, { "cell_type": "code", "execution_count": null, "id": "fa359d5b", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "schoolNewsCrawler", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.12" } }, "nbformat": 4, "nbformat_minor": 5 }