schoolNews/schoolNewsCrawler/crawler/test_crawler.ipynb

{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# 爬虫测试 Notebook\n",
        "用于测试人民日报爬虫功能\n",
        "\n",
        "## 1. 导入依赖"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "项目根目录: f:\\Project\\schoolNews\\schoolNewsCrawler\n",
            "✓ 已启用自动重载模块功能 - 修改 .py 文件后会自动生效\n"
          ]
        }
      ],
      "source": [
        "# 自动重载模块（当文件修改后自动刷新）\n",
        "%reload_ext autoreload\n",
        "%autoreload 2\n",
        "\n",
        "import sys\n",
        "import os\n",
        "\n",
        "# 先添加项目根目录到路径（必须在导入之前）\n",
        "project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))\n",
        "if project_root not in sys.path:\n",
        "    sys.path.insert(0, project_root)\n",
        "\n",
        "# 然后再导入模块\n",
        "from crawler.RmrbCrawler import RmrbCrawler\n",
        "from crawler.BaseCrawler import NewsItem\n",
        "from loguru import logger\n",
        "import json\n",
        "from pprint import pprint\n",
        "\n",
        "print(f\"项目根目录: {project_root}\")\n",
        "print(\"✓ 已启用自动重载模块功能 - 修改 .py 文件后会自动生效\")\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 2. 初始化爬虫"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 5,
      "metadata": {},
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "\u001b[32m2025-11-10 11:09:38.821\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mcrawler.BaseCrawler\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m69\u001b[0m - \u001b[1m初始化爬虫: RmrbCrawler\u001b[0m\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "爬虫初始化成功！\n",
            "基础URL: http://www.people.com.cn\n",
            "URLS: {'search': UrlConfig(url='http://search.people.cn/search-platform/front/search', params={'key': '', 'page': 1, 'limit': 10, 'hasTitle': True, 'hasContent': True, 'isFuzzy': True, 'type': 0, 'sortType': 2, 'startTime': 0, 'endTime': 0}, method='POST'), 'hot_point_rank': UrlConfig(url='http://search.people.cn/search-platform/front/searchRank', params={}, method='GET')}\n"
          ]
        }
      ],
      "source": [
        "# 创建爬虫实例\n",
        "crawler = RmrbCrawler()\n",
        "print(\"爬虫初始化成功！\")\n",
        "print(f\"基础URL: {crawler.config.base_url}\")\n",
        "print(f\"URLS: {crawler.config.urls}\")\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 11,
      "metadata": {},
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "\u001b[32m2025-11-10 13:21:41.775\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mcrawler.BaseCrawler\u001b[0m:\u001b[36mfetch\u001b[0m:\u001b[36m84\u001b[0m - \u001b[1m请求URL: http://politics.people.com.cn/n1/2025/1110/c461001-40600372.html (尝试 1/3)\u001b[0m\n",
            "\u001b[32m2025-11-10 13:21:41.908\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mcrawler.RmrbCrawler\u001b[0m:\u001b[36mparse_news_detail\u001b[0m:\u001b[36m301\u001b[0m - \u001b[1m成功解析新闻: 习言道｜建好这个“港”，习近平有深远考量\u001b[0m\n"
          ]
        },
        {
          "data": {
            "text/plain": [
              "NewsItem(title='习言道｜建好这个“港”，习近平有深远考量', contentRows=[{'tag': 'p', 'style': None, 'content': ''}, {'tag': 'img', 'style': 'text-align: center;', 'content': 'http://www.people.com.cn/mediafile/pic/BIG/20251110/20/9596092983742008100.jpg'}, {'tag': 'p', 'style': 'text-indent: 2em;', 'content': '中新网11月8日电 题：建好这个“港”，习近平有深远考量'}, {'tag': 'p', 'style': 'text-indent: 2em;', 'content': '党的二十届四中全会后首次到地方，习近平总书记来到了海南三亚。'}, {'tag': 'p', 'style': 'text-indent: 2em;', 'content': '再有一个多月，12月18日，海南自由贸易港将正式启动全岛封关。11月6日，在听取海南自由贸易港建设工作汇报时，习近平总书记强调，各级各有关方面要精心准备，确保平稳有序。'}, {'tag': 'p', 'style': 'text-indent: 2em;', 'content': '对于自由贸易港建设，总书记一直寄予厚望。'}, {'tag': 'p', 'style': 'text-indent: 2em;', 'content': '20世纪80年代，在厦门工作期间，习近平同志就曾牵头研究自由贸易港问题并将部分政策付诸实施。彼时，中国开放的大门刚刚打开。'}, {'tag': 'p', 'style': 'text-indent: 2em;', 'content': '多年后，这一任务交到了海南手中。'}, {'tag': 'p', 'style': 'text-indent: 2em;', 'content': '为什么是海南？从客观条件看，海南是我国最大的经济特区，地理位置独特，拥有全国最好的生态环境，同时又是相对独立的地理单元，具有成为全国改革开放试验田的独特优势。'}, {'tag': 'p', 'style': 'text-indent: 2em;', 'content': '从历史看，海南之所以能从一个边陲海岛发展成为我国改革开放的重要窗口，正是得益于深化改革、扩大开放。建设自贸港，是海南开放发展道路的延续，也是面向未来发展的需要。'}, {'tag': 'p', 'style': 'text-indent: 2em;', 'content': '从现实看，作为引领我国新时代对外开放的鲜明旗帜，海南自贸港建设不仅事关海南自身发展，更关乎中国改革开放全局。'}, {'tag': 'p', 'style': 'text-indent: 2em;', 'content': '“由海南来完成这项历史性任务，这也是中国特色社会主义经济特区建设的一个战略安排，不断摸索、大胆试验，现在蹚出来一条路子。”2022年在海南考察时，对于海南自贸港建设的未来，习近平总书记曾饱含期待。'}, {'tag': 'p', 'style': 'text-indent: 2em;', 'content': '只有敢于走别人没有走过的路，才能收获别样的风景。'}, {'tag': 'p', 'style': 'text-indent: 2em;', 'content': '从2018年党中央决定支持海南全岛建设自由贸易试验区，到2020年《海南自由贸易港建设总体方案》发布，再到海南自贸港连续7年被写入政府工作报告……一系列顶层设计推动海南自贸港建设取得重要阶段性成效，为启动全岛封关运作打下坚实基础。'}, {'tag': 'p', 'style': 'text-indent: 2em;', 'content': '美丽海岛持续释放活力，海南外贸连续5年增长，累计176个国家和地区在此投资。'}, {'tag': 'p', 'style': 'text-indent: 2em;', 'content': '这次在海南，习近平总书记再次强调海南自贸港的定位——'}, {'tag': 'p', 'style': 'text-indent: 2em;', 'content': '“高标准建设海南自由贸易港，主要目的是促进海南高质量发展，助力全国构建新发展格局。”'}, {'tag': 'p', 'style': 'text-indent: 2em;', 'content': '“建设海南自由贸易港的战略目标，就是要把海南自由贸易港打造成为引领我国新时代对外开放的重要门户。”'}, {'tag': 'p', 'style': 'text-indent: 2em;', 'content': '从海南到全国，从中国到世界，总书记点明中国特色自由贸易港的重要使命与历史方位。'}, {'tag': 'p', 'style': 'text-indent: 2em;', 'content': '此次，在海南自贸港将迈入全岛封关运作新阶段之际，总书记再次作出新部署。'}, {'tag': 'p', 'style': 'text-indent: 2em;', 'content
            ]
          },
          "execution_count": 11,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "crawler.parse_news_detail(\"http://politics.people.com.cn/n1/2025/1110/c461001-40600372.html\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 3. 测试搜索功能"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 11,
      "metadata": {},
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "\u001b[32m2025-11-08 17:21:50.827\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mcrawler.BaseCrawler\u001b[0m:\u001b[36mfetch\u001b[0m:\u001b[36m86\u001b[0m - \u001b[1m请求URL: http://search.people.cn/search-platform/front/search (尝试 1/3)\u001b[0m\n",
            "\u001b[32m2025-11-08 17:21:50.913\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mcrawler.RmrbCrawler\u001b[0m:\u001b[36msearch\u001b[0m:\u001b[36m88\u001b[0m - \u001b[1m搜索响应: {'code': '0', 'data': {'records': [{'author': '', 'belongsName': '传媒', 'belongsId': '[\"14677\"]', 'content': '根据《中国新闻奖评选办法》和中国广播电视社会组织联合会通知，现对人民日报社推荐参加第35届中国新闻奖音视频<em>新闻</em>访谈、音视频<em>新闻</em>直播初评的4件作品予以公示。 公示期为2025年4月3日至4月10日。如有不同意见，可通过电话或电子邮件等方式发表评议意见，逾期不再受理。电话：010-65363945 邮箱：xwpx@people.cn 人民日报社 2025年4月3日 作品目录： 音视频<em>新闻</em>访谈3件： 巴黎奥运连麦丨对话“百米飞鱼”潘展乐：干进46秒，说到做到 两会零时差｜以自身发展推动世界发展，中国提供了宝贵经验 两会面对面丨费俊龙对话叶聪：上天入海背后的故事 音视频<em>新闻</em>直播', 'contentOriginal': '根据《中国新闻奖评选办法》和中国广播电视社会组织联合会通知，现对人民日报社推荐参加第35届中国新闻奖音视频新闻访谈、音视频新闻直播初评的4件作品予以公示。 公示期为2025年4月3日至4月10日。如有不同意见，可通过电话或电子邮件等方式发表评议意见，逾期不再受理。 电话：010-65363945 邮箱：xwpx@people.cn 人民日报社 2025年4月3日 作品目录： 音视频新闻访谈3件： 巴黎奥运连麦丨对话“百米飞鱼”潘展乐：干进46秒，说到做到 两会零时差｜以自身发展推动世界发展，中国提供了宝贵经验 两会面对面丨费俊龙对话叶聪：上天入海背后的故事 音视频新闻直播1件： 徒步54公里的思政课，坚守29年的薪火传承', 'displayTime': 1743670375000, 'domain': 'media.people.com.cn', 'editor': '王连香', 'hasImg': 0, 'hasVideo': 0, 'id': 1000040453328, 'imageUrl': None, 'inputTime': 1743670376000, 'isDisclosed': None, 'isElited': None, 'isFixed': None, 'isOfficial': None, 'isRecommend': None, 'keyword': None, 'newsJson': None, 'originNodeRname': '人民网#传媒', 'originUrl': '', 'originalName': None, 'originalType': None, 'pretitle': '', 'shorttitle': '', 'source': 1, 'sourceId': 40453328, 'contentId': 40453328, 'sourcetitle': '', 'sourceType': 1, 'subtitle': '', 'title': '人民日报社推荐参加第35届中国<em>新闻</em>奖音<em>视频</em><em>新闻</em>访谈、 音<em>视频</em><em>新闻</em>直播初评作品的公示', 'url': 'http://media.people.com.cn/n1/2025/0403/c14677-40453328.html', 'originName': '人民网'}, {'author': '', 'belongsName': '韩国频道#滚动#社会', 'belongsId': '[\"407366\",\"407862\",\"407864\"]', 'content': '韩国《朝鲜日报》4月23日文章，原题：成为利用假新闻牟利和政治两极化温床的网络视频平台 距离美国网络视频平台YouTube上传第一条<em>视频</em>已经过去了20年，其危害和副作用日益暴露。这里充斥着假新闻和有害信息，成为了助长政治两极化与社会矛盾和分裂的温床。一旦发生与名人相关的性骚扰、外遇、离婚、暴力、死亡事件，这些主播就会出现，用未经验证的质疑和虚假信息制作并上传<em>视频</em>，通过恐吓、威胁勒索巨额钱财。该平台算法只筛选符合用户喜好的刺激性内容——不是均衡推荐<em>视频</em>，而是不断引导人观看极端偏向的<em>视频</em>。借助仇恨、暴力、煽动性的<em>视频</em>助长严重的确证偏见。每个政治集会或示威现场，都有几十名政治<em>视频</em>主播。这些人制造的假新闻的弊端和政治两极化、社会矛盾的副作用，难以进行计算。 想阻止这种情况的发生，首先需要Youtube母公司谷歌内部进行限制和自我净化的努力。变成了赚钱手段的平台算法也需要进行改进。我们应该建<E8AFA5>
            "\u001b[32m2025-11-08 17:21:50.914\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mcrawler.RmrbCrawler\u001b[0m:\u001b[36msearch\u001b[0m:\u001b[36m113\u001b[0m - \u001b[1m搜索到 0 条新闻\u001b[0m\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "\n",
            "搜索关键词: 视频新闻\n",
            "搜索结果数量: 0\n"
          ]
        }
      ],
      "source": [
        "# 测试搜索\n",
        "search_keyword = \"视频新闻\"\n",
        "search_results = crawler.search(key=search_keyword, page=1, limit=10, news_type=0)\n",
        "\n",
        "print(f\"\\n搜索关键词: {search_keyword}\")\n",
        "print(f\"搜索结果数量: {len(search_results)}\")\n",
        "\n",
        "if search_results:\n",
        "    for i, news in enumerate(search_results, 1):\n",
        "        print(f\"\\n{i}. {news.title}\")\n",
        "        print(f\"   链接: {news.url}\")\n",
        "        print(f\"   时间: {news.publish_time}\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 4. 测试爬取列表页"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# 爬取新闻列表\n",
        "category = \"politics\"  # 可选: politics, society, world, finance, tech, culture, education\n",
        "limit = 5\n",
        "\n",
        "news_list = crawler.crawl(category=category, limit=limit)\n",
        "\n",
        "print(f\"\\n爬取分类: {category}\")\n",
        "print(f\"爬取数量: {len(news_list)}/{limit}\")\n",
        "\n",
        "# 显示结果\n",
        "for i, news in enumerate(news_list, 1):\n",
        "    print(f\"\\n{i}. {news.title}\")\n",
        "    print(f\"   链接: {news.url}\")\n",
        "    print(f\"   来源: {news.source}\")\n",
        "    print(f\"   时间: {news.publish_time}\")\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 5. 测试解析单个新闻详情"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# 测试解析单个新闻页面\n",
        "test_url = \"http://www.people.com.cn/\"  # 替换为实际的新闻URL\n",
        "\n",
        "# 如果之前爬取有结果，使用第一条新闻的URL\n",
        "if news_list:\n",
        "    test_url = news_list[0].url\n",
        "\n",
        "print(f\"测试URL: {test_url}\")\n",
        "news_detail = crawler.parse_news_detail(test_url)\n",
        "\n",
        "if news_detail:\n",
        "    print(\"\\n新闻详情:\")\n",
        "    print(f\"标题: {news_detail.title}\")\n",
        "    print(f\"作者: {news_detail.author}\")\n",
        "    print(f\"时间: {news_detail.publish_time}\")\n",
        "    print(f\"来源: {news_detail.source}\")\n",
        "    print(f\"图片数量: {len(news_detail.images)}\")\n",
        "    print(f\"\\n内容预览 (前200字):\")\n",
        "    print(news_detail.content[:200] if news_detail.content else \"无内容\")\n",
        "else:\n",
        "    print(\"解析失败\")\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 6. 导出数据"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# 将爬取的数据转换为字典列表\n",
        "if news_list:\n",
        "    news_data = [news.model_dump() for news in news_list]\n",
        "    \n",
        "    # 保存为JSON文件\n",
        "    output_file = \"../output/test_news_from_notebook.json\"\n",
        "    os.makedirs(os.path.dirname(output_file), exist_ok=True)\n",
        "    \n",
        "    with open(output_file, 'w', encoding='utf-8') as f:\n",
        "        json.dump(news_data, f, ensure_ascii=False, indent=2)\n",
        "    \n",
        "    print(f\"数据已保存到: {output_file}\")\n",
        "    print(f\"总计: {len(news_data)} 条新闻\")\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 7. 数据统计分析"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "if news_list:\n",
        "    # 统计信息\n",
        "    total_news = len(news_list)\n",
        "    news_with_images = sum(1 for news in news_list if news.images)\n",
        "    news_with_author = sum(1 for news in news_list if news.author)\n",
        "    news_with_time = sum(1 for news in news_list if news.publish_time)\n",
        "    \n",
        "    print(\"=\" * 50)\n",
        "    print(\"爬取数据统计\")\n",
        "    print(\"=\" * 50)\n",
        "    print(f\"总新闻数量: {total_news}\")\n",
        "    print(f\"包含图片: {news_with_images} ({news_with_images/total_news*100:.1f}%)\")\n",
        "    print(f\"有作者信息: {news_with_author} ({news_with_author/total_news*100:.1f}%)\")\n",
        "    print(f\"有时间信息: {news_with_time} ({news_with_time/total_news*100:.1f}%)\")\n",
        "    \n",
        "    # 平均内容长度\n",
        "    avg_content_length = sum(len(news.content) for news in news_list) / total_news\n",
        "    print(f\"平均内容长度: {avg_content_length:.0f} 字符\")\n",
        "    \n",
        "    # 图片总数\n",
        "    total_images = sum(len(news.images) for news in news_list)\n",
        "    print(f\"图片总数: {total_images}\")\n",
        "    print(\"=\" * 50)\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 8. 清理资源"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# 关闭爬虫会话\n",
        "crawler.close()\n",
        "print(\"爬虫会话已关闭\")\n"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "shoolNewsCrewer",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.10.19"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 2
}