{ "cells": [ { "cell_type": "code", "execution_count": 22, "id": "948be230", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "项目根目录: f:\\Project\\schoolNews\n", "✓ 已启用自动重载模块功能 - 修改 .py 文件后会自动生效\n" ] } ], "source": [ "# 自动重载模块(当文件修改后自动刷新)\n", "%reload_ext autoreload\n", "%autoreload 2\n", "\n", "import sys\n", "import os\n", "\n", "# 先添加项目根目录到路径(必须在导入之前)\n", "project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))\n", "if project_root not in sys.path:\n", " sys.path.insert(0, project_root)\n", "\n", "# 然后再导入模块\n", "from crawler.xxqg.XxqgCrawler import XxqgCrawler\n", "from crawler.BaseCrawler import NewsItem\n", "from loguru import logger\n", "import json\n", "from pprint import pprint\n", "\n", "print(f\"项目根目录: {project_root}\")\n", "print(\"✓ 已启用自动重载模块功能 - 修改 .py 文件后会自动生效\")\n" ] }, { "cell_type": "code", "execution_count": 23, "id": "31a8a0dd", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\u001b[32m2025-11-21 11:46:12.368\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.BaseCrawler\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m72\u001b[0m - \u001b[1m初始化爬虫: XxqgCrawler\u001b[0m\n", "\u001b[32m2025-11-21 11:46:13.413\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xxqg.XxqgCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m119\u001b[0m - \u001b[1mChrome浏览器初始化成功\u001b[0m\n", "\u001b[32m2025-11-21 11:46:13.413\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xxqg.XxqgCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m128\u001b[0m - \u001b[1m访问主页获取初始Cookie\u001b[0m\n" ] } ], "source": [ "crawler = XxqgCrawler()" ] }, { "cell_type": "code", "execution_count": 25, "id": "afca4191", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\u001b[32m2025-11-21 11:48:19.413\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xxqg.XxqgCrawler\u001b[0m:\u001b[36msearch\u001b[0m:\u001b[36m268\u001b[0m - \u001b[1m访问搜索页面并手动点击搜索\u001b[0m\n", "\u001b[32m2025-11-21 11:48:22.988\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xxqg.XxqgCrawler\u001b[0m:\u001b[36msearch\u001b[0m:\u001b[36m273\u001b[0m - \u001b[1m\u001b[0m\n", "\u001b[32m2025-11-21 11:48:25.405\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xxqg.XxqgCrawler\u001b[0m:\u001b[36mget_search_url\u001b[0m:\u001b[36m262\u001b[0m - \u001b[1m本页提取到 13 条搜索结果\u001b[0m\n", "\u001b[32m2025-11-21 11:48:25.405\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xxqg.XxqgCrawler\u001b[0m:\u001b[36msearch\u001b[0m:\u001b[36m312\u001b[0m - \u001b[1m共提取 3 条URL\u001b[0m\n", "\u001b[32m2025-11-21 11:48:35.821\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mcrawler.xxqg.XxqgCrawler\u001b[0m:\u001b[36mparse_news_detail\u001b[0m:\u001b[36m348\u001b[0m - \u001b[33m\u001b[1m访问文章页失败或未找到文章区域: https://www.xuexi.cn/lgpage/detail/index.html?id=18044198242183277818&item_id=18044198242183277818, Message: \n", "Stacktrace:\n", "Symbols not available. Dumping unresolved backtrace:\n", "\t0x7ff75a08a235\n", "\t0x7ff759de2630\n", "\t0x7ff759b716dd\n", "\t0x7ff759bca27e\n", "\t0x7ff759bca58c\n", "\t0x7ff759c1ed77\n", "\t0x7ff759c1baba\n", "\t0x7ff759bbb0ed\n", "\t0x7ff759bbbf63\n", "\t0x7ff75a0b5d60\n", "\t0x7ff75a0afe8a\n", "\t0x7ff75a0d1005\n", "\t0x7ff759dfd71e\n", "\t0x7ff759e04e1f\n", "\t0x7ff759deb7c4\n", "\t0x7ff759deb97f\n", "\t0x7ff759dd18e8\n", "\t0x7ffb85a47374\n", "\t0x7ffb8797cc91\n", "\u001b[0m\n", "\u001b[32m2025-11-21 11:48:46.085\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mcrawler.xxqg.XxqgCrawler\u001b[0m:\u001b[36mparse_news_detail\u001b[0m:\u001b[36m348\u001b[0m - \u001b[33m\u001b[1m访问文章页失败或未找到文章区域: https://www.xuexi.cn/lgpage/detail/index.html?id=5133206409192666185&item_id=5133206409192666185, Message: \n", "Stacktrace:\n", "Symbols not available. Dumping unresolved backtrace:\n", "\t0x7ff75a08a235\n", "\t0x7ff759de2630\n", "\t0x7ff759b716dd\n", "\t0x7ff759bca27e\n", "\t0x7ff759bca58c\n", "\t0x7ff759c1ed77\n", "\t0x7ff759c1baba\n", "\t0x7ff759bbb0ed\n", "\t0x7ff759bbbf63\n", "\t0x7ff75a0b5d60\n", "\t0x7ff75a0afe8a\n", "\t0x7ff75a0d1005\n", "\t0x7ff759dfd71e\n", "\t0x7ff759e04e1f\n", "\t0x7ff759deb7c4\n", "\t0x7ff759deb97f\n", "\t0x7ff759dd18e8\n", "\t0x7ffb85a47374\n", "\t0x7ffb8797cc91\n", "\u001b[0m\n", "\u001b[32m2025-11-21 11:48:46.780\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mcrawler.xxqg.XxqgCrawler\u001b[0m:\u001b[36mparse_news_detail\u001b[0m:\u001b[36m362\u001b[0m - \u001b[33m\u001b[1m提取发布时间失败: Message: no such element: Unable to locate element: {\"method\":\"css selector\",\"selector\":\"div.render-detail-time\"}\n", " (Session info: chrome=142.0.7444.163); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#nosuchelementexception\n", "Stacktrace:\n", "Symbols not available. Dumping unresolved backtrace:\n", "\t0x7ff75a08a235\n", "\t0x7ff759de2630\n", "\t0x7ff759b716dd\n", "\t0x7ff759bca27e\n", "\t0x7ff759bca58c\n", "\t0x7ff759bbcd7c\n", "\t0x7ff759bbcc36\n", "\t0x7ff759c1baba\n", "\t0x7ff759bbb0ed\n", "\t0x7ff759bbbf63\n", "\t0x7ff75a0b5d60\n", "\t0x7ff75a0afe8a\n", "\t0x7ff75a0d1005\n", "\t0x7ff759dfd71e\n", "\t0x7ff759e04e1f\n", "\t0x7ff759deb7c4\n", "\t0x7ff759deb97f\n", "\t0x7ff759dd18e8\n", "\t0x7ffb85a47374\n", "\t0x7ffb8797cc91\n", "\u001b[0m\n", "\u001b[32m2025-11-21 11:48:46.786\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mcrawler.xxqg.XxqgCrawler\u001b[0m:\u001b[36mparse_news_detail\u001b[0m:\u001b[36m368\u001b[0m - \u001b[33m\u001b[1m提取来源失败: Message: no such element: Unable to locate element: {\"method\":\"css selector\",\"selector\":\"div.render-detail-source\"}\n", " (Session info: chrome=142.0.7444.163); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#nosuchelementexception\n", "Stacktrace:\n", "Symbols not available. Dumping unresolved backtrace:\n", "\t0x7ff75a08a235\n", "\t0x7ff759de2630\n", "\t0x7ff759b716dd\n", "\t0x7ff759bca27e\n", "\t0x7ff759bca58c\n", "\t0x7ff759bbcd7c\n", "\t0x7ff759bbcc36\n", "\t0x7ff759c1baba\n", "\t0x7ff759bbb0ed\n", "\t0x7ff759bbbf63\n", "\t0x7ff75a0b5d60\n", "\t0x7ff75a0afe8a\n", "\t0x7ff75a0d1005\n", "\t0x7ff759dfd71e\n", "\t0x7ff759e04e1f\n", "\t0x7ff759deb7c4\n", "\t0x7ff759deb97f\n", "\t0x7ff759dd18e8\n", "\t0x7ffb85a47374\n", "\t0x7ffb8797cc91\n", "\u001b[0m\n", "\u001b[32m2025-11-21 11:48:46.809\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mcrawler.xxqg.XxqgCrawler\u001b[0m:\u001b[36mget_content_rows\u001b[0m:\u001b[36m445\u001b[0m - \u001b[34m\u001b[1m提取文字: 10月28日上午10时,由中国海军戚继光舰、沂蒙山舰组成的海军83舰编队抵达新加坡樟宜港,正式开启对...\u001b[0m\n", "\u001b[32m2025-11-21 11:48:46.817\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mcrawler.xxqg.XxqgCrawler\u001b[0m:\u001b[36mget_content_rows\u001b[0m:\u001b[36m445\u001b[0m - \u001b[34m\u001b[1m提取文字: 中国驻新加坡大使曹忠明、使馆领导及工作人员、当地留学生代表以及新加坡海军官兵代表到码头举行欢迎仪式,...\u001b[0m\n", "\u001b[32m2025-11-21 11:48:46.829\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mcrawler.xxqg.XxqgCrawler\u001b[0m:\u001b[36mget_content_rows\u001b[0m:\u001b[36m445\u001b[0m - \u001b[34m\u001b[1m提取文字: 此次到访新加坡,海军83舰编队将进行为期4天的友好访问,编队任务官兵将参观新加坡海军军事设施及海军博...\u001b[0m\n", "\u001b[32m2025-11-21 11:48:46.841\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xxqg.XxqgCrawler\u001b[0m:\u001b[36mparse_news_detail\u001b[0m:\u001b[36m455\u001b[0m - \u001b[1m解析文章详情完成: {'title': '中国海军83舰编队对新加坡进行友好访问', 'contentRows': [{'type': 'text', 'content': '10月28日上午10时,由中国海军戚继光舰、沂蒙山舰组成的海军83舰编队抵达新加坡樟宜港,正式开启对新加坡的友好访问。'}, {'type': 'text', 'content': '中国驻新加坡大使曹忠明、使馆领导及工作人员、当地留学生代表以及新加坡海军官兵代表到码头举行欢迎仪式,共同迎接编队的到来。'}, {'type': 'text', 'content': '此次到访新加坡,海军83舰编队将进行为期4天的友好访问,编队任务官兵将参观新加坡海军军事设施及海军博物馆。中新两国海军将开展务实高效的军事交流,并计划互派官兵登舰参观。靠泊期间,戚继光舰还将举行甲板招待会。(总台报道员 海月 通讯员 李大公)'}], 'url': 'https://www.xuexi.cn/lgpage/detail/index.html?id=16022165268290507477&item_id=16022165268290507477', 'viewCount': None, 'publishTime': None, 'author': None, 'source': None, 'category': None, 'executeStatus': 0, 'executeMessage': None}\u001b[0m\n" ] }, { "data": { "text/plain": [ "ResultDomain(code=0, message='', success=True, data=None, dataList=[NewsItem(title='https://www.xuexi.cn/lgpage/detail/index.html?id=18044198242183277818&item_id=18044198242183277818', contentRows=[], url='https://www.xuexi.cn/lgpage/detail/index.html?id=18044198242183277818&item_id=18044198242183277818', viewCount=None, publishTime='2025-11-10', author=None, source='央视军事', category=None, executeStatus=0, executeMessage=None), NewsItem(title='https://www.xuexi.cn/lgpage/detail/index.html?id=5133206409192666185&item_id=5133206409192666185', contentRows=[], url='https://www.xuexi.cn/lgpage/detail/index.html?id=5133206409192666185&item_id=5133206409192666185', viewCount=None, publishTime='2025-10-29', author=None, source='央视军事', category=None, executeStatus=0, executeMessage=None), NewsItem(title='中国海军83舰编队对新加坡进行友好访问', contentRows=[{'type': 'text', 'content': '10月28日上午10时,由中国海军戚继光舰、沂蒙山舰组成的海军83舰编队抵达新加坡樟宜港,正式开启对新加坡的友好访问。'}, {'type': 'text', 'content': '中国驻新加坡大使曹忠明、使馆领导及工作人员、当地留学生代表以及新加坡海军官兵代表到码头举行欢迎仪式,共同迎接编队的到来。'}, {'type': 'text', 'content': '此次到访新加坡,海军83舰编队将进行为期4天的友好访问,编队任务官兵将参观新加坡海军军事设施及海军博物馆。中新两国海军将开展务实高效的军事交流,并计划互派官兵登舰参观。靠泊期间,戚继光舰还将举行甲板招待会。(总台报道员 海月 通讯员 李大公)'}], url='https://www.xuexi.cn/lgpage/detail/index.html?id=16022165268290507477&item_id=16022165268290507477', viewCount=None, publishTime='2025-10-28', author=None, source='中央广播电视总台', category=None, executeStatus=0, executeMessage=None)])" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "crawler.search(\"新加坡\", total=3)" ] }, { "cell_type": "code", "execution_count": null, "id": "7fac804d", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "schoolNewsCrawler", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.12" } }, "nbformat": 4, "nbformat_minor": 5 }