224 lines
13 KiB
Plaintext
224 lines
13 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 22,
|
||
"id": "948be230",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"项目根目录: f:\\Project\\schoolNews\n",
|
||
"✓ 已启用自动重载模块功能 - 修改 .py 文件后会自动生效\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# 自动重载模块(当文件修改后自动刷新)\n",
|
||
"%reload_ext autoreload\n",
|
||
"%autoreload 2\n",
|
||
"\n",
|
||
"import sys\n",
|
||
"import os\n",
|
||
"\n",
|
||
"# 先添加项目根目录到路径(必须在导入之前)\n",
|
||
"project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))\n",
|
||
"if project_root not in sys.path:\n",
|
||
" sys.path.insert(0, project_root)\n",
|
||
"\n",
|
||
"# 然后再导入模块\n",
|
||
"from crawler.xxqg.XxqgCrawler import XxqgCrawler\n",
|
||
"from crawler.BaseCrawler import NewsItem\n",
|
||
"from loguru import logger\n",
|
||
"import json\n",
|
||
"from pprint import pprint\n",
|
||
"\n",
|
||
"print(f\"项目根目录: {project_root}\")\n",
|
||
"print(\"✓ 已启用自动重载模块功能 - 修改 .py 文件后会自动生效\")\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 23,
|
||
"id": "31a8a0dd",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"\u001b[32m2025-11-21 11:46:12.368\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.BaseCrawler\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m72\u001b[0m - \u001b[1m初始化爬虫: XxqgCrawler\u001b[0m\n",
|
||
"\u001b[32m2025-11-21 11:46:13.413\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xxqg.XxqgCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m119\u001b[0m - \u001b[1mChrome浏览器初始化成功\u001b[0m\n",
|
||
"\u001b[32m2025-11-21 11:46:13.413\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xxqg.XxqgCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m128\u001b[0m - \u001b[1m访问主页获取初始Cookie\u001b[0m\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"crawler = XxqgCrawler()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 25,
|
||
"id": "afca4191",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"\u001b[32m2025-11-21 11:48:19.413\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xxqg.XxqgCrawler\u001b[0m:\u001b[36msearch\u001b[0m:\u001b[36m268\u001b[0m - \u001b[1m访问搜索页面并手动点击搜索\u001b[0m\n",
|
||
"\u001b[32m2025-11-21 11:48:22.988\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xxqg.XxqgCrawler\u001b[0m:\u001b[36msearch\u001b[0m:\u001b[36m273\u001b[0m - \u001b[1m<selenium.webdriver.remote.webelement.WebElement (session=\"e9cb78c65ab607f72ead921b77daa63e\", element=\"f.A4561956ABC35C3E0609958DE7693C5E.d.D6A7A659A5178F4B104A1069F7A2AA3A.e.99\")>\u001b[0m\n",
|
||
"\u001b[32m2025-11-21 11:48:25.405\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xxqg.XxqgCrawler\u001b[0m:\u001b[36mget_search_url\u001b[0m:\u001b[36m262\u001b[0m - \u001b[1m本页提取到 13 条搜索结果\u001b[0m\n",
|
||
"\u001b[32m2025-11-21 11:48:25.405\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xxqg.XxqgCrawler\u001b[0m:\u001b[36msearch\u001b[0m:\u001b[36m312\u001b[0m - \u001b[1m共提取 3 条URL\u001b[0m\n",
|
||
"\u001b[32m2025-11-21 11:48:35.821\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mcrawler.xxqg.XxqgCrawler\u001b[0m:\u001b[36mparse_news_detail\u001b[0m:\u001b[36m348\u001b[0m - \u001b[33m\u001b[1m访问文章页失败或未找到文章区域: https://www.xuexi.cn/lgpage/detail/index.html?id=18044198242183277818&item_id=18044198242183277818, Message: \n",
|
||
"Stacktrace:\n",
|
||
"Symbols not available. Dumping unresolved backtrace:\n",
|
||
"\t0x7ff75a08a235\n",
|
||
"\t0x7ff759de2630\n",
|
||
"\t0x7ff759b716dd\n",
|
||
"\t0x7ff759bca27e\n",
|
||
"\t0x7ff759bca58c\n",
|
||
"\t0x7ff759c1ed77\n",
|
||
"\t0x7ff759c1baba\n",
|
||
"\t0x7ff759bbb0ed\n",
|
||
"\t0x7ff759bbbf63\n",
|
||
"\t0x7ff75a0b5d60\n",
|
||
"\t0x7ff75a0afe8a\n",
|
||
"\t0x7ff75a0d1005\n",
|
||
"\t0x7ff759dfd71e\n",
|
||
"\t0x7ff759e04e1f\n",
|
||
"\t0x7ff759deb7c4\n",
|
||
"\t0x7ff759deb97f\n",
|
||
"\t0x7ff759dd18e8\n",
|
||
"\t0x7ffb85a47374\n",
|
||
"\t0x7ffb8797cc91\n",
|
||
"\u001b[0m\n",
|
||
"\u001b[32m2025-11-21 11:48:46.085\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mcrawler.xxqg.XxqgCrawler\u001b[0m:\u001b[36mparse_news_detail\u001b[0m:\u001b[36m348\u001b[0m - \u001b[33m\u001b[1m访问文章页失败或未找到文章区域: https://www.xuexi.cn/lgpage/detail/index.html?id=5133206409192666185&item_id=5133206409192666185, Message: \n",
|
||
"Stacktrace:\n",
|
||
"Symbols not available. Dumping unresolved backtrace:\n",
|
||
"\t0x7ff75a08a235\n",
|
||
"\t0x7ff759de2630\n",
|
||
"\t0x7ff759b716dd\n",
|
||
"\t0x7ff759bca27e\n",
|
||
"\t0x7ff759bca58c\n",
|
||
"\t0x7ff759c1ed77\n",
|
||
"\t0x7ff759c1baba\n",
|
||
"\t0x7ff759bbb0ed\n",
|
||
"\t0x7ff759bbbf63\n",
|
||
"\t0x7ff75a0b5d60\n",
|
||
"\t0x7ff75a0afe8a\n",
|
||
"\t0x7ff75a0d1005\n",
|
||
"\t0x7ff759dfd71e\n",
|
||
"\t0x7ff759e04e1f\n",
|
||
"\t0x7ff759deb7c4\n",
|
||
"\t0x7ff759deb97f\n",
|
||
"\t0x7ff759dd18e8\n",
|
||
"\t0x7ffb85a47374\n",
|
||
"\t0x7ffb8797cc91\n",
|
||
"\u001b[0m\n",
|
||
"\u001b[32m2025-11-21 11:48:46.780\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mcrawler.xxqg.XxqgCrawler\u001b[0m:\u001b[36mparse_news_detail\u001b[0m:\u001b[36m362\u001b[0m - \u001b[33m\u001b[1m提取发布时间失败: Message: no such element: Unable to locate element: {\"method\":\"css selector\",\"selector\":\"div.render-detail-time\"}\n",
|
||
" (Session info: chrome=142.0.7444.163); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#nosuchelementexception\n",
|
||
"Stacktrace:\n",
|
||
"Symbols not available. Dumping unresolved backtrace:\n",
|
||
"\t0x7ff75a08a235\n",
|
||
"\t0x7ff759de2630\n",
|
||
"\t0x7ff759b716dd\n",
|
||
"\t0x7ff759bca27e\n",
|
||
"\t0x7ff759bca58c\n",
|
||
"\t0x7ff759bbcd7c\n",
|
||
"\t0x7ff759bbcc36\n",
|
||
"\t0x7ff759c1baba\n",
|
||
"\t0x7ff759bbb0ed\n",
|
||
"\t0x7ff759bbbf63\n",
|
||
"\t0x7ff75a0b5d60\n",
|
||
"\t0x7ff75a0afe8a\n",
|
||
"\t0x7ff75a0d1005\n",
|
||
"\t0x7ff759dfd71e\n",
|
||
"\t0x7ff759e04e1f\n",
|
||
"\t0x7ff759deb7c4\n",
|
||
"\t0x7ff759deb97f\n",
|
||
"\t0x7ff759dd18e8\n",
|
||
"\t0x7ffb85a47374\n",
|
||
"\t0x7ffb8797cc91\n",
|
||
"\u001b[0m\n",
|
||
"\u001b[32m2025-11-21 11:48:46.786\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mcrawler.xxqg.XxqgCrawler\u001b[0m:\u001b[36mparse_news_detail\u001b[0m:\u001b[36m368\u001b[0m - \u001b[33m\u001b[1m提取来源失败: Message: no such element: Unable to locate element: {\"method\":\"css selector\",\"selector\":\"div.render-detail-source\"}\n",
|
||
" (Session info: chrome=142.0.7444.163); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#nosuchelementexception\n",
|
||
"Stacktrace:\n",
|
||
"Symbols not available. Dumping unresolved backtrace:\n",
|
||
"\t0x7ff75a08a235\n",
|
||
"\t0x7ff759de2630\n",
|
||
"\t0x7ff759b716dd\n",
|
||
"\t0x7ff759bca27e\n",
|
||
"\t0x7ff759bca58c\n",
|
||
"\t0x7ff759bbcd7c\n",
|
||
"\t0x7ff759bbcc36\n",
|
||
"\t0x7ff759c1baba\n",
|
||
"\t0x7ff759bbb0ed\n",
|
||
"\t0x7ff759bbbf63\n",
|
||
"\t0x7ff75a0b5d60\n",
|
||
"\t0x7ff75a0afe8a\n",
|
||
"\t0x7ff75a0d1005\n",
|
||
"\t0x7ff759dfd71e\n",
|
||
"\t0x7ff759e04e1f\n",
|
||
"\t0x7ff759deb7c4\n",
|
||
"\t0x7ff759deb97f\n",
|
||
"\t0x7ff759dd18e8\n",
|
||
"\t0x7ffb85a47374\n",
|
||
"\t0x7ffb8797cc91\n",
|
||
"\u001b[0m\n",
|
||
"\u001b[32m2025-11-21 11:48:46.809\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mcrawler.xxqg.XxqgCrawler\u001b[0m:\u001b[36mget_content_rows\u001b[0m:\u001b[36m445\u001b[0m - \u001b[34m\u001b[1m提取文字: 10月28日上午10时,由中国海军戚继光舰、沂蒙山舰组成的海军83舰编队抵达新加坡樟宜港,正式开启对...\u001b[0m\n",
|
||
"\u001b[32m2025-11-21 11:48:46.817\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mcrawler.xxqg.XxqgCrawler\u001b[0m:\u001b[36mget_content_rows\u001b[0m:\u001b[36m445\u001b[0m - \u001b[34m\u001b[1m提取文字: 中国驻新加坡大使曹忠明、使馆领导及工作人员、当地留学生代表以及新加坡海军官兵代表到码头举行欢迎仪式,...\u001b[0m\n",
|
||
"\u001b[32m2025-11-21 11:48:46.829\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mcrawler.xxqg.XxqgCrawler\u001b[0m:\u001b[36mget_content_rows\u001b[0m:\u001b[36m445\u001b[0m - \u001b[34m\u001b[1m提取文字: 此次到访新加坡,海军83舰编队将进行为期4天的友好访问,编队任务官兵将参观新加坡海军军事设施及海军博...\u001b[0m\n",
|
||
"\u001b[32m2025-11-21 11:48:46.841\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xxqg.XxqgCrawler\u001b[0m:\u001b[36mparse_news_detail\u001b[0m:\u001b[36m455\u001b[0m - \u001b[1m解析文章详情完成: {'title': '中国海军83舰编队对新加坡进行友好访问', 'contentRows': [{'type': 'text', 'content': '10月28日上午10时,由中国海军戚继光舰、沂蒙山舰组成的海军83舰编队抵达新加坡樟宜港,正式开启对新加坡的友好访问。'}, {'type': 'text', 'content': '中国驻新加坡大使曹忠明、使馆领导及工作人员、当地留学生代表以及新加坡海军官兵代表到码头举行欢迎仪式,共同迎接编队的到来。'}, {'type': 'text', 'content': '此次到访新加坡,海军83舰编队将进行为期4天的友好访问,编队任务官兵将参观新加坡海军军事设施及海军博物馆。中新两国海军将开展务实高效的军事交流,并计划互派官兵登舰参观。靠泊期间,戚继光舰还将举行甲板招待会。(总台报道员 海月 通讯员 李大公)'}], 'url': 'https://www.xuexi.cn/lgpage/detail/index.html?id=16022165268290507477&item_id=16022165268290507477', 'viewCount': None, 'publishTime': None, 'author': None, 'source': None, 'category': None, 'executeStatus': 0, 'executeMessage': None}\u001b[0m\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"ResultDomain(code=0, message='', success=True, data=None, dataList=[NewsItem(title='https://www.xuexi.cn/lgpage/detail/index.html?id=18044198242183277818&item_id=18044198242183277818', contentRows=[], url='https://www.xuexi.cn/lgpage/detail/index.html?id=18044198242183277818&item_id=18044198242183277818', viewCount=None, publishTime='2025-11-10', author=None, source='央视军事', category=None, executeStatus=0, executeMessage=None), NewsItem(title='https://www.xuexi.cn/lgpage/detail/index.html?id=5133206409192666185&item_id=5133206409192666185', contentRows=[], url='https://www.xuexi.cn/lgpage/detail/index.html?id=5133206409192666185&item_id=5133206409192666185', viewCount=None, publishTime='2025-10-29', author=None, source='央视军事', category=None, executeStatus=0, executeMessage=None), NewsItem(title='中国海军83舰编队对新加坡进行友好访问', contentRows=[{'type': 'text', 'content': '10月28日上午10时,由中国海军戚继光舰、沂蒙山舰组成的海军83舰编队抵达新加坡樟宜港,正式开启对新加坡的友好访问。'}, {'type': 'text', 'content': '中国驻新加坡大使曹忠明、使馆领导及工作人员、当地留学生代表以及新加坡海军官兵代表到码头举行欢迎仪式,共同迎接编队的到来。'}, {'type': 'text', 'content': '此次到访新加坡,海军83舰编队将进行为期4天的友好访问,编队任务官兵将参观新加坡海军军事设施及海军博物馆。中新两国海军将开展务实高效的军事交流,并计划互派官兵登舰参观。靠泊期间,戚继光舰还将举行甲板招待会。(总台报道员 海月 通讯员 李大公)'}], url='https://www.xuexi.cn/lgpage/detail/index.html?id=16022165268290507477&item_id=16022165268290507477', viewCount=None, publishTime='2025-10-28', author=None, source='中央广播电视总台', category=None, executeStatus=0, executeMessage=None)])"
|
||
]
|
||
},
|
||
"execution_count": 25,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"crawler.search(\"新加坡\", total=3)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "7fac804d",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "schoolNewsCrawler",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.12.12"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|