Files
schoolNews/schoolNewsCrawler/XxqgTest.ipynb

224 lines
13 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 22,
"id": "948be230",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"项目根目录: f:\\Project\\schoolNews\n",
"✓ 已启用自动重载模块功能 - 修改 .py 文件后会自动生效\n"
]
}
],
"source": [
"# 自动重载模块(当文件修改后自动刷新)\n",
"%reload_ext autoreload\n",
"%autoreload 2\n",
"\n",
"import sys\n",
"import os\n",
"\n",
"# 先添加项目根目录到路径(必须在导入之前)\n",
"project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))\n",
"if project_root not in sys.path:\n",
" sys.path.insert(0, project_root)\n",
"\n",
"# 然后再导入模块\n",
"from crawler.xxqg.XxqgCrawler import XxqgCrawler\n",
"from crawler.BaseCrawler import NewsItem\n",
"from loguru import logger\n",
"import json\n",
"from pprint import pprint\n",
"\n",
"print(f\"项目根目录: {project_root}\")\n",
"print(\"✓ 已启用自动重载模块功能 - 修改 .py 文件后会自动生效\")\n"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "31a8a0dd",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[32m2025-11-21 11:46:12.368\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.BaseCrawler\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m72\u001b[0m - \u001b[1m初始化爬虫: XxqgCrawler\u001b[0m\n",
"\u001b[32m2025-11-21 11:46:13.413\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xxqg.XxqgCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m119\u001b[0m - \u001b[1mChrome浏览器初始化成功\u001b[0m\n",
"\u001b[32m2025-11-21 11:46:13.413\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xxqg.XxqgCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m128\u001b[0m - \u001b[1m访问主页获取初始Cookie\u001b[0m\n"
]
}
],
"source": [
"crawler = XxqgCrawler()"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "afca4191",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[32m2025-11-21 11:48:19.413\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xxqg.XxqgCrawler\u001b[0m:\u001b[36msearch\u001b[0m:\u001b[36m268\u001b[0m - \u001b[1m访问搜索页面并手动点击搜索\u001b[0m\n",
"\u001b[32m2025-11-21 11:48:22.988\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xxqg.XxqgCrawler\u001b[0m:\u001b[36msearch\u001b[0m:\u001b[36m273\u001b[0m - \u001b[1m<selenium.webdriver.remote.webelement.WebElement (session=\"e9cb78c65ab607f72ead921b77daa63e\", element=\"f.A4561956ABC35C3E0609958DE7693C5E.d.D6A7A659A5178F4B104A1069F7A2AA3A.e.99\")>\u001b[0m\n",
"\u001b[32m2025-11-21 11:48:25.405\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xxqg.XxqgCrawler\u001b[0m:\u001b[36mget_search_url\u001b[0m:\u001b[36m262\u001b[0m - \u001b[1m本页提取到 13 条搜索结果\u001b[0m\n",
"\u001b[32m2025-11-21 11:48:25.405\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xxqg.XxqgCrawler\u001b[0m:\u001b[36msearch\u001b[0m:\u001b[36m312\u001b[0m - \u001b[1m共提取 3 条URL\u001b[0m\n",
"\u001b[32m2025-11-21 11:48:35.821\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mcrawler.xxqg.XxqgCrawler\u001b[0m:\u001b[36mparse_news_detail\u001b[0m:\u001b[36m348\u001b[0m - \u001b[33m\u001b[1m访问文章页失败或未找到文章区域: https://www.xuexi.cn/lgpage/detail/index.html?id=18044198242183277818&amp;item_id=18044198242183277818, Message: \n",
"Stacktrace:\n",
"Symbols not available. Dumping unresolved backtrace:\n",
"\t0x7ff75a08a235\n",
"\t0x7ff759de2630\n",
"\t0x7ff759b716dd\n",
"\t0x7ff759bca27e\n",
"\t0x7ff759bca58c\n",
"\t0x7ff759c1ed77\n",
"\t0x7ff759c1baba\n",
"\t0x7ff759bbb0ed\n",
"\t0x7ff759bbbf63\n",
"\t0x7ff75a0b5d60\n",
"\t0x7ff75a0afe8a\n",
"\t0x7ff75a0d1005\n",
"\t0x7ff759dfd71e\n",
"\t0x7ff759e04e1f\n",
"\t0x7ff759deb7c4\n",
"\t0x7ff759deb97f\n",
"\t0x7ff759dd18e8\n",
"\t0x7ffb85a47374\n",
"\t0x7ffb8797cc91\n",
"\u001b[0m\n",
"\u001b[32m2025-11-21 11:48:46.085\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mcrawler.xxqg.XxqgCrawler\u001b[0m:\u001b[36mparse_news_detail\u001b[0m:\u001b[36m348\u001b[0m - \u001b[33m\u001b[1m访问文章页失败或未找到文章区域: https://www.xuexi.cn/lgpage/detail/index.html?id=5133206409192666185&amp;item_id=5133206409192666185, Message: \n",
"Stacktrace:\n",
"Symbols not available. Dumping unresolved backtrace:\n",
"\t0x7ff75a08a235\n",
"\t0x7ff759de2630\n",
"\t0x7ff759b716dd\n",
"\t0x7ff759bca27e\n",
"\t0x7ff759bca58c\n",
"\t0x7ff759c1ed77\n",
"\t0x7ff759c1baba\n",
"\t0x7ff759bbb0ed\n",
"\t0x7ff759bbbf63\n",
"\t0x7ff75a0b5d60\n",
"\t0x7ff75a0afe8a\n",
"\t0x7ff75a0d1005\n",
"\t0x7ff759dfd71e\n",
"\t0x7ff759e04e1f\n",
"\t0x7ff759deb7c4\n",
"\t0x7ff759deb97f\n",
"\t0x7ff759dd18e8\n",
"\t0x7ffb85a47374\n",
"\t0x7ffb8797cc91\n",
"\u001b[0m\n",
"\u001b[32m2025-11-21 11:48:46.780\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mcrawler.xxqg.XxqgCrawler\u001b[0m:\u001b[36mparse_news_detail\u001b[0m:\u001b[36m362\u001b[0m - \u001b[33m\u001b[1m提取发布时间失败: Message: no such element: Unable to locate element: {\"method\":\"css selector\",\"selector\":\"div.render-detail-time\"}\n",
" (Session info: chrome=142.0.7444.163); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#nosuchelementexception\n",
"Stacktrace:\n",
"Symbols not available. Dumping unresolved backtrace:\n",
"\t0x7ff75a08a235\n",
"\t0x7ff759de2630\n",
"\t0x7ff759b716dd\n",
"\t0x7ff759bca27e\n",
"\t0x7ff759bca58c\n",
"\t0x7ff759bbcd7c\n",
"\t0x7ff759bbcc36\n",
"\t0x7ff759c1baba\n",
"\t0x7ff759bbb0ed\n",
"\t0x7ff759bbbf63\n",
"\t0x7ff75a0b5d60\n",
"\t0x7ff75a0afe8a\n",
"\t0x7ff75a0d1005\n",
"\t0x7ff759dfd71e\n",
"\t0x7ff759e04e1f\n",
"\t0x7ff759deb7c4\n",
"\t0x7ff759deb97f\n",
"\t0x7ff759dd18e8\n",
"\t0x7ffb85a47374\n",
"\t0x7ffb8797cc91\n",
"\u001b[0m\n",
"\u001b[32m2025-11-21 11:48:46.786\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mcrawler.xxqg.XxqgCrawler\u001b[0m:\u001b[36mparse_news_detail\u001b[0m:\u001b[36m368\u001b[0m - \u001b[33m\u001b[1m提取来源失败: Message: no such element: Unable to locate element: {\"method\":\"css selector\",\"selector\":\"div.render-detail-source\"}\n",
" (Session info: chrome=142.0.7444.163); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#nosuchelementexception\n",
"Stacktrace:\n",
"Symbols not available. Dumping unresolved backtrace:\n",
"\t0x7ff75a08a235\n",
"\t0x7ff759de2630\n",
"\t0x7ff759b716dd\n",
"\t0x7ff759bca27e\n",
"\t0x7ff759bca58c\n",
"\t0x7ff759bbcd7c\n",
"\t0x7ff759bbcc36\n",
"\t0x7ff759c1baba\n",
"\t0x7ff759bbb0ed\n",
"\t0x7ff759bbbf63\n",
"\t0x7ff75a0b5d60\n",
"\t0x7ff75a0afe8a\n",
"\t0x7ff75a0d1005\n",
"\t0x7ff759dfd71e\n",
"\t0x7ff759e04e1f\n",
"\t0x7ff759deb7c4\n",
"\t0x7ff759deb97f\n",
"\t0x7ff759dd18e8\n",
"\t0x7ffb85a47374\n",
"\t0x7ffb8797cc91\n",
"\u001b[0m\n",
"\u001b[32m2025-11-21 11:48:46.809\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mcrawler.xxqg.XxqgCrawler\u001b[0m:\u001b[36mget_content_rows\u001b[0m:\u001b[36m445\u001b[0m - \u001b[34m\u001b[1m提取文字: 10月28日上午10时由中国海军戚继光舰、沂蒙山舰组成的海军83舰编队抵达新加坡樟宜港正式开启对...\u001b[0m\n",
"\u001b[32m2025-11-21 11:48:46.817\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mcrawler.xxqg.XxqgCrawler\u001b[0m:\u001b[36mget_content_rows\u001b[0m:\u001b[36m445\u001b[0m - \u001b[34m\u001b[1m提取文字: 中国驻新加坡大使曹忠明、使馆领导及工作人员、当地留学生代表以及新加坡海军官兵代表到码头举行欢迎仪式,...\u001b[0m\n",
"\u001b[32m2025-11-21 11:48:46.829\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mcrawler.xxqg.XxqgCrawler\u001b[0m:\u001b[36mget_content_rows\u001b[0m:\u001b[36m445\u001b[0m - \u001b[34m\u001b[1m提取文字: 此次到访新加坡海军83舰编队将进行为期4天的友好访问编队任务官兵将参观新加坡海军军事设施及海军博...\u001b[0m\n",
"\u001b[32m2025-11-21 11:48:46.841\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xxqg.XxqgCrawler\u001b[0m:\u001b[36mparse_news_detail\u001b[0m:\u001b[36m455\u001b[0m - \u001b[1m解析文章详情完成: {'title': '中国海军83舰编队对新加坡进行友好访问', 'contentRows': [{'type': 'text', 'content': '10月28日上午10时由中国海军戚继光舰、沂蒙山舰组成的海军83舰编队抵达新加坡樟宜港正式开启对新加坡的友好访问。'}, {'type': 'text', 'content': '中国驻新加坡大使曹忠明、使馆领导及工作人员、当地留学生代表以及新加坡海军官兵代表到码头举行欢迎仪式,共同迎接编队的到来。'}, {'type': 'text', 'content': '此次到访新加坡海军83舰编队将进行为期4天的友好访问编队任务官兵将参观新加坡海军军事设施及海军博物馆。中新两国海军将开展务实高效的军事交流并计划互派官兵登舰参观。靠泊期间戚继光舰还将举行甲板招待会。总台报道员 海月 通讯员 李大公)'}], 'url': 'https://www.xuexi.cn/lgpage/detail/index.html?id=16022165268290507477&amp;item_id=16022165268290507477', 'viewCount': None, 'publishTime': None, 'author': None, 'source': None, 'category': None, 'executeStatus': 0, 'executeMessage': None}\u001b[0m\n"
]
},
{
"data": {
"text/plain": [
"ResultDomain(code=0, message='', success=True, data=None, dataList=[NewsItem(title='https://www.xuexi.cn/lgpage/detail/index.html?id=18044198242183277818&amp;item_id=18044198242183277818', contentRows=[], url='https://www.xuexi.cn/lgpage/detail/index.html?id=18044198242183277818&amp;item_id=18044198242183277818', viewCount=None, publishTime='2025-11-10', author=None, source='央视军事', category=None, executeStatus=0, executeMessage=None), NewsItem(title='https://www.xuexi.cn/lgpage/detail/index.html?id=5133206409192666185&amp;item_id=5133206409192666185', contentRows=[], url='https://www.xuexi.cn/lgpage/detail/index.html?id=5133206409192666185&amp;item_id=5133206409192666185', viewCount=None, publishTime='2025-10-29', author=None, source='央视军事', category=None, executeStatus=0, executeMessage=None), NewsItem(title='中国海军83舰编队对新加坡进行友好访问', contentRows=[{'type': 'text', 'content': '10月28日上午10时由中国海军戚继光舰、沂蒙山舰组成的海军83舰编队抵达新加坡樟宜港正式开启对新加坡的友好访问。'}, {'type': 'text', 'content': '中国驻新加坡大使曹忠明、使馆领导及工作人员、当地留学生代表以及新加坡海军官兵代表到码头举行欢迎仪式,共同迎接编队的到来。'}, {'type': 'text', 'content': '此次到访新加坡海军83舰编队将进行为期4天的友好访问编队任务官兵将参观新加坡海军军事设施及海军博物馆。中新两国海军将开展务实高效的军事交流并计划互派官兵登舰参观。靠泊期间戚继光舰还将举行甲板招待会。总台报道员 海月 通讯员 李大公)'}], url='https://www.xuexi.cn/lgpage/detail/index.html?id=16022165268290507477&amp;item_id=16022165268290507477', viewCount=None, publishTime='2025-10-28', author=None, source='中央广播电视总台', category=None, executeStatus=0, executeMessage=None)])"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"crawler.search(\"新加坡\", total=3)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7fac804d",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "schoolNewsCrawler",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}