Files
schoolNews/schoolNewsCrawler/test.ipynb

161 lines
5.2 KiB
Plaintext
Raw Normal View History

2025-11-19 19:05:31 +08:00
{
"cells": [
{
"cell_type": "code",
2025-11-20 15:46:53 +08:00
"execution_count": 9,
2025-11-19 19:05:31 +08:00
"id": "948be230",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"项目根目录: f:\\Project\\schoolNews\n",
"✓ 已启用自动重载模块功能 - 修改 .py 文件后会自动生效\n"
]
}
],
"source": [
"# 自动重载模块(当文件修改后自动刷新)\n",
"%reload_ext autoreload\n",
"%autoreload 2\n",
"\n",
"import sys\n",
"import os\n",
"\n",
"# 先添加项目根目录到路径(必须在导入之前)\n",
"project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))\n",
"if project_root not in sys.path:\n",
" sys.path.insert(0, project_root)\n",
"\n",
"# 然后再导入模块\n",
"from crawler.xhw.XhwCrawler import XhwCrawler\n",
"from crawler.BaseCrawler import NewsItem\n",
"from loguru import logger\n",
"import json\n",
"from pprint import pprint\n",
"\n",
"print(f\"项目根目录: {project_root}\")\n",
"print(\"✓ 已启用自动重载模块功能 - 修改 .py 文件后会自动生效\")\n"
]
},
{
"cell_type": "code",
2025-11-20 15:46:53 +08:00
"execution_count": 10,
2025-11-19 19:05:31 +08:00
"id": "31a8a0dd",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
2025-11-20 15:46:53 +08:00
"\u001b[32m2025-11-20 15:39:21.410\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.BaseCrawler\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m71\u001b[0m - \u001b[1m初始化爬虫: XhwCrawler\u001b[0m\n",
"\u001b[32m2025-11-20 15:39:22.502\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m20\u001b[0m - \u001b[1mChrome浏览器初始化成功\u001b[0m\n",
"\u001b[32m2025-11-20 15:39:22.502\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m24\u001b[0m - \u001b[1m访问主页获取初始Cookie\u001b[0m\n"
2025-11-19 19:05:31 +08:00
]
}
],
"source": [
"crawler = XhwCrawler()"
]
},
{
"cell_type": "code",
2025-11-20 15:46:53 +08:00
"execution_count": 11,
2025-11-19 19:05:31 +08:00
"id": "e5a6e91c",
"metadata": {},
2025-11-20 14:57:20 +08:00
"outputs": [],
2025-11-19 19:05:31 +08:00
"source": [
2025-11-20 14:57:20 +08:00
"#crawler.search(\"大学\", 1)\n",
2025-11-19 19:05:31 +08:00
"# crawler.search(\"中国\", 10, \"xhsz\")\n",
"# crawler.search(\"中国\", 10, \"news\")\n",
"# crawler.search(\"中国\", 10, \"xhsz\")\n",
"# crawler.search(\"中国\", 10, \"news\")\n",
"# crawler.search(\"中国\", 10, \"news\")"
]
},
{
"cell_type": "code",
2025-11-20 15:46:53 +08:00
"execution_count": 12,
2025-11-19 19:05:31 +08:00
"id": "7e0f56fa",
"metadata": {},
"outputs": [],
"source": [
"# crawler.parse_xhsz_news_detail(\"https://xhsz.news.cn/focus_news/detail?id=9752\")"
]
},
{
"cell_type": "code",
2025-11-20 15:46:53 +08:00
"execution_count": 13,
"id": "47327ebf",
"metadata": {},
2025-11-20 15:46:53 +08:00
"outputs": [],
"source": [
"#crawler.parse_xh_news_detail(\"https://www.news.cn/politics/leaders/20250224/5384be3d47c643b3a68e3bb724656152/c.html\")\n",
"# crawler.parse_xh_news_detail(\"https://www.news.cn/politics/leaders/20240207/2819fe60663140eab9599581dcae8c1e/c.html\") #视频"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "fa359d5b",
"metadata": {},
"outputs": [
2025-11-20 15:46:53 +08:00
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[32m2025-11-20 15:45:21.322\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mcommend\u001b[0m:\u001b[36m44\u001b[0m - \u001b[1m轮播图新闻url: 5\u001b[0m\n",
"\u001b[32m2025-11-20 15:45:21.483\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mcommend\u001b[0m:\u001b[36m54\u001b[0m - \u001b[1m聚焦新闻url: 21\u001b[0m\n",
"\u001b[32m2025-11-20 15:45:22.214\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mcommend\u001b[0m:\u001b[36m44\u001b[0m - \u001b[1m轮播图新闻url: 7\u001b[0m\n",
"\u001b[32m2025-11-20 15:45:23.134\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mcommend\u001b[0m:\u001b[36m54\u001b[0m - \u001b[1m聚焦新闻url: 124\u001b[0m\n",
"\u001b[32m2025-11-20 15:45:23.135\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mcommend\u001b[0m:\u001b[36m55\u001b[0m - \u001b[1m获取到新闻url:157\u001b[0m\n"
]
},
{
"data": {
"text/plain": [
2025-11-20 15:46:53 +08:00
"ResultDomain(code=0, message='', success=True, data=None, dataList=[])"
]
},
2025-11-20 15:46:53 +08:00
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2025-11-20 15:46:53 +08:00
"crawler.commend()"
]
},
{
"cell_type": "code",
"execution_count": null,
2025-11-20 15:46:53 +08:00
"id": "ebabbd5b",
"metadata": {},
"outputs": [],
2025-11-19 19:05:31 +08:00
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "schoolNewsCrawler",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}