140 lines
8.0 KiB
Plaintext
140 lines
8.0 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"id": "948be230",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"项目根目录: f:\\Project\\schoolNews\n",
|
||
"✓ 已启用自动重载模块功能 - 修改 .py 文件后会自动生效\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# 自动重载模块(当文件修改后自动刷新)\n",
|
||
"%reload_ext autoreload\n",
|
||
"%autoreload 2\n",
|
||
"\n",
|
||
"import sys\n",
|
||
"import os\n",
|
||
"\n",
|
||
"# 先添加项目根目录到路径(必须在导入之前)\n",
|
||
"project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))\n",
|
||
"if project_root not in sys.path:\n",
|
||
" sys.path.insert(0, project_root)\n",
|
||
"\n",
|
||
"# 然后再导入模块\n",
|
||
"from crawler.xhw.XhwCrawler import XhwCrawler\n",
|
||
"from crawler.BaseCrawler import NewsItem\n",
|
||
"from loguru import logger\n",
|
||
"import json\n",
|
||
"from pprint import pprint\n",
|
||
"\n",
|
||
"print(f\"项目根目录: {project_root}\")\n",
|
||
"print(\"✓ 已启用自动重载模块功能 - 修改 .py 文件后会自动生效\")\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"id": "31a8a0dd",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"\u001b[32m2025-11-19 19:03:54.324\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.BaseCrawler\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m71\u001b[0m - \u001b[1m初始化爬虫: XhwCrawler\u001b[0m\n",
|
||
"\u001b[32m2025-11-19 19:03:55.214\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m20\u001b[0m - \u001b[1mChrome浏览器初始化成功\u001b[0m\n",
|
||
"\u001b[32m2025-11-19 19:03:55.216\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m25\u001b[0m - \u001b[1m访问主页获取初始Cookie\u001b[0m\n",
|
||
"\u001b[32m2025-11-19 19:03:55.217\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m26\u001b[0m - \u001b[1m准备访问URL: https://xhsz.news.cn/\u001b[0m\n",
|
||
"\u001b[32m2025-11-19 19:03:57.557\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m29\u001b[0m - \u001b[1m成功访问URL: https://xhsz.news.cn/\u001b[0m\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"crawler = XhwCrawler()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"id": "e5a6e91c",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"\u001b[32m2025-11-19 19:04:12.458\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36msearch\u001b[0m:\u001b[36m174\u001b[0m - \u001b[1m请求URL: https://xhsz.news.cn/s?k=%E4%B9%A0%E8%BF%91%E5%B9%B3&action=news&page=1\u001b[0m\n",
|
||
"\u001b[32m2025-11-19 19:04:15.858\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36msearch\u001b[0m:\u001b[36m188\u001b[0m - \u001b[33m\u001b[1m检测到验证页面,尝试手动处理验证\u001b[0m\n",
|
||
"\u001b[32m2025-11-19 19:04:15.858\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36msearch\u001b[0m:\u001b[36m189\u001b[0m - \u001b[1m请在30秒内手动完成验证...\u001b[0m\n",
|
||
"\u001b[32m2025-11-19 19:04:48.814\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36msearch\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m解析后的HTML内容: <html lang=\"en\"><head>\n",
|
||
"<meta charset=\"utf-8\"/>\n",
|
||
"<meta content=\"IE=edge\" http-equiv=\"X-UA-Compatible\"/>\n",
|
||
"<meta content=\"webkit\" name=\"renderer\"/>\n",
|
||
"<title>新华网新华思政-全国高校课程思政教学资源服务平台</title>\n",
|
||
"<meta content=\"新华思政,课程思政,全国高校课程思政教学资源服务平台,新华网,新华教育,思政教育.\" name=\"keywords\"/>\n",
|
||
"<meta content=\"新华网作为党和国家重要的网上舆论阵地,适时推出新华思政—全国高校课程思政教学资源服务平台,为全国高校教师针对课程思政建设、交流、学习和共享于一体的教学服务平台,旨在推广课程思政建设先进经验和做法,助力高校课程思政教学资源需求,深入挖掘课程思政元素,助力广泛开展课程思政建设的良好氛围,提升教师开展课程思政建设的意识和能力。\" name=\"description\"/>\n",
|
||
"<link href=\"/static/skin4/favicon.ico\" rel...\u001b[0m\n"
|
||
]
|
||
},
|
||
{
|
||
"ename": "AttributeError",
|
||
"evalue": "'NoneType' object has no attribute 'find'",
|
||
"output_type": "error",
|
||
"traceback": [
|
||
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
||
"\u001b[31mAttributeError\u001b[39m Traceback (most recent call last)",
|
||
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[5]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mcrawler\u001b[49m\u001b[43m.\u001b[49m\u001b[43msearch\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43m习近平\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[32;43m10\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m 2\u001b[39m \u001b[38;5;66;03m# crawler.search(\"中国\", 10, \"xhsz\")\u001b[39;00m\n\u001b[32m 3\u001b[39m \u001b[38;5;66;03m# crawler.search(\"中国\", 10, \"news\")\u001b[39;00m\n\u001b[32m 4\u001b[39m \u001b[38;5;66;03m# crawler.search(\"中国\", 10, \"xhsz\")\u001b[39;00m\n\u001b[32m 5\u001b[39m \u001b[38;5;66;03m# crawler.search(\"中国\", 10, \"news\")\u001b[39;00m\n\u001b[32m 6\u001b[39m \u001b[38;5;66;03m# crawler.search(\"中国\", 10, \"news\")\u001b[39;00m\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32mf:\\Project\\schoolNews\\schoolNewsCrawler\\crawler\\xhw\\XhwCrawler.py:241\u001b[39m, in \u001b[36msearch\u001b[39m\u001b[34m(self, key, total, action)\u001b[39m\n\u001b[32m 239\u001b[39m news_info = news.find(\u001b[33m\"\u001b[39m\u001b[33mdiv.head\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 240\u001b[39m news_title = news_info.find(\u001b[33m\"\u001b[39m\u001b[33mdiv.title\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m--> \u001b[39m\u001b[32m241\u001b[39m news_date = news_info.find(\u001b[33m\"\u001b[39m\u001b[33mdiv.date\u001b[39m\u001b[33m\"\u001b[39m).text.strip()\n\u001b[32m 242\u001b[39m url = news_title.find(\u001b[33m\"\u001b[39m\u001b[33ma\u001b[39m\u001b[33m\"\u001b[39m).get(\u001b[33m\"\u001b[39m\u001b[33mhref\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 243\u001b[39m url_base_map[url] = {\u001b[33m\"\u001b[39m\u001b[33mtitle\u001b[39m\u001b[33m\"\u001b[39m: news_title.get_text(strip=\u001b[38;5;28;01mTrue\u001b[39;00m), \u001b[33m\"\u001b[39m\u001b[33mdate\u001b[39m\u001b[33m\"\u001b[39m: news_date}\n",
|
||
"\u001b[31mAttributeError\u001b[39m: 'NoneType' object has no attribute 'find'"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"crawler.search(\"习近平\", 10)\n",
|
||
"# crawler.search(\"中国\", 10, \"xhsz\")\n",
|
||
"# crawler.search(\"中国\", 10, \"news\")\n",
|
||
"# crawler.search(\"中国\", 10, \"xhsz\")\n",
|
||
"# crawler.search(\"中国\", 10, \"news\")\n",
|
||
"# crawler.search(\"中国\", 10, \"news\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "7e0f56fa",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "schoolNewsCrawler",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.12.12"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|