Files
schoolNews/schoolNewsCrawler/test.ipynb
2025-11-19 19:05:31 +08:00

140 lines
8.0 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"id": "948be230",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"项目根目录: f:\\Project\\schoolNews\n",
"✓ 已启用自动重载模块功能 - 修改 .py 文件后会自动生效\n"
]
}
],
"source": [
"# 自动重载模块(当文件修改后自动刷新)\n",
"%reload_ext autoreload\n",
"%autoreload 2\n",
"\n",
"import sys\n",
"import os\n",
"\n",
"# 先添加项目根目录到路径(必须在导入之前)\n",
"project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))\n",
"if project_root not in sys.path:\n",
" sys.path.insert(0, project_root)\n",
"\n",
"# 然后再导入模块\n",
"from crawler.xhw.XhwCrawler import XhwCrawler\n",
"from crawler.BaseCrawler import NewsItem\n",
"from loguru import logger\n",
"import json\n",
"from pprint import pprint\n",
"\n",
"print(f\"项目根目录: {project_root}\")\n",
"print(\"✓ 已启用自动重载模块功能 - 修改 .py 文件后会自动生效\")\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "31a8a0dd",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[32m2025-11-19 19:03:54.324\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.BaseCrawler\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m71\u001b[0m - \u001b[1m初始化爬虫: XhwCrawler\u001b[0m\n",
"\u001b[32m2025-11-19 19:03:55.214\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m20\u001b[0m - \u001b[1mChrome浏览器初始化成功\u001b[0m\n",
"\u001b[32m2025-11-19 19:03:55.216\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m25\u001b[0m - \u001b[1m访问主页获取初始Cookie\u001b[0m\n",
"\u001b[32m2025-11-19 19:03:55.217\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m26\u001b[0m - \u001b[1m准备访问URL: https://xhsz.news.cn/\u001b[0m\n",
"\u001b[32m2025-11-19 19:03:57.557\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m29\u001b[0m - \u001b[1m成功访问URL: https://xhsz.news.cn/\u001b[0m\n"
]
}
],
"source": [
"crawler = XhwCrawler()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "e5a6e91c",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[32m2025-11-19 19:04:12.458\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36msearch\u001b[0m:\u001b[36m174\u001b[0m - \u001b[1m请求URL: https://xhsz.news.cn/s?k=%E4%B9%A0%E8%BF%91%E5%B9%B3&action=news&page=1\u001b[0m\n",
"\u001b[32m2025-11-19 19:04:15.858\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36msearch\u001b[0m:\u001b[36m188\u001b[0m - \u001b[33m\u001b[1m检测到验证页面尝试手动处理验证\u001b[0m\n",
"\u001b[32m2025-11-19 19:04:15.858\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36msearch\u001b[0m:\u001b[36m189\u001b[0m - \u001b[1m请在30秒内手动完成验证...\u001b[0m\n",
"\u001b[32m2025-11-19 19:04:48.814\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36msearch\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m解析后的HTML内容: <html lang=\"en\"><head>\n",
"<meta charset=\"utf-8\"/>\n",
"<meta content=\"IE=edge\" http-equiv=\"X-UA-Compatible\"/>\n",
"<meta content=\"webkit\" name=\"renderer\"/>\n",
"<title>新华网新华思政-全国高校课程思政教学资源服务平台</title>\n",
"<meta content=\"新华思政,课程思政,全国高校课程思政教学资源服务平台,新华网,新华教育,思政教育.\" name=\"keywords\"/>\n",
"<meta content=\"新华网作为党和国家重要的网上舆论阵地,适时推出新华思政—全国高校课程思政教学资源服务平台,为全国高校教师针对课程思政建设、交流、学习和共享于一体的教学服务平台,旨在推广课程思政建设先进经验和做法,助力高校课程思政教学资源需求,深入挖掘课程思政元素,助力广泛开展课程思政建设的良好氛围,提升教师开展课程思政建设的意识和能力。\" name=\"description\"/>\n",
"<link href=\"/static/skin4/favicon.ico\" rel...\u001b[0m\n"
]
},
{
"ename": "AttributeError",
"evalue": "'NoneType' object has no attribute 'find'",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mAttributeError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[5]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mcrawler\u001b[49m\u001b[43m.\u001b[49m\u001b[43msearch\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43m习近平\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[32;43m10\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m 2\u001b[39m \u001b[38;5;66;03m# crawler.search(\"中国\", 10, \"xhsz\")\u001b[39;00m\n\u001b[32m 3\u001b[39m \u001b[38;5;66;03m# crawler.search(\"中国\", 10, \"news\")\u001b[39;00m\n\u001b[32m 4\u001b[39m \u001b[38;5;66;03m# crawler.search(\"中国\", 10, \"xhsz\")\u001b[39;00m\n\u001b[32m 5\u001b[39m \u001b[38;5;66;03m# crawler.search(\"中国\", 10, \"news\")\u001b[39;00m\n\u001b[32m 6\u001b[39m \u001b[38;5;66;03m# crawler.search(\"中国\", 10, \"news\")\u001b[39;00m\n",
"\u001b[36mFile \u001b[39m\u001b[32mf:\\Project\\schoolNews\\schoolNewsCrawler\\crawler\\xhw\\XhwCrawler.py:241\u001b[39m, in \u001b[36msearch\u001b[39m\u001b[34m(self, key, total, action)\u001b[39m\n\u001b[32m 239\u001b[39m news_info = news.find(\u001b[33m\"\u001b[39m\u001b[33mdiv.head\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 240\u001b[39m news_title = news_info.find(\u001b[33m\"\u001b[39m\u001b[33mdiv.title\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m--> \u001b[39m\u001b[32m241\u001b[39m news_date = news_info.find(\u001b[33m\"\u001b[39m\u001b[33mdiv.date\u001b[39m\u001b[33m\"\u001b[39m).text.strip()\n\u001b[32m 242\u001b[39m url = news_title.find(\u001b[33m\"\u001b[39m\u001b[33ma\u001b[39m\u001b[33m\"\u001b[39m).get(\u001b[33m\"\u001b[39m\u001b[33mhref\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 243\u001b[39m url_base_map[url] = {\u001b[33m\"\u001b[39m\u001b[33mtitle\u001b[39m\u001b[33m\"\u001b[39m: news_title.get_text(strip=\u001b[38;5;28;01mTrue\u001b[39;00m), \u001b[33m\"\u001b[39m\u001b[33mdate\u001b[39m\u001b[33m\"\u001b[39m: news_date}\n",
"\u001b[31mAttributeError\u001b[39m: 'NoneType' object has no attribute 'find'"
]
}
],
"source": [
"crawler.search(\"习近平\", 10)\n",
"# crawler.search(\"中国\", 10, \"xhsz\")\n",
"# crawler.search(\"中国\", 10, \"news\")\n",
"# crawler.search(\"中国\", 10, \"xhsz\")\n",
"# crawler.search(\"中国\", 10, \"news\")\n",
"# crawler.search(\"中国\", 10, \"news\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7e0f56fa",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "schoolNewsCrawler",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}