2025-11-19 19:05:31 +08:00
|
|
|
|
{
|
|
|
|
|
|
"cells": [
|
|
|
|
|
|
{
|
|
|
|
|
|
"cell_type": "code",
|
2026-01-09 13:33:39 +08:00
|
|
|
|
"execution_count": 1,
|
2025-11-19 19:05:31 +08:00
|
|
|
|
"id": "948be230",
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
"outputs": [
|
|
|
|
|
|
{
|
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
|
"text": [
|
|
|
|
|
|
"项目根目录: f:\\Project\\schoolNews\n",
|
|
|
|
|
|
"✓ 已启用自动重载模块功能 - 修改 .py 文件后会自动生效\n"
|
|
|
|
|
|
]
|
|
|
|
|
|
}
|
|
|
|
|
|
],
|
|
|
|
|
|
"source": [
|
|
|
|
|
|
"# 自动重载模块(当文件修改后自动刷新)\n",
|
|
|
|
|
|
"%reload_ext autoreload\n",
|
|
|
|
|
|
"%autoreload 2\n",
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
"import sys\n",
|
|
|
|
|
|
"import os\n",
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
"# 先添加项目根目录到路径(必须在导入之前)\n",
|
|
|
|
|
|
"project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))\n",
|
|
|
|
|
|
"if project_root not in sys.path:\n",
|
|
|
|
|
|
" sys.path.insert(0, project_root)\n",
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
"# 然后再导入模块\n",
|
|
|
|
|
|
"from crawler.xhw.XhwCrawler import XhwCrawler\n",
|
|
|
|
|
|
"from crawler.BaseCrawler import NewsItem\n",
|
|
|
|
|
|
"from loguru import logger\n",
|
|
|
|
|
|
"import json\n",
|
|
|
|
|
|
"from pprint import pprint\n",
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
"print(f\"项目根目录: {project_root}\")\n",
|
|
|
|
|
|
"print(\"✓ 已启用自动重载模块功能 - 修改 .py 文件后会自动生效\")\n"
|
|
|
|
|
|
]
|
|
|
|
|
|
},
|
|
|
|
|
|
{
|
|
|
|
|
|
"cell_type": "code",
|
2026-01-09 13:33:39 +08:00
|
|
|
|
"execution_count": 2,
|
2025-11-19 19:05:31 +08:00
|
|
|
|
"id": "31a8a0dd",
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
"outputs": [
|
|
|
|
|
|
{
|
|
|
|
|
|
"name": "stderr",
|
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
|
"text": [
|
2026-01-09 13:33:39 +08:00
|
|
|
|
"\u001b[32m2026-01-09 13:28:20.374\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.BaseCrawler\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m72\u001b[0m - \u001b[1m初始化爬虫: XhwCrawler\u001b[0m\n",
|
|
|
|
|
|
"\u001b[32m2026-01-09 13:28:20.375\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m147\u001b[0m - \u001b[1m使用Chrome二进制: win/chrome-headless/chrome-headless-shell-win64/chrome-headless-shell.exe\u001b[0m\n",
|
|
|
|
|
|
"\u001b[32m2026-01-09 13:28:21.675\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m154\u001b[0m - \u001b[1mChrome浏览器初始化成功\u001b[0m\n",
|
|
|
|
|
|
"\u001b[32m2026-01-09 13:28:21.676\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m163\u001b[0m - \u001b[1m访问主页获取初始Cookie\u001b[0m\n"
|
2025-11-19 19:05:31 +08:00
|
|
|
|
]
|
|
|
|
|
|
}
|
|
|
|
|
|
],
|
|
|
|
|
|
"source": [
|
|
|
|
|
|
"crawler = XhwCrawler()"
|
|
|
|
|
|
]
|
|
|
|
|
|
},
|
|
|
|
|
|
{
|
|
|
|
|
|
"cell_type": "code",
|
2026-01-09 13:33:39 +08:00
|
|
|
|
"execution_count": 3,
|
2025-11-19 19:05:31 +08:00
|
|
|
|
"id": "e5a6e91c",
|
|
|
|
|
|
"metadata": {},
|
2025-11-20 14:57:20 +08:00
|
|
|
|
"outputs": [],
|
2025-11-19 19:05:31 +08:00
|
|
|
|
"source": [
|
2025-11-20 14:57:20 +08:00
|
|
|
|
"#crawler.search(\"大学\", 1)\n",
|
2025-11-19 19:05:31 +08:00
|
|
|
|
"# crawler.search(\"中国\", 10, \"xhsz\")\n",
|
|
|
|
|
|
"# crawler.search(\"中国\", 10, \"news\")\n",
|
|
|
|
|
|
"# crawler.search(\"中国\", 10, \"xhsz\")\n",
|
|
|
|
|
|
"# crawler.search(\"中国\", 10, \"news\")\n",
|
|
|
|
|
|
"# crawler.search(\"中国\", 10, \"news\")"
|
|
|
|
|
|
]
|
|
|
|
|
|
},
|
|
|
|
|
|
{
|
|
|
|
|
|
"cell_type": "code",
|
2026-01-09 13:33:39 +08:00
|
|
|
|
"execution_count": 7,
|
2025-11-19 19:05:31 +08:00
|
|
|
|
"id": "7e0f56fa",
|
|
|
|
|
|
"metadata": {},
|
2026-01-09 13:33:39 +08:00
|
|
|
|
"outputs": [
|
|
|
|
|
|
{
|
|
|
|
|
|
"name": "stderr",
|
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
|
"text": [
|
|
|
|
|
|
"\u001b[32m2026-01-09 13:29:16.510\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mparse_xh_news_detail\u001b[0m:\u001b[36m442\u001b[0m - \u001b[33m\u001b[1m新闻内容解析失败: http://www.xinhuanet.com/politics/leaders/2021-01/21/c_1127006370.htm\u001b[0m\n"
|
|
|
|
|
|
]
|
|
|
|
|
|
},
|
|
|
|
|
|
{
|
|
|
|
|
|
"data": {
|
|
|
|
|
|
"text/plain": [
|
|
|
|
|
|
"NewsItem(title='第一观察|习近平考察冬奥,首次部署这项任务', contentRows=[], url='https://xhsz.news.cn/focus_news/detail?id=2709', viewCount=None, publishTime='2021-01-21 00:28:02', author=None, source='新华网', category=None, executeStatus=0, executeMessage=None)"
|
|
|
|
|
|
]
|
|
|
|
|
|
},
|
|
|
|
|
|
"execution_count": 7,
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
"output_type": "execute_result"
|
|
|
|
|
|
}
|
|
|
|
|
|
],
|
2025-11-20 14:42:15 +08:00
|
|
|
|
"source": [
|
2026-01-09 13:33:39 +08:00
|
|
|
|
"crawler.parse_xhsz_news_detail(\"https://xhsz.news.cn/focus_news/detail?id=2709\")"
|
2025-11-20 14:42:15 +08:00
|
|
|
|
]
|
|
|
|
|
|
},
|
|
|
|
|
|
{
|
|
|
|
|
|
"cell_type": "code",
|
2026-01-09 13:33:39 +08:00
|
|
|
|
"execution_count": null,
|
2025-11-20 14:42:15 +08:00
|
|
|
|
"id": "47327ebf",
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
"outputs": [
|
2026-01-09 13:33:39 +08:00
|
|
|
|
{
|
|
|
|
|
|
"name": "stderr",
|
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
|
"text": [
|
|
|
|
|
|
"\u001b[32m2026-01-09 13:28:34.186\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mparse_xh_news_detail\u001b[0m:\u001b[36m442\u001b[0m - \u001b[33m\u001b[1m新闻内容解析失败: https://www.news.cn/politics/leaders/20240201/a2d6ef2017fa4a589804e72a72cbc97a/c.html\u001b[0m\n"
|
|
|
|
|
|
]
|
|
|
|
|
|
},
|
2025-11-20 14:42:15 +08:00
|
|
|
|
{
|
|
|
|
|
|
"data": {
|
|
|
|
|
|
"text/plain": [
|
2026-01-09 13:33:39 +08:00
|
|
|
|
"NewsItem(title='特稿丨习近平:文化传承创新的引领人', contentRows=[{'tag': 'img', 'content': \"<img src='https://www.news.cn/politics/leaders/20240201/a2d6ef2017fa4a589804e72a72cbc97a/20240201a2d6ef2017fa4a589804e72a72cbc97a_202402018fe651002e974486896e39450c6e73d8.jpg' />\"}, {'tag': 'p', 'content': '<p><span style=\"font-family: 楷体; color: #000080;\">\\u2003\\u20032023年10月18日上午,习近平在北京人民大会堂出席第三届“一带一路”国际合作高峰论坛开幕式并发表题为《建设开放包容、互联互通、共同发展的世界》的主旨演讲。新华社记者 王晔 摄</span></p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003新华社北京2月1日电 <span style=\"color: #000080;\"><strong>题:习近平:文化传承创新的引领人</strong></span></p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003新华社记者</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003在杭州,跨湖桥遗址博物馆为迎接更多游客探寻八千年前此地先民的生活,春节前夕推出增强现实技术服务。在西安,陕菜文化体验博物馆正尝试把汉赋唐诗中描写的古代菜肴复原出来,让客人领略传统美食的“文韵”。在北京,新成立的中国影协科幻电影工作委员会的成员们忙着为《流浪地球》后的新科幻电影选景,为观众提供好莱坞外的另一种选择。</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003拥有五千年文明史的中国,到处出现新的“文化热”。今年1月召开的全国宣传部长会议,提出深入学习贯彻习近平文化思想,“为全面推进强国建设、民族复兴伟业提供坚强思想保证、强大精神力量、有利文化条件”。</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003这意味着,一百多年前随新文化运动到来、马克思主义传播而诞生的中国共产党,正在习近平引领下,再次应用文化夯实其执政地位,推动国家实现现代化。</p>'}, {'tag': 'p', 'content': '<p>\\u2003<span style=\"color: #000080;\"><strong>\\u2003深厚的文化情怀</strong></span></p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003习近平1953年出生在一个文化氛围浓郁的红色家庭。像几千年来中国的家庭一样,父母十分注重家传。</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003习近平五六岁时,母亲齐心给他讲“岳母刺字”。习近平说,把字刺上去,多疼啊!齐心说,是疼,但心里铭记住了。习近平后来说,从那时起,“精忠报国”就成了他一生追求的目标。</p>'}, {'tag': 'img', 'content': \"<img src='https://www.news.cn/politics/leaders/20240201/a2d6ef2017fa4a589804e72a72cbc97a/20240201a2d6ef2017fa4a589804e72a72cbc97a_2024020172b0b7aecc1044b8a8bea3e57c3d1ac9.jpg' />\"}, {'tag': 'p', 'content': '<p><span style=\"font-family: 楷体; color: #000080;\">\\u2003\\u2003这是习近平陪母亲齐心散步。新华社发</span></p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003习近平说,他“最大的爱好是读书”。习近平的小学语文老师田潞英曾回忆,常穿着补丁衣服的习近平“学习很好,喜欢读经典名著”。也有初中老师记得,有次下课后习近平主动来找老师,说他十分喜爱杜甫,希望多读一些他的诗作。</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003上世纪六十年代,习近平从北京来到陕西梁家河当知青。他带了满箱子书去,并给自己定下“修身”座右铭,“一物不知,深以为耻”。习近平年轻时读的书既有《三国志》、《古诗源》、《史记》等古籍,也有《战争与和平》、《浮士德》、《海底两万里》等名著,还有《资本论》、《共产党宣言》、《为人民服务》等经典。</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003约40年后,这位曾在窑洞煤油灯下一看书就是半宿、放羊锄地都揣着书的人,以中国国家主席身份站在联合国教科文组织总部的讲台上,分享他对文化和
|
2025-11-20 14:42:15 +08:00
|
|
|
|
]
|
|
|
|
|
|
},
|
2026-01-09 13:33:39 +08:00
|
|
|
|
"execution_count": 5,
|
2025-11-20 14:42:15 +08:00
|
|
|
|
"metadata": {},
|
|
|
|
|
|
"output_type": "execute_result"
|
|
|
|
|
|
}
|
|
|
|
|
|
],
|
|
|
|
|
|
"source": [
|
2025-11-20 16:09:29 +08:00
|
|
|
|
"#crawler.parse_xh_news_detail(\"https://www.news.cn/politics/leaders/20250224/5384be3d47c643b3a68e3bb724656152/c.html\")\n",
|
|
|
|
|
|
"# crawler.parse_xh_news_detail(\"https://www.news.cn/politics/leaders/20240207/2819fe60663140eab9599581dcae8c1e/c.html\") #视频\n",
|
2026-01-09 13:33:39 +08:00
|
|
|
|
"# crawler.parse_xh_news_detail(\"https://www.news.cn/politics/leaders/20240201/a2d6ef2017fa4a589804e72a72cbc97a/c.html\") # 分页"
|
2025-11-20 16:09:29 +08:00
|
|
|
|
]
|
|
|
|
|
|
},
|
|
|
|
|
|
{
|
|
|
|
|
|
"cell_type": "code",
|
2026-01-09 13:33:39 +08:00
|
|
|
|
"execution_count": 6,
|
2025-11-20 16:09:29 +08:00
|
|
|
|
"id": "fa359d5b",
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
"outputs": [],
|
|
|
|
|
|
"source": [
|
|
|
|
|
|
"# crawler.commend()"
|
2025-11-20 14:42:15 +08:00
|
|
|
|
]
|
|
|
|
|
|
},
|
|
|
|
|
|
{
|
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
|
"execution_count": null,
|
2025-11-20 15:46:53 +08:00
|
|
|
|
"id": "ebabbd5b",
|
2025-11-20 14:42:15 +08:00
|
|
|
|
"metadata": {},
|
|
|
|
|
|
"outputs": [],
|
2025-11-19 19:05:31 +08:00
|
|
|
|
"source": []
|
|
|
|
|
|
}
|
|
|
|
|
|
],
|
|
|
|
|
|
"metadata": {
|
|
|
|
|
|
"kernelspec": {
|
2026-01-09 13:33:39 +08:00
|
|
|
|
"display_name": "crawler",
|
2025-11-19 19:05:31 +08:00
|
|
|
|
"language": "python",
|
|
|
|
|
|
"name": "python3"
|
|
|
|
|
|
},
|
|
|
|
|
|
"language_info": {
|
|
|
|
|
|
"codemirror_mode": {
|
|
|
|
|
|
"name": "ipython",
|
|
|
|
|
|
"version": 3
|
|
|
|
|
|
},
|
|
|
|
|
|
"file_extension": ".py",
|
|
|
|
|
|
"mimetype": "text/x-python",
|
|
|
|
|
|
"name": "python",
|
|
|
|
|
|
"nbconvert_exporter": "python",
|
|
|
|
|
|
"pygments_lexer": "ipython3",
|
2026-01-09 13:33:39 +08:00
|
|
|
|
"version": "3.11.14"
|
2025-11-19 19:05:31 +08:00
|
|
|
|
}
|
|
|
|
|
|
},
|
|
|
|
|
|
"nbformat": 4,
|
|
|
|
|
|
"nbformat_minor": 5
|
|
|
|
|
|
}
|