2025-11-19 19:05:31 +08:00
|
|
|
|
{
|
|
|
|
|
|
"cells": [
|
|
|
|
|
|
{
|
|
|
|
|
|
"cell_type": "code",
|
2025-11-20 14:57:20 +08:00
|
|
|
|
"execution_count": 1,
|
2025-11-19 19:05:31 +08:00
|
|
|
|
"id": "948be230",
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
"outputs": [
|
|
|
|
|
|
{
|
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
|
"text": [
|
|
|
|
|
|
"项目根目录: f:\\Project\\schoolNews\n",
|
|
|
|
|
|
"✓ 已启用自动重载模块功能 - 修改 .py 文件后会自动生效\n"
|
|
|
|
|
|
]
|
|
|
|
|
|
}
|
|
|
|
|
|
],
|
|
|
|
|
|
"source": [
|
|
|
|
|
|
"# 自动重载模块(当文件修改后自动刷新)\n",
|
|
|
|
|
|
"%reload_ext autoreload\n",
|
|
|
|
|
|
"%autoreload 2\n",
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
"import sys\n",
|
|
|
|
|
|
"import os\n",
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
"# 先添加项目根目录到路径(必须在导入之前)\n",
|
|
|
|
|
|
"project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))\n",
|
|
|
|
|
|
"if project_root not in sys.path:\n",
|
|
|
|
|
|
" sys.path.insert(0, project_root)\n",
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
"# 然后再导入模块\n",
|
|
|
|
|
|
"from crawler.xhw.XhwCrawler import XhwCrawler\n",
|
|
|
|
|
|
"from crawler.BaseCrawler import NewsItem\n",
|
|
|
|
|
|
"from loguru import logger\n",
|
|
|
|
|
|
"import json\n",
|
|
|
|
|
|
"from pprint import pprint\n",
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
"print(f\"项目根目录: {project_root}\")\n",
|
|
|
|
|
|
"print(\"✓ 已启用自动重载模块功能 - 修改 .py 文件后会自动生效\")\n"
|
|
|
|
|
|
]
|
|
|
|
|
|
},
|
|
|
|
|
|
{
|
|
|
|
|
|
"cell_type": "code",
|
2025-11-20 14:57:20 +08:00
|
|
|
|
"execution_count": 2,
|
2025-11-19 19:05:31 +08:00
|
|
|
|
"id": "31a8a0dd",
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
"outputs": [
|
|
|
|
|
|
{
|
|
|
|
|
|
"name": "stderr",
|
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
|
"text": [
|
2025-11-20 14:57:20 +08:00
|
|
|
|
"\u001b[32m2025-11-20 14:48:38.587\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.BaseCrawler\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m71\u001b[0m - \u001b[1m初始化爬虫: XhwCrawler\u001b[0m\n",
|
|
|
|
|
|
"\u001b[32m2025-11-20 14:48:39.615\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m113\u001b[0m - \u001b[1mChrome浏览器初始化成功\u001b[0m\n",
|
|
|
|
|
|
"\u001b[32m2025-11-20 14:48:39.615\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m122\u001b[0m - \u001b[1m访问主页获取初始Cookie\u001b[0m\n",
|
|
|
|
|
|
"\u001b[32m2025-11-20 14:48:39.616\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m123\u001b[0m - \u001b[1m准备访问URL: https://xhsz.news.cn/\u001b[0m\n",
|
|
|
|
|
|
"\u001b[32m2025-11-20 14:48:41.227\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m126\u001b[0m - \u001b[1m成功访问URL: https://xhsz.news.cn/\u001b[0m\n"
|
2025-11-19 19:05:31 +08:00
|
|
|
|
]
|
|
|
|
|
|
}
|
|
|
|
|
|
],
|
|
|
|
|
|
"source": [
|
|
|
|
|
|
"crawler = XhwCrawler()"
|
|
|
|
|
|
]
|
|
|
|
|
|
},
|
|
|
|
|
|
{
|
|
|
|
|
|
"cell_type": "code",
|
2025-11-20 14:57:20 +08:00
|
|
|
|
"execution_count": 3,
|
2025-11-19 19:05:31 +08:00
|
|
|
|
"id": "e5a6e91c",
|
|
|
|
|
|
"metadata": {},
|
2025-11-20 14:57:20 +08:00
|
|
|
|
"outputs": [],
|
2025-11-19 19:05:31 +08:00
|
|
|
|
"source": [
|
2025-11-20 14:57:20 +08:00
|
|
|
|
"#crawler.search(\"大学\", 1)\n",
|
2025-11-19 19:05:31 +08:00
|
|
|
|
"# crawler.search(\"中国\", 10, \"xhsz\")\n",
|
|
|
|
|
|
"# crawler.search(\"中国\", 10, \"news\")\n",
|
|
|
|
|
|
"# crawler.search(\"中国\", 10, \"xhsz\")\n",
|
|
|
|
|
|
"# crawler.search(\"中国\", 10, \"news\")\n",
|
|
|
|
|
|
"# crawler.search(\"中国\", 10, \"news\")"
|
|
|
|
|
|
]
|
|
|
|
|
|
},
|
|
|
|
|
|
{
|
|
|
|
|
|
"cell_type": "code",
|
2025-11-20 14:57:20 +08:00
|
|
|
|
"execution_count": 4,
|
2025-11-19 19:05:31 +08:00
|
|
|
|
"id": "7e0f56fa",
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
"outputs": [],
|
2025-11-20 14:42:15 +08:00
|
|
|
|
"source": [
|
|
|
|
|
|
"# crawler.parse_xhsz_news_detail(\"https://xhsz.news.cn/focus_news/detail?id=9752\")"
|
|
|
|
|
|
]
|
|
|
|
|
|
},
|
|
|
|
|
|
{
|
|
|
|
|
|
"cell_type": "code",
|
2025-11-20 14:57:20 +08:00
|
|
|
|
"execution_count": 7,
|
2025-11-20 14:42:15 +08:00
|
|
|
|
"id": "47327ebf",
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
"outputs": [
|
|
|
|
|
|
{
|
|
|
|
|
|
"data": {
|
|
|
|
|
|
"text/plain": [
|
2025-11-20 14:57:20 +08:00
|
|
|
|
"NewsItem(title='微纪录片|习近平的“三农”情', contentRows=[{'tag': 'video', 'content': \"<video src='https://vodpub6.v.news.cn/yqfbzx-original/20240207/202402072819fe60663140eab9599581dcae8c1e_73db5fe0318c44469be3ea83adfc730d.mp4' />\"}, {'tag': 'p', 'content': '<p style=\"text-align: center;\">习近平对农民有着深厚的感情</p>'}, {'tag': 'p', 'content': '<p style=\"text-align: center;\">对“三农”问题有深入的思考</p>'}, {'tag': 'p', 'content': '<p style=\"text-align: center;\">一路走来</p>'}, {'tag': 'p', 'content': '<p style=\"text-align: center;\">他经常和农民在一起</p>'}, {'tag': 'p', 'content': '<p style=\"text-align: center;\">从小小山村的党支部书记</p>'}, {'tag': 'p', 'content': '<p style=\"text-align: center;\">到党的总书记</p>'}, {'tag': 'p', 'content': '<p style=\"text-align: center;\">寸寸光阴</p>'}, {'tag': 'p', 'content': '<p style=\"text-align: center;\">见证着他的“三农”情</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003总策划:刘健</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003策划:李拯宇</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003监制:孙志平</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003制片:樊华</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003统筹:韩珅、王志斌</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003编导:陈晓宇</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003记者:陈晓宇、范世辉、岳文婷、邹尚伯、张晨俊、王怿文、李涛、胡友松、朱晓光</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003报道员:刘鹏飞、李树锋、张伟、朱海亮、徐涛、王盟、王静</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003海报:韩彤(实习)</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003鸣谢:中共延川县委宣传部</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003中共石家庄市委宣传部</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003中共曹县县委宣传部</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003新华社音视频部制作</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003新华通讯社出品</p>'}], url='https://www.news.cn/politics/leaders/20240207/2819fe60663140eab9599581dcae8c1e/c.html', viewCount=None, publishTime='2024-02-07 12:43:29', author=None, source='新华社', category=None, executeStatus=0, executeMessage=None)"
|
2025-11-20 14:42:15 +08:00
|
|
|
|
]
|
|
|
|
|
|
},
|
2025-11-20 14:57:20 +08:00
|
|
|
|
"execution_count": 7,
|
2025-11-20 14:42:15 +08:00
|
|
|
|
"metadata": {},
|
|
|
|
|
|
"output_type": "execute_result"
|
|
|
|
|
|
}
|
|
|
|
|
|
],
|
|
|
|
|
|
"source": [
|
2025-11-20 14:57:20 +08:00
|
|
|
|
"#crawler.parse_xh_news_detail(\"https://www.news.cn/politics/leaders/20250224/5384be3d47c643b3a68e3bb724656152/c.html\")\n",
|
|
|
|
|
|
"crawler.parse_xh_news_detail(\"https://www.news.cn/politics/leaders/20240207/2819fe60663140eab9599581dcae8c1e/c.html\") #视频"
|
2025-11-20 14:42:15 +08:00
|
|
|
|
]
|
|
|
|
|
|
},
|
|
|
|
|
|
{
|
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
|
"id": "fa359d5b",
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
"outputs": [],
|
2025-11-19 19:05:31 +08:00
|
|
|
|
"source": []
|
|
|
|
|
|
}
|
|
|
|
|
|
],
|
|
|
|
|
|
"metadata": {
|
|
|
|
|
|
"kernelspec": {
|
|
|
|
|
|
"display_name": "schoolNewsCrawler",
|
|
|
|
|
|
"language": "python",
|
|
|
|
|
|
"name": "python3"
|
|
|
|
|
|
},
|
|
|
|
|
|
"language_info": {
|
|
|
|
|
|
"codemirror_mode": {
|
|
|
|
|
|
"name": "ipython",
|
|
|
|
|
|
"version": 3
|
|
|
|
|
|
},
|
|
|
|
|
|
"file_extension": ".py",
|
|
|
|
|
|
"mimetype": "text/x-python",
|
|
|
|
|
|
"name": "python",
|
|
|
|
|
|
"nbconvert_exporter": "python",
|
|
|
|
|
|
"pygments_lexer": "ipython3",
|
|
|
|
|
|
"version": "3.12.12"
|
|
|
|
|
|
}
|
|
|
|
|
|
},
|
|
|
|
|
|
"nbformat": 4,
|
|
|
|
|
|
"nbformat_minor": 5
|
|
|
|
|
|
}
|