Files
schoolNews/schoolNewsCrawler/test.ipynb
2025-11-20 14:57:20 +08:00

142 lines
6.6 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "948be230",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"项目根目录: f:\\Project\\schoolNews\n",
"✓ 已启用自动重载模块功能 - 修改 .py 文件后会自动生效\n"
]
}
],
"source": [
"# 自动重载模块(当文件修改后自动刷新)\n",
"%reload_ext autoreload\n",
"%autoreload 2\n",
"\n",
"import sys\n",
"import os\n",
"\n",
"# 先添加项目根目录到路径(必须在导入之前)\n",
"project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))\n",
"if project_root not in sys.path:\n",
" sys.path.insert(0, project_root)\n",
"\n",
"# 然后再导入模块\n",
"from crawler.xhw.XhwCrawler import XhwCrawler\n",
"from crawler.BaseCrawler import NewsItem\n",
"from loguru import logger\n",
"import json\n",
"from pprint import pprint\n",
"\n",
"print(f\"项目根目录: {project_root}\")\n",
"print(\"✓ 已启用自动重载模块功能 - 修改 .py 文件后会自动生效\")\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "31a8a0dd",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[32m2025-11-20 14:48:38.587\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.BaseCrawler\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m71\u001b[0m - \u001b[1m初始化爬虫: XhwCrawler\u001b[0m\n",
"\u001b[32m2025-11-20 14:48:39.615\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m113\u001b[0m - \u001b[1mChrome浏览器初始化成功\u001b[0m\n",
"\u001b[32m2025-11-20 14:48:39.615\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m122\u001b[0m - \u001b[1m访问主页获取初始Cookie\u001b[0m\n",
"\u001b[32m2025-11-20 14:48:39.616\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m123\u001b[0m - \u001b[1m准备访问URL: https://xhsz.news.cn/\u001b[0m\n",
"\u001b[32m2025-11-20 14:48:41.227\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m126\u001b[0m - \u001b[1m成功访问URL: https://xhsz.news.cn/\u001b[0m\n"
]
}
],
"source": [
"crawler = XhwCrawler()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "e5a6e91c",
"metadata": {},
"outputs": [],
"source": [
"#crawler.search(\"大学\", 1)\n",
"# crawler.search(\"中国\", 10, \"xhsz\")\n",
"# crawler.search(\"中国\", 10, \"news\")\n",
"# crawler.search(\"中国\", 10, \"xhsz\")\n",
"# crawler.search(\"中国\", 10, \"news\")\n",
"# crawler.search(\"中国\", 10, \"news\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "7e0f56fa",
"metadata": {},
"outputs": [],
"source": [
"# crawler.parse_xhsz_news_detail(\"https://xhsz.news.cn/focus_news/detail?id=9752\")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "47327ebf",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"NewsItem(title='微纪录片|习近平的“三农”情', contentRows=[{'tag': 'video', 'content': \"<video src='https://vodpub6.v.news.cn/yqfbzx-original/20240207/202402072819fe60663140eab9599581dcae8c1e_73db5fe0318c44469be3ea83adfc730d.mp4' />\"}, {'tag': 'p', 'content': '<p style=\"text-align: center;\">习近平对农民有着深厚的感情</p>'}, {'tag': 'p', 'content': '<p style=\"text-align: center;\">对“三农”问题有深入的思考</p>'}, {'tag': 'p', 'content': '<p style=\"text-align: center;\">一路走来</p>'}, {'tag': 'p', 'content': '<p style=\"text-align: center;\">他经常和农民在一起</p>'}, {'tag': 'p', 'content': '<p style=\"text-align: center;\">从小小山村的党支部书记</p>'}, {'tag': 'p', 'content': '<p style=\"text-align: center;\">到党的总书记</p>'}, {'tag': 'p', 'content': '<p style=\"text-align: center;\">寸寸光阴</p>'}, {'tag': 'p', 'content': '<p style=\"text-align: center;\">见证着他的“三农”情</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003总策划刘健</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003策划李拯宇</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003监制孙志平</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003制片樊华</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003统筹韩珅、王志斌</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003编导陈晓宇</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003记者陈晓宇、范世辉、岳文婷、邹尚伯、张晨俊、王怿文、李涛、胡友松、朱晓光</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003报道员刘鹏飞、李树锋、张伟、朱海亮、徐涛、王盟、王静</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003海报韩彤实习</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003鸣谢中共延川县委宣传部</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003中共石家庄市委宣传部</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003中共曹县县委宣传部</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003新华社音视频部制作</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003新华通讯社出品</p>'}], url='https://www.news.cn/politics/leaders/20240207/2819fe60663140eab9599581dcae8c1e/c.html', viewCount=None, publishTime='2024-02-07 12:43:29', author=None, source='新华社', category=None, executeStatus=0, executeMessage=None)"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#crawler.parse_xh_news_detail(\"https://www.news.cn/politics/leaders/20250224/5384be3d47c643b3a68e3bb724656152/c.html\")\n",
"crawler.parse_xh_news_detail(\"https://www.news.cn/politics/leaders/20240207/2819fe60663140eab9599581dcae8c1e/c.html\") #视频"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fa359d5b",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "schoolNewsCrawler",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}