Files
schoolNews/schoolNewsCrawler/XhwTest.ipynb

151 lines
7.6 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 19,
"id": "948be230",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"项目根目录: f:\\Project\\schoolNews\n",
"✓ 已启用自动重载模块功能 - 修改 .py 文件后会自动生效\n"
]
}
],
"source": [
"# 自动重载模块(当文件修改后自动刷新)\n",
"%reload_ext autoreload\n",
"%autoreload 2\n",
"\n",
"import sys\n",
"import os\n",
"\n",
"# 先添加项目根目录到路径(必须在导入之前)\n",
"project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))\n",
"if project_root not in sys.path:\n",
" sys.path.insert(0, project_root)\n",
"\n",
"# 然后再导入模块\n",
"from crawler.xhw.XhwCrawler import XhwCrawler\n",
"from crawler.BaseCrawler import NewsItem\n",
"from loguru import logger\n",
"import json\n",
"from pprint import pprint\n",
"\n",
"print(f\"项目根目录: {project_root}\")\n",
"print(\"✓ 已启用自动重载模块功能 - 修改 .py 文件后会自动生效\")\n"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "31a8a0dd",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[32m2025-11-20 16:06:16.802\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.BaseCrawler\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m71\u001b[0m - \u001b[1m初始化爬虫: XhwCrawler\u001b[0m\n",
"\u001b[32m2025-11-20 16:06:17.899\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m20\u001b[0m - \u001b[1mChrome浏览器初始化成功\u001b[0m\n",
"\u001b[32m2025-11-20 16:06:17.900\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m24\u001b[0m - \u001b[1m访问主页获取初始Cookie\u001b[0m\n"
]
}
],
"source": [
"crawler = XhwCrawler()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "e5a6e91c",
"metadata": {},
"outputs": [],
"source": [
"#crawler.search(\"大学\", 1)\n",
"# crawler.search(\"中国\", 10, \"xhsz\")\n",
"# crawler.search(\"中国\", 10, \"news\")\n",
"# crawler.search(\"中国\", 10, \"xhsz\")\n",
"# crawler.search(\"中国\", 10, \"news\")\n",
"# crawler.search(\"中国\", 10, \"news\")"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "7e0f56fa",
"metadata": {},
"outputs": [],
"source": [
"# crawler.parse_xhsz_news_detail(\"https://xhsz.news.cn/focus_news/detail?id=9752\")"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "47327ebf",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"NewsItem(title='全国精神文明建设表彰大会在京召开', contentRows=[{'tag': 'img', 'content': \"<img src='https://www.news.cn/photo/20250523/9fc5e377b19047918dfe0eca2aad5c67/202505239fc5e377b19047918dfe0eca2aad5c67_202505232bbfd057329e448b9540fc5dbf050283.jpg' />\"}, {'tag': 'p', 'content': '<p>\\u2003\\u20035月23日全国精神文明建设表彰大会在北京召开。中央宣传思想文化工作领导小组决定授予202个城市全国文明城市称号授予3316个村镇全国文明村镇称号授予4688个单位全国文明单位称号授予798户家庭全国文明家庭称号授予890所学校全国文明校园称号授予60名同志第九届全国道德模范荣誉称号授予239名同志第九届全国道德模范提名奖。</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003新华社记者 翟健岚 摄</p>'}, {'tag': 'img', 'content': \"<img src='https://www.news.cn/photo/20250523/9fc5e377b19047918dfe0eca2aad5c67/202505239fc5e377b19047918dfe0eca2aad5c67_20250523ead9d290ee4c45928fc7ca35e227727b.jpg' />\"}, {'tag': 'p', 'content': '<p>\\u2003\\u20035月23日全国精神文明建设表彰大会在北京召开。中央宣传思想文化工作领导小组决定授予202个城市全国文明城市称号授予3316个村镇全国文明村镇称号授予4688个单位全国文明单位称号授予798户家庭全国文明家庭称号授予890所学校全国文明校园称号授予60名同志第九届全国道德模范荣誉称号授予239名同志第九届全国道德模范提名奖。</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003新华社记者 翟健岚 摄</p>'}, {'tag': 'img', 'content': \"<img src='https://www.news.cn/photo/20250523/9fc5e377b19047918dfe0eca2aad5c67/202505239fc5e377b19047918dfe0eca2aad5c67_20250523971c4e94467b451c93cd4810992e2025.jpg' />\"}, {'tag': 'p', 'content': '<p>\\u2003\\u20035月23日全国精神文明建设表彰大会在北京召开。中央宣传思想文化工作领导小组决定授予202个城市全国文明城市称号授予3316个村镇全国文明村镇称号授予4688个单位全国文明单位称号授予798户家庭全国文明家庭称号授予890所学校全国文明校园称号授予60名同志第九届全国道德模范荣誉称号授予239名同志第九届全国道德模范提名奖。</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003新华社记者 翟健岚 摄</p>'}, {'tag': 'img', 'content': \"<img src='https://www.news.cn/photo/20250523/9fc5e377b19047918dfe0eca2aad5c67/202505239fc5e377b19047918dfe0eca2aad5c67_20250523cf0f3bf322184d339a91049891f265bb.jpg' />\"}, {'tag': 'p', 'content': '<p>\\u2003\\u20035月23日全国精神文明建设表彰大会在北京召开。中央宣传思想文化工作领导小组决定授予202个城市全国文明城市称号授予3316个村镇全国文明村镇称号授予4688个单位全国文明单位称号授予798户家庭全国文明家庭称号授予890所学校全国文明校园称号授予60名同志第九届全国道德模范荣誉称号授予239名同志第九届全国道德模范提名奖。</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003新华社记者 翟健岚 摄</p>'}], url='https://www.news.cn/photo/20250523/9fc5e377b19047918dfe0eca2aad5c67/c.html', viewCount=None, publishTime='2025-05-23 18:33:10', author=None, source='新华网', category=None, executeStatus=0, executeMessage=None)"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#crawler.parse_xh_news_detail(\"https://www.news.cn/politics/leaders/20250224/5384be3d47c643b3a68e3bb724656152/c.html\")\n",
"# crawler.parse_xh_news_detail(\"https://www.news.cn/politics/leaders/20240207/2819fe60663140eab9599581dcae8c1e/c.html\") #视频\n",
"crawler.parse_xh_news_detail(\"https://www.news.cn/photo/20250523/9fc5e377b19047918dfe0eca2aad5c67/c.html\") # 分页"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "fa359d5b",
"metadata": {},
"outputs": [],
"source": [
"# crawler.commend()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ebabbd5b",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "schoolNewsCrawler",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}