视频
This commit is contained in:
@@ -56,7 +56,27 @@ class XhwCrawler(BaseCrawler):
|
|||||||
'sec-ch-ua-platform': '"Windows"'
|
'sec-ch-ua-platform': '"Windows"'
|
||||||
}
|
}
|
||||||
),
|
),
|
||||||
|
"hot_point": UrlConfig(
|
||||||
|
url="https://xhsz.news.cn/focus_news",
|
||||||
|
method="GET",
|
||||||
|
params={},
|
||||||
|
headers={
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||||
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
||||||
|
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
||||||
|
'Accept-Encoding': 'gzip, deflate, br',
|
||||||
|
'Connection': 'keep-alive',
|
||||||
|
'Upgrade-Insecure-Requests': '1',
|
||||||
|
'Sec-Fetch-Dest': 'document',
|
||||||
|
'Sec-Fetch-Mode': 'navigate',
|
||||||
|
'Sec-Fetch-Site': 'none',
|
||||||
|
'Cache-Control': 'max-age=0',
|
||||||
|
'Referer': 'https://xhsz.news.cn/',
|
||||||
|
'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
|
||||||
|
'sec-ch-ua-mobile': '?0',
|
||||||
|
'sec-ch-ua-platform': '"Windows"'
|
||||||
|
}
|
||||||
|
)
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
@@ -318,37 +338,47 @@ class XhwCrawler(BaseCrawler):
|
|||||||
for child in children:
|
for child in children:
|
||||||
try:
|
try:
|
||||||
tag_name = child.tag_name.lower()
|
tag_name = child.tag_name.lower()
|
||||||
if tag_name == "p":
|
if tag_name == "p" or tag_name == "div":
|
||||||
text = child.text.strip().replace("\xa0", "")
|
text = child.text.strip().replace("\xa0", "")
|
||||||
if not text and len(child.find_elements(By.TAG_NAME, "img")) == 0:
|
if not text and len(child.find_elements(By.TAG_NAME, "img")) == 0:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# 图片
|
|
||||||
try:
|
|
||||||
img = child.find_element(By.TAG_NAME, "img")
|
|
||||||
src = img.get_attribute("src")
|
|
||||||
if src and not src.startswith("http"):
|
|
||||||
src = self.config.base_url + src
|
|
||||||
news_item.contentRows.append({"tag": "img", "content": f"<img src='{src}' />"})
|
|
||||||
continue
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# 视频
|
# 视频
|
||||||
try:
|
try:
|
||||||
video = child.find_element(By.TAG_NAME, "video")
|
video = child.find_element(By.TAG_NAME, "video")
|
||||||
src = video.get_attribute("src")
|
src = video.get_attribute("src")
|
||||||
if src and not src.startswith("http"):
|
if src and not src.startswith("http"):
|
||||||
src = self.config.base_url + src
|
src = self._normalize_url(src)
|
||||||
news_item.contentRows.append({"tag": "video", "content": f"<video src='{src}' />"})
|
news_item.contentRows.append({"tag": "video", "content": f"<video src='{src}' />"})
|
||||||
continue
|
continue
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
# 图片
|
||||||
|
try:
|
||||||
|
img = child.find_element(By.TAG_NAME, "img")
|
||||||
|
src = img.get_attribute("src")
|
||||||
|
if src and not src.startswith("http"):
|
||||||
|
src = self._normalize_url(src)
|
||||||
|
news_item.contentRows.append({"tag": "img", "content": f"<img src='{src}' />"})
|
||||||
|
continue
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
# 普通段落
|
# 普通段落
|
||||||
news_item.contentRows.append({"tag": "p", "content": child.get_attribute("outerHTML")})
|
news_item.contentRows.append({"tag": "p", "content": child.get_attribute("outerHTML")})
|
||||||
elif tag_name in ["img", "video"]:
|
elif tag_name == "img":
|
||||||
news_item.contentRows.append({"tag": tag_name, "content": child.get_attribute("outerHTML")})
|
src = child.get_attribute("src")
|
||||||
|
if src and not src.startswith("http"):
|
||||||
|
src = self._normalize_url(src)
|
||||||
|
news_item.contentRows.append({"tag": "img", "content": f"<img src='{src}' />"})
|
||||||
|
elif tag_name == "video":
|
||||||
|
src = child.get_attribute("src")
|
||||||
|
if src and not src.startswith("http"):
|
||||||
|
src = self._normalize_url(src)
|
||||||
|
news_item.contentRows.append({"tag": "video", "content": f"<video src='{src}' />"})
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"解析段落失败: {e}")
|
logger.warning(f"解析段落失败: {e}")
|
||||||
continue
|
continue
|
||||||
@@ -488,7 +518,55 @@ class XhwCrawler(BaseCrawler):
|
|||||||
resultDomain.success = bool(news_list)
|
resultDomain.success = bool(news_list)
|
||||||
return resultDomain
|
return resultDomain
|
||||||
|
|
||||||
|
def hot_point(self) -> ResultDomain:
|
||||||
|
# 检查driver是否已初始化
|
||||||
|
if not self.driver:
|
||||||
|
logger.error("WebDriver未初始化,无法继续爬取")
|
||||||
|
return ResultDomain(code=1, message="WebDriver未初始化,无法继续爬取", success=False)
|
||||||
|
|
||||||
|
news_urls = []
|
||||||
|
news_list = []
|
||||||
|
resultDomain = ResultDomain(code=0, message="", success=True, dataList=news_list)
|
||||||
|
|
||||||
|
# 获取搜索配置
|
||||||
|
hot_point_config = self.config.urls.get("hot_point")
|
||||||
|
if not hot_point_config:
|
||||||
|
logger.error("未找到搜索URL配置")
|
||||||
|
resultDomain.code = 0
|
||||||
|
resultDomain.message = "未找到搜索URL配置"
|
||||||
|
resultDomain.success = False
|
||||||
|
return resultDomain
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 获取新闻url
|
||||||
|
url_base_map = {}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# 从新闻url中获取新闻详情
|
||||||
|
for news_url in news_urls:
|
||||||
|
try:
|
||||||
|
news = self.parse_news_detail(news_url)
|
||||||
|
if news:
|
||||||
|
news.title = url_base_map.get(news_url, {}).get("title") or news.title
|
||||||
|
news.publishTime = url_base_map.get(news_url, {}).get("date") or news.publishTime
|
||||||
|
news_list.append(news)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"解析新闻失败: {news_url}, {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"搜索过程整体异常: {e}")
|
||||||
|
resultDomain.success = False
|
||||||
|
resultDomain.code = 0
|
||||||
|
resultDomain.message = "爬取失败"
|
||||||
|
|
||||||
|
# 最终保证返回 dataList
|
||||||
|
resultDomain.dataList = news_list
|
||||||
|
resultDomain.success = bool(news_list)
|
||||||
|
return resultDomain
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
if hasattr(self, 'driver') and self.driver:
|
if hasattr(self, 'driver') and self.driver:
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 18,
|
"execution_count": 1,
|
||||||
"id": "948be230",
|
"id": "948be230",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@@ -41,7 +41,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 19,
|
"execution_count": 2,
|
||||||
"id": "31a8a0dd",
|
"id": "31a8a0dd",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@@ -49,11 +49,11 @@
|
|||||||
"name": "stderr",
|
"name": "stderr",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"\u001b[32m2025-11-20 14:39:07.858\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.BaseCrawler\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m71\u001b[0m - \u001b[1m初始化爬虫: XhwCrawler\u001b[0m\n",
|
"\u001b[32m2025-11-20 14:48:38.587\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.BaseCrawler\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m71\u001b[0m - \u001b[1m初始化爬虫: XhwCrawler\u001b[0m\n",
|
||||||
"\u001b[32m2025-11-20 14:39:08.884\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m20\u001b[0m - \u001b[1mChrome浏览器初始化成功\u001b[0m\n",
|
"\u001b[32m2025-11-20 14:48:39.615\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m113\u001b[0m - \u001b[1mChrome浏览器初始化成功\u001b[0m\n",
|
||||||
"\u001b[32m2025-11-20 14:39:08.884\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m24\u001b[0m - \u001b[1m访问主页获取初始Cookie\u001b[0m\n",
|
"\u001b[32m2025-11-20 14:48:39.615\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m122\u001b[0m - \u001b[1m访问主页获取初始Cookie\u001b[0m\n",
|
||||||
"\u001b[32m2025-11-20 14:39:08.885\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m25\u001b[0m - \u001b[1m准备访问URL: https://xhsz.news.cn/\u001b[0m\n",
|
"\u001b[32m2025-11-20 14:48:39.616\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m123\u001b[0m - \u001b[1m准备访问URL: https://xhsz.news.cn/\u001b[0m\n",
|
||||||
"\u001b[32m2025-11-20 14:39:10.309\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m28\u001b[0m - \u001b[1m成功访问URL: https://xhsz.news.cn/\u001b[0m\n"
|
"\u001b[32m2025-11-20 14:48:41.227\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m126\u001b[0m - \u001b[1m成功访问URL: https://xhsz.news.cn/\u001b[0m\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@@ -63,50 +63,12 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 3,
|
||||||
"id": "e5a6e91c",
|
"id": "e5a6e91c",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stderr",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"\u001b[32m2025-11-20 13:19:51.853\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36msearch\u001b[0m:\u001b[36m26\u001b[0m - \u001b[1m请求URL: https://xhsz.news.cn/s?k=%E5%A4%A7%E5%AD%A6&action=news&page=1\u001b[0m\n",
|
|
||||||
"\u001b[32m2025-11-20 13:20:15.300\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mparse_xhsz_news_detail\u001b[0m:\u001b[36m26\u001b[0m - \u001b[1m未发现滑动验证,直接继续\u001b[0m\n",
|
|
||||||
"\u001b[32m2025-11-20 13:20:20.310\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mparse_xhsz_news_detail\u001b[0m:\u001b[36m33\u001b[0m - \u001b[1m找到新闻主体部分: <selenium.webdriver.remote.webelement.WebElement (session=\"11360ade0a59af3938c0f8faa9b88abf\", element=\"f.6B13A7AB92BA3CB5CE0964EB246896F9.d.8B0C5F90441ED5455E088CF6DF7032DE.e.84\")>\u001b[0m\n",
|
|
||||||
"\u001b[32m2025-11-20 13:20:36.428\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mparse_xhsz_news_detail\u001b[0m:\u001b[36m26\u001b[0m - \u001b[1m未发现滑动验证,直接继续\u001b[0m\n",
|
|
||||||
"\u001b[32m2025-11-20 13:20:41.434\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mparse_xhsz_news_detail\u001b[0m:\u001b[36m33\u001b[0m - \u001b[1m找到新闻主体部分: <selenium.webdriver.remote.webelement.WebElement (session=\"11360ade0a59af3938c0f8faa9b88abf\", element=\"f.6B13A7AB92BA3CB5CE0964EB246896F9.d.D41E40A40777EF2D881878B18F35342A.e.114\")>\u001b[0m\n",
|
|
||||||
"\u001b[32m2025-11-20 13:20:57.656\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mparse_xhsz_news_detail\u001b[0m:\u001b[36m26\u001b[0m - \u001b[1m未发现滑动验证,直接继续\u001b[0m\n",
|
|
||||||
"\u001b[32m2025-11-20 13:21:02.664\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mparse_xhsz_news_detail\u001b[0m:\u001b[36m33\u001b[0m - \u001b[1m找到新闻主体部分: <selenium.webdriver.remote.webelement.WebElement (session=\"11360ade0a59af3938c0f8faa9b88abf\", element=\"f.6B13A7AB92BA3CB5CE0964EB246896F9.d.2BA293A49BA4DA88D492D8BDC1E07365.e.157\")>\u001b[0m\n",
|
|
||||||
"\u001b[32m2025-11-20 13:21:18.808\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mparse_xhsz_news_detail\u001b[0m:\u001b[36m26\u001b[0m - \u001b[1m未发现滑动验证,直接继续\u001b[0m\n",
|
|
||||||
"\u001b[32m2025-11-20 13:21:23.814\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mparse_xhsz_news_detail\u001b[0m:\u001b[36m33\u001b[0m - \u001b[1m找到新闻主体部分: <selenium.webdriver.remote.webelement.WebElement (session=\"11360ade0a59af3938c0f8faa9b88abf\", element=\"f.6B13A7AB92BA3CB5CE0964EB246896F9.d.DDC416596722BE8B22A5E84011EA59C3.e.198\")>\u001b[0m\n",
|
|
||||||
"\u001b[32m2025-11-20 13:22:32.631\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mparse_xhsz_news_detail\u001b[0m:\u001b[36m26\u001b[0m - \u001b[1m未发现滑动验证,直接继续\u001b[0m\n",
|
|
||||||
"\u001b[32m2025-11-20 13:22:37.642\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mparse_xhsz_news_detail\u001b[0m:\u001b[36m33\u001b[0m - \u001b[1m找到新闻主体部分: <selenium.webdriver.remote.webelement.WebElement (session=\"11360ade0a59af3938c0f8faa9b88abf\", element=\"f.6B13A7AB92BA3CB5CE0964EB246896F9.d.B9E24DEEF281C700F90635CABAA2B108.e.230\")>\u001b[0m\n",
|
|
||||||
"\u001b[32m2025-11-20 13:22:53.636\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mparse_xhsz_news_detail\u001b[0m:\u001b[36m26\u001b[0m - \u001b[1m未发现滑动验证,直接继续\u001b[0m\n",
|
|
||||||
"\u001b[32m2025-11-20 13:22:58.643\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mparse_xhsz_news_detail\u001b[0m:\u001b[36m33\u001b[0m - \u001b[1m找到新闻主体部分: <selenium.webdriver.remote.webelement.WebElement (session=\"11360ade0a59af3938c0f8faa9b88abf\", element=\"f.6B13A7AB92BA3CB5CE0964EB246896F9.d.EECC90A746E37A0994443791EFF7C402.e.290\")>\u001b[0m\n",
|
|
||||||
"\u001b[32m2025-11-20 13:23:15.189\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mparse_xhsz_news_detail\u001b[0m:\u001b[36m26\u001b[0m - \u001b[1m未发现滑动验证,直接继续\u001b[0m\n",
|
|
||||||
"\u001b[32m2025-11-20 13:23:20.196\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mparse_xhsz_news_detail\u001b[0m:\u001b[36m33\u001b[0m - \u001b[1m找到新闻主体部分: <selenium.webdriver.remote.webelement.WebElement (session=\"11360ade0a59af3938c0f8faa9b88abf\", element=\"f.6B13A7AB92BA3CB5CE0964EB246896F9.d.0188441312BE753DFF48394C16A44F8F.e.330\")>\u001b[0m\n",
|
|
||||||
"\u001b[32m2025-11-20 13:23:36.050\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mparse_xhsz_news_detail\u001b[0m:\u001b[36m26\u001b[0m - \u001b[1m未发现滑动验证,直接继续\u001b[0m\n",
|
|
||||||
"\u001b[32m2025-11-20 13:23:41.057\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mparse_xhsz_news_detail\u001b[0m:\u001b[36m33\u001b[0m - \u001b[1m找到新闻主体部分: <selenium.webdriver.remote.webelement.WebElement (session=\"11360ade0a59af3938c0f8faa9b88abf\", element=\"f.6B13A7AB92BA3CB5CE0964EB246896F9.d.F7A148D8A30D006FFCDAC45B01A2E7B5.e.374\")>\u001b[0m\n",
|
|
||||||
"\u001b[32m2025-11-20 13:23:56.819\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mparse_xhsz_news_detail\u001b[0m:\u001b[36m26\u001b[0m - \u001b[1m未发现滑动验证,直接继续\u001b[0m\n",
|
|
||||||
"\u001b[32m2025-11-20 13:24:01.826\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mparse_xhsz_news_detail\u001b[0m:\u001b[36m33\u001b[0m - \u001b[1m找到新闻主体部分: <selenium.webdriver.remote.webelement.WebElement (session=\"11360ade0a59af3938c0f8faa9b88abf\", element=\"f.6B13A7AB92BA3CB5CE0964EB246896F9.d.5A632E0B79568A5FFC8E29FFD5B09507.e.396\")>\u001b[0m\n",
|
|
||||||
"\u001b[32m2025-11-20 13:24:17.976\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mparse_xhsz_news_detail\u001b[0m:\u001b[36m26\u001b[0m - \u001b[1m未发现滑动验证,直接继续\u001b[0m\n",
|
|
||||||
"\u001b[32m2025-11-20 13:24:22.983\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mparse_xhsz_news_detail\u001b[0m:\u001b[36m33\u001b[0m - \u001b[1m找到新闻主体部分: <selenium.webdriver.remote.webelement.WebElement (session=\"11360ade0a59af3938c0f8faa9b88abf\", element=\"f.6B13A7AB92BA3CB5CE0964EB246896F9.d.6B5B529215D1C2221EEF1597FF0C3D0A.e.445\")>\u001b[0m\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"ResultDomain(code=0, message='', success=True, data=None, dataList=[])"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 16,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"crawler.search(\"大学\", 1)\n",
|
"#crawler.search(\"大学\", 1)\n",
|
||||||
"# crawler.search(\"中国\", 10, \"xhsz\")\n",
|
"# crawler.search(\"中国\", 10, \"xhsz\")\n",
|
||||||
"# crawler.search(\"中国\", 10, \"news\")\n",
|
"# crawler.search(\"中国\", 10, \"news\")\n",
|
||||||
"# crawler.search(\"中国\", 10, \"xhsz\")\n",
|
"# crawler.search(\"中国\", 10, \"xhsz\")\n",
|
||||||
@@ -116,7 +78,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 17,
|
"execution_count": 4,
|
||||||
"id": "7e0f56fa",
|
"id": "7e0f56fa",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@@ -126,23 +88,24 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 20,
|
"execution_count": 7,
|
||||||
"id": "47327ebf",
|
"id": "47327ebf",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
"NewsItem(title='《习近平总书记关于党的建设的重要思想概论》出版座谈会在北京召开', contentRows=[{'tag': 'p', 'content': '<p>\\u3000\\u3000新华社北京2月24日电\\u3000《习近平总书记关于党的建设的重要思想概论》出版座谈会2月24日在京召开。与会代表结合《概论》主要内容,交流学习贯彻习近平总书记关于党的建设的重要思想的认识和体会。</p>'}, {'tag': 'p', 'content': '<p>\\u3000\\u3000会议认为,《概论》是广大党员、干部深入学习领会习近平总书记关于党的建设的重要思想的权威辅助读物。习近平总书记关于党的建设的重要思想,是一个逻辑严密、内涵丰富、系统全面、博大精深的科学体系,是对中国化的马克思主义党建理论体系的继承发展,构成习近平新时代中国特色社会主义思想的“党建篇”。在这一重要思想的科学指引下,我们党成功开辟百年大党自我革命新境界,推动党和国家事业取得历史性成就、发生历史性变革,为世界政党建设提供了重要借鉴。</p>'}, {'tag': 'p', 'content': '<p>\\u3000\\u3000会议指出,要以学好用好《概论》为契机,进一步把习近平总书记关于党的建设的重要思想领会深、把握准、落到位,深刻领会其科学体系、理论品质和实践指向,更加深刻领悟“两个确立”的决定性意义,增强“四个意识”、坚定“四个自信”、做到“两个维护”。要不断深化体系化研究、学理化阐释,深刻把握这一重要思想蕴含的深刻道理、透彻学理、深邃哲理。要坚持用这一重要思想武装头脑、指导实践、推动工作,把学习成果转化为工作实效,推进党建研究高质量发展,以党建研究新成果推进党的建设和组织工作高质量发展,为以中国式现代化全面推进强国建设、民族复兴伟业提供坚强组织保证。</p>'}, {'tag': 'p', 'content': '<p>\\u3000\\u3000座谈会由全国党建研究会举办,中央和国家机关有关部门,各省区市和新疆生产建设兵团党建研究会(学会),部分中管企业、高校有关负责同志,党史党建专家代表参加座谈会。</p>'}], url='https://www.news.cn/politics/leaders/20250224/5384be3d47c643b3a68e3bb724656152/c.html', viewCount=None, publishTime='2025-02-24 22:44:25', author=None, source='新华网', category=None, executeStatus=0, executeMessage=None)"
|
"NewsItem(title='微纪录片|习近平的“三农”情', contentRows=[{'tag': 'video', 'content': \"<video src='https://vodpub6.v.news.cn/yqfbzx-original/20240207/202402072819fe60663140eab9599581dcae8c1e_73db5fe0318c44469be3ea83adfc730d.mp4' />\"}, {'tag': 'p', 'content': '<p style=\"text-align: center;\">习近平对农民有着深厚的感情</p>'}, {'tag': 'p', 'content': '<p style=\"text-align: center;\">对“三农”问题有深入的思考</p>'}, {'tag': 'p', 'content': '<p style=\"text-align: center;\">一路走来</p>'}, {'tag': 'p', 'content': '<p style=\"text-align: center;\">他经常和农民在一起</p>'}, {'tag': 'p', 'content': '<p style=\"text-align: center;\">从小小山村的党支部书记</p>'}, {'tag': 'p', 'content': '<p style=\"text-align: center;\">到党的总书记</p>'}, {'tag': 'p', 'content': '<p style=\"text-align: center;\">寸寸光阴</p>'}, {'tag': 'p', 'content': '<p style=\"text-align: center;\">见证着他的“三农”情</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003总策划:刘健</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003策划:李拯宇</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003监制:孙志平</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003制片:樊华</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003统筹:韩珅、王志斌</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003编导:陈晓宇</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003记者:陈晓宇、范世辉、岳文婷、邹尚伯、张晨俊、王怿文、李涛、胡友松、朱晓光</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003报道员:刘鹏飞、李树锋、张伟、朱海亮、徐涛、王盟、王静</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003海报:韩彤(实习)</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003鸣谢:中共延川县委宣传部</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003中共石家庄市委宣传部</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003中共曹县县委宣传部</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003新华社音视频部制作</p>'}, {'tag': 'p', 'content': '<p>\\u2003\\u2003新华通讯社出品</p>'}], url='https://www.news.cn/politics/leaders/20240207/2819fe60663140eab9599581dcae8c1e/c.html', viewCount=None, publishTime='2024-02-07 12:43:29', author=None, source='新华社', category=None, executeStatus=0, executeMessage=None)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 20,
|
"execution_count": 7,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"crawler.parse_xh_news_detail(\"https://www.news.cn/politics/leaders/20250224/5384be3d47c643b3a68e3bb724656152/c.html\")"
|
"#crawler.parse_xh_news_detail(\"https://www.news.cn/politics/leaders/20250224/5384be3d47c643b3a68e3bb724656152/c.html\")\n",
|
||||||
|
"crawler.parse_xh_news_detail(\"https://www.news.cn/politics/leaders/20240207/2819fe60663140eab9599581dcae8c1e/c.html\") #视频"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|||||||
Reference in New Issue
Block a user