视频
This commit is contained in:
@@ -56,7 +56,27 @@ class XhwCrawler(BaseCrawler):
|
||||
'sec-ch-ua-platform': '"Windows"'
|
||||
}
|
||||
),
|
||||
|
||||
"hot_point": UrlConfig(
|
||||
url="https://xhsz.news.cn/focus_news",
|
||||
method="GET",
|
||||
params={},
|
||||
headers={
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'Connection': 'keep-alive',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-Site': 'none',
|
||||
'Cache-Control': 'max-age=0',
|
||||
'Referer': 'https://xhsz.news.cn/',
|
||||
'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
|
||||
'sec-ch-ua-mobile': '?0',
|
||||
'sec-ch-ua-platform': '"Windows"'
|
||||
}
|
||||
)
|
||||
},
|
||||
)
|
||||
super().__init__(config)
|
||||
@@ -318,37 +338,47 @@ class XhwCrawler(BaseCrawler):
|
||||
for child in children:
|
||||
try:
|
||||
tag_name = child.tag_name.lower()
|
||||
if tag_name == "p":
|
||||
if tag_name == "p" or tag_name == "div":
|
||||
text = child.text.strip().replace("\xa0", "")
|
||||
if not text and len(child.find_elements(By.TAG_NAME, "img")) == 0:
|
||||
continue
|
||||
|
||||
# 图片
|
||||
try:
|
||||
img = child.find_element(By.TAG_NAME, "img")
|
||||
src = img.get_attribute("src")
|
||||
if src and not src.startswith("http"):
|
||||
src = self.config.base_url + src
|
||||
news_item.contentRows.append({"tag": "img", "content": f"<img src='{src}' />"})
|
||||
continue
|
||||
except:
|
||||
pass
|
||||
|
||||
# 视频
|
||||
try:
|
||||
video = child.find_element(By.TAG_NAME, "video")
|
||||
src = video.get_attribute("src")
|
||||
if src and not src.startswith("http"):
|
||||
src = self.config.base_url + src
|
||||
src = self._normalize_url(src)
|
||||
news_item.contentRows.append({"tag": "video", "content": f"<video src='{src}' />"})
|
||||
continue
|
||||
except:
|
||||
pass
|
||||
|
||||
# 图片
|
||||
try:
|
||||
img = child.find_element(By.TAG_NAME, "img")
|
||||
src = img.get_attribute("src")
|
||||
if src and not src.startswith("http"):
|
||||
src = self._normalize_url(src)
|
||||
news_item.contentRows.append({"tag": "img", "content": f"<img src='{src}' />"})
|
||||
continue
|
||||
except:
|
||||
pass
|
||||
|
||||
# 普通段落
|
||||
news_item.contentRows.append({"tag": "p", "content": child.get_attribute("outerHTML")})
|
||||
elif tag_name in ["img", "video"]:
|
||||
news_item.contentRows.append({"tag": tag_name, "content": child.get_attribute("outerHTML")})
|
||||
elif tag_name == "img":
|
||||
src = child.get_attribute("src")
|
||||
if src and not src.startswith("http"):
|
||||
src = self._normalize_url(src)
|
||||
news_item.contentRows.append({"tag": "img", "content": f"<img src='{src}' />"})
|
||||
elif tag_name == "video":
|
||||
src = child.get_attribute("src")
|
||||
if src and not src.startswith("http"):
|
||||
src = self._normalize_url(src)
|
||||
news_item.contentRows.append({"tag": "video", "content": f"<video src='{src}' />"})
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"解析段落失败: {e}")
|
||||
continue
|
||||
@@ -488,7 +518,55 @@ class XhwCrawler(BaseCrawler):
|
||||
resultDomain.success = bool(news_list)
|
||||
return resultDomain
|
||||
|
||||
|
||||
def hot_point(self) -> ResultDomain:
|
||||
# 检查driver是否已初始化
|
||||
if not self.driver:
|
||||
logger.error("WebDriver未初始化,无法继续爬取")
|
||||
return ResultDomain(code=1, message="WebDriver未初始化,无法继续爬取", success=False)
|
||||
|
||||
news_urls = []
|
||||
news_list = []
|
||||
resultDomain = ResultDomain(code=0, message="", success=True, dataList=news_list)
|
||||
|
||||
# 获取搜索配置
|
||||
hot_point_config = self.config.urls.get("hot_point")
|
||||
if not hot_point_config:
|
||||
logger.error("未找到搜索URL配置")
|
||||
resultDomain.code = 0
|
||||
resultDomain.message = "未找到搜索URL配置"
|
||||
resultDomain.success = False
|
||||
return resultDomain
|
||||
|
||||
|
||||
|
||||
try:
|
||||
# 获取新闻url
|
||||
url_base_map = {}
|
||||
|
||||
|
||||
|
||||
# 从新闻url中获取新闻详情
|
||||
for news_url in news_urls:
|
||||
try:
|
||||
news = self.parse_news_detail(news_url)
|
||||
if news:
|
||||
news.title = url_base_map.get(news_url, {}).get("title") or news.title
|
||||
news.publishTime = url_base_map.get(news_url, {}).get("date") or news.publishTime
|
||||
news_list.append(news)
|
||||
except Exception as e:
|
||||
logger.warning(f"解析新闻失败: {news_url}, {e}")
|
||||
continue
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"搜索过程整体异常: {e}")
|
||||
resultDomain.success = False
|
||||
resultDomain.code = 0
|
||||
resultDomain.message = "爬取失败"
|
||||
|
||||
# 最终保证返回 dataList
|
||||
resultDomain.dataList = news_list
|
||||
resultDomain.success = bool(news_list)
|
||||
return resultDomain
|
||||
|
||||
def close(self):
|
||||
if hasattr(self, 'driver') and self.driver:
|
||||
|
||||
Reference in New Issue
Block a user