This commit is contained in:
2026-01-09 17:27:28 +08:00
parent 8042675d71
commit 6336f89f0d
2 changed files with 64 additions and 16 deletions

File diff suppressed because one or more lines are too long

View File

@@ -346,6 +346,14 @@ class XhwCrawler(BaseCrawler):
except:
logger.warning(f"新闻内容解析失败: {url}")
# 根据是否有内容设置执行状态
if news_item.contentRows:
news_item.executeStatus = 1
news_item.executeMessage = "解析成功"
else:
news_item.executeStatus = 0
news_item.executeMessage = "未解析到内容"
return news_item
@@ -447,7 +455,11 @@ class XhwCrawler(BaseCrawler):
# 内容
try:
article_div = self.driver.find_element(By.CSS_SELECTOR, "div.main.clearfix")
# 兼容处理先尝试2层结构找不到再用1层
try:
content_div = article_div.find_element(By.CSS_SELECTOR, "div#detail span#detailContent")
except:
content_div = article_div.find_element(By.CSS_SELECTOR, "div#detail")
parse_content(content_div)
except Exception as e:
logger.warning(f"新闻内容解析失败: {url}, {e}")
@@ -455,7 +467,11 @@ class XhwCrawler(BaseCrawler):
# 分页处理(单独 try-except避免影响已解析的内容
try:
article_div = self.driver.find_element(By.CSS_SELECTOR, "div.main.clearfix")
# 兼容处理先尝试2层结构找不到再用1层
try:
content_div = article_div.find_element(By.CSS_SELECTOR, "div#detail span#detailContent")
except:
content_div = article_div.find_element(By.CSS_SELECTOR, "div#detail")
page_div = content_div.find_element(By.CSS_SELECTOR, "center.xinhuaPager")
page_urls = []
if page_div:
@@ -468,12 +484,24 @@ class XhwCrawler(BaseCrawler):
for page_url in page_urls:
self.driver.get(page_url)
time.sleep(2)
# 兼容处理
try:
content_div = self.driver.find_element(By.CSS_SELECTOR, "div#detail span#detailContent")
except:
content_div = self.driver.find_element(By.CSS_SELECTOR, "div#detail")
parse_content(content_div)
except:
# 没有分页是正常情况,不需要警告
pass
# 根据是否有内容设置执行状态
if news_item.contentRows:
news_item.executeStatus = 1
news_item.executeMessage = "解析成功"
else:
news_item.executeStatus = 0
news_item.executeMessage = "未解析到内容"
return news_item
@@ -590,6 +618,10 @@ class XhwCrawler(BaseCrawler):
# 内容
try:
article_div = self.driver.find_element(By.CSS_SELECTOR, "div.main.clearfix")
# 兼容处理先尝试2层结构找不到再用1层
try:
content_div = article_div.find_element(By.CSS_SELECTOR, "div#detail span#detailContent")
except:
content_div = article_div.find_element(By.CSS_SELECTOR, "div#detail")
parse_content(content_div)
except Exception as e:
@@ -598,6 +630,10 @@ class XhwCrawler(BaseCrawler):
# 分页处理(单独 try-except避免影响已解析的内容
try:
article_div = self.driver.find_element(By.CSS_SELECTOR, "div.main.clearfix")
# 兼容处理先尝试2层结构找不到再用1层
try:
content_div = article_div.find_element(By.CSS_SELECTOR, "div#detail span#detailContent")
except:
content_div = article_div.find_element(By.CSS_SELECTOR, "div#detail")
page_div = content_div.find_element(By.CSS_SELECTOR, "center.xinhuaPager")
page_urls = []
@@ -611,12 +647,24 @@ class XhwCrawler(BaseCrawler):
for page_url in page_urls:
self.driver.get(page_url)
time.sleep(2)
# 兼容处理
try:
content_div = self.driver.find_element(By.CSS_SELECTOR, "div#detail span#detailContent")
except:
content_div = self.driver.find_element(By.CSS_SELECTOR, "div#detail")
parse_content(content_div)
except:
# 没有分页是正常情况,不需要警告
pass
# 根据是否有内容设置执行状态
if news_item.contentRows:
news_item.executeStatus = 1
news_item.executeMessage = "解析成功"
else:
news_item.executeStatus = 0
news_item.executeMessage = "未解析到内容"
return news_item
def _normalize_url(self, url: str) -> str: