更新
This commit is contained in:
File diff suppressed because one or more lines are too long
@@ -346,6 +346,14 @@ class XhwCrawler(BaseCrawler):
|
||||
except:
|
||||
logger.warning(f"新闻内容解析失败: {url}")
|
||||
|
||||
# 根据是否有内容设置执行状态
|
||||
if news_item.contentRows:
|
||||
news_item.executeStatus = 1
|
||||
news_item.executeMessage = "解析成功"
|
||||
else:
|
||||
news_item.executeStatus = 0
|
||||
news_item.executeMessage = "未解析到内容"
|
||||
|
||||
return news_item
|
||||
|
||||
|
||||
@@ -447,7 +455,11 @@ class XhwCrawler(BaseCrawler):
|
||||
# 内容
|
||||
try:
|
||||
article_div = self.driver.find_element(By.CSS_SELECTOR, "div.main.clearfix")
|
||||
content_div = article_div.find_element(By.CSS_SELECTOR, "div#detail span#detailContent")
|
||||
# 兼容处理:先尝试2层结构,找不到再用1层
|
||||
try:
|
||||
content_div = article_div.find_element(By.CSS_SELECTOR, "div#detail span#detailContent")
|
||||
except:
|
||||
content_div = article_div.find_element(By.CSS_SELECTOR, "div#detail")
|
||||
parse_content(content_div)
|
||||
except Exception as e:
|
||||
logger.warning(f"新闻内容解析失败: {url}, {e}")
|
||||
@@ -455,7 +467,11 @@ class XhwCrawler(BaseCrawler):
|
||||
# 分页处理(单独 try-except,避免影响已解析的内容)
|
||||
try:
|
||||
article_div = self.driver.find_element(By.CSS_SELECTOR, "div.main.clearfix")
|
||||
content_div = article_div.find_element(By.CSS_SELECTOR, "div#detail span#detailContent")
|
||||
# 兼容处理:先尝试2层结构,找不到再用1层
|
||||
try:
|
||||
content_div = article_div.find_element(By.CSS_SELECTOR, "div#detail span#detailContent")
|
||||
except:
|
||||
content_div = article_div.find_element(By.CSS_SELECTOR, "div#detail")
|
||||
page_div = content_div.find_element(By.CSS_SELECTOR, "center.xinhuaPager")
|
||||
page_urls = []
|
||||
if page_div:
|
||||
@@ -468,12 +484,24 @@ class XhwCrawler(BaseCrawler):
|
||||
for page_url in page_urls:
|
||||
self.driver.get(page_url)
|
||||
time.sleep(2)
|
||||
content_div = self.driver.find_element(By.CSS_SELECTOR, "div#detail span#detailContent")
|
||||
# 兼容处理
|
||||
try:
|
||||
content_div = self.driver.find_element(By.CSS_SELECTOR, "div#detail span#detailContent")
|
||||
except:
|
||||
content_div = self.driver.find_element(By.CSS_SELECTOR, "div#detail")
|
||||
parse_content(content_div)
|
||||
except:
|
||||
# 没有分页是正常情况,不需要警告
|
||||
pass
|
||||
|
||||
# 根据是否有内容设置执行状态
|
||||
if news_item.contentRows:
|
||||
news_item.executeStatus = 1
|
||||
news_item.executeMessage = "解析成功"
|
||||
else:
|
||||
news_item.executeStatus = 0
|
||||
news_item.executeMessage = "未解析到内容"
|
||||
|
||||
return news_item
|
||||
|
||||
|
||||
@@ -590,7 +618,11 @@ class XhwCrawler(BaseCrawler):
|
||||
# 内容
|
||||
try:
|
||||
article_div = self.driver.find_element(By.CSS_SELECTOR, "div.main.clearfix")
|
||||
content_div = article_div.find_element(By.CSS_SELECTOR, "div#detail")
|
||||
# 兼容处理:先尝试2层结构,找不到再用1层
|
||||
try:
|
||||
content_div = article_div.find_element(By.CSS_SELECTOR, "div#detail span#detailContent")
|
||||
except:
|
||||
content_div = article_div.find_element(By.CSS_SELECTOR, "div#detail")
|
||||
parse_content(content_div)
|
||||
except Exception as e:
|
||||
logger.warning(f"新闻内容解析失败: {url}, {e}")
|
||||
@@ -598,7 +630,11 @@ class XhwCrawler(BaseCrawler):
|
||||
# 分页处理(单独 try-except,避免影响已解析的内容)
|
||||
try:
|
||||
article_div = self.driver.find_element(By.CSS_SELECTOR, "div.main.clearfix")
|
||||
content_div = article_div.find_element(By.CSS_SELECTOR, "div#detail")
|
||||
# 兼容处理:先尝试2层结构,找不到再用1层
|
||||
try:
|
||||
content_div = article_div.find_element(By.CSS_SELECTOR, "div#detail span#detailContent")
|
||||
except:
|
||||
content_div = article_div.find_element(By.CSS_SELECTOR, "div#detail")
|
||||
page_div = content_div.find_element(By.CSS_SELECTOR, "center.xinhuaPager")
|
||||
page_urls = []
|
||||
if page_div:
|
||||
@@ -611,12 +647,24 @@ class XhwCrawler(BaseCrawler):
|
||||
for page_url in page_urls:
|
||||
self.driver.get(page_url)
|
||||
time.sleep(2)
|
||||
content_div = self.driver.find_element(By.CSS_SELECTOR, "div#detail")
|
||||
# 兼容处理
|
||||
try:
|
||||
content_div = self.driver.find_element(By.CSS_SELECTOR, "div#detail span#detailContent")
|
||||
except:
|
||||
content_div = self.driver.find_element(By.CSS_SELECTOR, "div#detail")
|
||||
parse_content(content_div)
|
||||
except:
|
||||
# 没有分页是正常情况,不需要警告
|
||||
pass
|
||||
|
||||
# 根据是否有内容设置执行状态
|
||||
if news_item.contentRows:
|
||||
news_item.executeStatus = 1
|
||||
news_item.executeMessage = "解析成功"
|
||||
else:
|
||||
news_item.executeStatus = 0
|
||||
news_item.executeMessage = "未解析到内容"
|
||||
|
||||
return news_item
|
||||
|
||||
def _normalize_url(self, url: str) -> str:
|
||||
|
||||
Reference in New Issue
Block a user