更新
This commit is contained in:
File diff suppressed because one or more lines are too long
@@ -346,6 +346,14 @@ class XhwCrawler(BaseCrawler):
|
|||||||
except:
|
except:
|
||||||
logger.warning(f"新闻内容解析失败: {url}")
|
logger.warning(f"新闻内容解析失败: {url}")
|
||||||
|
|
||||||
|
# 根据是否有内容设置执行状态
|
||||||
|
if news_item.contentRows:
|
||||||
|
news_item.executeStatus = 1
|
||||||
|
news_item.executeMessage = "解析成功"
|
||||||
|
else:
|
||||||
|
news_item.executeStatus = 0
|
||||||
|
news_item.executeMessage = "未解析到内容"
|
||||||
|
|
||||||
return news_item
|
return news_item
|
||||||
|
|
||||||
|
|
||||||
@@ -447,7 +455,11 @@ class XhwCrawler(BaseCrawler):
|
|||||||
# 内容
|
# 内容
|
||||||
try:
|
try:
|
||||||
article_div = self.driver.find_element(By.CSS_SELECTOR, "div.main.clearfix")
|
article_div = self.driver.find_element(By.CSS_SELECTOR, "div.main.clearfix")
|
||||||
|
# 兼容处理:先尝试2层结构,找不到再用1层
|
||||||
|
try:
|
||||||
content_div = article_div.find_element(By.CSS_SELECTOR, "div#detail span#detailContent")
|
content_div = article_div.find_element(By.CSS_SELECTOR, "div#detail span#detailContent")
|
||||||
|
except:
|
||||||
|
content_div = article_div.find_element(By.CSS_SELECTOR, "div#detail")
|
||||||
parse_content(content_div)
|
parse_content(content_div)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"新闻内容解析失败: {url}, {e}")
|
logger.warning(f"新闻内容解析失败: {url}, {e}")
|
||||||
@@ -455,7 +467,11 @@ class XhwCrawler(BaseCrawler):
|
|||||||
# 分页处理(单独 try-except,避免影响已解析的内容)
|
# 分页处理(单独 try-except,避免影响已解析的内容)
|
||||||
try:
|
try:
|
||||||
article_div = self.driver.find_element(By.CSS_SELECTOR, "div.main.clearfix")
|
article_div = self.driver.find_element(By.CSS_SELECTOR, "div.main.clearfix")
|
||||||
|
# 兼容处理:先尝试2层结构,找不到再用1层
|
||||||
|
try:
|
||||||
content_div = article_div.find_element(By.CSS_SELECTOR, "div#detail span#detailContent")
|
content_div = article_div.find_element(By.CSS_SELECTOR, "div#detail span#detailContent")
|
||||||
|
except:
|
||||||
|
content_div = article_div.find_element(By.CSS_SELECTOR, "div#detail")
|
||||||
page_div = content_div.find_element(By.CSS_SELECTOR, "center.xinhuaPager")
|
page_div = content_div.find_element(By.CSS_SELECTOR, "center.xinhuaPager")
|
||||||
page_urls = []
|
page_urls = []
|
||||||
if page_div:
|
if page_div:
|
||||||
@@ -468,12 +484,24 @@ class XhwCrawler(BaseCrawler):
|
|||||||
for page_url in page_urls:
|
for page_url in page_urls:
|
||||||
self.driver.get(page_url)
|
self.driver.get(page_url)
|
||||||
time.sleep(2)
|
time.sleep(2)
|
||||||
|
# 兼容处理
|
||||||
|
try:
|
||||||
content_div = self.driver.find_element(By.CSS_SELECTOR, "div#detail span#detailContent")
|
content_div = self.driver.find_element(By.CSS_SELECTOR, "div#detail span#detailContent")
|
||||||
|
except:
|
||||||
|
content_div = self.driver.find_element(By.CSS_SELECTOR, "div#detail")
|
||||||
parse_content(content_div)
|
parse_content(content_div)
|
||||||
except:
|
except:
|
||||||
# 没有分页是正常情况,不需要警告
|
# 没有分页是正常情况,不需要警告
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
# 根据是否有内容设置执行状态
|
||||||
|
if news_item.contentRows:
|
||||||
|
news_item.executeStatus = 1
|
||||||
|
news_item.executeMessage = "解析成功"
|
||||||
|
else:
|
||||||
|
news_item.executeStatus = 0
|
||||||
|
news_item.executeMessage = "未解析到内容"
|
||||||
|
|
||||||
return news_item
|
return news_item
|
||||||
|
|
||||||
|
|
||||||
@@ -590,6 +618,10 @@ class XhwCrawler(BaseCrawler):
|
|||||||
# 内容
|
# 内容
|
||||||
try:
|
try:
|
||||||
article_div = self.driver.find_element(By.CSS_SELECTOR, "div.main.clearfix")
|
article_div = self.driver.find_element(By.CSS_SELECTOR, "div.main.clearfix")
|
||||||
|
# 兼容处理:先尝试2层结构,找不到再用1层
|
||||||
|
try:
|
||||||
|
content_div = article_div.find_element(By.CSS_SELECTOR, "div#detail span#detailContent")
|
||||||
|
except:
|
||||||
content_div = article_div.find_element(By.CSS_SELECTOR, "div#detail")
|
content_div = article_div.find_element(By.CSS_SELECTOR, "div#detail")
|
||||||
parse_content(content_div)
|
parse_content(content_div)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -598,6 +630,10 @@ class XhwCrawler(BaseCrawler):
|
|||||||
# 分页处理(单独 try-except,避免影响已解析的内容)
|
# 分页处理(单独 try-except,避免影响已解析的内容)
|
||||||
try:
|
try:
|
||||||
article_div = self.driver.find_element(By.CSS_SELECTOR, "div.main.clearfix")
|
article_div = self.driver.find_element(By.CSS_SELECTOR, "div.main.clearfix")
|
||||||
|
# 兼容处理:先尝试2层结构,找不到再用1层
|
||||||
|
try:
|
||||||
|
content_div = article_div.find_element(By.CSS_SELECTOR, "div#detail span#detailContent")
|
||||||
|
except:
|
||||||
content_div = article_div.find_element(By.CSS_SELECTOR, "div#detail")
|
content_div = article_div.find_element(By.CSS_SELECTOR, "div#detail")
|
||||||
page_div = content_div.find_element(By.CSS_SELECTOR, "center.xinhuaPager")
|
page_div = content_div.find_element(By.CSS_SELECTOR, "center.xinhuaPager")
|
||||||
page_urls = []
|
page_urls = []
|
||||||
@@ -611,12 +647,24 @@ class XhwCrawler(BaseCrawler):
|
|||||||
for page_url in page_urls:
|
for page_url in page_urls:
|
||||||
self.driver.get(page_url)
|
self.driver.get(page_url)
|
||||||
time.sleep(2)
|
time.sleep(2)
|
||||||
|
# 兼容处理
|
||||||
|
try:
|
||||||
|
content_div = self.driver.find_element(By.CSS_SELECTOR, "div#detail span#detailContent")
|
||||||
|
except:
|
||||||
content_div = self.driver.find_element(By.CSS_SELECTOR, "div#detail")
|
content_div = self.driver.find_element(By.CSS_SELECTOR, "div#detail")
|
||||||
parse_content(content_div)
|
parse_content(content_div)
|
||||||
except:
|
except:
|
||||||
# 没有分页是正常情况,不需要警告
|
# 没有分页是正常情况,不需要警告
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
# 根据是否有内容设置执行状态
|
||||||
|
if news_item.contentRows:
|
||||||
|
news_item.executeStatus = 1
|
||||||
|
news_item.executeMessage = "解析成功"
|
||||||
|
else:
|
||||||
|
news_item.executeStatus = 0
|
||||||
|
news_item.executeMessage = "未解析到内容"
|
||||||
|
|
||||||
return news_item
|
return news_item
|
||||||
|
|
||||||
def _normalize_url(self, url: str) -> str:
|
def _normalize_url(self, url: str) -> str:
|
||||||
|
|||||||
Reference in New Issue
Block a user