推荐脚本+分页处理
This commit is contained in:
@@ -348,11 +348,7 @@ class XhwCrawler(BaseCrawler):
|
||||
|
||||
title = head_div.find_element(By.CSS_SELECTOR, "h1").text
|
||||
news_item.title = title
|
||||
|
||||
# 内容
|
||||
try:
|
||||
article_div = self.driver.find_element(By.CSS_SELECTOR, "div.main.clearfix")
|
||||
content_div = article_div.find_element(By.CSS_SELECTOR, "div#detail span#detailContent")
|
||||
def parse_content(content_div):
|
||||
children = content_div.find_elements(By.XPATH, "./*")
|
||||
for child in children:
|
||||
try:
|
||||
@@ -401,6 +397,27 @@ class XhwCrawler(BaseCrawler):
|
||||
except Exception as e:
|
||||
logger.warning(f"解析段落失败: {e}")
|
||||
continue
|
||||
# 内容
|
||||
try:
|
||||
article_div = self.driver.find_element(By.CSS_SELECTOR, "div.main.clearfix")
|
||||
content_div = article_div.find_element(By.CSS_SELECTOR, "div#detail span#detailContent")
|
||||
parse_content(content_div)
|
||||
|
||||
page_div = content_div.find_element(By.CSS_SELECTOR, "center.xinhuaPager")
|
||||
page_urls = []
|
||||
if page_div:
|
||||
page_as = page_div.find_elements(By.CSS_SELECTOR, "span#xinhuaPagerBox > a")
|
||||
for page_a in page_as:
|
||||
page_url = page_a.get_attribute("href")
|
||||
if page_url and not page_url.startswith("http"):
|
||||
page_url = self._normalize_url(page_url)
|
||||
page_urls.append(page_url)
|
||||
for page_url in page_urls:
|
||||
self.driver.get(page_url)
|
||||
time.sleep(2)
|
||||
content_div = self.driver.find_element(By.CSS_SELECTOR, "div#detail span#detailContent")
|
||||
parse_content(content_div)
|
||||
|
||||
except:
|
||||
logger.warning(f"新闻内容解析失败: {url}")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user