推荐脚本+分页处理

This commit is contained in:
2025-11-20 16:09:29 +08:00
parent 078d86db6e
commit 97da821799
3 changed files with 133 additions and 38 deletions

View File

@@ -348,11 +348,7 @@ class XhwCrawler(BaseCrawler):
title = head_div.find_element(By.CSS_SELECTOR, "h1").text
news_item.title = title
# 内容
try:
article_div = self.driver.find_element(By.CSS_SELECTOR, "div.main.clearfix")
content_div = article_div.find_element(By.CSS_SELECTOR, "div#detail span#detailContent")
def parse_content(content_div):
children = content_div.find_elements(By.XPATH, "./*")
for child in children:
try:
@@ -401,6 +397,27 @@ class XhwCrawler(BaseCrawler):
except Exception as e:
logger.warning(f"解析段落失败: {e}")
continue
# 内容
try:
article_div = self.driver.find_element(By.CSS_SELECTOR, "div.main.clearfix")
content_div = article_div.find_element(By.CSS_SELECTOR, "div#detail span#detailContent")
parse_content(content_div)
page_div = content_div.find_element(By.CSS_SELECTOR, "center.xinhuaPager")
page_urls = []
if page_div:
page_as = page_div.find_elements(By.CSS_SELECTOR, "span#xinhuaPagerBox > a")
for page_a in page_as:
page_url = page_a.get_attribute("href")
if page_url and not page_url.startswith("http"):
page_url = self._normalize_url(page_url)
page_urls.append(page_url)
for page_url in page_urls:
self.driver.get(page_url)
time.sleep(2)
content_div = self.driver.find_element(By.CSS_SELECTOR, "div#detail span#detailContent")
parse_content(content_div)
except:
logger.warning(f"新闻内容解析失败: {url}")