新华网爬虫更新
This commit is contained in:
File diff suppressed because one or more lines are too long
@@ -106,6 +106,16 @@ class XhwCrawler(BaseCrawler):
|
||||
"热点发布": "news"
|
||||
}
|
||||
|
||||
# 详情页解析器映射,根据 URL 域名选择解析方法
|
||||
# xhsz.news.cn -> parse_xhsz_news_detail
|
||||
# www.news.cn -> parse_xh_news_detail
|
||||
# www.xinhuanet.com -> parse_xinhuanet_news_detail (待实现)
|
||||
self.detail_map = {
|
||||
"xhsz.news.cn": self.parse_xhsz_news_detail, # 新华时政
|
||||
"www.news.cn": self.parse_xh_news_detail, # 新华网主站
|
||||
"www.xinhuanet.com": self.parse_xinhuanet_news_detail, # 新华网旧站
|
||||
}
|
||||
|
||||
# 初始化时创建driver
|
||||
self.driver = self._init_driver()
|
||||
|
||||
@@ -192,7 +202,20 @@ class XhwCrawler(BaseCrawler):
|
||||
return driver
|
||||
|
||||
def parse_news_detail(self, url: str) -> Optional[NewsItem]:
|
||||
return self.parse_xhsz_news_detail(url)
|
||||
"""
|
||||
根据 URL 域名自动选择对应的解析器
|
||||
"""
|
||||
# 从 URL 中提取完整域名
|
||||
netloc = urlparse(url).netloc
|
||||
|
||||
# 从 detail_map 中获取对应的解析函数
|
||||
parser_func = self.detail_map.get(netloc)
|
||||
|
||||
if parser_func is None:
|
||||
logger.warning(f"未找到对应解析器,netloc={netloc}, url={url},使用默认解析器")
|
||||
parser_func = self.parse_xhsz_news_detail
|
||||
|
||||
return parser_func(url)
|
||||
|
||||
def parse_xhsz_news_detail(self, url: str) -> NewsItem:
|
||||
"""
|
||||
@@ -231,7 +254,11 @@ class XhwCrawler(BaseCrawler):
|
||||
|
||||
final_url = self.driver.current_url
|
||||
if final_url != url:
|
||||
news_item = self.parse_xh_news_detail(final_url)
|
||||
# URL 发生重定向,根据新 URL 选择对应的解析器
|
||||
netloc = urlparse(final_url).netloc
|
||||
|
||||
parser_func = self.detail_map.get(netloc, self.parse_xhsz_news_detail)
|
||||
news_item = parser_func(final_url)
|
||||
news_item.url = url
|
||||
return news_item
|
||||
|
||||
@@ -422,7 +449,13 @@ class XhwCrawler(BaseCrawler):
|
||||
article_div = self.driver.find_element(By.CSS_SELECTOR, "div.main.clearfix")
|
||||
content_div = article_div.find_element(By.CSS_SELECTOR, "div#detail span#detailContent")
|
||||
parse_content(content_div)
|
||||
except Exception as e:
|
||||
logger.warning(f"新闻内容解析失败: {url}, {e}")
|
||||
|
||||
# 分页处理(单独 try-except,避免影响已解析的内容)
|
||||
try:
|
||||
article_div = self.driver.find_element(By.CSS_SELECTOR, "div.main.clearfix")
|
||||
content_div = article_div.find_element(By.CSS_SELECTOR, "div#detail span#detailContent")
|
||||
page_div = content_div.find_element(By.CSS_SELECTOR, "center.xinhuaPager")
|
||||
page_urls = []
|
||||
if page_div:
|
||||
@@ -437,9 +470,152 @@ class XhwCrawler(BaseCrawler):
|
||||
time.sleep(2)
|
||||
content_div = self.driver.find_element(By.CSS_SELECTOR, "div#detail span#detailContent")
|
||||
parse_content(content_div)
|
||||
|
||||
except:
|
||||
logger.warning(f"新闻内容解析失败: {url}")
|
||||
# 没有分页是正常情况,不需要警告
|
||||
pass
|
||||
|
||||
return news_item
|
||||
|
||||
|
||||
def parse_xinhuanet_news_detail(self, url: str) -> NewsItem:
|
||||
"""
|
||||
使用Selenium解析新华网旧站(xinhuanet.com)新闻详情页
|
||||
异常局部捕获,保证返回 NewsItem 对象,即使部分内容解析失败
|
||||
"""
|
||||
news_item = NewsItem(title="", contentRows=[], url=url)
|
||||
|
||||
if not self.driver:
|
||||
logger.error("WebDriver未初始化,无法获取新闻详情")
|
||||
return news_item
|
||||
|
||||
try:
|
||||
self.driver.get(url)
|
||||
time.sleep(2)
|
||||
except Exception as e:
|
||||
logger.warning(f"访问新闻详情页失败: {url}, {e}")
|
||||
return news_item
|
||||
|
||||
# 滑动验证处理
|
||||
try:
|
||||
sliders = self.driver.find_elements(By.CSS_SELECTOR, ".handler.handler_bg")
|
||||
if sliders:
|
||||
slider = sliders[0]
|
||||
action_chain = ActionChains(self.driver)
|
||||
action_chain.click_and_hold(slider).perform()
|
||||
distance = 1000
|
||||
tracks = [distance*0.2, distance*0.3, distance*0.25, distance*0.25]
|
||||
for track in tracks:
|
||||
action_chain.move_by_offset(int(track), 0).pause(1)
|
||||
action_chain.perform()
|
||||
action_chain.release().perform()
|
||||
time.sleep(2)
|
||||
except Exception as e:
|
||||
logger.info(f"滑块验证处理失败或未出现: {e}")
|
||||
|
||||
# head - xinhuanet 格式: <span class="year"><em> 2021</em></span><span class="day">01/21</span><span class="time"> 00:28:02</span>
|
||||
try:
|
||||
head_div = self.driver.find_element(By.CSS_SELECTOR, "div.header.domPC")
|
||||
time_div = head_div.find_element(By.CSS_SELECTOR, "div.header-time.left")
|
||||
year = time_div.find_element(By.CSS_SELECTOR, "span.year").text.strip()
|
||||
day = time_div.find_element(By.CSS_SELECTOR, "span.day").text.strip()
|
||||
time_str = time_div.find_element(By.CSS_SELECTOR, "span.time").text.strip()
|
||||
datetimes = f"{year}/{day} {time_str}"
|
||||
news_item.publishTime = str(datetime.strptime(datetimes, "%Y/%m/%d %H:%M:%S"))
|
||||
|
||||
source_div = head_div.find_element(By.CSS_SELECTOR, "div.source")
|
||||
source_text = source_div.text.strip()
|
||||
if ":" in source_text:
|
||||
news_item.source = source_text.split(":")[1]
|
||||
else:
|
||||
news_item.source = source_text
|
||||
|
||||
# 标题在 h1 > span.title 中
|
||||
title = head_div.find_element(By.CSS_SELECTOR, "h1 span.title").text.strip()
|
||||
news_item.title = title
|
||||
except Exception as e:
|
||||
logger.warning(f"解析头部信息失败: {url}, {e}")
|
||||
|
||||
def parse_content(content_div):
|
||||
children = content_div.find_elements(By.XPATH, "./*")
|
||||
for child in children:
|
||||
try:
|
||||
tag_name = child.tag_name.lower()
|
||||
if tag_name == "p" or tag_name == "div":
|
||||
text = child.text.strip().replace("\xa0", "")
|
||||
if not text and len(child.find_elements(By.TAG_NAME, "img")) == 0:
|
||||
continue
|
||||
|
||||
# 视频
|
||||
try:
|
||||
video = child.find_element(By.TAG_NAME, "video")
|
||||
src = video.get_attribute("src")
|
||||
if src and not src.startswith("http"):
|
||||
src = self._normalize_url(src)
|
||||
style = video.get_attribute("style") or ""
|
||||
news_item.contentRows.append({"tag": "video", "content": f"<video style='{style}' src='{src}' />"})
|
||||
continue
|
||||
except:
|
||||
pass
|
||||
|
||||
# 图片
|
||||
try:
|
||||
img = child.find_element(By.TAG_NAME, "img")
|
||||
src = img.get_attribute("src")
|
||||
if src and not src.startswith("http"):
|
||||
src = self._normalize_url(src)
|
||||
style = img.get_attribute("style") or ""
|
||||
news_item.contentRows.append({"tag": "img", "content": f"<img style='{style}' src='{src}' />"})
|
||||
continue
|
||||
except:
|
||||
pass
|
||||
|
||||
# 普通段落
|
||||
news_item.contentRows.append({"tag": "p", "content": child.get_attribute("outerHTML")})
|
||||
elif tag_name == "img":
|
||||
src = child.get_attribute("src")
|
||||
if src and not src.startswith("http"):
|
||||
src = self._normalize_url(src)
|
||||
style = child.get_attribute("style") or ""
|
||||
news_item.contentRows.append({"tag": "img", "content": f"<img style='{style}' src='{src}' />"})
|
||||
elif tag_name == "video":
|
||||
src = child.get_attribute("src")
|
||||
if src and not src.startswith("http"):
|
||||
src = self._normalize_url(src)
|
||||
style = child.get_attribute("style") or ""
|
||||
news_item.contentRows.append({"tag": "video", "content": f"<video style='{style}' src='{src}' />"})
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"解析段落失败: {e}")
|
||||
continue
|
||||
# 内容
|
||||
try:
|
||||
article_div = self.driver.find_element(By.CSS_SELECTOR, "div.main.clearfix")
|
||||
content_div = article_div.find_element(By.CSS_SELECTOR, "div#detail")
|
||||
parse_content(content_div)
|
||||
except Exception as e:
|
||||
logger.warning(f"新闻内容解析失败: {url}, {e}")
|
||||
|
||||
# 分页处理(单独 try-except,避免影响已解析的内容)
|
||||
try:
|
||||
article_div = self.driver.find_element(By.CSS_SELECTOR, "div.main.clearfix")
|
||||
content_div = article_div.find_element(By.CSS_SELECTOR, "div#detail")
|
||||
page_div = content_div.find_element(By.CSS_SELECTOR, "center.xinhuaPager")
|
||||
page_urls = []
|
||||
if page_div:
|
||||
page_as = page_div.find_elements(By.CSS_SELECTOR, "span#xinhuaPagerBox > a")
|
||||
for page_a in page_as:
|
||||
page_url = page_a.get_attribute("href")
|
||||
if page_url and not page_url.startswith("http"):
|
||||
page_url = self._normalize_url(page_url)
|
||||
page_urls.append(page_url)
|
||||
for page_url in page_urls:
|
||||
self.driver.get(page_url)
|
||||
time.sleep(2)
|
||||
content_div = self.driver.find_element(By.CSS_SELECTOR, "div#detail")
|
||||
parse_content(content_div)
|
||||
except:
|
||||
# 没有分页是正常情况,不需要警告
|
||||
pass
|
||||
|
||||
return news_item
|
||||
|
||||
|
||||
Reference in New Issue
Block a user