推荐爬虫

This commit is contained in:
2025-11-20 15:46:53 +08:00
parent 9f56f4fd24
commit 078d86db6e
2 changed files with 148 additions and 15 deletions

View File

@@ -76,6 +76,27 @@ class XhwCrawler(BaseCrawler):
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
}
),
"commend": UrlConfig(
url="https://xhsz.news.cn/focus_news",
method="GET",
params={},
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Cache-Control': 'max-age=0',
'Referer': 'https://xhsz.news.cn/',
'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
}
)
},
)
@@ -585,6 +606,99 @@ class XhwCrawler(BaseCrawler):
resultDomain.success = bool(news_list)
return resultDomain
# 特别推荐
def commend(self) -> ResultDomain:
# 检查driver是否已初始化
if not self.driver:
logger.error("WebDriver未初始化无法继续爬取")
return ResultDomain(code=1, message="WebDriver未初始化无法继续爬取", success=False)
news_urls = []
news_list = []
resultDomain = ResultDomain(code=0, message="", success=True, dataList=news_list)
# 获取搜索配置
hot_point_config = self.config.urls.get("hot_point")
if not hot_point_config:
logger.error("未找到搜索URL配置")
resultDomain.code = 0
resultDomain.message = "未找到搜索URL配置"
resultDomain.success = False
return resultDomain
# 访问搜索页
try:
self.driver.get(hot_point_config.url)
time.sleep(2)
except Exception as e:
logger.warning(f"访问搜索页失败: {hot_point_config.url}, {e}")
return resultDomain
try:
# 获取新闻url
url_base_map = {}
news_div = self.driver.find_element(By.CSS_SELECTOR, "section.wrapper > div.page-news.center-1200")
page_r_div = news_div.find_element(By.CSS_SELECTOR, "div.page-news-r")
commend_jump_divs = page_r_div.find_elements(By.CSS_SELECTOR, "div.page-news-recommend > div.item")
jump_urls = []
for commend_jump_div in commend_jump_divs:
a = commend_jump_div.find_element(By.CSS_SELECTOR, "div.txt > a")
jump_url = self._normalize_url(a.get_attribute("href") or '')
jump_urls.append(jump_url)
for jump_url in jump_urls:
self.driver.get(jump_url)
conent_div = WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.content")))
# 轮播图区域
swiper_wrapper_div = conent_div.find_element(By.CSS_SELECTOR, "div.part01 > div.swiper-container > div.swiper-wrapper")
if swiper_wrapper_div:
swiper_slides = swiper_wrapper_div.find_elements(By.CSS_SELECTOR, "div.swiper-slide")
# swiper_news_urls = []
for swiper_slide in swiper_slides:
a = swiper_slide.find_element(By.CSS_SELECTOR, "div.tit > a")
news_url = self._normalize_url(a.get_attribute("href") or '')
news_urls.append(news_url)
# swiper_news_urls.append(news_url)
# 聚焦区域
news_ul_div = conent_div.find_element(By.CSS_SELECTOR, "div.part02 > div.part02_con > ul")
if news_ul_div:
news_li_divs = news_ul_div.find_elements(By.CSS_SELECTOR, "li")
# focus_news_urls = []
for news_li_div in news_li_divs:
a = news_li_div.find_element(By.CSS_SELECTOR, "h3.h3Tit > a")
news_url = self._normalize_url(a.get_attribute("href") or '')
news_urls.append(news_url)
# focus_news_urls.append(news_url)
# 从新闻url中获取新闻详情
count = 0
for news_url in news_urls:
try:
news = self.parse_news_detail(news_url)
if news:
news.title = url_base_map.get(news_url, {}).get("title") or news.title
news_list.append(news)
count += 1
if count >= 5:
break
except Exception as e:
logger.warning(f"解析新闻失败: {news_url}, {e}")
continue
except Exception as e:
logger.error(f"搜索过程整体异常: {e}")
resultDomain.success = False
resultDomain.code = 0
resultDomain.message = "爬取失败"
# 最终保证返回 dataList
resultDomain.dataList = news_list
resultDomain.success = bool(news_list)
return resultDomain
def close(self):
if hasattr(self, 'driver') and self.driver:
try: