新华网搜索爬虫+新闻内容提取
This commit is contained in:
@@ -1,4 +1,5 @@
|
|||||||
# 新华网爬虫
|
# 新华网爬虫
|
||||||
|
from itertools import count
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
|
|
||||||
from bs4 import Tag
|
from bs4 import Tag
|
||||||
@@ -84,24 +85,24 @@ class XhwCrawler(BaseCrawler):
|
|||||||
chrome_options.add_argument('--disable-web-security')
|
chrome_options.add_argument('--disable-web-security')
|
||||||
chrome_options.add_argument('--allow-running-insecure-content')
|
chrome_options.add_argument('--allow-running-insecure-content')
|
||||||
chrome_options.add_argument('--disable-features=VizDisplayCompositor')
|
chrome_options.add_argument('--disable-features=VizDisplayCompositor')
|
||||||
chrome_options.add_argument('--remote-debugging-port=9222') # 添加调试端口
|
|
||||||
service = Service(executable_path=r"chromedriver.exe")
|
service = Service(executable_path=r"chromedriver.exe")
|
||||||
|
|
||||||
|
driver = None
|
||||||
try:
|
try:
|
||||||
self.driver = webdriver.Chrome(service=service, options=chrome_options)
|
driver = webdriver.Chrome(service=service, options=chrome_options)
|
||||||
logger.info("Chrome浏览器初始化成功")
|
logger.info("Chrome浏览器初始化成功")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Chrome浏览器初始化失败: {str(e)}")
|
logger.error(f"Chrome浏览器初始化失败: {str(e)}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# 设置隐式等待时间
|
# 设置隐式等待时间
|
||||||
self.driver.implicitly_wait(10)
|
# driver.implicitly_wait(10)
|
||||||
|
|
||||||
# 访问主页获取初始Cookie
|
# 访问主页获取初始Cookie
|
||||||
logger.info("访问主页获取初始Cookie")
|
logger.info("访问主页获取初始Cookie")
|
||||||
logger.info(f"准备访问URL: {self.config.base_url}")
|
logger.info(f"准备访问URL: {self.config.base_url}")
|
||||||
try:
|
try:
|
||||||
self.driver.get(self.config.base_url)
|
driver.get(self.config.base_url)
|
||||||
logger.info(f"成功访问URL: {self.config.base_url}")
|
logger.info(f"成功访问URL: {self.config.base_url}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"访问URL失败: {self.config.base_url}, 错误: {str(e)}")
|
logger.error(f"访问URL失败: {self.config.base_url}, 错误: {str(e)}")
|
||||||
@@ -109,7 +110,7 @@ class XhwCrawler(BaseCrawler):
|
|||||||
time.sleep(random.uniform(2, 4))
|
time.sleep(random.uniform(2, 4))
|
||||||
|
|
||||||
# 检查是否有验证页面
|
# 检查是否有验证页面
|
||||||
page_source = self.driver.page_source
|
page_source = driver.page_source
|
||||||
if "验证" in page_source or "captcha" in page_source.lower() or "滑动验证" in page_source:
|
if "验证" in page_source or "captcha" in page_source.lower() or "滑动验证" in page_source:
|
||||||
logger.warning("检测到验证页面,尝试手动处理验证")
|
logger.warning("检测到验证页面,尝试手动处理验证")
|
||||||
|
|
||||||
@@ -118,318 +119,244 @@ class XhwCrawler(BaseCrawler):
|
|||||||
time.sleep(30)
|
time.sleep(30)
|
||||||
|
|
||||||
# 刷新页面,检查验证是否完成
|
# 刷新页面,检查验证是否完成
|
||||||
self.driver.refresh()
|
driver.refresh()
|
||||||
time.sleep(random.uniform(2, 4))
|
time.sleep(random.uniform(2, 4))
|
||||||
|
|
||||||
# 再次检查验证状态
|
# 再次检查验证状态
|
||||||
page_source = self.driver.page_source
|
page_source = driver.page_source
|
||||||
if "验证" in page_source or "captcha" in page_source.lower() or "滑动验证" in page_source:
|
if "验证" in page_source or "captcha" in page_source.lower() or "滑动验证" in page_source:
|
||||||
logger.error("验证未完成,无法继续爬取")
|
logger.error("验证未完成,无法继续爬取")
|
||||||
# self.driver.quit()
|
# self.driver.quit()
|
||||||
# self.driver = None
|
# self.driver = None
|
||||||
return None
|
return None
|
||||||
|
|
||||||
return self.driver
|
return driver
|
||||||
|
|
||||||
def __del__(self):
|
|
||||||
"""析构函数,确保关闭浏览器"""
|
|
||||||
if hasattr(self, 'driver') and self.driver:
|
|
||||||
self.driver.quit()
|
|
||||||
logger.info("浏览器已关闭")
|
|
||||||
|
|
||||||
def search(self, key:str, total=10, action="news") -> ResultDomain:
|
|
||||||
# 检查driver是否已初始化
|
|
||||||
if not self.driver:
|
|
||||||
logger.error("WebDriver未初始化,无法继续爬取")
|
|
||||||
resultDomain = ResultDomain(code=1, message="WebDriver未初始化,无法继续爬取", success=False)
|
|
||||||
return resultDomain
|
|
||||||
|
|
||||||
# 直接使用self.driver
|
|
||||||
news_urls = []
|
|
||||||
news_list = []
|
|
||||||
resultDomain = ResultDomain(code=0, message="", success=True, dataList=news_list)
|
|
||||||
# 获取搜索配置
|
|
||||||
search_config = self.config.urls.get("search")
|
|
||||||
if not search_config:
|
|
||||||
logger.error("未找到搜索URL配置")
|
|
||||||
resultDomain.code = 0
|
|
||||||
resultDomain.message = "未找到搜索URL配置"
|
|
||||||
resultDomain.success = False
|
|
||||||
return resultDomain
|
|
||||||
pagesize = 10
|
|
||||||
# 准备搜索参数
|
|
||||||
search_data = search_config.params.copy()
|
|
||||||
search_data["k"] = key
|
|
||||||
search_data["action"] = action
|
|
||||||
|
|
||||||
# 获取新闻url
|
|
||||||
url_base_map = {}
|
|
||||||
# 向上取整计算需要的页数
|
|
||||||
total_pages = (total + pagesize - 1) // pagesize
|
|
||||||
for page in range(1, total_pages + 1):
|
|
||||||
search_data["page"] = page
|
|
||||||
pageHtml = search_config.url + "?" + urlencode(search_data)
|
|
||||||
# 分页的html
|
|
||||||
logger.info(f"请求URL: {pageHtml}")
|
|
||||||
|
|
||||||
# 使用Selenium访问页面
|
|
||||||
self.driver.get(pageHtml)
|
|
||||||
time.sleep(random.uniform(2, 4))
|
|
||||||
|
|
||||||
# 检查是否有验证页面
|
|
||||||
if not self.driver:
|
|
||||||
logger.error("WebDriver已失效,无法继续爬取")
|
|
||||||
resultDomain = ResultDomain(code=1, message="WebDriver已失效,无法继续爬取", success=False)
|
|
||||||
return resultDomain
|
|
||||||
|
|
||||||
page_source = self.driver.page_source
|
|
||||||
if "验证" in page_source or "captcha" in page_source.lower() or "滑动验证" in page_source:
|
|
||||||
logger.warning("检测到验证页面,尝试手动处理验证")
|
|
||||||
logger.info("请在30秒内手动完成验证...")
|
|
||||||
time.sleep(30)
|
|
||||||
|
|
||||||
# 检查driver是否仍然有效
|
|
||||||
if not self.driver:
|
|
||||||
logger.error("WebDriver已失效,无法继续爬取")
|
|
||||||
resultDomain = ResultDomain(code=1, message="WebDriver已失效,无法继续爬取", success=False)
|
|
||||||
return resultDomain
|
|
||||||
|
|
||||||
self.driver.refresh()
|
|
||||||
time.sleep(random.uniform(2, 4))
|
|
||||||
|
|
||||||
# 再次检查验证状态
|
|
||||||
if not self.driver:
|
|
||||||
logger.error("WebDriver已失效,无法继续爬取")
|
|
||||||
resultDomain = ResultDomain(code=1, message="WebDriver已失效,无法继续爬取", success=False)
|
|
||||||
return resultDomain
|
|
||||||
|
|
||||||
page_source = self.driver.page_source
|
|
||||||
if "验证" in page_source or "captcha" in page_source.lower() or "滑动验证" in page_source:
|
|
||||||
logger.error("验证未完成,无法继续爬取")
|
|
||||||
resultDomain = ResultDomain(code=1, message="验证未完成,无法继续爬取", success=False)
|
|
||||||
return resultDomain
|
|
||||||
|
|
||||||
# 解析页面内容
|
|
||||||
pageSoup = self.parse_html(page_source)
|
|
||||||
logger.info(f"解析后的HTML内容: {str(pageSoup)[:500]}...") # 只输出前500个字符
|
|
||||||
# 从分页中获取新闻url
|
|
||||||
searchMainDiv = pageSoup.find("div", class_="page-search-main")
|
|
||||||
if not searchMainDiv:
|
|
||||||
logger.error("未找到搜索主体部分")
|
|
||||||
resultDomain.code = 0
|
|
||||||
resultDomain.message = "未找到搜索主体部分"
|
|
||||||
resultDomain.success = False
|
|
||||||
return resultDomain
|
|
||||||
searchGroupDiv = searchMainDiv.find("div", class_="page-search-group")
|
|
||||||
if not searchGroupDiv:
|
|
||||||
logger.error("未找到搜索组")
|
|
||||||
resultDomain.code = 0
|
|
||||||
resultDomain.message = "未找到搜索组"
|
|
||||||
resultDomain.success = False
|
|
||||||
return resultDomain
|
|
||||||
newsDiv = searchGroupDiv.find("div", class_="page-search-news")
|
|
||||||
if not newsDiv:
|
|
||||||
logger.error("未找到新闻列表")
|
|
||||||
resultDomain.code = 0
|
|
||||||
resultDomain.message = "未找到新闻列表"
|
|
||||||
resultDomain.success = False
|
|
||||||
return resultDomain
|
|
||||||
newsList = newsDiv.find_all("div", class_="group")
|
|
||||||
for news in newsList:
|
|
||||||
news_info = news.find("div.head")
|
|
||||||
news_title = news_info.find("div.title")
|
|
||||||
news_date = news_info.find("div.date").text.strip()
|
|
||||||
url = news_title.find("a").get("href")
|
|
||||||
url_base_map[url] = {"title": news_title.get_text(strip=True), "date": news_date}
|
|
||||||
news_urls.append(url)
|
|
||||||
# 临时保存url到url.json
|
|
||||||
with open("url.json", "w", encoding="utf-8") as f:
|
|
||||||
json.dump(url_base_map, f, ensure_ascii=False, indent=4)
|
|
||||||
# 从新闻url中获取新闻详情
|
|
||||||
for news_url in news_urls:
|
|
||||||
news = self.parse_news_detail(news_url)
|
|
||||||
news.title = url_base_map.get(news_url, {}).get("title")
|
|
||||||
news.publishTime = url_base_map.get(news_url, {}).get("date")
|
|
||||||
news_list.append(news)
|
|
||||||
|
|
||||||
# 临时保存新闻到news.json
|
|
||||||
with open("news.json", "w", encoding="utf-8") as f:
|
|
||||||
json.dump(news_list, f, ensure_ascii=False, indent=4)
|
|
||||||
|
|
||||||
# 关闭浏览器
|
|
||||||
if self.driver:
|
|
||||||
self.driver.quit()
|
|
||||||
logger.info("浏览器已关闭")
|
|
||||||
|
|
||||||
return resultDomain
|
|
||||||
|
|
||||||
def parse_news_detail(self, url: str) -> Optional[NewsItem]:
|
def parse_news_detail(self, url: str) -> Optional[NewsItem]:
|
||||||
return self.parse_xhsz_news_detail_selenium(url)
|
return self.parse_xhsz_news_detail(url)
|
||||||
|
|
||||||
|
def parse_xhsz_news_detail(self, url: str) -> NewsItem:
|
||||||
|
"""
|
||||||
|
使用Selenium解析新华网新闻详情页
|
||||||
|
异常局部捕获,保证返回 NewsItem 对象,即使部分内容解析失败
|
||||||
|
"""
|
||||||
|
news_item = NewsItem(title="", contentRows=[], url=url)
|
||||||
|
|
||||||
def parse_xhsz_news_detail_selenium(self, url: str) -> Optional[NewsItem]:
|
|
||||||
# 检查driver是否已初始化
|
|
||||||
if not self.driver:
|
if not self.driver:
|
||||||
logger.error("WebDriver未初始化,无法获取新闻详情")
|
logger.error("WebDriver未初始化,无法获取新闻详情")
|
||||||
return None
|
return news_item
|
||||||
|
|
||||||
newsItem = NewsItem(title="", contentRows=[], url=url)
|
|
||||||
|
|
||||||
# 使用Selenium访问新闻详情页
|
try:
|
||||||
self.driver.get(url)
|
self.driver.get(url)
|
||||||
time.sleep(random.uniform(2, 4))
|
time.sleep(2)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"访问新闻详情页失败: {url}, {e}")
|
||||||
|
return news_item
|
||||||
|
|
||||||
|
# 滑动验证处理
|
||||||
|
try:
|
||||||
|
sliders = self.driver.find_elements(By.CSS_SELECTOR, ".handler.handler_bg")
|
||||||
|
if sliders:
|
||||||
|
slider = sliders[0]
|
||||||
|
action_chain = ActionChains(self.driver)
|
||||||
|
action_chain.click_and_hold(slider).perform()
|
||||||
|
distance = 1000
|
||||||
|
tracks = [distance*0.2, distance*0.3, distance*0.25, distance*0.25]
|
||||||
|
for track in tracks:
|
||||||
|
action_chain.move_by_offset(int(track), 0).pause(1)
|
||||||
|
action_chain.perform()
|
||||||
|
action_chain.release().perform()
|
||||||
|
time.sleep(2)
|
||||||
|
except Exception as e:
|
||||||
|
logger.info(f"滑块验证处理失败或未出现: {e}")
|
||||||
|
|
||||||
# 检查是否有验证页面
|
final_url = self.driver.current_url
|
||||||
if not self.driver:
|
if final_url != url:
|
||||||
logger.error("WebDriver已失效,无法获取新闻详情")
|
news_item = self.parse_xh_news_detail(final_url)
|
||||||
return None
|
news_item.url = url
|
||||||
|
return news_item
|
||||||
page_source = self.driver.page_source
|
|
||||||
if "验证" in page_source or "captcha" in page_source.lower() or "滑动验证" in page_source:
|
|
||||||
logger.warning("检测到验证页面,尝试手动处理验证")
|
|
||||||
logger.info("请在30秒内手动完成验证...")
|
|
||||||
time.sleep(30)
|
|
||||||
|
|
||||||
# 检查driver是否仍然有效
|
|
||||||
if not self.driver:
|
|
||||||
logger.error("WebDriver已失效,无法获取新闻详情")
|
|
||||||
return None
|
|
||||||
|
|
||||||
self.driver.refresh()
|
|
||||||
time.sleep(random.uniform(2, 4))
|
|
||||||
|
|
||||||
# 再次检查验证状态
|
# 新闻主体
|
||||||
if not self.driver:
|
try:
|
||||||
logger.error("WebDriver已失效,无法获取新闻详情")
|
main_div = WebDriverWait(self.driver, 10).until(
|
||||||
return None
|
EC.presence_of_element_located((By.CSS_SELECTOR, "div.page-news-detail"))
|
||||||
|
)
|
||||||
page_source = self.driver.page_source
|
except Exception as e:
|
||||||
if "验证" in page_source or "captcha" in page_source.lower() or "滑动验证" in page_source:
|
logger.warning(f"未找到新闻主体: {url}, {e}")
|
||||||
logger.error("验证未完成,无法获取新闻详情")
|
return news_item
|
||||||
return None
|
|
||||||
|
|
||||||
# 解析页面内容
|
try:
|
||||||
newsDetailSoup = self.parse_html(page_source)
|
article_div = main_div.find_element(By.CSS_SELECTOR, "div.page-news-l")
|
||||||
|
except:
|
||||||
|
logger.warning(f"未找到文章主体: {url}")
|
||||||
|
return news_item
|
||||||
|
|
||||||
# 查找新闻主体部分
|
# 标题
|
||||||
main_div = newsDetailSoup.find("div.page-news-detail")
|
try:
|
||||||
if not main_div:
|
title_div = article_div.find_element(By.CSS_SELECTOR, "div.page-news-detail-title")
|
||||||
logger.error(f"未找到新闻主体部分: {url}")
|
news_item.title = title_div.text.strip()
|
||||||
return None
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
article_div = main_div.find("div.page-news-l")
|
# 新闻元信息
|
||||||
if not article_div:
|
try:
|
||||||
logger.error(f"未找到新闻文章部分: {url}")
|
channal_div = article_div.find_element(By.CSS_SELECTOR, "div.page-news-detail-note")
|
||||||
return None
|
channal_items = channal_div.find_elements(By.CSS_SELECTOR, "div.item")
|
||||||
|
for item in channal_items:
|
||||||
# 获取标题
|
|
||||||
title_div = article_div.find("div.page-news-detail-title")
|
|
||||||
if title_div:
|
|
||||||
newsItem.title = title_div.text.strip()
|
|
||||||
|
|
||||||
# 获取新闻元信息
|
|
||||||
channal_div = article_div.find("div.page-news-detail-note")
|
|
||||||
if channal_div:
|
|
||||||
channal_items_div = channal_div.find_all("div.item")
|
|
||||||
for item in channal_items_div:
|
|
||||||
text = item.text.strip()
|
text = item.text.strip()
|
||||||
if "来源" in text:
|
if "来源" in text:
|
||||||
parts = text.split(":", 1)
|
news_item.source = text.split(":", 1)[-1].strip()
|
||||||
if len(parts) > 1:
|
|
||||||
newsItem.source = parts[1].strip()
|
|
||||||
elif "发布时间" in text:
|
elif "发布时间" in text:
|
||||||
parts = text.split(":", 1)
|
news_item.publishTime = text.split(":", 1)[-1].strip()
|
||||||
if len(parts) > 1:
|
|
||||||
newsItem.publishTime = parts[1].strip()
|
|
||||||
elif "浏览人数" in text:
|
elif "浏览人数" in text:
|
||||||
parts = text.split(":", 1)
|
try:
|
||||||
if len(parts) > 1:
|
news_item.viewCount = int(text.split(":", 1)[-1].strip())
|
||||||
newsItem.viewCount = parts[1].strip()
|
except:
|
||||||
|
pass
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
# 获取新闻内容
|
# 内容
|
||||||
content_div = article_div.find("div.page-news-detail-content")
|
try:
|
||||||
if content_div:
|
content_div = article_div.find_element(By.CSS_SELECTOR, "div.page-news-detail-content")
|
||||||
# 遍历内容区域中的所有元素
|
children = content_div.find_elements(By.XPATH, "./*")
|
||||||
for child in content_div.children:
|
for child in children:
|
||||||
if not isinstance(child, Tag):
|
try:
|
||||||
|
tag_name = child.tag_name.lower()
|
||||||
|
if tag_name == "p":
|
||||||
|
text = child.text.strip().replace("\xa0", "")
|
||||||
|
if not text and len(child.find_elements(By.TAG_NAME, "img")) == 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 图片
|
||||||
|
try:
|
||||||
|
img = child.find_element(By.TAG_NAME, "img")
|
||||||
|
src = img.get_attribute("src")
|
||||||
|
if src and not src.startswith("http"):
|
||||||
|
src = self.config.base_url + src
|
||||||
|
news_item.contentRows.append({"tag": "img", "content": f"<img src='{src}' />"})
|
||||||
|
continue
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# 视频
|
||||||
|
try:
|
||||||
|
video = child.find_element(By.TAG_NAME, "video")
|
||||||
|
src = video.get_attribute("src")
|
||||||
|
if src and not src.startswith("http"):
|
||||||
|
src = self.config.base_url + src
|
||||||
|
news_item.contentRows.append({"tag": "video", "content": f"<video src='{src}' />"})
|
||||||
|
continue
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# 普通段落
|
||||||
|
news_item.contentRows.append({"tag": "p", "content": child.get_attribute("outerHTML")})
|
||||||
|
elif tag_name in ["img", "video"]:
|
||||||
|
news_item.contentRows.append({"tag": tag_name, "content": child.get_attribute("outerHTML")})
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"解析段落失败: {e}")
|
||||||
continue
|
continue
|
||||||
|
except:
|
||||||
|
logger.warning(f"新闻内容解析失败: {url}")
|
||||||
|
|
||||||
# 处理图片
|
return news_item
|
||||||
if child.name == "p" and child.find("img"):
|
|
||||||
img_tag = child.find("img")
|
|
||||||
if img_tag:
|
|
||||||
src = str(img_tag.get("src"))
|
|
||||||
img_tag["src"] = self._normalize_url(src)
|
|
||||||
newsItem.contentRows.append({
|
|
||||||
"tag": "img",
|
|
||||||
"content": str(img_tag)
|
|
||||||
})
|
|
||||||
# 处理视频
|
|
||||||
elif child.find("video"):
|
|
||||||
video_tag = child.find("video")
|
|
||||||
if video_tag:
|
|
||||||
src = str(video_tag.get("src"))
|
|
||||||
video_tag["src"] = self._normalize_url(src)
|
|
||||||
newsItem.contentRows.append({
|
|
||||||
"tag": "video",
|
|
||||||
"content": str(video_tag)
|
|
||||||
})
|
|
||||||
# 处理普通段落
|
|
||||||
elif child.name == "p" and child.get_text(strip=True):
|
|
||||||
newsItem.contentRows.append({
|
|
||||||
"tag": "p",
|
|
||||||
"content": child.get_text(strip=True)
|
|
||||||
})
|
|
||||||
|
|
||||||
return newsItem
|
|
||||||
|
|
||||||
def parse_xhsz_news_detail(self, url: str) -> Optional[NewsItem]:
|
|
||||||
newsItem = NewsItem(title="", contentRows=[], url=url)
|
|
||||||
response = self.fetch(url)
|
|
||||||
newsDetailSoup = self.parse_html(response.content)
|
|
||||||
|
|
||||||
main_div = newsDetailSoup.find("div.page-news-detail")
|
|
||||||
article_div = main_div.find("div.page-news-l")
|
|
||||||
title_div = article_div.find("div.page-news-detail-title")
|
|
||||||
channal_div = article_div.find("div.page-news-detail-note")
|
|
||||||
content_div = article_div.find("div.page-news-detail-content")
|
|
||||||
|
|
||||||
# 获取新闻标题
|
|
||||||
newsItem.title = title_div.text.strip()
|
|
||||||
|
|
||||||
# 获取新闻来源、发布时间、浏览人数
|
|
||||||
channal_items_div = channal_div.find_all("div.item")
|
|
||||||
if("来源" in channal_items_div[0].text):
|
|
||||||
newsItem.source = channal_items_div[0].text.strip().split(":")[1]
|
|
||||||
if("发布时间" in channal_items_div[1].text):
|
|
||||||
newsItem.publishTime = channal_items_div[1].text.strip().split(":")[1]
|
|
||||||
if("浏览人数" in channal_items_div[2].text):
|
|
||||||
newsItem.viewCount = channal_items_div[2].text.strip().split(":")[1]
|
|
||||||
|
|
||||||
for child in content_div.children:
|
|
||||||
if not isinstance(child, Tag):
|
|
||||||
continue
|
|
||||||
|
|
||||||
img_tag = child.find("img")
|
|
||||||
video_tag = child.find("video")
|
|
||||||
tag = "p"
|
|
||||||
content = str(child)
|
|
||||||
if img_tag: # 是图片
|
|
||||||
tag = "img"
|
|
||||||
src = str(img_tag.get("src"))
|
|
||||||
if src:
|
|
||||||
img_tag["src"] = self._normalize_url(src)
|
|
||||||
content = str(img_tag)
|
|
||||||
elif video_tag: # 是视频
|
|
||||||
tag = "video"
|
|
||||||
src = str(video_tag.get("src"))
|
|
||||||
if src:
|
|
||||||
video_tag["src"] = self._normalize_url(src)
|
|
||||||
content = str(video_tag)
|
|
||||||
|
|
||||||
newsItem.contentRows.append({"tag": tag, "content": content})
|
|
||||||
|
|
||||||
return newsItem
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def parse_xh_news_detail(self, url: str) -> NewsItem:
|
||||||
|
"""
|
||||||
|
使用Selenium解析新华网新闻详情页
|
||||||
|
异常局部捕获,保证返回 NewsItem 对象,即使部分内容解析失败
|
||||||
|
"""
|
||||||
|
news_item = NewsItem(title="", contentRows=[], url=url)
|
||||||
|
|
||||||
|
if not self.driver:
|
||||||
|
logger.error("WebDriver未初始化,无法获取新闻详情")
|
||||||
|
return news_item
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.driver.get(url)
|
||||||
|
time.sleep(2)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"访问新闻详情页失败: {url}, {e}")
|
||||||
|
return news_item
|
||||||
|
|
||||||
|
# 滑动验证处理
|
||||||
|
try:
|
||||||
|
sliders = self.driver.find_elements(By.CSS_SELECTOR, ".handler.handler_bg")
|
||||||
|
if sliders:
|
||||||
|
slider = sliders[0]
|
||||||
|
action_chain = ActionChains(self.driver)
|
||||||
|
action_chain.click_and_hold(slider).perform()
|
||||||
|
distance = 1000
|
||||||
|
tracks = [distance*0.2, distance*0.3, distance*0.25, distance*0.25]
|
||||||
|
for track in tracks:
|
||||||
|
action_chain.move_by_offset(int(track), 0).pause(1)
|
||||||
|
action_chain.perform()
|
||||||
|
action_chain.release().perform()
|
||||||
|
time.sleep(2)
|
||||||
|
except Exception as e:
|
||||||
|
logger.info(f"滑块验证处理失败或未出现: {e}")
|
||||||
|
|
||||||
|
# head
|
||||||
|
head_div = self.driver.find_element(By.CSS_SELECTOR, "div.header.domPC")
|
||||||
|
time_div = head_div.find_element(By.CSS_SELECTOR, "div.header-time.left")
|
||||||
|
datetimes = time_div.find_element(By.CSS_SELECTOR, "span.year").text+"/"+time_div.find_element(By.CSS_SELECTOR, "span.day").text+" "+time_div.find_element(By.CSS_SELECTOR, "span.time").text
|
||||||
|
news_item.publishTime = str(datetime.strptime(datetimes, "%Y/%m/%d %H:%M:%S"))
|
||||||
|
source = head_div.find_element(By.CSS_SELECTOR, "div.source").text.split(":")[1]
|
||||||
|
news_item.source = source
|
||||||
|
|
||||||
|
title = head_div.find_element(By.CSS_SELECTOR, "h1").text
|
||||||
|
news_item.title = title
|
||||||
|
|
||||||
|
# 内容
|
||||||
|
try:
|
||||||
|
article_div = self.driver.find_element(By.CSS_SELECTOR, "div.main.clearfix")
|
||||||
|
content_div = article_div.find_element(By.CSS_SELECTOR, "div#detail span#detailContent")
|
||||||
|
children = content_div.find_elements(By.XPATH, "./*")
|
||||||
|
for child in children:
|
||||||
|
try:
|
||||||
|
tag_name = child.tag_name.lower()
|
||||||
|
if tag_name == "p":
|
||||||
|
text = child.text.strip().replace("\xa0", "")
|
||||||
|
if not text and len(child.find_elements(By.TAG_NAME, "img")) == 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 图片
|
||||||
|
try:
|
||||||
|
img = child.find_element(By.TAG_NAME, "img")
|
||||||
|
src = img.get_attribute("src")
|
||||||
|
if src and not src.startswith("http"):
|
||||||
|
src = self.config.base_url + src
|
||||||
|
news_item.contentRows.append({"tag": "img", "content": f"<img src='{src}' />"})
|
||||||
|
continue
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# 视频
|
||||||
|
try:
|
||||||
|
video = child.find_element(By.TAG_NAME, "video")
|
||||||
|
src = video.get_attribute("src")
|
||||||
|
if src and not src.startswith("http"):
|
||||||
|
src = self.config.base_url + src
|
||||||
|
news_item.contentRows.append({"tag": "video", "content": f"<video src='{src}' />"})
|
||||||
|
continue
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# 普通段落
|
||||||
|
news_item.contentRows.append({"tag": "p", "content": child.get_attribute("outerHTML")})
|
||||||
|
elif tag_name in ["img", "video"]:
|
||||||
|
news_item.contentRows.append({"tag": tag_name, "content": child.get_attribute("outerHTML")})
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"解析段落失败: {e}")
|
||||||
|
continue
|
||||||
|
except:
|
||||||
|
logger.warning(f"新闻内容解析失败: {url}")
|
||||||
|
|
||||||
|
return news_item
|
||||||
|
|
||||||
def _normalize_url(self, url: str) -> str:
|
def _normalize_url(self, url: str) -> str:
|
||||||
"""
|
"""
|
||||||
规范化 URL,补全协议和域名
|
规范化 URL,补全协议和域名
|
||||||
@@ -453,3 +380,121 @@ class XhwCrawler(BaseCrawler):
|
|||||||
|
|
||||||
# 相对路径,补全域名
|
# 相对路径,补全域名
|
||||||
return self.config.base_url + url
|
return self.config.base_url + url
|
||||||
|
|
||||||
|
def search(self, key:str, total=10, action="news") -> ResultDomain:
|
||||||
|
# 检查driver是否已初始化
|
||||||
|
if not self.driver:
|
||||||
|
logger.error("WebDriver未初始化,无法继续爬取")
|
||||||
|
return ResultDomain(code=1, message="WebDriver未初始化,无法继续爬取", success=False)
|
||||||
|
|
||||||
|
news_urls = []
|
||||||
|
news_list = []
|
||||||
|
resultDomain = ResultDomain(code=0, message="", success=True, dataList=news_list)
|
||||||
|
|
||||||
|
# 获取搜索配置
|
||||||
|
search_config = self.config.urls.get("search")
|
||||||
|
if not search_config:
|
||||||
|
logger.error("未找到搜索URL配置")
|
||||||
|
resultDomain.code = 0
|
||||||
|
resultDomain.message = "未找到搜索URL配置"
|
||||||
|
resultDomain.success = False
|
||||||
|
return resultDomain
|
||||||
|
|
||||||
|
pagesize = 10
|
||||||
|
search_data = search_config.params.copy()
|
||||||
|
search_data["k"] = key
|
||||||
|
search_data["action"] = action
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 获取新闻url
|
||||||
|
url_base_map = {}
|
||||||
|
total_pages = (total + pagesize - 1) // pagesize
|
||||||
|
for page in range(1, total_pages + 1):
|
||||||
|
search_data["page"] = page
|
||||||
|
pageHtml = search_config.url + "?" + urlencode(search_data)
|
||||||
|
logger.info(f"请求URL: {pageHtml}")
|
||||||
|
|
||||||
|
# 使用Selenium访问页面
|
||||||
|
try:
|
||||||
|
self.driver.get(pageHtml)
|
||||||
|
time.sleep(2)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"访问搜索页失败: {pageHtml}, {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 滑动验证处理
|
||||||
|
try:
|
||||||
|
sliders = self.driver.find_elements(By.CSS_SELECTOR, ".handler.handler_bg")
|
||||||
|
if sliders:
|
||||||
|
slider = sliders[0]
|
||||||
|
action_chain = ActionChains(self.driver)
|
||||||
|
action_chain.click_and_hold(slider).perform()
|
||||||
|
distance = 1000
|
||||||
|
tracks = [distance*0.2, distance*0.3, distance*0.25, distance*0.25]
|
||||||
|
for track in tracks:
|
||||||
|
action_chain.move_by_offset(int(track), 0).pause(1)
|
||||||
|
action_chain.release().perform()
|
||||||
|
time.sleep(5)
|
||||||
|
except Exception as e:
|
||||||
|
logger.info(f"滑动验证处理失败或未出现: {e}")
|
||||||
|
|
||||||
|
# 提取新闻列表
|
||||||
|
try:
|
||||||
|
search_main = self.driver.find_element(By.CSS_SELECTOR, "div.page-search-main")
|
||||||
|
search_group = search_main.find_element(By.CSS_SELECTOR, "div.page-search-group")
|
||||||
|
news_div = search_group.find_element(By.CSS_SELECTOR, "div.page-search-news")
|
||||||
|
news_items = news_div.find_elements(By.CSS_SELECTOR, "div.group")
|
||||||
|
for news in news_items:
|
||||||
|
try:
|
||||||
|
head = news.find_element(By.CSS_SELECTOR, "div.head")
|
||||||
|
title_div = head.find_element(By.CSS_SELECTOR, "div.title")
|
||||||
|
date_div = head.find_element(By.CSS_SELECTOR, "div.date")
|
||||||
|
a_tag = title_div.find_element(By.TAG_NAME, "a")
|
||||||
|
news_url = a_tag.get_attribute("href")
|
||||||
|
news_title = a_tag.text.strip()
|
||||||
|
news_date = date_div.text.strip()
|
||||||
|
url_base_map[news_url] = {"title": news_title, "date": news_date}
|
||||||
|
news_urls.append(news_url)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"提取单条新闻URL失败: {e}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"提取新闻列表失败: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 从新闻url中获取新闻详情
|
||||||
|
count = 0
|
||||||
|
for news_url in news_urls:
|
||||||
|
try:
|
||||||
|
news = self.parse_news_detail(news_url)
|
||||||
|
if news:
|
||||||
|
news.title = url_base_map.get(news_url, {}).get("title") or news.title
|
||||||
|
news.publishTime = url_base_map.get(news_url, {}).get("date") or news.publishTime
|
||||||
|
news_list.append(news)
|
||||||
|
count += 1
|
||||||
|
if count >= total:
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"解析新闻失败: {news_url}, {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"搜索过程整体异常: {e}")
|
||||||
|
resultDomain.success = False
|
||||||
|
resultDomain.code = 0
|
||||||
|
resultDomain.message = "爬取失败"
|
||||||
|
|
||||||
|
# 最终保证返回 dataList
|
||||||
|
resultDomain.dataList = news_list
|
||||||
|
resultDomain.success = bool(news_list)
|
||||||
|
return resultDomain
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
if hasattr(self, 'driver') and self.driver:
|
||||||
|
try:
|
||||||
|
self.driver.quit()
|
||||||
|
logger.info("浏览器已关闭")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"关闭浏览器失败: {str(e)}")
|
||||||
|
self.driver = None
|
||||||
|
|||||||
106
schoolNewsCrawler/crawler/xhw/XhwSearch.py
Normal file
106
schoolNewsCrawler/crawler/xhw/XhwSearch.py
Normal file
@@ -0,0 +1,106 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
人民日报搜索爬虫命令行工具
|
||||||
|
用法: python RmrbSearch.py --key "关键词" --total 10 --type 0
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
import time
|
||||||
|
# Add project root directory to path to import crawler
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
||||||
|
|
||||||
|
from crawler.xhw.XhwCrawler import XhwCrawler
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""主函数"""
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description='人民日报新闻搜索工具',
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
|
epilog="""
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--query', '-q',
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help='搜索关键词'
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--total', '-t',
|
||||||
|
type=int,
|
||||||
|
default=10,
|
||||||
|
help='抓取数量 (默认: 10)'
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--output', '-o',
|
||||||
|
type=str,
|
||||||
|
help='输出文件路径'
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# 获取参数
|
||||||
|
key = args.query
|
||||||
|
total = args.total
|
||||||
|
output_file = args.output
|
||||||
|
|
||||||
|
logger.info("使用直接参数模式")
|
||||||
|
|
||||||
|
# 关键校验:key 必须存在
|
||||||
|
if not key or not key.strip():
|
||||||
|
parser.error("搜索关键词不能为空!")
|
||||||
|
try:
|
||||||
|
logger.info(f"开始搜索: 关键词='{key}', 数量={total}")
|
||||||
|
crawler = XhwCrawler()
|
||||||
|
time.sleep(5)
|
||||||
|
result = crawler.search(key=key.strip(), total=total)
|
||||||
|
# print(result)
|
||||||
|
output = {
|
||||||
|
"code": result.code,
|
||||||
|
"message": result.message,
|
||||||
|
"success": result.success,
|
||||||
|
"data": None,
|
||||||
|
"dataList": [item.model_dump() for item in result.dataList] if result.dataList else []
|
||||||
|
}
|
||||||
|
# result = None
|
||||||
|
# with open("F:\Project\schoolNews\schoolNewsCrawler\output\output.json", "r", encoding="utf-8") as f:
|
||||||
|
# result = json.load(f)
|
||||||
|
# print(result)
|
||||||
|
# output = result
|
||||||
|
|
||||||
|
|
||||||
|
if output_file:
|
||||||
|
output_path = Path(output_file)
|
||||||
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
with open(output_path, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(output, f, ensure_ascii=False, indent=2)
|
||||||
|
logger.info(f"结果已保存到: {output_file}")
|
||||||
|
|
||||||
|
crawler.close()
|
||||||
|
sys.exit(0 if result.success else 1)
|
||||||
|
# print(json.dumps(output, ensure_ascii=False, indent=2))
|
||||||
|
# sys.exit(0 if result["success"] else 1)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"执行失败: {str(e)}")
|
||||||
|
error_output = {
|
||||||
|
"code": 500,
|
||||||
|
"message": f"执行失败: {str(e)}",
|
||||||
|
"success": False,
|
||||||
|
"data": None,
|
||||||
|
"dataList": []
|
||||||
|
}
|
||||||
|
print(json.dumps(error_output, ensure_ascii=False, indent=2))
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -2,7 +2,7 @@
|
|||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 3,
|
"execution_count": 18,
|
||||||
"id": "948be230",
|
"id": "948be230",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@@ -41,7 +41,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 4,
|
"execution_count": 19,
|
||||||
"id": "31a8a0dd",
|
"id": "31a8a0dd",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@@ -49,11 +49,11 @@
|
|||||||
"name": "stderr",
|
"name": "stderr",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"\u001b[32m2025-11-19 19:03:54.324\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.BaseCrawler\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m71\u001b[0m - \u001b[1m初始化爬虫: XhwCrawler\u001b[0m\n",
|
"\u001b[32m2025-11-20 14:39:07.858\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.BaseCrawler\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m71\u001b[0m - \u001b[1m初始化爬虫: XhwCrawler\u001b[0m\n",
|
||||||
"\u001b[32m2025-11-19 19:03:55.214\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m20\u001b[0m - \u001b[1mChrome浏览器初始化成功\u001b[0m\n",
|
"\u001b[32m2025-11-20 14:39:08.884\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m20\u001b[0m - \u001b[1mChrome浏览器初始化成功\u001b[0m\n",
|
||||||
"\u001b[32m2025-11-19 19:03:55.216\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m25\u001b[0m - \u001b[1m访问主页获取初始Cookie\u001b[0m\n",
|
"\u001b[32m2025-11-20 14:39:08.884\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m24\u001b[0m - \u001b[1m访问主页获取初始Cookie\u001b[0m\n",
|
||||||
"\u001b[32m2025-11-19 19:03:55.217\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m26\u001b[0m - \u001b[1m准备访问URL: https://xhsz.news.cn/\u001b[0m\n",
|
"\u001b[32m2025-11-20 14:39:08.885\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m25\u001b[0m - \u001b[1m准备访问URL: https://xhsz.news.cn/\u001b[0m\n",
|
||||||
"\u001b[32m2025-11-19 19:03:57.557\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m29\u001b[0m - \u001b[1m成功访问URL: https://xhsz.news.cn/\u001b[0m\n"
|
"\u001b[32m2025-11-20 14:39:10.309\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m28\u001b[0m - \u001b[1m成功访问URL: https://xhsz.news.cn/\u001b[0m\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@@ -63,7 +63,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 5,
|
"execution_count": null,
|
||||||
"id": "e5a6e91c",
|
"id": "e5a6e91c",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@@ -71,34 +71,42 @@
|
|||||||
"name": "stderr",
|
"name": "stderr",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"\u001b[32m2025-11-19 19:04:12.458\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36msearch\u001b[0m:\u001b[36m174\u001b[0m - \u001b[1m请求URL: https://xhsz.news.cn/s?k=%E4%B9%A0%E8%BF%91%E5%B9%B3&action=news&page=1\u001b[0m\n",
|
"\u001b[32m2025-11-20 13:19:51.853\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36msearch\u001b[0m:\u001b[36m26\u001b[0m - \u001b[1m请求URL: https://xhsz.news.cn/s?k=%E5%A4%A7%E5%AD%A6&action=news&page=1\u001b[0m\n",
|
||||||
"\u001b[32m2025-11-19 19:04:15.858\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36msearch\u001b[0m:\u001b[36m188\u001b[0m - \u001b[33m\u001b[1m检测到验证页面,尝试手动处理验证\u001b[0m\n",
|
"\u001b[32m2025-11-20 13:20:15.300\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mparse_xhsz_news_detail\u001b[0m:\u001b[36m26\u001b[0m - \u001b[1m未发现滑动验证,直接继续\u001b[0m\n",
|
||||||
"\u001b[32m2025-11-19 19:04:15.858\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36msearch\u001b[0m:\u001b[36m189\u001b[0m - \u001b[1m请在30秒内手动完成验证...\u001b[0m\n",
|
"\u001b[32m2025-11-20 13:20:20.310\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mparse_xhsz_news_detail\u001b[0m:\u001b[36m33\u001b[0m - \u001b[1m找到新闻主体部分: <selenium.webdriver.remote.webelement.WebElement (session=\"11360ade0a59af3938c0f8faa9b88abf\", element=\"f.6B13A7AB92BA3CB5CE0964EB246896F9.d.8B0C5F90441ED5455E088CF6DF7032DE.e.84\")>\u001b[0m\n",
|
||||||
"\u001b[32m2025-11-19 19:04:48.814\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36msearch\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m解析后的HTML内容: <html lang=\"en\"><head>\n",
|
"\u001b[32m2025-11-20 13:20:36.428\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mparse_xhsz_news_detail\u001b[0m:\u001b[36m26\u001b[0m - \u001b[1m未发现滑动验证,直接继续\u001b[0m\n",
|
||||||
"<meta charset=\"utf-8\"/>\n",
|
"\u001b[32m2025-11-20 13:20:41.434\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mparse_xhsz_news_detail\u001b[0m:\u001b[36m33\u001b[0m - \u001b[1m找到新闻主体部分: <selenium.webdriver.remote.webelement.WebElement (session=\"11360ade0a59af3938c0f8faa9b88abf\", element=\"f.6B13A7AB92BA3CB5CE0964EB246896F9.d.D41E40A40777EF2D881878B18F35342A.e.114\")>\u001b[0m\n",
|
||||||
"<meta content=\"IE=edge\" http-equiv=\"X-UA-Compatible\"/>\n",
|
"\u001b[32m2025-11-20 13:20:57.656\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mparse_xhsz_news_detail\u001b[0m:\u001b[36m26\u001b[0m - \u001b[1m未发现滑动验证,直接继续\u001b[0m\n",
|
||||||
"<meta content=\"webkit\" name=\"renderer\"/>\n",
|
"\u001b[32m2025-11-20 13:21:02.664\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mparse_xhsz_news_detail\u001b[0m:\u001b[36m33\u001b[0m - \u001b[1m找到新闻主体部分: <selenium.webdriver.remote.webelement.WebElement (session=\"11360ade0a59af3938c0f8faa9b88abf\", element=\"f.6B13A7AB92BA3CB5CE0964EB246896F9.d.2BA293A49BA4DA88D492D8BDC1E07365.e.157\")>\u001b[0m\n",
|
||||||
"<title>新华网新华思政-全国高校课程思政教学资源服务平台</title>\n",
|
"\u001b[32m2025-11-20 13:21:18.808\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mparse_xhsz_news_detail\u001b[0m:\u001b[36m26\u001b[0m - \u001b[1m未发现滑动验证,直接继续\u001b[0m\n",
|
||||||
"<meta content=\"新华思政,课程思政,全国高校课程思政教学资源服务平台,新华网,新华教育,思政教育.\" name=\"keywords\"/>\n",
|
"\u001b[32m2025-11-20 13:21:23.814\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mparse_xhsz_news_detail\u001b[0m:\u001b[36m33\u001b[0m - \u001b[1m找到新闻主体部分: <selenium.webdriver.remote.webelement.WebElement (session=\"11360ade0a59af3938c0f8faa9b88abf\", element=\"f.6B13A7AB92BA3CB5CE0964EB246896F9.d.DDC416596722BE8B22A5E84011EA59C3.e.198\")>\u001b[0m\n",
|
||||||
"<meta content=\"新华网作为党和国家重要的网上舆论阵地,适时推出新华思政—全国高校课程思政教学资源服务平台,为全国高校教师针对课程思政建设、交流、学习和共享于一体的教学服务平台,旨在推广课程思政建设先进经验和做法,助力高校课程思政教学资源需求,深入挖掘课程思政元素,助力广泛开展课程思政建设的良好氛围,提升教师开展课程思政建设的意识和能力。\" name=\"description\"/>\n",
|
"\u001b[32m2025-11-20 13:22:32.631\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mparse_xhsz_news_detail\u001b[0m:\u001b[36m26\u001b[0m - \u001b[1m未发现滑动验证,直接继续\u001b[0m\n",
|
||||||
"<link href=\"/static/skin4/favicon.ico\" rel...\u001b[0m\n"
|
"\u001b[32m2025-11-20 13:22:37.642\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mparse_xhsz_news_detail\u001b[0m:\u001b[36m33\u001b[0m - \u001b[1m找到新闻主体部分: <selenium.webdriver.remote.webelement.WebElement (session=\"11360ade0a59af3938c0f8faa9b88abf\", element=\"f.6B13A7AB92BA3CB5CE0964EB246896F9.d.B9E24DEEF281C700F90635CABAA2B108.e.230\")>\u001b[0m\n",
|
||||||
|
"\u001b[32m2025-11-20 13:22:53.636\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mparse_xhsz_news_detail\u001b[0m:\u001b[36m26\u001b[0m - \u001b[1m未发现滑动验证,直接继续\u001b[0m\n",
|
||||||
|
"\u001b[32m2025-11-20 13:22:58.643\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mparse_xhsz_news_detail\u001b[0m:\u001b[36m33\u001b[0m - \u001b[1m找到新闻主体部分: <selenium.webdriver.remote.webelement.WebElement (session=\"11360ade0a59af3938c0f8faa9b88abf\", element=\"f.6B13A7AB92BA3CB5CE0964EB246896F9.d.EECC90A746E37A0994443791EFF7C402.e.290\")>\u001b[0m\n",
|
||||||
|
"\u001b[32m2025-11-20 13:23:15.189\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mparse_xhsz_news_detail\u001b[0m:\u001b[36m26\u001b[0m - \u001b[1m未发现滑动验证,直接继续\u001b[0m\n",
|
||||||
|
"\u001b[32m2025-11-20 13:23:20.196\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mparse_xhsz_news_detail\u001b[0m:\u001b[36m33\u001b[0m - \u001b[1m找到新闻主体部分: <selenium.webdriver.remote.webelement.WebElement (session=\"11360ade0a59af3938c0f8faa9b88abf\", element=\"f.6B13A7AB92BA3CB5CE0964EB246896F9.d.0188441312BE753DFF48394C16A44F8F.e.330\")>\u001b[0m\n",
|
||||||
|
"\u001b[32m2025-11-20 13:23:36.050\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mparse_xhsz_news_detail\u001b[0m:\u001b[36m26\u001b[0m - \u001b[1m未发现滑动验证,直接继续\u001b[0m\n",
|
||||||
|
"\u001b[32m2025-11-20 13:23:41.057\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mparse_xhsz_news_detail\u001b[0m:\u001b[36m33\u001b[0m - \u001b[1m找到新闻主体部分: <selenium.webdriver.remote.webelement.WebElement (session=\"11360ade0a59af3938c0f8faa9b88abf\", element=\"f.6B13A7AB92BA3CB5CE0964EB246896F9.d.F7A148D8A30D006FFCDAC45B01A2E7B5.e.374\")>\u001b[0m\n",
|
||||||
|
"\u001b[32m2025-11-20 13:23:56.819\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mparse_xhsz_news_detail\u001b[0m:\u001b[36m26\u001b[0m - \u001b[1m未发现滑动验证,直接继续\u001b[0m\n",
|
||||||
|
"\u001b[32m2025-11-20 13:24:01.826\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mparse_xhsz_news_detail\u001b[0m:\u001b[36m33\u001b[0m - \u001b[1m找到新闻主体部分: <selenium.webdriver.remote.webelement.WebElement (session=\"11360ade0a59af3938c0f8faa9b88abf\", element=\"f.6B13A7AB92BA3CB5CE0964EB246896F9.d.5A632E0B79568A5FFC8E29FFD5B09507.e.396\")>\u001b[0m\n",
|
||||||
|
"\u001b[32m2025-11-20 13:24:17.976\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mparse_xhsz_news_detail\u001b[0m:\u001b[36m26\u001b[0m - \u001b[1m未发现滑动验证,直接继续\u001b[0m\n",
|
||||||
|
"\u001b[32m2025-11-20 13:24:22.983\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36mparse_xhsz_news_detail\u001b[0m:\u001b[36m33\u001b[0m - \u001b[1m找到新闻主体部分: <selenium.webdriver.remote.webelement.WebElement (session=\"11360ade0a59af3938c0f8faa9b88abf\", element=\"f.6B13A7AB92BA3CB5CE0964EB246896F9.d.6B5B529215D1C2221EEF1597FF0C3D0A.e.445\")>\u001b[0m\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"ename": "AttributeError",
|
"data": {
|
||||||
"evalue": "'NoneType' object has no attribute 'find'",
|
"text/plain": [
|
||||||
"output_type": "error",
|
"ResultDomain(code=0, message='', success=True, data=None, dataList=[])"
|
||||||
"traceback": [
|
]
|
||||||
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
},
|
||||||
"\u001b[31mAttributeError\u001b[39m Traceback (most recent call last)",
|
"execution_count": 16,
|
||||||
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[5]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mcrawler\u001b[49m\u001b[43m.\u001b[49m\u001b[43msearch\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43m习近平\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[32;43m10\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m 2\u001b[39m \u001b[38;5;66;03m# crawler.search(\"中国\", 10, \"xhsz\")\u001b[39;00m\n\u001b[32m 3\u001b[39m \u001b[38;5;66;03m# crawler.search(\"中国\", 10, \"news\")\u001b[39;00m\n\u001b[32m 4\u001b[39m \u001b[38;5;66;03m# crawler.search(\"中国\", 10, \"xhsz\")\u001b[39;00m\n\u001b[32m 5\u001b[39m \u001b[38;5;66;03m# crawler.search(\"中国\", 10, \"news\")\u001b[39;00m\n\u001b[32m 6\u001b[39m \u001b[38;5;66;03m# crawler.search(\"中国\", 10, \"news\")\u001b[39;00m\n",
|
"metadata": {},
|
||||||
"\u001b[36mFile \u001b[39m\u001b[32mf:\\Project\\schoolNews\\schoolNewsCrawler\\crawler\\xhw\\XhwCrawler.py:241\u001b[39m, in \u001b[36msearch\u001b[39m\u001b[34m(self, key, total, action)\u001b[39m\n\u001b[32m 239\u001b[39m news_info = news.find(\u001b[33m\"\u001b[39m\u001b[33mdiv.head\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 240\u001b[39m news_title = news_info.find(\u001b[33m\"\u001b[39m\u001b[33mdiv.title\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m--> \u001b[39m\u001b[32m241\u001b[39m news_date = news_info.find(\u001b[33m\"\u001b[39m\u001b[33mdiv.date\u001b[39m\u001b[33m\"\u001b[39m).text.strip()\n\u001b[32m 242\u001b[39m url = news_title.find(\u001b[33m\"\u001b[39m\u001b[33ma\u001b[39m\u001b[33m\"\u001b[39m).get(\u001b[33m\"\u001b[39m\u001b[33mhref\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 243\u001b[39m url_base_map[url] = {\u001b[33m\"\u001b[39m\u001b[33mtitle\u001b[39m\u001b[33m\"\u001b[39m: news_title.get_text(strip=\u001b[38;5;28;01mTrue\u001b[39;00m), \u001b[33m\"\u001b[39m\u001b[33mdate\u001b[39m\u001b[33m\"\u001b[39m: news_date}\n",
|
"output_type": "execute_result"
|
||||||
"\u001b[31mAttributeError\u001b[39m: 'NoneType' object has no attribute 'find'"
|
|
||||||
]
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"crawler.search(\"习近平\", 10)\n",
|
"crawler.search(\"大学\", 1)\n",
|
||||||
"# crawler.search(\"中国\", 10, \"xhsz\")\n",
|
"# crawler.search(\"中国\", 10, \"xhsz\")\n",
|
||||||
"# crawler.search(\"中国\", 10, \"news\")\n",
|
"# crawler.search(\"中国\", 10, \"news\")\n",
|
||||||
"# crawler.search(\"中国\", 10, \"xhsz\")\n",
|
"# crawler.search(\"中国\", 10, \"xhsz\")\n",
|
||||||
@@ -108,10 +116,41 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 17,
|
||||||
"id": "7e0f56fa",
|
"id": "7e0f56fa",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# crawler.parse_xhsz_news_detail(\"https://xhsz.news.cn/focus_news/detail?id=9752\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 20,
|
||||||
|
"id": "47327ebf",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"NewsItem(title='《习近平总书记关于党的建设的重要思想概论》出版座谈会在北京召开', contentRows=[{'tag': 'p', 'content': '<p>\\u3000\\u3000新华社北京2月24日电\\u3000《习近平总书记关于党的建设的重要思想概论》出版座谈会2月24日在京召开。与会代表结合《概论》主要内容,交流学习贯彻习近平总书记关于党的建设的重要思想的认识和体会。</p>'}, {'tag': 'p', 'content': '<p>\\u3000\\u3000会议认为,《概论》是广大党员、干部深入学习领会习近平总书记关于党的建设的重要思想的权威辅助读物。习近平总书记关于党的建设的重要思想,是一个逻辑严密、内涵丰富、系统全面、博大精深的科学体系,是对中国化的马克思主义党建理论体系的继承发展,构成习近平新时代中国特色社会主义思想的“党建篇”。在这一重要思想的科学指引下,我们党成功开辟百年大党自我革命新境界,推动党和国家事业取得历史性成就、发生历史性变革,为世界政党建设提供了重要借鉴。</p>'}, {'tag': 'p', 'content': '<p>\\u3000\\u3000会议指出,要以学好用好《概论》为契机,进一步把习近平总书记关于党的建设的重要思想领会深、把握准、落到位,深刻领会其科学体系、理论品质和实践指向,更加深刻领悟“两个确立”的决定性意义,增强“四个意识”、坚定“四个自信”、做到“两个维护”。要不断深化体系化研究、学理化阐释,深刻把握这一重要思想蕴含的深刻道理、透彻学理、深邃哲理。要坚持用这一重要思想武装头脑、指导实践、推动工作,把学习成果转化为工作实效,推进党建研究高质量发展,以党建研究新成果推进党的建设和组织工作高质量发展,为以中国式现代化全面推进强国建设、民族复兴伟业提供坚强组织保证。</p>'}, {'tag': 'p', 'content': '<p>\\u3000\\u3000座谈会由全国党建研究会举办,中央和国家机关有关部门,各省区市和新疆生产建设兵团党建研究会(学会),部分中管企业、高校有关负责同志,党史党建专家代表参加座谈会。</p>'}], url='https://www.news.cn/politics/leaders/20250224/5384be3d47c643b3a68e3bb724656152/c.html', viewCount=None, publishTime='2025-02-24 22:44:25', author=None, source='新华网', category=None, executeStatus=0, executeMessage=None)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 20,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"crawler.parse_xh_news_detail(\"https://www.news.cn/politics/leaders/20250224/5384be3d47c643b3a68e3bb724656152/c.html\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "fa359d5b",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
"source": []
|
"source": []
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|||||||
Reference in New Issue
Block a user