chromedriver

This commit is contained in:
2025-11-19 19:05:31 +08:00
parent 1ad118b0d3
commit 8462eeed86
11 changed files with 543 additions and 16 deletions

0
Optional[NewsItem] Normal file
View File

0
ResultDomain Normal file
View File

View File

Binary file not shown.

View File

@@ -2,7 +2,8 @@
from typing import Callable, Dict, Optional, List, Any, Union from typing import Callable, Dict, Optional, List, Any, Union
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
import requests import requests
from bs4 import BeautifulSoup, NavigableString from bs4 import BeautifulSoup
from bs4.element import Tag, NavigableString
from loguru import logger from loguru import logger
from pydantic import BaseModel, Field, HttpUrl from pydantic import BaseModel, Field, HttpUrl
import json import json
@@ -45,6 +46,7 @@ class NewsItem(BaseModel):
title: str = Field(..., description="新闻标题") title: str = Field(..., description="新闻标题")
contentRows: List[Dict[str, Any]] = Field(..., description="新闻内容") contentRows: List[Dict[str, Any]] = Field(..., description="新闻内容")
url: str = Field(..., description="新闻链接") url: str = Field(..., description="新闻链接")
viewCount: Optional[int] = Field(default=None, description="浏览数")
publishTime: Optional[str] = Field(default=None, description="发布时间") publishTime: Optional[str] = Field(default=None, description="发布时间")
author: Optional[str] = Field(default=None, description="作者") author: Optional[str] = Field(default=None, description="作者")
source: Optional[str] = Field(default=None, description="来源") source: Optional[str] = Field(default=None, description="来源")

View File

@@ -1 +0,0 @@

View File

@@ -12,7 +12,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -38,7 +38,7 @@
" sys.path.insert(0, project_root)\n", " sys.path.insert(0, project_root)\n",
"\n", "\n",
"# 然后再导入模块\n", "# 然后再导入模块\n",
"from crawler.RmrbCrawler import RmrbCrawler\n", "from crawler.rmrb.RmrbCrawler import RmrbCrawler\n",
"from crawler.BaseCrawler import NewsItem\n", "from crawler.BaseCrawler import NewsItem\n",
"from loguru import logger\n", "from loguru import logger\n",
"import json\n", "import json\n",

View File

@@ -1,5 +1,8 @@
# 新华网爬虫 # 新华网爬虫
from typing import List, Optional from typing import List, Optional
from bs4 import Tag
from pydantic import InstanceOf
from core.ResultDomain import ResultDomain from core.ResultDomain import ResultDomain
from crawler.BaseCrawler import BaseCrawler, CrawlerConfig, NewsItem, UrlConfig from crawler.BaseCrawler import BaseCrawler, CrawlerConfig, NewsItem, UrlConfig
from loguru import logger from loguru import logger
@@ -9,6 +12,17 @@ from datetime import datetime, timedelta
from bs4.element import NavigableString from bs4.element import NavigableString
from urllib.parse import urlparse, urlencode from urllib.parse import urlparse, urlencode
import json import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.service import Service
import time
import random
import os
class XhwCrawler(BaseCrawler): class XhwCrawler(BaseCrawler):
def __init__(self): def __init__(self):
@@ -25,10 +39,20 @@ class XhwCrawler(BaseCrawler):
"page": 1 "page": 1
}, },
headers={ headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'application/json, text/plain, */*', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'zh-CN,zh;q=0.9', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Content-Type': 'application/json;charset=UTF-8' 'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Cache-Control': 'max-age=0',
'Referer': 'https://xhsz.news.cn/',
'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
} }
), ),
@@ -40,10 +64,90 @@ class XhwCrawler(BaseCrawler):
"热点发布": "news" "热点发布": "news"
} }
# 初始化时创建driver
self.driver = self._init_driver()
def _init_driver(self):
"""初始化并返回Chrome WebDriver实例"""
chrome_options = Options()
# 确保浏览器可见,不使用无头模式
# 或者完全删除这行,因为默认就是有界面模式
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
# 确保浏览器可见
chrome_options.add_argument('--start-maximized')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--disable-web-security')
chrome_options.add_argument('--allow-running-insecure-content')
chrome_options.add_argument('--disable-features=VizDisplayCompositor')
chrome_options.add_argument('--remote-debugging-port=9222') # 添加调试端口
service = Service(executable_path=r"chromedriver.exe")
try:
self.driver = webdriver.Chrome(service=service, options=chrome_options)
logger.info("Chrome浏览器初始化成功")
except Exception as e:
logger.error(f"Chrome浏览器初始化失败: {str(e)}")
return None
# 设置隐式等待时间
self.driver.implicitly_wait(10)
# 访问主页获取初始Cookie
logger.info("访问主页获取初始Cookie")
logger.info(f"准备访问URL: {self.config.base_url}")
try:
self.driver.get(self.config.base_url)
logger.info(f"成功访问URL: {self.config.base_url}")
except Exception as e:
logger.error(f"访问URL失败: {self.config.base_url}, 错误: {str(e)}")
return None
time.sleep(random.uniform(2, 4))
# 检查是否有验证页面
page_source = self.driver.page_source
if "验证" in page_source or "captcha" in page_source.lower() or "滑动验证" in page_source:
logger.warning("检测到验证页面,尝试手动处理验证")
# 尝试等待用户手动处理验证
logger.info("请在30秒内手动完成验证...")
time.sleep(30)
# 刷新页面,检查验证是否完成
self.driver.refresh()
time.sleep(random.uniform(2, 4))
# 再次检查验证状态
page_source = self.driver.page_source
if "验证" in page_source or "captcha" in page_source.lower() or "滑动验证" in page_source:
logger.error("验证未完成,无法继续爬取")
# self.driver.quit()
# self.driver = None
return None
return self.driver
def __del__(self):
"""析构函数,确保关闭浏览器"""
if hasattr(self, 'driver') and self.driver:
self.driver.quit()
logger.info("浏览器已关闭")
def search(self, key:str, total=10, action="news") -> ResultDomain: def search(self, key:str, total=10, action="news") -> ResultDomain:
resultDomain = ResultDomain() # 检查driver是否已初始化
if not self.driver:
logger.error("WebDriver未初始化无法继续爬取")
resultDomain = ResultDomain(code=1, message="WebDriver未初始化无法继续爬取", success=False)
return resultDomain
# 直接使用self.driver
news_urls = []
news_list = [] news_list = []
resultDomain.dataList = news_list resultDomain = ResultDomain(code=0, message="", success=True, dataList=news_list)
# 获取搜索配置 # 获取搜索配置
search_config = self.config.urls.get("search") search_config = self.config.urls.get("search")
if not search_config: if not search_config:
@@ -58,11 +162,294 @@ class XhwCrawler(BaseCrawler):
search_data["k"] = key search_data["k"] = key
search_data["action"] = action search_data["action"] = action
for page in range(1, total//pagesize+1): # 获取新闻url
url_base_map = {}
# 向上取整计算需要的页数
total_pages = (total + pagesize - 1) // pagesize
for page in range(1, total_pages + 1):
search_data["page"] = page search_data["page"] = page
pageHtml = search_config.url + "?" + urlencode(search_data) pageHtml = search_config.url + "?" + urlencode(search_data)
self.parse_html(pageHtml) # 分页的html
resultDomain.code = 0 logger.info(f"请求URL: {pageHtml}")
resultDomain.message = "搜索成功"
resultDomain.success = True # 使用Selenium访问页面
self.driver.get(pageHtml)
time.sleep(random.uniform(2, 4))
# 检查是否有验证页面
if not self.driver:
logger.error("WebDriver已失效无法继续爬取")
resultDomain = ResultDomain(code=1, message="WebDriver已失效无法继续爬取", success=False)
return resultDomain
page_source = self.driver.page_source
if "验证" in page_source or "captcha" in page_source.lower() or "滑动验证" in page_source:
logger.warning("检测到验证页面,尝试手动处理验证")
logger.info("请在30秒内手动完成验证...")
time.sleep(30)
# 检查driver是否仍然有效
if not self.driver:
logger.error("WebDriver已失效无法继续爬取")
resultDomain = ResultDomain(code=1, message="WebDriver已失效无法继续爬取", success=False)
return resultDomain
self.driver.refresh()
time.sleep(random.uniform(2, 4))
# 再次检查验证状态
if not self.driver:
logger.error("WebDriver已失效无法继续爬取")
resultDomain = ResultDomain(code=1, message="WebDriver已失效无法继续爬取", success=False)
return resultDomain
page_source = self.driver.page_source
if "验证" in page_source or "captcha" in page_source.lower() or "滑动验证" in page_source:
logger.error("验证未完成,无法继续爬取")
resultDomain = ResultDomain(code=1, message="验证未完成,无法继续爬取", success=False)
return resultDomain
# 解析页面内容
pageSoup = self.parse_html(page_source)
logger.info(f"解析后的HTML内容: {str(pageSoup)[:500]}...") # 只输出前500个字符
# 从分页中获取新闻url
searchMainDiv = pageSoup.find("div", class_="page-search-main")
if not searchMainDiv:
logger.error("未找到搜索主体部分")
resultDomain.code = 0
resultDomain.message = "未找到搜索主体部分"
resultDomain.success = False
return resultDomain
searchGroupDiv = searchMainDiv.find("div", class_="page-search-group")
if not searchGroupDiv:
logger.error("未找到搜索组")
resultDomain.code = 0
resultDomain.message = "未找到搜索组"
resultDomain.success = False
return resultDomain
newsDiv = searchGroupDiv.find("div", class_="page-search-news")
if not newsDiv:
logger.error("未找到新闻列表")
resultDomain.code = 0
resultDomain.message = "未找到新闻列表"
resultDomain.success = False
return resultDomain
newsList = newsDiv.find_all("div", class_="group")
for news in newsList:
news_info = news.find("div.head")
news_title = news_info.find("div.title")
news_date = news_info.find("div.date").text.strip()
url = news_title.find("a").get("href")
url_base_map[url] = {"title": news_title.get_text(strip=True), "date": news_date}
news_urls.append(url)
# 临时保存url到url.json
with open("url.json", "w", encoding="utf-8") as f:
json.dump(url_base_map, f, ensure_ascii=False, indent=4)
# 从新闻url中获取新闻详情
for news_url in news_urls:
news = self.parse_news_detail(news_url)
news.title = url_base_map.get(news_url, {}).get("title")
news.publishTime = url_base_map.get(news_url, {}).get("date")
news_list.append(news)
# 临时保存新闻到news.json
with open("news.json", "w", encoding="utf-8") as f:
json.dump(news_list, f, ensure_ascii=False, indent=4)
# 关闭浏览器
if self.driver:
self.driver.quit()
logger.info("浏览器已关闭")
return resultDomain return resultDomain
def parse_news_detail(self, url: str) -> Optional[NewsItem]:
return self.parse_xhsz_news_detail_selenium(url)
def parse_xhsz_news_detail_selenium(self, url: str) -> Optional[NewsItem]:
# 检查driver是否已初始化
if not self.driver:
logger.error("WebDriver未初始化无法获取新闻详情")
return None
newsItem = NewsItem(title="", contentRows=[], url=url)
# 使用Selenium访问新闻详情页
self.driver.get(url)
time.sleep(random.uniform(2, 4))
# 检查是否有验证页面
if not self.driver:
logger.error("WebDriver已失效无法获取新闻详情")
return None
page_source = self.driver.page_source
if "验证" in page_source or "captcha" in page_source.lower() or "滑动验证" in page_source:
logger.warning("检测到验证页面,尝试手动处理验证")
logger.info("请在30秒内手动完成验证...")
time.sleep(30)
# 检查driver是否仍然有效
if not self.driver:
logger.error("WebDriver已失效无法获取新闻详情")
return None
self.driver.refresh()
time.sleep(random.uniform(2, 4))
# 再次检查验证状态
if not self.driver:
logger.error("WebDriver已失效无法获取新闻详情")
return None
page_source = self.driver.page_source
if "验证" in page_source or "captcha" in page_source.lower() or "滑动验证" in page_source:
logger.error("验证未完成,无法获取新闻详情")
return None
# 解析页面内容
newsDetailSoup = self.parse_html(page_source)
# 查找新闻主体部分
main_div = newsDetailSoup.find("div.page-news-detail")
if not main_div:
logger.error(f"未找到新闻主体部分: {url}")
return None
article_div = main_div.find("div.page-news-l")
if not article_div:
logger.error(f"未找到新闻文章部分: {url}")
return None
# 获取标题
title_div = article_div.find("div.page-news-detail-title")
if title_div:
newsItem.title = title_div.text.strip()
# 获取新闻元信息
channal_div = article_div.find("div.page-news-detail-note")
if channal_div:
channal_items_div = channal_div.find_all("div.item")
for item in channal_items_div:
text = item.text.strip()
if "来源" in text:
parts = text.split(":", 1)
if len(parts) > 1:
newsItem.source = parts[1].strip()
elif "发布时间" in text:
parts = text.split(":", 1)
if len(parts) > 1:
newsItem.publishTime = parts[1].strip()
elif "浏览人数" in text:
parts = text.split(":", 1)
if len(parts) > 1:
newsItem.viewCount = parts[1].strip()
# 获取新闻内容
content_div = article_div.find("div.page-news-detail-content")
if content_div:
# 遍历内容区域中的所有元素
for child in content_div.children:
if not isinstance(child, Tag):
continue
# 处理图片
if child.name == "p" and child.find("img"):
img_tag = child.find("img")
if img_tag:
src = str(img_tag.get("src"))
img_tag["src"] = self._normalize_url(src)
newsItem.contentRows.append({
"tag": "img",
"content": str(img_tag)
})
# 处理视频
elif child.find("video"):
video_tag = child.find("video")
if video_tag:
src = str(video_tag.get("src"))
video_tag["src"] = self._normalize_url(src)
newsItem.contentRows.append({
"tag": "video",
"content": str(video_tag)
})
# 处理普通段落
elif child.name == "p" and child.get_text(strip=True):
newsItem.contentRows.append({
"tag": "p",
"content": child.get_text(strip=True)
})
return newsItem
def parse_xhsz_news_detail(self, url: str) -> Optional[NewsItem]:
newsItem = NewsItem(title="", contentRows=[], url=url)
response = self.fetch(url)
newsDetailSoup = self.parse_html(response.content)
main_div = newsDetailSoup.find("div.page-news-detail")
article_div = main_div.find("div.page-news-l")
title_div = article_div.find("div.page-news-detail-title")
channal_div = article_div.find("div.page-news-detail-note")
content_div = article_div.find("div.page-news-detail-content")
# 获取新闻标题
newsItem.title = title_div.text.strip()
# 获取新闻来源、发布时间、浏览人数
channal_items_div = channal_div.find_all("div.item")
if("来源" in channal_items_div[0].text):
newsItem.source = channal_items_div[0].text.strip().split(":")[1]
if("发布时间" in channal_items_div[1].text):
newsItem.publishTime = channal_items_div[1].text.strip().split(":")[1]
if("浏览人数" in channal_items_div[2].text):
newsItem.viewCount = channal_items_div[2].text.strip().split(":")[1]
for child in content_div.children:
if not isinstance(child, Tag):
continue
img_tag = child.find("img")
video_tag = child.find("video")
tag = "p"
content = str(child)
if img_tag: # 是图片
tag = "img"
src = str(img_tag.get("src"))
if src:
img_tag["src"] = self._normalize_url(src)
content = str(img_tag)
elif video_tag: # 是视频
tag = "video"
src = str(video_tag.get("src"))
if src:
video_tag["src"] = self._normalize_url(src)
content = str(video_tag)
newsItem.contentRows.append({"tag": tag, "content": content})
return newsItem
def _normalize_url(self, url: str) -> str:
"""
规范化 URL补全协议和域名
Args:
url: 原始 URL
Returns:
完整的 URL
"""
if not url:
return url
# 已经是完整 URL
if url.startswith("http://") or url.startswith("https://"):
return url
# 协议相对 URL补充 https:
if url.startswith("//"):
return "https:" + url
# 相对路径,补全域名
return self.config.base_url + url

View File

@@ -0,0 +1,139 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"id": "948be230",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"项目根目录: f:\\Project\\schoolNews\n",
"✓ 已启用自动重载模块功能 - 修改 .py 文件后会自动生效\n"
]
}
],
"source": [
"# 自动重载模块(当文件修改后自动刷新)\n",
"%reload_ext autoreload\n",
"%autoreload 2\n",
"\n",
"import sys\n",
"import os\n",
"\n",
"# 先添加项目根目录到路径(必须在导入之前)\n",
"project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))\n",
"if project_root not in sys.path:\n",
" sys.path.insert(0, project_root)\n",
"\n",
"# 然后再导入模块\n",
"from crawler.xhw.XhwCrawler import XhwCrawler\n",
"from crawler.BaseCrawler import NewsItem\n",
"from loguru import logger\n",
"import json\n",
"from pprint import pprint\n",
"\n",
"print(f\"项目根目录: {project_root}\")\n",
"print(\"✓ 已启用自动重载模块功能 - 修改 .py 文件后会自动生效\")\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "31a8a0dd",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[32m2025-11-19 19:03:54.324\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.BaseCrawler\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m71\u001b[0m - \u001b[1m初始化爬虫: XhwCrawler\u001b[0m\n",
"\u001b[32m2025-11-19 19:03:55.214\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m20\u001b[0m - \u001b[1mChrome浏览器初始化成功\u001b[0m\n",
"\u001b[32m2025-11-19 19:03:55.216\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m25\u001b[0m - \u001b[1m访问主页获取初始Cookie\u001b[0m\n",
"\u001b[32m2025-11-19 19:03:55.217\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m26\u001b[0m - \u001b[1m准备访问URL: https://xhsz.news.cn/\u001b[0m\n",
"\u001b[32m2025-11-19 19:03:57.557\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m29\u001b[0m - \u001b[1m成功访问URL: https://xhsz.news.cn/\u001b[0m\n"
]
}
],
"source": [
"crawler = XhwCrawler()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "e5a6e91c",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[32m2025-11-19 19:04:12.458\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36msearch\u001b[0m:\u001b[36m174\u001b[0m - \u001b[1m请求URL: https://xhsz.news.cn/s?k=%E4%B9%A0%E8%BF%91%E5%B9%B3&action=news&page=1\u001b[0m\n",
"\u001b[32m2025-11-19 19:04:15.858\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36msearch\u001b[0m:\u001b[36m188\u001b[0m - \u001b[33m\u001b[1m检测到验证页面尝试手动处理验证\u001b[0m\n",
"\u001b[32m2025-11-19 19:04:15.858\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36msearch\u001b[0m:\u001b[36m189\u001b[0m - \u001b[1m请在30秒内手动完成验证...\u001b[0m\n",
"\u001b[32m2025-11-19 19:04:48.814\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36msearch\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m解析后的HTML内容: <html lang=\"en\"><head>\n",
"<meta charset=\"utf-8\"/>\n",
"<meta content=\"IE=edge\" http-equiv=\"X-UA-Compatible\"/>\n",
"<meta content=\"webkit\" name=\"renderer\"/>\n",
"<title>新华网新华思政-全国高校课程思政教学资源服务平台</title>\n",
"<meta content=\"新华思政,课程思政,全国高校课程思政教学资源服务平台,新华网,新华教育,思政教育.\" name=\"keywords\"/>\n",
"<meta content=\"新华网作为党和国家重要的网上舆论阵地,适时推出新华思政—全国高校课程思政教学资源服务平台,为全国高校教师针对课程思政建设、交流、学习和共享于一体的教学服务平台,旨在推广课程思政建设先进经验和做法,助力高校课程思政教学资源需求,深入挖掘课程思政元素,助力广泛开展课程思政建设的良好氛围,提升教师开展课程思政建设的意识和能力。\" name=\"description\"/>\n",
"<link href=\"/static/skin4/favicon.ico\" rel...\u001b[0m\n"
]
},
{
"ename": "AttributeError",
"evalue": "'NoneType' object has no attribute 'find'",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mAttributeError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[5]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mcrawler\u001b[49m\u001b[43m.\u001b[49m\u001b[43msearch\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43m习近平\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[32;43m10\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m 2\u001b[39m \u001b[38;5;66;03m# crawler.search(\"中国\", 10, \"xhsz\")\u001b[39;00m\n\u001b[32m 3\u001b[39m \u001b[38;5;66;03m# crawler.search(\"中国\", 10, \"news\")\u001b[39;00m\n\u001b[32m 4\u001b[39m \u001b[38;5;66;03m# crawler.search(\"中国\", 10, \"xhsz\")\u001b[39;00m\n\u001b[32m 5\u001b[39m \u001b[38;5;66;03m# crawler.search(\"中国\", 10, \"news\")\u001b[39;00m\n\u001b[32m 6\u001b[39m \u001b[38;5;66;03m# crawler.search(\"中国\", 10, \"news\")\u001b[39;00m\n",
"\u001b[36mFile \u001b[39m\u001b[32mf:\\Project\\schoolNews\\schoolNewsCrawler\\crawler\\xhw\\XhwCrawler.py:241\u001b[39m, in \u001b[36msearch\u001b[39m\u001b[34m(self, key, total, action)\u001b[39m\n\u001b[32m 239\u001b[39m news_info = news.find(\u001b[33m\"\u001b[39m\u001b[33mdiv.head\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 240\u001b[39m news_title = news_info.find(\u001b[33m\"\u001b[39m\u001b[33mdiv.title\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m--> \u001b[39m\u001b[32m241\u001b[39m news_date = news_info.find(\u001b[33m\"\u001b[39m\u001b[33mdiv.date\u001b[39m\u001b[33m\"\u001b[39m).text.strip()\n\u001b[32m 242\u001b[39m url = news_title.find(\u001b[33m\"\u001b[39m\u001b[33ma\u001b[39m\u001b[33m\"\u001b[39m).get(\u001b[33m\"\u001b[39m\u001b[33mhref\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 243\u001b[39m url_base_map[url] = {\u001b[33m\"\u001b[39m\u001b[33mtitle\u001b[39m\u001b[33m\"\u001b[39m: news_title.get_text(strip=\u001b[38;5;28;01mTrue\u001b[39;00m), \u001b[33m\"\u001b[39m\u001b[33mdate\u001b[39m\u001b[33m\"\u001b[39m: news_date}\n",
"\u001b[31mAttributeError\u001b[39m: 'NoneType' object has no attribute 'find'"
]
}
],
"source": [
"crawler.search(\"习近平\", 10)\n",
"# crawler.search(\"中国\", 10, \"xhsz\")\n",
"# crawler.search(\"中国\", 10, \"news\")\n",
"# crawler.search(\"中国\", 10, \"xhsz\")\n",
"# crawler.search(\"中国\", 10, \"news\")\n",
"# crawler.search(\"中国\", 10, \"news\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7e0f56fa",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "schoolNewsCrawler",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

0
str Normal file
View File