diff --git a/Optional[NewsItem] b/Optional[NewsItem] new file mode 100644 index 0000000..e69de29 diff --git a/ResultDomain b/ResultDomain new file mode 100644 index 0000000..e69de29 diff --git a/schoolNewsCrawler/__init__.py b/schoolNewsCrawler/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/schoolNewsCrawler/chromedriver.exe b/schoolNewsCrawler/chromedriver.exe new file mode 100644 index 0000000..40f9872 Binary files /dev/null and b/schoolNewsCrawler/chromedriver.exe differ diff --git a/schoolNewsCrawler/crawler/BaseCrawler.py b/schoolNewsCrawler/crawler/BaseCrawler.py index fe26be1..60cd45f 100644 --- a/schoolNewsCrawler/crawler/BaseCrawler.py +++ b/schoolNewsCrawler/crawler/BaseCrawler.py @@ -2,7 +2,8 @@ from typing import Callable, Dict, Optional, List, Any, Union from abc import ABC, abstractmethod import requests -from bs4 import BeautifulSoup, NavigableString +from bs4 import BeautifulSoup +from bs4.element import Tag, NavigableString from loguru import logger from pydantic import BaseModel, Field, HttpUrl import json @@ -45,6 +46,7 @@ class NewsItem(BaseModel): title: str = Field(..., description="新闻标题") contentRows: List[Dict[str, Any]] = Field(..., description="新闻内容") url: str = Field(..., description="新闻链接") + viewCount: Optional[int] = Field(default=None, description="浏览数") publishTime: Optional[str] = Field(default=None, description="发布时间") author: Optional[str] = Field(default=None, description="作者") source: Optional[str] = Field(default=None, description="来源") diff --git a/schoolNewsCrawler/crawler/__init__.py b/schoolNewsCrawler/crawler/__init__.py index 8b13789..e69de29 100644 --- a/schoolNewsCrawler/crawler/__init__.py +++ b/schoolNewsCrawler/crawler/__init__.py @@ -1 +0,0 @@ - diff --git a/schoolNewsCrawler/crawler/test_crawler.ipynb b/schoolNewsCrawler/crawler/rmrb/test_crawler.ipynb similarity index 98% rename from schoolNewsCrawler/crawler/test_crawler.ipynb rename to schoolNewsCrawler/crawler/rmrb/test_crawler.ipynb index db294c2..dc7ef60 100644 --- a/schoolNewsCrawler/crawler/test_crawler.ipynb +++ b/schoolNewsCrawler/crawler/rmrb/test_crawler.ipynb @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -38,7 +38,7 @@ " sys.path.insert(0, project_root)\n", "\n", "# 然后再导入模块\n", - "from crawler.RmrbCrawler import RmrbCrawler\n", + "from crawler.rmrb.RmrbCrawler import RmrbCrawler\n", "from crawler.BaseCrawler import NewsItem\n", "from loguru import logger\n", "import json\n", diff --git a/schoolNewsCrawler/crawler/xhw/XhwCrawler.py b/schoolNewsCrawler/crawler/xhw/XhwCrawler.py index b6b64e4..86e600b 100644 --- a/schoolNewsCrawler/crawler/xhw/XhwCrawler.py +++ b/schoolNewsCrawler/crawler/xhw/XhwCrawler.py @@ -1,5 +1,8 @@ # 新华网爬虫 from typing import List, Optional + +from bs4 import Tag +from pydantic import InstanceOf from core.ResultDomain import ResultDomain from crawler.BaseCrawler import BaseCrawler, CrawlerConfig, NewsItem, UrlConfig from loguru import logger @@ -9,6 +12,17 @@ from datetime import datetime, timedelta from bs4.element import NavigableString from urllib.parse import urlparse, urlencode import json +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.common.action_chains import ActionChains +from selenium.webdriver.chrome.service import Service + +import time +import random +import os class XhwCrawler(BaseCrawler): def __init__(self): @@ -25,10 +39,20 @@ class XhwCrawler(BaseCrawler): "page": 1 }, headers={ - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36', - 'Accept': 'application/json, text/plain, */*', - 'Accept-Language': 'zh-CN,zh;q=0.9', - 'Content-Type': 'application/json;charset=UTF-8' + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', + 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', + 'Accept-Encoding': 'gzip, deflate, br', + 'Connection': 'keep-alive', + 'Upgrade-Insecure-Requests': '1', + 'Sec-Fetch-Dest': 'document', + 'Sec-Fetch-Mode': 'navigate', + 'Sec-Fetch-Site': 'none', + 'Cache-Control': 'max-age=0', + 'Referer': 'https://xhsz.news.cn/', + 'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"', + 'sec-ch-ua-mobile': '?0', + 'sec-ch-ua-platform': '"Windows"' } ), @@ -39,11 +63,91 @@ class XhwCrawler(BaseCrawler): "全部": "index", "热点发布": "news" } + + # 初始化时创建driver + self.driver = self._init_driver() + + def _init_driver(self): + """初始化并返回Chrome WebDriver实例""" + chrome_options = Options() + # 确保浏览器可见,不使用无头模式 + # 或者完全删除这行,因为默认就是有界面模式 + chrome_options.add_argument('--no-sandbox') + chrome_options.add_argument('--disable-dev-shm-usage') + chrome_options.add_argument('--disable-blink-features=AutomationControlled') + chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"]) + chrome_options.add_experimental_option('useAutomationExtension', False) + chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36') + # 确保浏览器可见 + chrome_options.add_argument('--start-maximized') + chrome_options.add_argument('--disable-gpu') + chrome_options.add_argument('--disable-web-security') + chrome_options.add_argument('--allow-running-insecure-content') + chrome_options.add_argument('--disable-features=VizDisplayCompositor') + chrome_options.add_argument('--remote-debugging-port=9222') # 添加调试端口 + service = Service(executable_path=r"chromedriver.exe") + + try: + self.driver = webdriver.Chrome(service=service, options=chrome_options) + logger.info("Chrome浏览器初始化成功") + except Exception as e: + logger.error(f"Chrome浏览器初始化失败: {str(e)}") + return None + + # 设置隐式等待时间 + self.driver.implicitly_wait(10) + + # 访问主页获取初始Cookie + logger.info("访问主页获取初始Cookie") + logger.info(f"准备访问URL: {self.config.base_url}") + try: + self.driver.get(self.config.base_url) + logger.info(f"成功访问URL: {self.config.base_url}") + except Exception as e: + logger.error(f"访问URL失败: {self.config.base_url}, 错误: {str(e)}") + return None + time.sleep(random.uniform(2, 4)) + + # 检查是否有验证页面 + page_source = self.driver.page_source + if "验证" in page_source or "captcha" in page_source.lower() or "滑动验证" in page_source: + logger.warning("检测到验证页面,尝试手动处理验证") + + # 尝试等待用户手动处理验证 + logger.info("请在30秒内手动完成验证...") + time.sleep(30) + + # 刷新页面,检查验证是否完成 + self.driver.refresh() + time.sleep(random.uniform(2, 4)) + + # 再次检查验证状态 + page_source = self.driver.page_source + if "验证" in page_source or "captcha" in page_source.lower() or "滑动验证" in page_source: + logger.error("验证未完成,无法继续爬取") + # self.driver.quit() + # self.driver = None + return None + + return self.driver + + def __del__(self): + """析构函数,确保关闭浏览器""" + if hasattr(self, 'driver') and self.driver: + self.driver.quit() + logger.info("浏览器已关闭") def search(self, key:str, total=10, action="news") -> ResultDomain: - resultDomain = ResultDomain() + # 检查driver是否已初始化 + if not self.driver: + logger.error("WebDriver未初始化,无法继续爬取") + resultDomain = ResultDomain(code=1, message="WebDriver未初始化,无法继续爬取", success=False) + return resultDomain + + # 直接使用self.driver + news_urls = [] news_list = [] - resultDomain.dataList = news_list + resultDomain = ResultDomain(code=0, message="", success=True, dataList=news_list) # 获取搜索配置 search_config = self.config.urls.get("search") if not search_config: @@ -58,11 +162,294 @@ class XhwCrawler(BaseCrawler): search_data["k"] = key search_data["action"] = action - for page in range(1, total//pagesize+1): + # 获取新闻url + url_base_map = {} + # 向上取整计算需要的页数 + total_pages = (total + pagesize - 1) // pagesize + for page in range(1, total_pages + 1): search_data["page"] = page pageHtml = search_config.url + "?" + urlencode(search_data) - self.parse_html(pageHtml) - resultDomain.code = 0 - resultDomain.message = "搜索成功" - resultDomain.success = True - return resultDomain \ No newline at end of file + # 分页的html + logger.info(f"请求URL: {pageHtml}") + + # 使用Selenium访问页面 + self.driver.get(pageHtml) + time.sleep(random.uniform(2, 4)) + + # 检查是否有验证页面 + if not self.driver: + logger.error("WebDriver已失效,无法继续爬取") + resultDomain = ResultDomain(code=1, message="WebDriver已失效,无法继续爬取", success=False) + return resultDomain + + page_source = self.driver.page_source + if "验证" in page_source or "captcha" in page_source.lower() or "滑动验证" in page_source: + logger.warning("检测到验证页面,尝试手动处理验证") + logger.info("请在30秒内手动完成验证...") + time.sleep(30) + + # 检查driver是否仍然有效 + if not self.driver: + logger.error("WebDriver已失效,无法继续爬取") + resultDomain = ResultDomain(code=1, message="WebDriver已失效,无法继续爬取", success=False) + return resultDomain + + self.driver.refresh() + time.sleep(random.uniform(2, 4)) + + # 再次检查验证状态 + if not self.driver: + logger.error("WebDriver已失效,无法继续爬取") + resultDomain = ResultDomain(code=1, message="WebDriver已失效,无法继续爬取", success=False) + return resultDomain + + page_source = self.driver.page_source + if "验证" in page_source or "captcha" in page_source.lower() or "滑动验证" in page_source: + logger.error("验证未完成,无法继续爬取") + resultDomain = ResultDomain(code=1, message="验证未完成,无法继续爬取", success=False) + return resultDomain + + # 解析页面内容 + pageSoup = self.parse_html(page_source) + logger.info(f"解析后的HTML内容: {str(pageSoup)[:500]}...") # 只输出前500个字符 + # 从分页中获取新闻url + searchMainDiv = pageSoup.find("div", class_="page-search-main") + if not searchMainDiv: + logger.error("未找到搜索主体部分") + resultDomain.code = 0 + resultDomain.message = "未找到搜索主体部分" + resultDomain.success = False + return resultDomain + searchGroupDiv = searchMainDiv.find("div", class_="page-search-group") + if not searchGroupDiv: + logger.error("未找到搜索组") + resultDomain.code = 0 + resultDomain.message = "未找到搜索组" + resultDomain.success = False + return resultDomain + newsDiv = searchGroupDiv.find("div", class_="page-search-news") + if not newsDiv: + logger.error("未找到新闻列表") + resultDomain.code = 0 + resultDomain.message = "未找到新闻列表" + resultDomain.success = False + return resultDomain + newsList = newsDiv.find_all("div", class_="group") + for news in newsList: + news_info = news.find("div.head") + news_title = news_info.find("div.title") + news_date = news_info.find("div.date").text.strip() + url = news_title.find("a").get("href") + url_base_map[url] = {"title": news_title.get_text(strip=True), "date": news_date} + news_urls.append(url) + # 临时保存url到url.json + with open("url.json", "w", encoding="utf-8") as f: + json.dump(url_base_map, f, ensure_ascii=False, indent=4) + # 从新闻url中获取新闻详情 + for news_url in news_urls: + news = self.parse_news_detail(news_url) + news.title = url_base_map.get(news_url, {}).get("title") + news.publishTime = url_base_map.get(news_url, {}).get("date") + news_list.append(news) + + # 临时保存新闻到news.json + with open("news.json", "w", encoding="utf-8") as f: + json.dump(news_list, f, ensure_ascii=False, indent=4) + + # 关闭浏览器 + if self.driver: + self.driver.quit() + logger.info("浏览器已关闭") + + return resultDomain + + def parse_news_detail(self, url: str) -> Optional[NewsItem]: + return self.parse_xhsz_news_detail_selenium(url) + + def parse_xhsz_news_detail_selenium(self, url: str) -> Optional[NewsItem]: + # 检查driver是否已初始化 + if not self.driver: + logger.error("WebDriver未初始化,无法获取新闻详情") + return None + + newsItem = NewsItem(title="", contentRows=[], url=url) + + # 使用Selenium访问新闻详情页 + self.driver.get(url) + time.sleep(random.uniform(2, 4)) + + # 检查是否有验证页面 + if not self.driver: + logger.error("WebDriver已失效,无法获取新闻详情") + return None + + page_source = self.driver.page_source + if "验证" in page_source or "captcha" in page_source.lower() or "滑动验证" in page_source: + logger.warning("检测到验证页面,尝试手动处理验证") + logger.info("请在30秒内手动完成验证...") + time.sleep(30) + + # 检查driver是否仍然有效 + if not self.driver: + logger.error("WebDriver已失效,无法获取新闻详情") + return None + + self.driver.refresh() + time.sleep(random.uniform(2, 4)) + + # 再次检查验证状态 + if not self.driver: + logger.error("WebDriver已失效,无法获取新闻详情") + return None + + page_source = self.driver.page_source + if "验证" in page_source or "captcha" in page_source.lower() or "滑动验证" in page_source: + logger.error("验证未完成,无法获取新闻详情") + return None + + # 解析页面内容 + newsDetailSoup = self.parse_html(page_source) + + # 查找新闻主体部分 + main_div = newsDetailSoup.find("div.page-news-detail") + if not main_div: + logger.error(f"未找到新闻主体部分: {url}") + return None + + article_div = main_div.find("div.page-news-l") + if not article_div: + logger.error(f"未找到新闻文章部分: {url}") + return None + + # 获取标题 + title_div = article_div.find("div.page-news-detail-title") + if title_div: + newsItem.title = title_div.text.strip() + + # 获取新闻元信息 + channal_div = article_div.find("div.page-news-detail-note") + if channal_div: + channal_items_div = channal_div.find_all("div.item") + for item in channal_items_div: + text = item.text.strip() + if "来源" in text: + parts = text.split(":", 1) + if len(parts) > 1: + newsItem.source = parts[1].strip() + elif "发布时间" in text: + parts = text.split(":", 1) + if len(parts) > 1: + newsItem.publishTime = parts[1].strip() + elif "浏览人数" in text: + parts = text.split(":", 1) + if len(parts) > 1: + newsItem.viewCount = parts[1].strip() + + # 获取新闻内容 + content_div = article_div.find("div.page-news-detail-content") + if content_div: + # 遍历内容区域中的所有元素 + for child in content_div.children: + if not isinstance(child, Tag): + continue + + # 处理图片 + if child.name == "p" and child.find("img"): + img_tag = child.find("img") + if img_tag: + src = str(img_tag.get("src")) + img_tag["src"] = self._normalize_url(src) + newsItem.contentRows.append({ + "tag": "img", + "content": str(img_tag) + }) + # 处理视频 + elif child.find("video"): + video_tag = child.find("video") + if video_tag: + src = str(video_tag.get("src")) + video_tag["src"] = self._normalize_url(src) + newsItem.contentRows.append({ + "tag": "video", + "content": str(video_tag) + }) + # 处理普通段落 + elif child.name == "p" and child.get_text(strip=True): + newsItem.contentRows.append({ + "tag": "p", + "content": child.get_text(strip=True) + }) + + return newsItem + + def parse_xhsz_news_detail(self, url: str) -> Optional[NewsItem]: + newsItem = NewsItem(title="", contentRows=[], url=url) + response = self.fetch(url) + newsDetailSoup = self.parse_html(response.content) + + main_div = newsDetailSoup.find("div.page-news-detail") + article_div = main_div.find("div.page-news-l") + title_div = article_div.find("div.page-news-detail-title") + channal_div = article_div.find("div.page-news-detail-note") + content_div = article_div.find("div.page-news-detail-content") + + # 获取新闻标题 + newsItem.title = title_div.text.strip() + + # 获取新闻来源、发布时间、浏览人数 + channal_items_div = channal_div.find_all("div.item") + if("来源" in channal_items_div[0].text): + newsItem.source = channal_items_div[0].text.strip().split(":")[1] + if("发布时间" in channal_items_div[1].text): + newsItem.publishTime = channal_items_div[1].text.strip().split(":")[1] + if("浏览人数" in channal_items_div[2].text): + newsItem.viewCount = channal_items_div[2].text.strip().split(":")[1] + + for child in content_div.children: + if not isinstance(child, Tag): + continue + + img_tag = child.find("img") + video_tag = child.find("video") + tag = "p" + content = str(child) + if img_tag: # 是图片 + tag = "img" + src = str(img_tag.get("src")) + if src: + img_tag["src"] = self._normalize_url(src) + content = str(img_tag) + elif video_tag: # 是视频 + tag = "video" + src = str(video_tag.get("src")) + if src: + video_tag["src"] = self._normalize_url(src) + content = str(video_tag) + + newsItem.contentRows.append({"tag": tag, "content": content}) + + return newsItem + + def _normalize_url(self, url: str) -> str: + """ + 规范化 URL,补全协议和域名 + + Args: + url: 原始 URL + + Returns: + 完整的 URL + """ + if not url: + return url + + # 已经是完整 URL + if url.startswith("http://") or url.startswith("https://"): + return url + + # 协议相对 URL,补充 https: + if url.startswith("//"): + return "https:" + url + + # 相对路径,补全域名 + return self.config.base_url + url diff --git a/schoolNewsCrawler/crawler/xhw/__init__.py b/schoolNewsCrawler/crawler/xhw/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/schoolNewsCrawler/test.ipynb b/schoolNewsCrawler/test.ipynb new file mode 100644 index 0000000..01728a5 --- /dev/null +++ b/schoolNewsCrawler/test.ipynb @@ -0,0 +1,139 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "id": "948be230", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "项目根目录: f:\\Project\\schoolNews\n", + "✓ 已启用自动重载模块功能 - 修改 .py 文件后会自动生效\n" + ] + } + ], + "source": [ + "# 自动重载模块(当文件修改后自动刷新)\n", + "%reload_ext autoreload\n", + "%autoreload 2\n", + "\n", + "import sys\n", + "import os\n", + "\n", + "# 先添加项目根目录到路径(必须在导入之前)\n", + "project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))\n", + "if project_root not in sys.path:\n", + " sys.path.insert(0, project_root)\n", + "\n", + "# 然后再导入模块\n", + "from crawler.xhw.XhwCrawler import XhwCrawler\n", + "from crawler.BaseCrawler import NewsItem\n", + "from loguru import logger\n", + "import json\n", + "from pprint import pprint\n", + "\n", + "print(f\"项目根目录: {project_root}\")\n", + "print(\"✓ 已启用自动重载模块功能 - 修改 .py 文件后会自动生效\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "31a8a0dd", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[32m2025-11-19 19:03:54.324\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.BaseCrawler\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m71\u001b[0m - \u001b[1m初始化爬虫: XhwCrawler\u001b[0m\n", + "\u001b[32m2025-11-19 19:03:55.214\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m20\u001b[0m - \u001b[1mChrome浏览器初始化成功\u001b[0m\n", + "\u001b[32m2025-11-19 19:03:55.216\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m25\u001b[0m - \u001b[1m访问主页获取初始Cookie\u001b[0m\n", + "\u001b[32m2025-11-19 19:03:55.217\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m26\u001b[0m - \u001b[1m准备访问URL: https://xhsz.news.cn/\u001b[0m\n", + "\u001b[32m2025-11-19 19:03:57.557\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36m_init_driver\u001b[0m:\u001b[36m29\u001b[0m - \u001b[1m成功访问URL: https://xhsz.news.cn/\u001b[0m\n" + ] + } + ], + "source": [ + "crawler = XhwCrawler()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "e5a6e91c", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[32m2025-11-19 19:04:12.458\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36msearch\u001b[0m:\u001b[36m174\u001b[0m - \u001b[1m请求URL: https://xhsz.news.cn/s?k=%E4%B9%A0%E8%BF%91%E5%B9%B3&action=news&page=1\u001b[0m\n", + "\u001b[32m2025-11-19 19:04:15.858\u001b[0m | \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36msearch\u001b[0m:\u001b[36m188\u001b[0m - \u001b[33m\u001b[1m检测到验证页面,尝试手动处理验证\u001b[0m\n", + "\u001b[32m2025-11-19 19:04:15.858\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36msearch\u001b[0m:\u001b[36m189\u001b[0m - \u001b[1m请在30秒内手动完成验证...\u001b[0m\n", + "\u001b[32m2025-11-19 19:04:48.814\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcrawler.xhw.XhwCrawler\u001b[0m:\u001b[36msearch\u001b[0m:\u001b[36m215\u001b[0m - \u001b[1m解析后的HTML内容: \n", + "\n", + "\n", + "\n", + "新华网新华思政-全国高校课程思政教学资源服务平台\n", + "\n", + "\n", + " \u001b[39m\u001b[32m1\u001b[39m \u001b[43mcrawler\u001b[49m\u001b[43m.\u001b[49m\u001b[43msearch\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43m习近平\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[32;43m10\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m 2\u001b[39m \u001b[38;5;66;03m# crawler.search(\"中国\", 10, \"xhsz\")\u001b[39;00m\n\u001b[32m 3\u001b[39m \u001b[38;5;66;03m# crawler.search(\"中国\", 10, \"news\")\u001b[39;00m\n\u001b[32m 4\u001b[39m \u001b[38;5;66;03m# crawler.search(\"中国\", 10, \"xhsz\")\u001b[39;00m\n\u001b[32m 5\u001b[39m \u001b[38;5;66;03m# crawler.search(\"中国\", 10, \"news\")\u001b[39;00m\n\u001b[32m 6\u001b[39m \u001b[38;5;66;03m# crawler.search(\"中国\", 10, \"news\")\u001b[39;00m\n", + "\u001b[36mFile \u001b[39m\u001b[32mf:\\Project\\schoolNews\\schoolNewsCrawler\\crawler\\xhw\\XhwCrawler.py:241\u001b[39m, in \u001b[36msearch\u001b[39m\u001b[34m(self, key, total, action)\u001b[39m\n\u001b[32m 239\u001b[39m news_info = news.find(\u001b[33m\"\u001b[39m\u001b[33mdiv.head\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 240\u001b[39m news_title = news_info.find(\u001b[33m\"\u001b[39m\u001b[33mdiv.title\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m--> \u001b[39m\u001b[32m241\u001b[39m news_date = news_info.find(\u001b[33m\"\u001b[39m\u001b[33mdiv.date\u001b[39m\u001b[33m\"\u001b[39m).text.strip()\n\u001b[32m 242\u001b[39m url = news_title.find(\u001b[33m\"\u001b[39m\u001b[33ma\u001b[39m\u001b[33m\"\u001b[39m).get(\u001b[33m\"\u001b[39m\u001b[33mhref\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 243\u001b[39m url_base_map[url] = {\u001b[33m\"\u001b[39m\u001b[33mtitle\u001b[39m\u001b[33m\"\u001b[39m: news_title.get_text(strip=\u001b[38;5;28;01mTrue\u001b[39;00m), \u001b[33m\"\u001b[39m\u001b[33mdate\u001b[39m\u001b[33m\"\u001b[39m: news_date}\n", + "\u001b[31mAttributeError\u001b[39m: 'NoneType' object has no attribute 'find'" + ] + } + ], + "source": [ + "crawler.search(\"习近平\", 10)\n", + "# crawler.search(\"中国\", 10, \"xhsz\")\n", + "# crawler.search(\"中国\", 10, \"news\")\n", + "# crawler.search(\"中国\", 10, \"xhsz\")\n", + "# crawler.search(\"中国\", 10, \"news\")\n", + "# crawler.search(\"中国\", 10, \"news\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7e0f56fa", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "schoolNewsCrawler", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/str b/str new file mode 100644 index 0000000..e69de29