重要新闻

This commit is contained in:
2025-11-21 14:55:50 +08:00
parent 0e7cee3070
commit 7ccec2b624
4 changed files with 2018 additions and 219 deletions

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,140 @@
# 学习强国重要新闻爬虫使用说明
## 功能概述
`XxqgCrawler` 类中新增了 `crawl_important` 方法,用于爬取学习强国"重要新闻"栏目的文章内容。
## 实现原理
该方法结合了旧版 `myQiangguo` 爬虫和新版 Selenium 爬虫的优势:
1. **获取文章列表**:参考旧版爬虫方式,使用 `requests` 库直接请求 JSON 接口获取文章列表
- JSON接口地址: `https://www.xuexi.cn/lgdata/1jscb6pu1n2.json?_st=26095725`
- 返回包含文章URL、标题、来源等基础信息的列表
2. **解析文章详情**:使用现有的 `parse_news_detail` 方法(基于 Selenium解析每篇文章的详细内容
- 提取标题、发布时间、来源
- 提取正文内容(文字、图片、视频)
- 保存完整的文章结构
## 使用方法
### 基本用法
```python
from crawler.xxqg.XxqgCrawler import XxqgCrawler
# 初始化爬虫
crawler = XxqgCrawler()
# 爬取重要新闻默认最多60篇
result = crawler.crawl_important()
# 检查结果
if result.success:
print(f"成功爬取 {len(result.dataList)} 篇新闻")
for news in result.dataList:
print(f"标题: {news.title}")
print(f"来源: {news.source}")
print(f"发布时间: {news.publishTime}")
else:
print(f"爬取失败: {result.message}")
# 关闭浏览器
crawler.driver.quit()
```
### 自定义爬取数量
```python
# 只爬取前10篇文章
result = crawler.crawl_important(max_count=10)
```
### 运行测试脚本
```bash
cd f:\Project\schoolNews\schoolNewsCrawler\crawler\xxqg
python test_important_crawler.py
```
## 输出结果
爬取完成后,结果会自动保存到 `Xxqg_important_news.json` 文件中,包含以下信息:
```json
[
{
"title": "文章标题",
"url": "文章URL",
"source": "来源",
"publishTime": "发布时间",
"contentRows": [
{
"type": "text",
"content": "段落文本"
},
{
"type": "img",
"content": "<img src='图片URL' />"
}
]
}
]
```
## 参数说明
### `crawl_important(max_count=60)`
- **max_count**: 最多爬取的文章数量默认60篇
- **返回值**: `ResultDomain` 对象
- `success`: 是否成功
- `code`: 状态码0表示成功1表示失败
- `message`: 提示信息
- `dataList`: 新闻列表(`List[NewsItem]`
## 注意事项
1. **浏览器初始化**:首次运行时会自动打开 Chrome 浏览器并访问学习强国主页获取 Cookie
2. **验证码处理**如果遇到验证码程序会暂停30秒让用户手动完成验证
3. **爬取速度**每篇文章之间会有1-2秒的随机延迟避免请求过快被封禁
4. **资源清理**:使用完毕后记得调用 `crawler.driver.quit()` 关闭浏览器
## 与旧版爬虫的对比
### 旧版爬虫 (myQiangguo)
- 使用 `requests` + `BeautifulSoup` 解析静态HTML
- 依赖于特定的 `data+MD5.js` 接口格式
- 需要处理不同格式的URL.html和.json
### 新版爬虫 (XxqgCrawler)
- 结合 `requests` 获取列表 + `Selenium` 解析详情
- 能够处理动态加载的内容
- 统一的接口和返回格式
- 更好的错误处理和日志记录
## 扩展功能
如果需要爬取其他栏目,可以参考 `crawl_important` 方法的实现,修改对应的 JSON 接口URL即可。
常见栏目的JSON接口
- 重要新闻: `https://www.xuexi.cn/lgdata/1jscb6pu1n2.json?_st=26095725`
- 重要活动: `https://www.xuexi.cn/lgdata/1jpuhp6fn73.json?_st=26095746`
- 重要会议: `https://www.xuexi.cn/lgdata/19vhj0omh73.json?_st=26095747`
- 重要讲话: `https://www.xuexi.cn/lgdata/132gdqo7l73.json?_st=26095749`
## 技术架构
```
crawl_important()
├── requests 获取JSON列表
│ └── 解析文章URL和基础信息
├── 遍历URL列表
│ ├── parse_news_detail() (Selenium)
│ │ ├── 访问文章页面
│ │ ├── 提取标题、时间、来源
│ │ └── 解析内容(文字、图片、视频)
│ └── 补充缺失的字段
└── 保存结果到JSON文件
```

View File

@@ -78,6 +78,27 @@ class XxqgCrawler(BaseCrawler):
'sec-ch-ua-mobile': '?0', 'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"' 'sec-ch-ua-platform': '"Windows"'
} }
),
"home": UrlConfig(
url="https://www.xuexi.cn/",
method="GET",
params={},
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Cache-Control': 'max-age=0',
'Referer': 'https://www.xuexi.cn/',
'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
}
) )
}, },
@@ -180,6 +201,127 @@ class XxqgCrawler(BaseCrawler):
# 相对路径,补全域名 # 相对路径,补全域名
return self.config.base_url + url return self.config.base_url + url
def parse_news_detail(self, url: str) -> NewsItem:
news_item = NewsItem(title='', contentRows=[], url=url)
if self.driver is None:
return news_item
try:
self.driver.get(url)
article_area_div = WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, 'div.render-detail-article'))
)
except Exception as e:
logger.warning(f"访问文章页失败或未找到文章区域: {url}, {e}")
return news_item
# 基础信息获取
try:
title_div = article_area_div.find_element(By.CSS_SELECTOR, "div.render-detail-title")
news_item.title = title_div.text.strip()
except Exception as e:
logger.warning(f"提取标题失败: {e}")
try:
time_div = article_area_div.find_element(By.CSS_SELECTOR, "span.render-detail-time")
news_item.publishTime = time_div.text.strip()
except Exception as e:
logger.warning(f"提取发布时间失败: {e}")
try:
source_div = article_area_div.find_element(By.CSS_SELECTOR, "span.render-detail-source")
news_item.source = source_div.text.strip().split("")[1]
except Exception as e:
logger.warning(f"提取来源失败: {e}")
# 获取文章内容区域
try:
article_content_div = article_area_div.find_element(By.CSS_SELECTOR, "div.render-detail-article-content")
except Exception as e:
logger.warning(f"未找到文章内容区域: {e}")
return news_item
# 检查是否有分页
def is_page():
try:
page_div = article_content_div.find_element(By.CSS_SELECTOR, "div.detail-pagination-wrap")
return page_div is not None and page_div.is_displayed()
except:
return False
def get_content_rows():
"""提取文章内容行"""
try:
content_div = article_content_div.find_element(By.CSS_SELECTOR, "div.render-detail-content")
except Exception as e:
logger.warning(f"未找到内容区域: {str(e)}")
return
# 获取所有直接子元素
children = content_div.find_elements(By.XPATH, "./*")
for child in children:
try:
# 获取元素的class属性
class_name = child.get_attribute("class") or ""
# 图片元素
if "article-img" in class_name:
try:
img = child.find_element(By.TAG_NAME, "img")
img_src = img.get_attribute("src")
if img_src:
# 规范化URL
img_src = self._normalize_url(img_src)
# 添加图片标签
news_item.contentRows.append({
"type": "img",
"content": f'<img src="{img_src}" />'
})
logger.debug(f"提取图片: {img_src}")
continue
except Exception as e:
logger.warning(f"提取图片失败: {str(e)}")
# 视频元素
if "article-video" in class_name:
try:
video = child.find_element(By.TAG_NAME, "video")
video_src = video.get_attribute("src")
if video_src:
# 规范化URL
video_src = self._normalize_url(video_src)
# 添加视频标签
news_item.contentRows.append({
"type": "video",
"content": f'<video src="{video_src}" controls></video>'
})
logger.debug(f"提取视频: {video_src}")
continue
except Exception as e:
logger.warning(f"提取视频失败: {str(e)}")
# 文字元素(作为最后的兜底)
text_content = child.text.strip()
# 过滤空内容
if text_content:
news_item.contentRows.append({
"type": "text",
"content": text_content
})
logger.debug(f"提取文字: {text_content[:50]}...")
except Exception as e:
logger.warning(f"处理内容元素失败: {str(e)}")
continue
get_content_rows()
if is_page():
pass
logger.info(f"解析文章详情完成: {news_item.model_dump()}")
return news_item
def search(self, keyword, total=10) -> ResultDomain: def search(self, keyword, total=10) -> ResultDomain:
"""搜索新闻""" """搜索新闻"""
@@ -334,123 +476,177 @@ class XxqgCrawler(BaseCrawler):
json.dump([item.model_dump() for item in resultDomain.dataList] if resultDomain.dataList else [], f, ensure_ascii=False, indent=4) json.dump([item.model_dump() for item in resultDomain.dataList] if resultDomain.dataList else [], f, ensure_ascii=False, indent=4)
return resultDomain return resultDomain
def parse_news_detail(self, url: str) -> NewsItem: def crawl_important(self, total=10) -> ResultDomain:
news_item = NewsItem(title='', contentRows=[], url=url) """
if self.driver is None: 爬取重要新闻栏目
return news_item 参考旧版myQiangguo爬虫方式使用requests获取文章列表然后用Selenium解析详情
Args:
total: 最多爬取的文章数量默认10
Returns:
ResultDomain: 包含新闻列表的结果对象
"""
news_list = []
resultDomain = ResultDomain(code=0, message="", success=True, dataList=news_list)
if self.driver is None:
logger.error("WebDriver未初始化无法继续爬取")
resultDomain.code = 1
resultDomain.success = False
resultDomain.message = "WebDriver未初始化"
return resultDomain
# 获取important配置
important_config = self.config.urls.get("important")
if not important_config:
logger.error("未找到important配置")
resultDomain.code = 1
resultDomain.success = False
resultDomain.message = "未找到important配置"
return resultDomain
self.driver.get(important_config.url)
try: try:
self.driver.get(url) if self.driver is None:
article_area_div = WebDriverWait(self.driver, 10).until( resultDomain.message="driver未初始化"
EC.presence_of_element_located((By.CSS_SELECTOR, 'div.render-detail-article')) return resultDomain
left_div = WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, 'div#page-main'))
) )
except Exception as e: except Exception as e:
logger.warning(f"访问文章页失败或未找到文章区域: {url}, {e}") logger.exception(f"访问页失败: {str(e)}")
return news_item resultDomain.code = 1
resultDomain.success = False
resultDomain.message = f"访问首页失败: {str(e)}"
return resultDomain
# 基础信息获取 # 从selenium-wire捕获的请求中筛选包含JSON数据的请求
time.sleep(3) # 等待所有请求完成
request_list = self.driver.requests
json_request = []
target_path = "98d5ae483720f701144e4dabf99a4a34/5957f69bffab66811b99940516ec8784"
target_request = None
logger.info(f"开始查找目标JSON请求共有 {len(request_list)} 个请求")
# 首先查找包含完整路径的JSON请求
for request in request_list:
if ".json" in request.url:
json_request.append(request)
if target_path in request.url:
target_request = request
if target_request is None:
logger.error("未找到目标JSON请求")
resultDomain.code = 1
resultDomain.success = False
resultDomain.message = "未找到目标JSON请求"
return resultDomain
# 解析meta请求响应获取channelId
try: try:
title_div = article_area_div.find_element(By.CSS_SELECTOR, "div.render-detail-title") meta_data = json.loads(target_request.response.body)
news_item.title = title_div.text.strip() logger.info(f"Meta响应数据: {meta_data}")
except Exception as e:
logger.warning(f"提取标题失败: {e}")
# 提取channelId
if 'pageData' in meta_data and 'channel' in meta_data['pageData']:
meta_id = meta_data['pageData']['channel']['channelId']
logger.info(f"成功获取channelId: {meta_id}")
else:
logger.error(f"Meta数据结构异常无法找到channelId。数据结构: {meta_data.keys()}")
resultDomain.code = 1
resultDomain.success = False
resultDomain.message = "无法从meta请求中提取channelId"
return resultDomain
except Exception as e:
logger.exception(f"解析meta请求失败: {str(e)}")
resultDomain.code = 1
resultDomain.success = False
resultDomain.message = f"解析meta请求失败: {str(e)}"
return resultDomain
# 使用channelId查找文章数据请求
data_request = None
for json_item in json_request:
if meta_id in json_item.url:
data_request = json_item
break
if data_request is None:
logger.error("未找到目标JSON请求")
resultDomain.code = 1
resultDomain.success = False
resultDomain.message = "未找到目标JSON请求"
return resultDomain
# 解析文章数据请求响应可能是gzip压缩的
try: try:
time_div = article_area_div.find_element(By.CSS_SELECTOR, "div.render-detail-time") response_body = data_request.response.body
news_item.publishTime = time_div.text.strip()
except Exception as e:
logger.warning(f"提取发布时间失败: {e}")
# 检查是否是gzip压缩
if response_body[:2] == b'\x1f\x8b': # gzip magic number
import gzip
response_body = gzip.decompress(response_body)
logger.info("检测到gzip压缩已解压")
# 解码为字符串
if isinstance(response_body, bytes):
response_body = response_body.decode('utf-8')
article_data = json.loads(response_body)
logger.info(f"成功解析文章数据,共 {len(article_data)}")
except Exception as e:
logger.exception(f"解析文章数据失败: {str(e)}")
resultDomain.code = 1
resultDomain.success = False
resultDomain.message = f"解析文章数据失败: {str(e)}"
return resultDomain
for article in article_data:
# 判断是昨天的新闻 "publishTime": "2025-11-21 10:04:20",
publish_date = article['publishTime'].split(" ")[0]
yesterday = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')
day_before_yesterday = (datetime.now() - timedelta(days=2)).strftime('%Y-%m-%d')
if publish_date == yesterday:
news_item = self.parse_news_detail(article['url'])
news_item.title = article['title']
news_item.publishTime = article['publishTime']
news_item.source = article['source'].split("_")[1]
news_item.url = article['url']
news_list.append(news_item)
logger.info(f"添加昨日新闻: {news_item.title}")
elif publish_date == day_before_yesterday:
# 遇到前天的新闻就停止,因为数据是按时间倒序排列的
logger.info("已到达前天的新闻,停止遍历")
break
resultDomain.dataList = news_list
# with open("Xxqg_important_news_list.json", "w", encoding="utf-8") as f:
# json.dump([item.model_dump() for item in resultDomain.dataList] if resultDomain.dataList else [], f, ensure_ascii=False, indent=4)
return resultDomain
def home(self, type="") -> ResultDomain:
"""获取首页数据"""
count = 0
url_base_map = {}
url_list = []
news_list = []
resultDomain = ResultDomain(code=0, message="", success=True, dataList=news_list)
home_config = self.config.urls.get("home")
self.driver.get(home_config.url)
try: try:
source_div = article_area_div.find_element(By.CSS_SELECTOR, "div.render-detail-source") if self.driver is None:
news_item.source = source_div.text.strip().split("")[1] resultDomain.message="driver未初始化"
return resultDomain
home_div = WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, 'div.page-main > div.grid-cell > div.grid-cell'))
)
except Exception as e: except Exception as e:
logger.warning(f"提取来源失败: {e}") resultDomain.message=f"获取首页失败: {str(e)}"
return resultDomain
section_divs = home_div.find_elements(By.CSS_SELECTOR, 'section')
# 获取文章内容区域
try:
article_content_div = article_area_div.find_element(By.CSS_SELECTOR, "div.render-detail-article-content")
except Exception as e:
logger.warning(f"未找到文章内容区域: {e}")
return news_item
# 检查是否有分页 return resultDomain
def is_page():
try:
page_div = article_content_div.find_element(By.CSS_SELECTOR, "div.detail-pagination-wrap")
return page_div is not None and page_div.is_displayed()
except:
return False
def get_content_rows():
"""提取文章内容行"""
try:
content_div = article_content_div.find_element(By.CSS_SELECTOR, "div.render-detail-content")
except Exception as e:
logger.warning(f"未找到内容区域: {str(e)}")
return
# 获取所有直接子元素
children = content_div.find_elements(By.XPATH, "./*")
for child in children:
try:
# 获取元素的class属性
class_name = child.get_attribute("class") or ""
# 图片元素
if "article-img" in class_name:
try:
img = child.find_element(By.TAG_NAME, "img")
img_src = img.get_attribute("src")
if img_src:
# 规范化URL
img_src = self._normalize_url(img_src)
# 添加图片标签
news_item.contentRows.append({
"type": "img",
"content": f'<img src="{img_src}" />'
})
logger.debug(f"提取图片: {img_src}")
continue
except Exception as e:
logger.warning(f"提取图片失败: {str(e)}")
# 视频元素
if "article-video" in class_name:
try:
video = child.find_element(By.TAG_NAME, "video")
video_src = video.get_attribute("src")
if video_src:
# 规范化URL
video_src = self._normalize_url(video_src)
# 添加视频标签
news_item.contentRows.append({
"type": "video",
"content": f'<video src="{video_src}" controls></video>'
})
logger.debug(f"提取视频: {video_src}")
continue
except Exception as e:
logger.warning(f"提取视频失败: {str(e)}")
# 文字元素(作为最后的兜底)
text_content = child.text.strip()
# 过滤空内容
if text_content:
news_item.contentRows.append({
"type": "text",
"content": text_content
})
logger.debug(f"提取文字: {text_content[:50]}...")
except Exception as e:
logger.warning(f"处理内容元素失败: {str(e)}")
continue
get_content_rows()
if is_page():
pass
logger.info(f"解析文章详情完成: {news_item.model_dump()}")
return news_item

View File

@@ -0,0 +1,42 @@
"""
测试学习强国重要新闻爬虫
"""
from XxqgCrawler import XxqgCrawler
from loguru import logger
def test_crawl_important():
"""测试爬取重要新闻"""
try:
# 初始化爬虫
logger.info("初始化学习强国爬虫...")
crawler = XxqgCrawler()
# 爬取重要新闻默认最多60篇
logger.info("开始爬取重要新闻...")
result = crawler.crawl_important(max_count=10) # 测试时只爬取10篇
# 检查结果
if result.success:
logger.info(f"爬取成功!{result.message}")
logger.info(f"共爬取到 {len(result.dataList)} 篇新闻")
# 打印前3篇新闻标题
for idx, news in enumerate(result.dataList[:3], 1):
logger.info(f"{idx}. {news.title}")
logger.info(f" 来源: {news.source}")
logger.info(f" 发布时间: {news.publishTime}")
logger.info(f" 内容行数: {len(news.contentRows)}")
logger.info("")
else:
logger.error(f"爬取失败: {result.message}")
# 关闭浏览器
if crawler.driver:
crawler.driver.quit()
logger.info("浏览器已关闭")
except Exception as e:
logger.exception(f"测试过程中发生错误: {str(e)}")
if __name__ == "__main__":
test_crawl_important()