457 lines
20 KiB
Python
457 lines
20 KiB
Python
|
|
# 新华网爬虫
|
|||
|
|
from itertools import count
|
|||
|
|
from typing import List, Optional
|
|||
|
|
|
|||
|
|
from bs4 import Tag
|
|||
|
|
from pydantic import InstanceOf
|
|||
|
|
from sqlalchemy import false
|
|||
|
|
from core.ResultDomain import ResultDomain
|
|||
|
|
from crawler.BaseCrawler import BaseCrawler, CrawlerConfig, NewsItem, UrlConfig
|
|||
|
|
from loguru import logger
|
|||
|
|
import re
|
|||
|
|
import chardet
|
|||
|
|
from datetime import datetime, timedelta
|
|||
|
|
from bs4.element import NavigableString
|
|||
|
|
from urllib.parse import urlparse, urlencode, urlunparse
|
|||
|
|
import json
|
|||
|
|
from seleniumwire import webdriver # 注意不是 selenium
|
|||
|
|
from selenium.webdriver.common.by import By
|
|||
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
|||
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|||
|
|
from selenium.webdriver.chrome.options import Options
|
|||
|
|
from selenium.webdriver.common.action_chains import ActionChains
|
|||
|
|
from selenium.webdriver.chrome.service import Service
|
|||
|
|
import platform
|
|||
|
|
from urllib.parse import urlparse, parse_qs
|
|||
|
|
|
|||
|
|
import time
|
|||
|
|
import random
|
|||
|
|
import os
|
|||
|
|
|
|||
|
|
class XxqgCrawler(BaseCrawler):
|
|||
|
|
def __init__(self):
|
|||
|
|
"""初始化学习强国爬虫"""
|
|||
|
|
config = CrawlerConfig(
|
|||
|
|
base_url="https://www.xuexi.cn/",
|
|||
|
|
urls={
|
|||
|
|
"search": UrlConfig(
|
|||
|
|
url="https://static.xuexi.cn/search/online/index.html",
|
|||
|
|
apiurl="https://search.xuexi.cn/api/search",
|
|||
|
|
method="GET",
|
|||
|
|
params={
|
|||
|
|
"query": ""
|
|||
|
|
},
|
|||
|
|
headers={
|
|||
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|||
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
|||
|
|
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
|||
|
|
'Accept-Encoding': 'gzip, deflate, br',
|
|||
|
|
'Connection': 'keep-alive',
|
|||
|
|
'Upgrade-Insecure-Requests': '1',
|
|||
|
|
'Sec-Fetch-Dest': 'document',
|
|||
|
|
'Sec-Fetch-Mode': 'navigate',
|
|||
|
|
'Sec-Fetch-Site': 'none',
|
|||
|
|
'Cache-Control': 'max-age=0',
|
|||
|
|
'Referer': 'https://www.xuexi.cn/',
|
|||
|
|
'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
|
|||
|
|
'sec-ch-ua-mobile': '?0',
|
|||
|
|
'sec-ch-ua-platform': '"Windows"'
|
|||
|
|
}
|
|||
|
|
),
|
|||
|
|
"important": UrlConfig(
|
|||
|
|
url="https://www.xuexi.cn/98d5ae483720f701144e4dabf99a4a34/5957f69bffab66811b99940516ec8784.html",
|
|||
|
|
method="GET",
|
|||
|
|
params={},
|
|||
|
|
headers={
|
|||
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|||
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
|||
|
|
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
|||
|
|
'Accept-Encoding': 'gzip, deflate, br',
|
|||
|
|
'Connection': 'keep-alive',
|
|||
|
|
'Upgrade-Insecure-Requests': '1',
|
|||
|
|
'Sec-Fetch-Dest': 'document',
|
|||
|
|
'Sec-Fetch-Mode': 'navigate',
|
|||
|
|
'Sec-Fetch-Site': 'none',
|
|||
|
|
'Cache-Control': 'max-age=0',
|
|||
|
|
'Referer': 'https://www.xuexi.cn/',
|
|||
|
|
'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
|
|||
|
|
'sec-ch-ua-mobile': '?0',
|
|||
|
|
'sec-ch-ua-platform': '"Windows"'
|
|||
|
|
}
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
},
|
|||
|
|
)
|
|||
|
|
super().__init__(config)
|
|||
|
|
|
|||
|
|
|
|||
|
|
# 初始化时创建driver
|
|||
|
|
self.driver = self._init_driver()
|
|||
|
|
|
|||
|
|
def _init_driver(self):
|
|||
|
|
"""初始化并返回Chrome WebDriver实例"""
|
|||
|
|
chrome_options = Options()
|
|||
|
|
# 确保浏览器可见,不使用无头模式
|
|||
|
|
# 或者完全删除这行,因为默认就是有界面模式
|
|||
|
|
chrome_options.add_argument('--no-sandbox')
|
|||
|
|
chrome_options.add_argument('--disable-dev-shm-usage')
|
|||
|
|
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
|
|||
|
|
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
|||
|
|
chrome_options.add_experimental_option('useAutomationExtension', False)
|
|||
|
|
chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
|
|||
|
|
# 确保浏览器可见
|
|||
|
|
chrome_options.add_argument('--start-maximized')
|
|||
|
|
chrome_options.add_argument('--disable-gpu')
|
|||
|
|
chrome_options.add_argument('--disable-web-security')
|
|||
|
|
chrome_options.add_argument('--allow-running-insecure-content')
|
|||
|
|
chrome_options.add_argument('--disable-features=VizDisplayCompositor')
|
|||
|
|
# 判断系统类型获取对应的chromedriver路径
|
|||
|
|
chrome_driver_path = 'win/chromedriver.exe'
|
|||
|
|
|
|||
|
|
if platform.system() == 'Linux':
|
|||
|
|
chrome_driver_path = 'linux/chromedriver'
|
|||
|
|
|
|||
|
|
service = Service(executable_path=chrome_driver_path)
|
|||
|
|
|
|||
|
|
driver = None
|
|||
|
|
try:
|
|||
|
|
driver = webdriver.Chrome(service=service, options=chrome_options)
|
|||
|
|
logger.info("Chrome浏览器初始化成功")
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"Chrome浏览器初始化失败: {str(e)}")
|
|||
|
|
return driver
|
|||
|
|
|
|||
|
|
# 设置隐式等待时间
|
|||
|
|
# driver.implicitly_wait(10)
|
|||
|
|
|
|||
|
|
# 访问主页获取初始Cookie
|
|||
|
|
logger.info("访问主页获取初始Cookie")
|
|||
|
|
try:
|
|||
|
|
driver.get(self.config.base_url)
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"访问URL失败: {self.config.base_url}, 错误: {str(e)}")
|
|||
|
|
return driver
|
|||
|
|
time.sleep(random.uniform(2, 4))
|
|||
|
|
|
|||
|
|
# 检查是否有验证页面
|
|||
|
|
page_source = driver.page_source
|
|||
|
|
if "验证" in page_source or "captcha" in page_source.lower() or "滑动验证" in page_source:
|
|||
|
|
logger.warning("检测到验证页面,尝试手动处理验证")
|
|||
|
|
|
|||
|
|
# 尝试等待用户手动处理验证
|
|||
|
|
logger.info("请在30秒内手动完成验证...")
|
|||
|
|
time.sleep(30)
|
|||
|
|
|
|||
|
|
# 刷新页面,检查验证是否完成
|
|||
|
|
driver.refresh()
|
|||
|
|
time.sleep(random.uniform(2, 4))
|
|||
|
|
|
|||
|
|
# 再次检查验证状态
|
|||
|
|
page_source = driver.page_source
|
|||
|
|
if "验证" in page_source or "captcha" in page_source.lower() or "滑动验证" in page_source:
|
|||
|
|
logger.error("验证未完成,无法继续爬取")
|
|||
|
|
# self.driver.quit()
|
|||
|
|
# self.driver = None
|
|||
|
|
return driver
|
|||
|
|
|
|||
|
|
return driver
|
|||
|
|
|
|||
|
|
def _normalize_url(self, url: str) -> str:
|
|||
|
|
"""
|
|||
|
|
规范化 URL,补全协议和域名
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
url: 原始 URL
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
完整的 URL
|
|||
|
|
"""
|
|||
|
|
if not url:
|
|||
|
|
return url
|
|||
|
|
|
|||
|
|
# 已经是完整 URL
|
|||
|
|
if url.startswith("http://") or url.startswith("https://"):
|
|||
|
|
return url
|
|||
|
|
|
|||
|
|
# 协议相对 URL,补充 https:
|
|||
|
|
if url.startswith("//"):
|
|||
|
|
return "https:" + url
|
|||
|
|
|
|||
|
|
# 相对路径,补全域名
|
|||
|
|
return self.config.base_url + url
|
|||
|
|
|
|||
|
|
|
|||
|
|
def search(self, keyword, total=10) -> ResultDomain:
|
|||
|
|
"""搜索新闻"""
|
|||
|
|
search_config = self.config.urls.get("search")
|
|||
|
|
if not self.driver:
|
|||
|
|
logger.error("WebDriver未初始化,无法继续爬取")
|
|||
|
|
return ResultDomain(code=1, message="WebDriver未初始化,无法继续爬取", success=False)
|
|||
|
|
|
|||
|
|
count = 0
|
|||
|
|
url_base_map = {}
|
|||
|
|
url_list = []
|
|||
|
|
news_list = []
|
|||
|
|
|
|||
|
|
resultDomain = ResultDomain(code=0, message="", success=True, dataList=news_list)
|
|||
|
|
|
|||
|
|
def get_search_url():
|
|||
|
|
"""从当前页面提取URL数据"""
|
|||
|
|
nonlocal count
|
|||
|
|
try:
|
|||
|
|
# 等待页面加载完成
|
|||
|
|
# assert self.driver is not None, "WebDriver未初始化"
|
|||
|
|
if self.driver is None:
|
|||
|
|
logger.error("WebDriver未初始化")
|
|||
|
|
return
|
|||
|
|
wait = WebDriverWait(self.driver, 10)
|
|||
|
|
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.search-result")))
|
|||
|
|
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.c-card:not(.c-sc)")))
|
|||
|
|
|
|||
|
|
# 解析HTML搜索结果
|
|||
|
|
home = self.driver.find_element(By.CSS_SELECTOR, "div.home")
|
|||
|
|
|
|||
|
|
search_content = home.find_element(By.CSS_SELECTOR, "div.search-content")
|
|||
|
|
|
|||
|
|
search_result_div = search_content.find_element(By.CSS_SELECTOR, "div.search-result")
|
|||
|
|
|
|||
|
|
item_s = search_result_div.find_elements(By.CSS_SELECTOR, "div.c-card:not(.c-sc)")
|
|||
|
|
|
|||
|
|
for item in item_s:
|
|||
|
|
if count >= total:
|
|||
|
|
break
|
|||
|
|
try:
|
|||
|
|
# 从 a 标签获取 URL
|
|||
|
|
link = item.find_element(By.CSS_SELECTOR, "a[href]")
|
|||
|
|
url = link.get_attribute("href")
|
|||
|
|
|
|||
|
|
# 从 h3 > span.title 获取标题
|
|||
|
|
title = item.find_element(By.CSS_SELECTOR, "h3 span.title").text
|
|||
|
|
|
|||
|
|
# 从 div.time 获取来源和时间
|
|||
|
|
time_element = item.find_element(By.CSS_SELECTOR, "div.time")
|
|||
|
|
time_text = time_element.text.strip()
|
|||
|
|
|
|||
|
|
# 判断是换行符分隔还是空格分隔
|
|||
|
|
if '\n' in time_text:
|
|||
|
|
time_lines = time_text.split('\n')
|
|||
|
|
source = time_lines[0].strip() if len(time_lines) > 0 else ''
|
|||
|
|
publish_time = time_lines[1].strip() if len(time_lines) > 1 else ''
|
|||
|
|
else:
|
|||
|
|
# 空格分隔,使用正则提取日期格式
|
|||
|
|
date_match = re.search(r'\d{4}-\d{2}-\d{2}', time_text)
|
|||
|
|
if date_match:
|
|||
|
|
publish_time = date_match.group()
|
|||
|
|
source = time_text[:date_match.start()].strip()
|
|||
|
|
else:
|
|||
|
|
source = ''
|
|||
|
|
publish_time = time_text
|
|||
|
|
|
|||
|
|
url_base_map[url] = {
|
|||
|
|
'title': title,
|
|||
|
|
'source': source,
|
|||
|
|
'publishTime': publish_time
|
|||
|
|
}
|
|||
|
|
url_list.append(url)
|
|||
|
|
count += 1
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning(f"解析某个搜索结果失败: {str(e)}")
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
logger.info(f"本页提取到 {len(item_s)} 条搜索结果")
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.exception(f"提取URL过程出错: {str(e)}")
|
|||
|
|
|
|||
|
|
# 方式1:初次手动点击按钮进入
|
|||
|
|
logger.info("访问搜索页面并手动点击搜索")
|
|||
|
|
self.driver.get(search_config.url)
|
|||
|
|
time.sleep(2)
|
|||
|
|
|
|||
|
|
home = self.driver.find_element(By.CSS_SELECTOR, "div.home")
|
|||
|
|
logger.info(home)
|
|||
|
|
input_wapper_div = self.driver.find_element(By.CSS_SELECTOR, 'div.search-input-wrapper')
|
|||
|
|
input_div = input_wapper_div.find_element(By.CSS_SELECTOR, 'input.search-type-input-compact')
|
|||
|
|
input_div.send_keys(keyword)
|
|||
|
|
|
|||
|
|
search_btn = input_wapper_div.find_element(By.CSS_SELECTOR, 'button[type="submit"]')
|
|||
|
|
search_btn.click()
|
|||
|
|
time.sleep(2)
|
|||
|
|
|
|||
|
|
# 提取第一页数据
|
|||
|
|
get_search_url()
|
|||
|
|
|
|||
|
|
# 方式2:后续页直接通过URL进入
|
|||
|
|
while count < total:
|
|||
|
|
# 记录提取前的数量
|
|||
|
|
count_before = count
|
|||
|
|
|
|||
|
|
# 构建下一页URL
|
|||
|
|
current_url = self.driver.current_url
|
|||
|
|
qs = urlparse(current_url)
|
|||
|
|
param = parse_qs(qs.query)
|
|||
|
|
current_page = int(param.get('page', ['1'])[0])
|
|||
|
|
param['page'] = [str(current_page + 1)]
|
|||
|
|
|
|||
|
|
new_url = urlunparse((qs.scheme, qs.netloc, qs.path, qs.params, urlencode(param, doseq=True), qs.fragment))
|
|||
|
|
logger.info(f"翻页到第 {current_page + 1} 页")
|
|||
|
|
|
|||
|
|
# 直接访问新页面
|
|||
|
|
self.driver.get(new_url)
|
|||
|
|
time.sleep(2)
|
|||
|
|
|
|||
|
|
# 提取数据
|
|||
|
|
get_search_url()
|
|||
|
|
|
|||
|
|
# 如果本页没有提取到新数据,说明没有更多结果
|
|||
|
|
if count == count_before:
|
|||
|
|
logger.info("本页没有提取到新数据,结束翻页")
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
logger.info(f"共提取 {len(url_list)} 条URL")
|
|||
|
|
|
|||
|
|
# 解析文章详情
|
|||
|
|
for url in url_list:
|
|||
|
|
try:
|
|||
|
|
news_item = self.parse_news_detail(url)
|
|||
|
|
if news_item:
|
|||
|
|
# 如果某些为空,根据url_base_map补齐
|
|||
|
|
if news_item.title is None or news_item.title.strip() == "":
|
|||
|
|
news_item.title = url_base_map[url].get("title", "")
|
|||
|
|
if news_item.publishTime is None or news_item.publishTime.strip() == "":
|
|||
|
|
news_item.publishTime = url_base_map[url].get("publishTime", "")
|
|||
|
|
if news_item.source is None or news_item.source.strip() == "":
|
|||
|
|
news_item.source = url_base_map[url].get("source", "")
|
|||
|
|
|
|||
|
|
news_list.append(news_item)
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning(f"解析文章详情失败: {str(e)}")
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
resultDomain.dataList = news_list
|
|||
|
|
with open("Xxqg_news_list.json", "w", encoding="utf-8") as f:
|
|||
|
|
json.dump([item.model_dump() for item in resultDomain.dataList] if resultDomain.dataList else [], f, ensure_ascii=False, indent=4)
|
|||
|
|
return resultDomain
|
|||
|
|
|
|||
|
|
def parse_news_detail(self, url: str) -> NewsItem:
|
|||
|
|
news_item = NewsItem(title='', contentRows=[], url=url)
|
|||
|
|
if self.driver is None:
|
|||
|
|
return news_item
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
self.driver.get(url)
|
|||
|
|
article_area_div = WebDriverWait(self.driver, 10).until(
|
|||
|
|
EC.presence_of_element_located((By.CSS_SELECTOR, 'div.render-detail-article'))
|
|||
|
|
)
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning(f"访问文章页失败或未找到文章区域: {url}, {e}")
|
|||
|
|
return news_item
|
|||
|
|
|
|||
|
|
# 基础信息获取
|
|||
|
|
try:
|
|||
|
|
title_div = article_area_div.find_element(By.CSS_SELECTOR, "div.render-detail-title")
|
|||
|
|
news_item.title = title_div.text.strip()
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning(f"提取标题失败: {e}")
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
time_div = article_area_div.find_element(By.CSS_SELECTOR, "div.render-detail-time")
|
|||
|
|
news_item.publishTime = time_div.text.strip()
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning(f"提取发布时间失败: {e}")
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
source_div = article_area_div.find_element(By.CSS_SELECTOR, "div.render-detail-source")
|
|||
|
|
news_item.source = source_div.text.strip().split(":")[1]
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning(f"提取来源失败: {e}")
|
|||
|
|
|
|||
|
|
# 获取文章内容区域
|
|||
|
|
try:
|
|||
|
|
article_content_div = article_area_div.find_element(By.CSS_SELECTOR, "div.render-detail-article-content")
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning(f"未找到文章内容区域: {e}")
|
|||
|
|
return news_item
|
|||
|
|
|
|||
|
|
# 检查是否有分页
|
|||
|
|
def is_page():
|
|||
|
|
try:
|
|||
|
|
page_div = article_content_div.find_element(By.CSS_SELECTOR, "div.detail-pagination-wrap")
|
|||
|
|
return page_div is not None and page_div.is_displayed()
|
|||
|
|
except:
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
def get_content_rows():
|
|||
|
|
"""提取文章内容行"""
|
|||
|
|
try:
|
|||
|
|
content_div = article_content_div.find_element(By.CSS_SELECTOR, "div.render-detail-content")
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning(f"未找到内容区域: {str(e)}")
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
# 获取所有直接子元素
|
|||
|
|
children = content_div.find_elements(By.XPATH, "./*")
|
|||
|
|
|
|||
|
|
for child in children:
|
|||
|
|
try:
|
|||
|
|
# 获取元素的class属性
|
|||
|
|
class_name = child.get_attribute("class") or ""
|
|||
|
|
|
|||
|
|
# 图片元素
|
|||
|
|
if "article-img" in class_name:
|
|||
|
|
try:
|
|||
|
|
img = child.find_element(By.TAG_NAME, "img")
|
|||
|
|
img_src = img.get_attribute("src")
|
|||
|
|
if img_src:
|
|||
|
|
# 规范化URL
|
|||
|
|
img_src = self._normalize_url(img_src)
|
|||
|
|
# 添加图片标签
|
|||
|
|
news_item.contentRows.append({
|
|||
|
|
"type": "img",
|
|||
|
|
"content": f'<img src="{img_src}" />'
|
|||
|
|
})
|
|||
|
|
logger.debug(f"提取图片: {img_src}")
|
|||
|
|
continue
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning(f"提取图片失败: {str(e)}")
|
|||
|
|
|
|||
|
|
# 视频元素
|
|||
|
|
if "article-video" in class_name:
|
|||
|
|
try:
|
|||
|
|
video = child.find_element(By.TAG_NAME, "video")
|
|||
|
|
video_src = video.get_attribute("src")
|
|||
|
|
if video_src:
|
|||
|
|
# 规范化URL
|
|||
|
|
video_src = self._normalize_url(video_src)
|
|||
|
|
# 添加视频标签
|
|||
|
|
news_item.contentRows.append({
|
|||
|
|
"type": "video",
|
|||
|
|
"content": f'<video src="{video_src}" controls></video>'
|
|||
|
|
})
|
|||
|
|
logger.debug(f"提取视频: {video_src}")
|
|||
|
|
continue
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning(f"提取视频失败: {str(e)}")
|
|||
|
|
|
|||
|
|
# 文字元素(作为最后的兜底)
|
|||
|
|
text_content = child.text.strip()
|
|||
|
|
# 过滤空内容
|
|||
|
|
if text_content:
|
|||
|
|
news_item.contentRows.append({
|
|||
|
|
"type": "text",
|
|||
|
|
"content": text_content
|
|||
|
|
})
|
|||
|
|
logger.debug(f"提取文字: {text_content[:50]}...")
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning(f"处理内容元素失败: {str(e)}")
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
get_content_rows()
|
|||
|
|
|
|||
|
|
if is_page():
|
|||
|
|
pass
|
|||
|
|
logger.info(f"解析文章详情完成: {news_item.model_dump()}")
|
|||
|
|
return news_item
|