From af9faf26c9423babf48dc8751355ba46ffa92af7 Mon Sep 17 00:00:00 2001 From: User Date: Fri, 17 Apr 2026 09:33:56 +0800 Subject: [PATCH] feat(s2s): add S2S text dialog via /ws/realtime-text + event 501 ChatTextQuery Dual-channel S2S architecture with full isolation between voice and text links: Backend (Java): - VolcRealtimeProtocol: add createChatTextQueryMessage (event 501) - VoiceSessionState: add textMode / playAudioReply / disableGreeting fields - VoiceWebSocketConfig: register second path /ws/realtime-text (same handler) - VoiceWebSocketHandler: detect text mode from URL path - VoiceGatewayService: * afterConnectionEstablished: overload with textMode flag * handleStart: parse playAudioReply / disableGreeting from client * buildStartSessionPayload: inject input_mod=text for text mode * handleDirectText: text mode sends event 501 directly, skip processReply * handleBinaryMessage: reject client audio in text mode * handleUpstreamBinary: drop S2S audio if text mode + no playback * startAudioKeepalive: skip entirely in text mode (no audio channel) * sendGreeting: skip greeting if disableGreeting=true Frontend (test2 + delivery): - nativeVoiceService: connect accepts clientMode/playAudioReply/disableGreeting * resolveWebSocketUrl accepts wsPath param * Text mode: no microphone capture, no playback context (unless playAudioReply) * New sendText() method for event 501 payload * handleAudioMessage drops audio in text mode without playback * Export NativeVoiceService class for multi-instance usage - ChatPanel (test2): new useS2S / playAudioReply props * useS2S=true: creates NativeVoiceService instance, connects to /ws/realtime-text * subtitle events drive streaming UI, assistant_pending drives loading state * handleSend routes to WebSocket in S2S mode, HTTP/SSE in Coze mode * Voice link code path zero-changed Verification: mvn test VoiceGatewaySmokeTest 20/20 pass, voice link regression-free --- .../client/src/services/nativeVoiceService.js | 90 +++++--- .../service/VoiceGatewayService.java | 75 ++++++- .../javaserver/service/VoiceSessionState.java | 5 + .../websocket/VoiceWebSocketConfig.java | 3 +- .../websocket/VoiceWebSocketHandler.java | 4 +- .../websocket/VolcRealtimeProtocol.java | 11 + test2/client/src/components/ChatPanel.jsx | 204 ++++++++++++++---- .../client/src/services/nativeVoiceService.js | 115 ++++++---- 8 files changed, 399 insertions(+), 108 deletions(-) diff --git a/delivery/client/src/services/nativeVoiceService.js b/delivery/client/src/services/nativeVoiceService.js index 046b894..494a7e0 100644 --- a/delivery/client/src/services/nativeVoiceService.js +++ b/delivery/client/src/services/nativeVoiceService.js @@ -25,7 +25,7 @@ class NativeVoiceService { }; } - resolveWebSocketUrl(sessionId, userId) { + resolveWebSocketUrl(sessionId, userId, wsPath = '/ws/realtime-dialog') { const query = new URLSearchParams({ sessionId, userId: userId || '', @@ -43,16 +43,16 @@ class NativeVoiceService { } else if (base.endsWith('/api')) { base = base.slice(0, -'/api'.length); } - return `${base}/ws/realtime-dialog?${query.toString()}`; + return `${base}${wsPath}?${query.toString()}`; } const hostname = window.location.hostname; const port = window.location.port; const isLocalHost = hostname === 'localhost' || hostname === '127.0.0.1'; if ((window.location.protocol === 'file:' || isLocalHost) && port !== '3013') { - return `ws://${hostname || '127.0.0.1'}:3013/ws/realtime-dialog?${query.toString()}`; + return `ws://${hostname || '127.0.0.1'}:3013${wsPath}?${query.toString()}`; } const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:'; - return `${protocol}//${window.location.host}/ws/realtime-dialog?${query.toString()}`; + return `${protocol}//${window.location.host}${wsPath}?${query.toString()}`; } emitConnectionState(state) { @@ -80,29 +80,52 @@ class NativeVoiceService { } } - async connect({ sessionId, userId, botName, systemRole, speakingStyle, modelVersion, speaker, greetingText }) { + async connect({ + sessionId, + userId, + botName, + systemRole, + speakingStyle, + modelVersion, + speaker, + greetingText, + clientMode = 'voice', + playAudioReply = false, + disableGreeting = false, + } = {}) { await this.disconnect(); - const wsUrl = this.resolveWebSocketUrl(sessionId, userId); + this.clientMode = clientMode; + this.playAudioReply = playAudioReply; + const wsPath = clientMode === 'text' ? '/ws/realtime-text' : '/ws/realtime-dialog'; + const wsUrl = this.resolveWebSocketUrl(sessionId, userId, wsPath); this.emitConnectionState('connecting'); - this.playbackContext = new (window.AudioContext || window.webkitAudioContext)(); - if (this.playbackContext.state === 'suspended') { - await this.playbackContext.resume().catch(() => {}); - } - this.playbackTime = this.playbackContext.currentTime; - // 并行: 同时预获取麦克风和建立WS连接,节省500ms+ - const micPromise = navigator.mediaDevices.getUserMedia({ - audio: { - channelCount: 1, - noiseSuppression: true, - echoCancellation: true, - autoGainControl: true, - }, - video: false, - }).catch((err) => { - console.warn('[NativeVoice] Pre-fetch getUserMedia failed:', err.message); - return null; - }); + // Audio playback context: only needed if we will receive audio + const needsPlayback = clientMode !== 'text' || playAudioReply; + if (needsPlayback) { + this.playbackContext = new (window.AudioContext || window.webkitAudioContext)(); + if (this.playbackContext.state === 'suspended') { + await this.playbackContext.resume().catch(() => {}); + } + this.playbackTime = this.playbackContext.currentTime; + } + + // Microphone: only needed in voice mode + let micPromise = Promise.resolve(null); + if (clientMode !== 'text') { + micPromise = navigator.mediaDevices.getUserMedia({ + audio: { + channelCount: 1, + noiseSuppression: true, + echoCancellation: true, + autoGainControl: true, + }, + video: false, + }).catch((err) => { + console.warn('[NativeVoice] Pre-fetch getUserMedia failed:', err.message); + return null; + }); + } const CONNECTION_TIMEOUT_MS = 12000; @@ -137,6 +160,9 @@ class NativeVoiceService { modelVersion, speaker, greetingText, + clientMode, + playAudioReply, + disableGreeting: clientMode === 'text' ? (disableGreeting !== false) : disableGreeting, })); }; @@ -173,11 +199,20 @@ class NativeVoiceService { }; }); - // 使用预获取的mediaStream(已并行获取),避免重复申请 + // 文字模式不启动麦克风 + if (clientMode === 'text') { + return; + } const preFetchedStream = await micPromise; await this.startCapture(preFetchedStream); } + sendText(text) { + if (this.ws && this.ws.readyState === WebSocket.OPEN) { + this.ws.send(JSON.stringify({ type: 'text', text: String(text || '') })); + } + } + handleJsonMessage(raw) { try { const msg = JSON.parse(raw); @@ -232,6 +267,10 @@ class NativeVoiceService { } handleAudioMessage(arrayBuffer) { + // Text mode without playAudioReply: drop S2S audio silently + if (this.clientMode === 'text' && !this.playAudioReply) { + return; + } if (!this.playbackContext) { return; } @@ -424,3 +463,4 @@ class NativeVoiceService { const nativeVoiceService = new NativeVoiceService(); export default nativeVoiceService; +export { NativeVoiceService }; diff --git a/java-server/src/main/java/com/bigwo/javaserver/service/VoiceGatewayService.java b/java-server/src/main/java/com/bigwo/javaserver/service/VoiceGatewayService.java index a38fdb2..3ce494d 100644 --- a/java-server/src/main/java/com/bigwo/javaserver/service/VoiceGatewayService.java +++ b/java-server/src/main/java/com/bigwo/javaserver/service/VoiceGatewayService.java @@ -117,6 +117,10 @@ public class VoiceGatewayService { } public void afterConnectionEstablished(WebSocketSession clientSession, String sessionId, String userId) { + afterConnectionEstablished(clientSession, sessionId, userId, false); + } + + public void afterConnectionEstablished(WebSocketSession clientSession, String sessionId, String userId, boolean textMode) { if (!properties.isEnabled()) { closeClient(clientSession, CloseStatus.NOT_ACCEPTABLE.withReason("voice gateway disabled")); return; @@ -126,12 +130,17 @@ public class VoiceGatewayService { return; } VoiceSessionState state = new VoiceSessionState(clientSession.getId(), sessionId.trim(), clientSession, normalizeNullable(userId)); + state.textMode = textMode; + if (textMode) { + state.disableGreeting = true; + state.playAudioReply = false; + } sessions.put(clientSession.getId(), state); - log.info("[VoiceGateway] client WS connected session={} wsId={} remote={}", state.sessionId, clientSession.getId(), - clientSession.getRemoteAddress()); - chatRepository.createSession(state.sessionId, state.userId, "voice"); + log.info("[VoiceGateway] client WS connected session={} wsId={} mode={} remote={}", + state.sessionId, clientSession.getId(), textMode ? "text" : "voice", clientSession.getRemoteAddress()); + chatRepository.createSession(state.sessionId, state.userId, textMode ? "text" : "voice"); resetIdleTimer(state); - sendJson(state, Map.of("type", "connected", "sessionId", state.sessionId)); + sendJson(state, Map.of("type", "connected", "sessionId", state.sessionId, "mode", textMode ? "text" : "voice")); } public void afterConnectionClosed(WebSocketSession clientSession) { @@ -200,6 +209,10 @@ public class VoiceGatewayService { if (state == null || payload == null || payload.length == 0) { return; } + if (state.textMode) { + // Text mode: reject client audio frames entirely + return; + } WebSocket upstream = state.upstream; if (upstream == null || !state.upstreamReady) { return; @@ -241,6 +254,17 @@ public class VoiceGatewayService { private void handleStart(VoiceSessionState state, JsonNode node) { state.userId = firstNonBlank(textValue(node.path("userId")), state.userId); + // Text mode optional override from start message (text mode may choose to play audio reply) + if (state.textMode) { + JsonNode playNode = node.path("playAudioReply"); + if (!playNode.isMissingNode() && !playNode.isNull()) { + state.playAudioReply = playNode.asBoolean(false); + } + JsonNode greetNode = node.path("disableGreeting"); + if (!greetNode.isMissingNode() && !greetNode.isNull()) { + state.disableGreeting = greetNode.asBoolean(true); + } + } AssistantProfileResult profileResult = assistantProfileService.getAssistantProfile(state.userId, false); state.assistantProfile = voiceAssistantProfileSupport.resolve(profileResult.profile()); state.botName = firstNonBlank(textValue(node.path("botName")), voiceAssistantProfileSupport.getDisplayName(state.assistantProfile), "大沃"); @@ -271,6 +295,23 @@ public class VoiceGatewayService { if (!persistUserSpeech(state, cleanText)) { return; } + if (state.textMode) { + // Text mode: send directly to S2S via event 501 (ChatTextQuery) + // S2S will internally invoke LLM / tool_calls (event 502 external_rag) and stream back + state.blockUpstreamAudio = !state.playAudioReply; // block audio if user doesn't want playback + state.currentTtsType = "default"; + state.awaitingUpstreamReply = true; + state.pendingAssistantSource = "voice_bot"; + state.pendingAssistantToolName = null; + state.pendingAssistantMeta = null; + state.pendingAssistantTurnSeq = state.latestUserTurnSeq; + state.turnCount++; + state.clearAssistantBuffer(); + sendUpstreamChatTextQuery(state, cleanText); + sendJson(state, Map.of("type", "assistant_pending", "active", Boolean.TRUE)); + log.info("[VoiceGateway][text-mode] sent ChatTextQuery session={} len={}", state.sessionId, cleanText.length()); + return; + } sendJson(state, Map.of("type", "tts_reset", "reason", "new_turn")); state.blockUpstreamAudio = true; state.currentTtsType = "default"; @@ -278,6 +319,16 @@ public class VoiceGatewayService { processReplyAsync(state, cleanText, state.latestUserTurnSeq); } + private void sendUpstreamChatTextQuery(VoiceSessionState state, String text) { + if (state.upstream == null || !state.upstreamReady) { + log.warn("[VoiceGateway][text-mode] upstream not ready, drop text session={}", state.sessionId); + sendJson(state, Map.of("type", "error", "error", "语音服务尚未就绪,请稍后重试")); + return; + } + byte[] msg = VolcRealtimeProtocol.createChatTextQueryMessage(state.sessionId, text, objectMapper); + sendUpstreamBinary(state, msg); + } + private void connectUpstream(VoiceSessionState state) { if (!properties.isConfigured()) { sendJson(state, Map.of("type", "error", "error", "VOLC_S2S_APP_ID 或 VOLC_S2S_TOKEN 未配置")); @@ -323,7 +374,8 @@ public class VoiceGatewayService { dialog.put("bot_name", state.botName); dialog.put("system_role", voiceAssistantProfileSupport.normalizeTextForSpeech(ANTI_THINKING_PREFIX + " " + state.systemRole)); dialog.put("speaking_style", voiceAssistantProfileSupport.normalizeTextForSpeech(state.speakingStyle)); - dialog.put("extra", Map.of("input_mod", "audio", "model", state.modelVersion, "strict_audit", false, "audit_response", "抱歉,这个问题我暂时无法回答。")); + String inputMod = state.textMode ? "text" : "audio"; + dialog.put("extra", Map.of("input_mod", inputMod, "model", state.modelVersion, "strict_audit", false, "audit_response", "抱歉,这个问题我暂时无法回答。")); Map payload = new LinkedHashMap<>(); payload.put("asr", asr); payload.put("tts", tts); @@ -351,6 +403,10 @@ public class VoiceGatewayService { return; } if (frame.type() == VolcRealtimeProtocol.TYPE_AUDIO_ONLY_SERVER) { + // Text mode: drop all S2S audio if client doesn't want playback + if (state.textMode && !state.playAudioReply) { + return; + } boolean isDefaultTts = !StringUtils.hasText(state.currentTtsType) || "default".equals(state.currentTtsType); boolean isSuppressing = state.suppressUpstreamUntil > System.currentTimeMillis() && isDefaultTts; boolean isUserJustSpeaking = isDefaultTts && state.lastPartialAt > 0 && (System.currentTimeMillis() - state.lastPartialAt < 800); @@ -1016,6 +1072,11 @@ public class VoiceGatewayService { } private void sendGreeting(VoiceSessionState state) { + if (state.disableGreeting) { + state.hasSentGreeting = true; + sendReady(state); + return; + } if (state.hasSentGreeting || !StringUtils.hasText(state.greetingText)) { sendReady(state); return; @@ -1137,6 +1198,10 @@ public class VoiceGatewayService { private void startAudioKeepalive(VoiceSessionState state) { cancelFuture(state.keepaliveFuture); + // Text mode: skip audio keepalive entirely (no audio channel) + if (state.textMode) { + return; + } long interval = Math.max(properties.getAudioKeepaliveIntervalMs(), 5000L); state.keepaliveFuture = scheduler.scheduleAtFixedRate(() -> { WebSocket upstream = state.upstream; diff --git a/java-server/src/main/java/com/bigwo/javaserver/service/VoiceSessionState.java b/java-server/src/main/java/com/bigwo/javaserver/service/VoiceSessionState.java index fe45e21..5cb12ad 100644 --- a/java-server/src/main/java/com/bigwo/javaserver/service/VoiceSessionState.java +++ b/java-server/src/main/java/com/bigwo/javaserver/service/VoiceSessionState.java @@ -114,6 +114,11 @@ final class VoiceSessionState { // Reply plan: evidence text (raw KB content, never used for subtitle/persistence) volatile String ragEvidenceText = ""; + // Text mode (S2S input_mod=text via /ws/realtime-text) + volatile boolean textMode; + volatile boolean playAudioReply; + volatile boolean disableGreeting; + VoiceSessionState(String clientConnectionId, String sessionId, WebSocketSession clientSession, String userId) { this.clientConnectionId = clientConnectionId; this.sessionId = sessionId; diff --git a/java-server/src/main/java/com/bigwo/javaserver/websocket/VoiceWebSocketConfig.java b/java-server/src/main/java/com/bigwo/javaserver/websocket/VoiceWebSocketConfig.java index 481f192..07805f7 100644 --- a/java-server/src/main/java/com/bigwo/javaserver/websocket/VoiceWebSocketConfig.java +++ b/java-server/src/main/java/com/bigwo/javaserver/websocket/VoiceWebSocketConfig.java @@ -17,6 +17,7 @@ public class VoiceWebSocketConfig implements WebSocketConfigurer { @Override public void registerWebSocketHandlers(WebSocketHandlerRegistry registry) { - registry.addHandler(voiceWebSocketHandler, "/ws/realtime-dialog").setAllowedOriginPatterns("*"); + registry.addHandler(voiceWebSocketHandler, "/ws/realtime-dialog", "/ws/realtime-text") + .setAllowedOriginPatterns("*"); } } diff --git a/java-server/src/main/java/com/bigwo/javaserver/websocket/VoiceWebSocketHandler.java b/java-server/src/main/java/com/bigwo/javaserver/websocket/VoiceWebSocketHandler.java index 971fb12..cd8e7eb 100644 --- a/java-server/src/main/java/com/bigwo/javaserver/websocket/VoiceWebSocketHandler.java +++ b/java-server/src/main/java/com/bigwo/javaserver/websocket/VoiceWebSocketHandler.java @@ -22,7 +22,9 @@ public class VoiceWebSocketHandler extends BinaryWebSocketHandler { public void afterConnectionEstablished(WebSocketSession session) { URI uri = session.getUri(); var queryParams = UriComponentsBuilder.fromUri(uri == null ? URI.create("/") : uri).build(true).getQueryParams(); - voiceGatewayService.afterConnectionEstablished(session, queryParams.getFirst("sessionId"), queryParams.getFirst("userId")); + String path = uri == null ? "" : uri.getPath(); + boolean textMode = path != null && path.contains("realtime-text"); + voiceGatewayService.afterConnectionEstablished(session, queryParams.getFirst("sessionId"), queryParams.getFirst("userId"), textMode); } @Override diff --git a/java-server/src/main/java/com/bigwo/javaserver/websocket/VolcRealtimeProtocol.java b/java-server/src/main/java/com/bigwo/javaserver/websocket/VolcRealtimeProtocol.java index 2e0bc41..b9385ad 100644 --- a/java-server/src/main/java/com/bigwo/javaserver/websocket/VolcRealtimeProtocol.java +++ b/java-server/src/main/java/com/bigwo/javaserver/websocket/VolcRealtimeProtocol.java @@ -51,6 +51,17 @@ public final class VolcRealtimeProtocol { return marshal(TYPE_FULL_CLIENT, MSG_TYPE_FLAG_WITH_EVENT, 500, sessionId, writeJsonBytes(payload, objectMapper), false); } + public static byte[] createChatTextQueryMessage(String sessionId, String content, ObjectMapper objectMapper) { + return marshal( + TYPE_FULL_CLIENT, + MSG_TYPE_FLAG_WITH_EVENT, + 501, + sessionId, + writeJsonBytes(Map.of("content", content == null ? "" : content), objectMapper), + false + ); + } + public static byte[] createChatRagTextMessage(String sessionId, String externalRag, ObjectMapper objectMapper) { return marshal( TYPE_FULL_CLIENT, diff --git a/test2/client/src/components/ChatPanel.jsx b/test2/client/src/components/ChatPanel.jsx index f2f26eb..c893626 100644 --- a/test2/client/src/components/ChatPanel.jsx +++ b/test2/client/src/components/ChatPanel.jsx @@ -2,8 +2,9 @@ import { useState, useRef, useEffect, useCallback } from 'react'; import { Send, Bot, User, Loader2, ArrowLeft, Sparkles, Wrench, StopCircle } from 'lucide-react'; import { startChatSession, sendMessageStream } from '../services/chatApi'; import { getSessionHistory } from '../services/voiceApi'; +import { NativeVoiceService } from '../services/nativeVoiceService'; -export default function ChatPanel({ sessionId, voiceSubtitles, settings, onBack, onMessagesChange }) { +export default function ChatPanel({ sessionId, voiceSubtitles, settings, onBack, onMessagesChange, useS2S = false, playAudioReply = false }) { const [messages, setMessages] = useState([]); const [input, setInput] = useState(''); const [isLoading, setIsLoading] = useState(false); @@ -14,51 +15,143 @@ export default function ChatPanel({ sessionId, voiceSubtitles, settings, onBack, const scrollRef = useRef(null); const inputRef = useRef(null); const abortRef = useRef(null); + const s2sServiceRef = useRef(null); + const s2sStreamingIdRef = useRef(null); + + // S2S text mode: establish WebSocket to /ws/realtime-text + useEffect(() => { + if (!useS2S || !sessionId) { + return undefined; + } + const svc = new NativeVoiceService(); + s2sServiceRef.current = svc; + + svc.on('onSubtitle', (data) => { + if (!data || !data.role) return; + if (data.role === 'user') { + // User subtitle is just an echo of what we already inserted; skip + return; + } + // assistant subtitle: streaming chunks (isFinal=false) or final (isFinal=true) + const assistantId = s2sStreamingIdRef.current; + if (!assistantId) return; + setMessages((prev) => prev.map((m) => ( + m.id === assistantId + ? { ...m, content: data.text || '', streaming: !data.isFinal } + : m + ))); + if (data.isFinal) { + setIsLoading(false); + setStreamingId(null); + s2sStreamingIdRef.current = null; + inputRef.current?.focus(); + } + }); + svc.on('onAssistantPending', (active) => { + setIsLoading(!!active); + }); + svc.on('onError', (err) => { + setError(err?.message || 'S2S 文字模式错误'); + setIsLoading(false); + setStreamingId(null); + s2sStreamingIdRef.current = null; + }); + svc.on('onIdleTimeout', () => { + setError('S2S 连接超时,已断开。请刷新页面重连'); + setIsInitialized(false); + }); + svc.on('onConnectionStateChange', (state) => { + if (state === 'connected') { + // wait for onReady (handled via promise in connect) + } else if (state === 'disconnected' || state === 'error') { + setIsInitialized(false); + } + }); + + (async () => { + try { + await svc.connect({ + sessionId, + userId: settings?.userId || '', + botName: settings?.botName || '大沃', + speaker: settings?.speaker || 'zh_female_vv_jupiter_bigtts', + modelVersion: settings?.modelVersion || 'O', + clientMode: 'text', + playAudioReply: !!playAudioReply, + disableGreeting: true, + }); + setIsInitialized(true); + } catch (e) { + setError(`S2S 连接失败:${e?.message || e}`); + } + })(); + + return () => { + svc.disconnect().catch(() => {}); + s2sServiceRef.current = null; + s2sStreamingIdRef.current = null; + }; + }, [useS2S, sessionId, settings?.userId, settings?.botName, settings?.speaker, settings?.modelVersion, playAudioReply]); // 初始化:创建聊天会话,优先从数据库加载完整历史 useEffect(() => { + if (useS2S) { + // S2S mode handles init in its own effect + return; + } async function init() { + // 1. 从数据库加载历史(独立于 Coze 会话,不受其失败影响) + let historyMsgs = []; try { - // 启动后端聊天会话(后端会从 DB 加载历史注入 Coze 上下文) - await startChatSession(sessionId, voiceSubtitles); - setIsInitialized(true); - - // 从数据库加载完整对话历史(包含语音通话中的工具结果) - let historyMsgs = []; - try { - const historyData = await getSessionHistory(sessionId, 20); - if (historyData?.messages?.length > 0) { - historyMsgs = historyData.messages.map((m, i) => ({ - id: `history-${i}`, - role: m.role, - content: m.content, - fromVoice: true, - })); - console.log(`[ChatPanel] Loaded ${historyMsgs.length} messages from DB`); - } - } catch (e) { - console.warn('[ChatPanel] DB history load failed, falling back to subtitles:', e.message); - } - - // 如果数据库没有历史,回退到 voiceSubtitles - if (historyMsgs.length === 0 && voiceSubtitles && voiceSubtitles.length > 0) { - historyMsgs = voiceSubtitles.map((s, i) => ({ - id: `voice-${i}`, - role: s.role === 'user' ? 'user' : 'assistant', - content: s.text, + const historyData = await getSessionHistory(sessionId, 20); + if (historyData?.messages?.length > 0) { + historyMsgs = historyData.messages.map((m, i) => ({ + id: `history-${i}`, + role: m.role, + content: m.content, fromVoice: true, })); + console.log(`[ChatPanel] Loaded ${historyMsgs.length} messages from DB`); } - - if (historyMsgs.length > 0) { - setMessages(historyMsgs); - } - - inputRef.current?.focus(); - } catch (err) { - console.error('[ChatPanel] Init failed:', err); - setError('聊天会话初始化失败'); + } catch (e) { + console.warn('[ChatPanel] DB history load failed, falling back to subtitles:', e.message); } + + // 如果数据库没有历史,回退到 voiceSubtitles + if (historyMsgs.length === 0 && voiceSubtitles && voiceSubtitles.length > 0) { + historyMsgs = voiceSubtitles.map((s, i) => ({ + id: `voice-${i}`, + role: s.role === 'user' ? 'user' : 'assistant', + content: s.text, + fromVoice: true, + })); + } + + if (historyMsgs.length > 0) { + setMessages(historyMsgs); + } + + // 2. 启动后端聊天会话(自动重试3次,间隔2秒) + let initOk = false; + for (let attempt = 0; attempt < 3 && !initOk; attempt++) { + try { + if (attempt > 0) { + console.log(`[ChatPanel] Retrying init (attempt ${attempt + 1}/3)...`); + await new Promise(r => setTimeout(r, 2000)); + } + await startChatSession(sessionId, voiceSubtitles); + initOk = true; + setIsInitialized(true); + setError(null); + } catch (err) { + console.error(`[ChatPanel] Init attempt ${attempt + 1} failed:`, err.message); + if (attempt === 2) { + setError('聊天会话初始化失败,点击重试'); + } + } + } + + inputRef.current?.focus(); } init(); }, [sessionId, voiceSubtitles]); @@ -108,6 +201,20 @@ export default function ChatPanel({ sessionId, voiceSubtitles, settings, onBack, // 先插入一个空的 assistant 消息用于流式填充 setMessages((prev) => [...prev, { id: assistantId, role: 'assistant', content: '', streaming: true }]); + // S2S text mode: send via WebSocket, subtitle/pending events drive UI + if (useS2S) { + const svc = s2sServiceRef.current; + if (!svc) { + setError('S2S 服务未就绪'); + setIsLoading(false); + setStreamingId(null); + return; + } + s2sStreamingIdRef.current = assistantId; + svc.sendText(text); + return; + } + const abort = sendMessageStream(sessionId, text, { onChunk: (chunk) => { setMessages((prev) => @@ -117,6 +224,12 @@ export default function ChatPanel({ sessionId, voiceSubtitles, settings, onBack, onToolCall: (tools) => { setToolsInUse(tools); }, + onStreamReset: () => { + // 内容安全拦截:清空已累积的流式文本,等待 done 事件的安全回复 + setMessages((prev) => + prev.map((m) => (m.id === assistantId ? { ...m, content: '' } : m)) + ); + }, onDone: (fullContent) => { setMessages((prev) => prev.map((m) => (m.id === assistantId ? { ...m, content: fullContent, streaming: false } : m)) @@ -137,7 +250,7 @@ export default function ChatPanel({ sessionId, voiceSubtitles, settings, onBack, }); abortRef.current = abort; - }, [input, isLoading, sessionId]); + }, [input, isLoading, sessionId, useS2S]); const handleKeyDown = (e) => { if (e.key === 'Enter' && !e.shiftKey) { @@ -164,7 +277,9 @@ export default function ChatPanel({ sessionId, voiceSubtitles, settings, onBack,

{settings.botName}

-

文字对话模式 · 方舟 LLM

+

+ {useS2S ? '文字对话模式 · S2S' : '文字对话模式 · 方舟 LLM'} +

@@ -262,6 +377,17 @@ export default function ChatPanel({ sessionId, voiceSubtitles, settings, onBack, {error && (
{error} + {!isInitialized && ( + + )}
)} diff --git a/test2/client/src/services/nativeVoiceService.js b/test2/client/src/services/nativeVoiceService.js index bbb0597..3d30dc7 100644 --- a/test2/client/src/services/nativeVoiceService.js +++ b/test2/client/src/services/nativeVoiceService.js @@ -25,7 +25,7 @@ class NativeVoiceService { }; } - resolveWebSocketUrl(sessionId, userId) { + resolveWebSocketUrl(sessionId, userId, wsPath = '/ws/realtime-dialog') { const query = new URLSearchParams({ sessionId, userId: userId || '', @@ -43,16 +43,16 @@ class NativeVoiceService { } else if (base.endsWith('/api')) { base = base.slice(0, -'/api'.length); } - return `${base}/ws/realtime-dialog?${query.toString()}`; + return `${base}${wsPath}?${query.toString()}`; } const hostname = window.location.hostname; const port = window.location.port; const isLocalHost = hostname === 'localhost' || hostname === '127.0.0.1'; if ((window.location.protocol === 'file:' || isLocalHost) && port !== '3012') { - return `ws://${hostname || '127.0.0.1'}:3012/ws/realtime-dialog?${query.toString()}`; + return `ws://${hostname || '127.0.0.1'}:3012${wsPath}?${query.toString()}`; } const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:'; - return `${protocol}//${window.location.host}/ws/realtime-dialog?${query.toString()}`; + return `${protocol}//${window.location.host}${wsPath}?${query.toString()}`; } emitConnectionState(state) { @@ -80,43 +80,66 @@ class NativeVoiceService { } } - async connect({ sessionId, userId, botName, systemRole, speakingStyle, modelVersion, speaker, greetingText }) { + async connect({ + sessionId, + userId, + botName, + systemRole, + speakingStyle, + modelVersion, + speaker, + greetingText, + clientMode = 'voice', + playAudioReply = false, + disableGreeting = false, + } = {}) { await this.disconnect(); - const wsUrl = this.resolveWebSocketUrl(sessionId, userId); + this.clientMode = clientMode; + this.playAudioReply = playAudioReply; + const wsPath = clientMode === 'text' ? '/ws/realtime-text' : '/ws/realtime-dialog'; + const wsUrl = this.resolveWebSocketUrl(sessionId, userId, wsPath); this.emitConnectionState('connecting'); - this.playbackContext = new (window.AudioContext || window.webkitAudioContext)(); - if (this.playbackContext.state === 'suspended') { - await this.playbackContext.resume().catch(() => {}); - } - this.playbackTime = this.playbackContext.currentTime; - // 安全上下文检查: getUserMedia 需要 HTTPS 或 localhost - if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) { - const errMsg = window.isSecureContext === false - ? '麦克风访问需要 HTTPS 连接,请使用 https:// 地址访问' - : '当前浏览器不支持麦克风访问'; - this.emitConnectionState('error', errMsg); - throw new Error(errMsg); - } - - // 并行: 同时预获取麦克风和建立WS连接,节省500ms+ - const micPromise = navigator.mediaDevices.getUserMedia({ - audio: { - channelCount: 1, - noiseSuppression: true, - echoCancellation: true, - autoGainControl: true, - }, - video: false, - }).catch((err) => { - console.warn('[NativeVoice] Pre-fetch getUserMedia failed:', err.name, err.message); - if (err.name === 'NotAllowedError' || err.message?.includes('Permission denied')) { - const msg = '麦克风权限被拒绝,请在浏览器设置中允许本站访问麦克风后重试'; - this.emitConnectionState('error', msg); - throw new Error(msg); + // Audio playback context: only needed if we will receive audio (voice mode or text+playAudio) + const needsPlayback = clientMode !== 'text' || playAudioReply; + if (needsPlayback) { + this.playbackContext = new (window.AudioContext || window.webkitAudioContext)(); + if (this.playbackContext.state === 'suspended') { + await this.playbackContext.resume().catch(() => {}); } - return null; - }); + this.playbackTime = this.playbackContext.currentTime; + } + + // Microphone capture: only needed in voice mode + let micPromise = Promise.resolve(null); + if (clientMode !== 'text') { + // 安全上下文检查: getUserMedia 需要 HTTPS 或 localhost + if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) { + const errMsg = window.isSecureContext === false + ? '麦克风访问需要 HTTPS 连接,请使用 https:// 地址访问' + : '当前浏览器不支持麦克风访问'; + this.emitConnectionState('error', errMsg); + throw new Error(errMsg); + } + // 并行: 同时预获取麦克风和建立WS连接,节省500ms+ + micPromise = navigator.mediaDevices.getUserMedia({ + audio: { + channelCount: 1, + noiseSuppression: true, + echoCancellation: true, + autoGainControl: true, + }, + video: false, + }).catch((err) => { + console.warn('[NativeVoice] Pre-fetch getUserMedia failed:', err.name, err.message); + if (err.name === 'NotAllowedError' || err.message?.includes('Permission denied')) { + const msg = '麦克风权限被拒绝,请在浏览器设置中允许本站访问麦克风后重试'; + this.emitConnectionState('error', msg); + throw new Error(msg); + } + return null; + }); + } const CONNECTION_TIMEOUT_MS = 12000; @@ -151,6 +174,9 @@ class NativeVoiceService { modelVersion, speaker, greetingText, + clientMode, + playAudioReply, + disableGreeting: clientMode === 'text' ? (disableGreeting !== false) : disableGreeting, })); }; @@ -187,11 +213,21 @@ class NativeVoiceService { }; }); + // 文字模式:不启动麦克风采集 + if (clientMode === 'text') { + return; + } // 使用预获取的mediaStream(已并行获取),避免重复申请 const preFetchedStream = await micPromise; await this.startCapture(preFetchedStream); } + sendText(text) { + if (this.ws && this.ws.readyState === WebSocket.OPEN) { + this.ws.send(JSON.stringify({ type: 'text', text: String(text || '') })); + } + } + handleJsonMessage(raw) { try { const msg = JSON.parse(raw); @@ -246,6 +282,10 @@ class NativeVoiceService { } handleAudioMessage(arrayBuffer) { + // Text mode without playAudioReply: drop all incoming audio silently + if (this.clientMode === 'text' && !this.playAudioReply) { + return; + } if (!this.playbackContext) { return; } @@ -441,3 +481,4 @@ class NativeVoiceService { const nativeVoiceService = new NativeVoiceService(); export default nativeVoiceService; +export { NativeVoiceService };