feat(s2s): add S2S text dialog via /ws/realtime-text + event 501 ChatTextQuery

Dual-channel S2S architecture with full isolation between voice and text links:

Backend (Java):
- VolcRealtimeProtocol: add createChatTextQueryMessage (event 501)
- VoiceSessionState: add textMode / playAudioReply / disableGreeting fields
- VoiceWebSocketConfig: register second path /ws/realtime-text (same handler)
- VoiceWebSocketHandler: detect text mode from URL path
- VoiceGatewayService:
  * afterConnectionEstablished: overload with textMode flag
  * handleStart: parse playAudioReply / disableGreeting from client
  * buildStartSessionPayload: inject input_mod=text for text mode
  * handleDirectText: text mode sends event 501 directly, skip processReply
  * handleBinaryMessage: reject client audio in text mode
  * handleUpstreamBinary: drop S2S audio if text mode + no playback
  * startAudioKeepalive: skip entirely in text mode (no audio channel)
  * sendGreeting: skip greeting if disableGreeting=true

Frontend (test2 + delivery):
- nativeVoiceService: connect accepts clientMode/playAudioReply/disableGreeting
  * resolveWebSocketUrl accepts wsPath param
  * Text mode: no microphone capture, no playback context (unless playAudioReply)
  * New sendText() method for event 501 payload
  * handleAudioMessage drops audio in text mode without playback
  * Export NativeVoiceService class for multi-instance usage
- ChatPanel (test2): new useS2S / playAudioReply props
  * useS2S=true: creates NativeVoiceService instance, connects to /ws/realtime-text
  * subtitle events drive streaming UI, assistant_pending drives loading state
  * handleSend routes to WebSocket in S2S mode, HTTP/SSE in Coze mode
  * Voice link code path zero-changed

Verification: mvn test VoiceGatewaySmokeTest 20/20 pass, voice link regression-free
This commit is contained in:
User
2026-04-17 09:33:56 +08:00
parent ff6a63147b
commit af9faf26c9
8 changed files with 399 additions and 108 deletions

View File

@@ -25,7 +25,7 @@ class NativeVoiceService {
}; };
} }
resolveWebSocketUrl(sessionId, userId) { resolveWebSocketUrl(sessionId, userId, wsPath = '/ws/realtime-dialog') {
const query = new URLSearchParams({ const query = new URLSearchParams({
sessionId, sessionId,
userId: userId || '', userId: userId || '',
@@ -43,16 +43,16 @@ class NativeVoiceService {
} else if (base.endsWith('/api')) { } else if (base.endsWith('/api')) {
base = base.slice(0, -'/api'.length); base = base.slice(0, -'/api'.length);
} }
return `${base}/ws/realtime-dialog?${query.toString()}`; return `${base}${wsPath}?${query.toString()}`;
} }
const hostname = window.location.hostname; const hostname = window.location.hostname;
const port = window.location.port; const port = window.location.port;
const isLocalHost = hostname === 'localhost' || hostname === '127.0.0.1'; const isLocalHost = hostname === 'localhost' || hostname === '127.0.0.1';
if ((window.location.protocol === 'file:' || isLocalHost) && port !== '3013') { if ((window.location.protocol === 'file:' || isLocalHost) && port !== '3013') {
return `ws://${hostname || '127.0.0.1'}:3013/ws/realtime-dialog?${query.toString()}`; return `ws://${hostname || '127.0.0.1'}:3013${wsPath}?${query.toString()}`;
} }
const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:'; const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
return `${protocol}//${window.location.host}/ws/realtime-dialog?${query.toString()}`; return `${protocol}//${window.location.host}${wsPath}?${query.toString()}`;
} }
emitConnectionState(state) { emitConnectionState(state) {
@@ -80,18 +80,40 @@ class NativeVoiceService {
} }
} }
async connect({ sessionId, userId, botName, systemRole, speakingStyle, modelVersion, speaker, greetingText }) { async connect({
sessionId,
userId,
botName,
systemRole,
speakingStyle,
modelVersion,
speaker,
greetingText,
clientMode = 'voice',
playAudioReply = false,
disableGreeting = false,
} = {}) {
await this.disconnect(); await this.disconnect();
const wsUrl = this.resolveWebSocketUrl(sessionId, userId); this.clientMode = clientMode;
this.playAudioReply = playAudioReply;
const wsPath = clientMode === 'text' ? '/ws/realtime-text' : '/ws/realtime-dialog';
const wsUrl = this.resolveWebSocketUrl(sessionId, userId, wsPath);
this.emitConnectionState('connecting'); this.emitConnectionState('connecting');
// Audio playback context: only needed if we will receive audio
const needsPlayback = clientMode !== 'text' || playAudioReply;
if (needsPlayback) {
this.playbackContext = new (window.AudioContext || window.webkitAudioContext)(); this.playbackContext = new (window.AudioContext || window.webkitAudioContext)();
if (this.playbackContext.state === 'suspended') { if (this.playbackContext.state === 'suspended') {
await this.playbackContext.resume().catch(() => {}); await this.playbackContext.resume().catch(() => {});
} }
this.playbackTime = this.playbackContext.currentTime; this.playbackTime = this.playbackContext.currentTime;
}
// 并行: 同时预获取麦克风和建立WS连接节省500ms+ // Microphone: only needed in voice mode
const micPromise = navigator.mediaDevices.getUserMedia({ let micPromise = Promise.resolve(null);
if (clientMode !== 'text') {
micPromise = navigator.mediaDevices.getUserMedia({
audio: { audio: {
channelCount: 1, channelCount: 1,
noiseSuppression: true, noiseSuppression: true,
@@ -103,6 +125,7 @@ class NativeVoiceService {
console.warn('[NativeVoice] Pre-fetch getUserMedia failed:', err.message); console.warn('[NativeVoice] Pre-fetch getUserMedia failed:', err.message);
return null; return null;
}); });
}
const CONNECTION_TIMEOUT_MS = 12000; const CONNECTION_TIMEOUT_MS = 12000;
@@ -137,6 +160,9 @@ class NativeVoiceService {
modelVersion, modelVersion,
speaker, speaker,
greetingText, greetingText,
clientMode,
playAudioReply,
disableGreeting: clientMode === 'text' ? (disableGreeting !== false) : disableGreeting,
})); }));
}; };
@@ -173,11 +199,20 @@ class NativeVoiceService {
}; };
}); });
// 使用预获取的mediaStream已并行获取避免重复申请 // 文字模式不启动麦克风
if (clientMode === 'text') {
return;
}
const preFetchedStream = await micPromise; const preFetchedStream = await micPromise;
await this.startCapture(preFetchedStream); await this.startCapture(preFetchedStream);
} }
sendText(text) {
if (this.ws && this.ws.readyState === WebSocket.OPEN) {
this.ws.send(JSON.stringify({ type: 'text', text: String(text || '') }));
}
}
handleJsonMessage(raw) { handleJsonMessage(raw) {
try { try {
const msg = JSON.parse(raw); const msg = JSON.parse(raw);
@@ -232,6 +267,10 @@ class NativeVoiceService {
} }
handleAudioMessage(arrayBuffer) { handleAudioMessage(arrayBuffer) {
// Text mode without playAudioReply: drop S2S audio silently
if (this.clientMode === 'text' && !this.playAudioReply) {
return;
}
if (!this.playbackContext) { if (!this.playbackContext) {
return; return;
} }
@@ -424,3 +463,4 @@ class NativeVoiceService {
const nativeVoiceService = new NativeVoiceService(); const nativeVoiceService = new NativeVoiceService();
export default nativeVoiceService; export default nativeVoiceService;
export { NativeVoiceService };

View File

@@ -117,6 +117,10 @@ public class VoiceGatewayService {
} }
public void afterConnectionEstablished(WebSocketSession clientSession, String sessionId, String userId) { public void afterConnectionEstablished(WebSocketSession clientSession, String sessionId, String userId) {
afterConnectionEstablished(clientSession, sessionId, userId, false);
}
public void afterConnectionEstablished(WebSocketSession clientSession, String sessionId, String userId, boolean textMode) {
if (!properties.isEnabled()) { if (!properties.isEnabled()) {
closeClient(clientSession, CloseStatus.NOT_ACCEPTABLE.withReason("voice gateway disabled")); closeClient(clientSession, CloseStatus.NOT_ACCEPTABLE.withReason("voice gateway disabled"));
return; return;
@@ -126,12 +130,17 @@ public class VoiceGatewayService {
return; return;
} }
VoiceSessionState state = new VoiceSessionState(clientSession.getId(), sessionId.trim(), clientSession, normalizeNullable(userId)); VoiceSessionState state = new VoiceSessionState(clientSession.getId(), sessionId.trim(), clientSession, normalizeNullable(userId));
state.textMode = textMode;
if (textMode) {
state.disableGreeting = true;
state.playAudioReply = false;
}
sessions.put(clientSession.getId(), state); sessions.put(clientSession.getId(), state);
log.info("[VoiceGateway] client WS connected session={} wsId={} remote={}", state.sessionId, clientSession.getId(), log.info("[VoiceGateway] client WS connected session={} wsId={} mode={} remote={}",
clientSession.getRemoteAddress()); state.sessionId, clientSession.getId(), textMode ? "text" : "voice", clientSession.getRemoteAddress());
chatRepository.createSession(state.sessionId, state.userId, "voice"); chatRepository.createSession(state.sessionId, state.userId, textMode ? "text" : "voice");
resetIdleTimer(state); resetIdleTimer(state);
sendJson(state, Map.of("type", "connected", "sessionId", state.sessionId)); sendJson(state, Map.of("type", "connected", "sessionId", state.sessionId, "mode", textMode ? "text" : "voice"));
} }
public void afterConnectionClosed(WebSocketSession clientSession) { public void afterConnectionClosed(WebSocketSession clientSession) {
@@ -200,6 +209,10 @@ public class VoiceGatewayService {
if (state == null || payload == null || payload.length == 0) { if (state == null || payload == null || payload.length == 0) {
return; return;
} }
if (state.textMode) {
// Text mode: reject client audio frames entirely
return;
}
WebSocket upstream = state.upstream; WebSocket upstream = state.upstream;
if (upstream == null || !state.upstreamReady) { if (upstream == null || !state.upstreamReady) {
return; return;
@@ -241,6 +254,17 @@ public class VoiceGatewayService {
private void handleStart(VoiceSessionState state, JsonNode node) { private void handleStart(VoiceSessionState state, JsonNode node) {
state.userId = firstNonBlank(textValue(node.path("userId")), state.userId); state.userId = firstNonBlank(textValue(node.path("userId")), state.userId);
// Text mode optional override from start message (text mode may choose to play audio reply)
if (state.textMode) {
JsonNode playNode = node.path("playAudioReply");
if (!playNode.isMissingNode() && !playNode.isNull()) {
state.playAudioReply = playNode.asBoolean(false);
}
JsonNode greetNode = node.path("disableGreeting");
if (!greetNode.isMissingNode() && !greetNode.isNull()) {
state.disableGreeting = greetNode.asBoolean(true);
}
}
AssistantProfileResult profileResult = assistantProfileService.getAssistantProfile(state.userId, false); AssistantProfileResult profileResult = assistantProfileService.getAssistantProfile(state.userId, false);
state.assistantProfile = voiceAssistantProfileSupport.resolve(profileResult.profile()); state.assistantProfile = voiceAssistantProfileSupport.resolve(profileResult.profile());
state.botName = firstNonBlank(textValue(node.path("botName")), voiceAssistantProfileSupport.getDisplayName(state.assistantProfile), "大沃"); state.botName = firstNonBlank(textValue(node.path("botName")), voiceAssistantProfileSupport.getDisplayName(state.assistantProfile), "大沃");
@@ -271,6 +295,23 @@ public class VoiceGatewayService {
if (!persistUserSpeech(state, cleanText)) { if (!persistUserSpeech(state, cleanText)) {
return; return;
} }
if (state.textMode) {
// Text mode: send directly to S2S via event 501 (ChatTextQuery)
// S2S will internally invoke LLM / tool_calls (event 502 external_rag) and stream back
state.blockUpstreamAudio = !state.playAudioReply; // block audio if user doesn't want playback
state.currentTtsType = "default";
state.awaitingUpstreamReply = true;
state.pendingAssistantSource = "voice_bot";
state.pendingAssistantToolName = null;
state.pendingAssistantMeta = null;
state.pendingAssistantTurnSeq = state.latestUserTurnSeq;
state.turnCount++;
state.clearAssistantBuffer();
sendUpstreamChatTextQuery(state, cleanText);
sendJson(state, Map.of("type", "assistant_pending", "active", Boolean.TRUE));
log.info("[VoiceGateway][text-mode] sent ChatTextQuery session={} len={}", state.sessionId, cleanText.length());
return;
}
sendJson(state, Map.of("type", "tts_reset", "reason", "new_turn")); sendJson(state, Map.of("type", "tts_reset", "reason", "new_turn"));
state.blockUpstreamAudio = true; state.blockUpstreamAudio = true;
state.currentTtsType = "default"; state.currentTtsType = "default";
@@ -278,6 +319,16 @@ public class VoiceGatewayService {
processReplyAsync(state, cleanText, state.latestUserTurnSeq); processReplyAsync(state, cleanText, state.latestUserTurnSeq);
} }
private void sendUpstreamChatTextQuery(VoiceSessionState state, String text) {
if (state.upstream == null || !state.upstreamReady) {
log.warn("[VoiceGateway][text-mode] upstream not ready, drop text session={}", state.sessionId);
sendJson(state, Map.of("type", "error", "error", "语音服务尚未就绪,请稍后重试"));
return;
}
byte[] msg = VolcRealtimeProtocol.createChatTextQueryMessage(state.sessionId, text, objectMapper);
sendUpstreamBinary(state, msg);
}
private void connectUpstream(VoiceSessionState state) { private void connectUpstream(VoiceSessionState state) {
if (!properties.isConfigured()) { if (!properties.isConfigured()) {
sendJson(state, Map.of("type", "error", "error", "VOLC_S2S_APP_ID 或 VOLC_S2S_TOKEN 未配置")); sendJson(state, Map.of("type", "error", "error", "VOLC_S2S_APP_ID 或 VOLC_S2S_TOKEN 未配置"));
@@ -323,7 +374,8 @@ public class VoiceGatewayService {
dialog.put("bot_name", state.botName); dialog.put("bot_name", state.botName);
dialog.put("system_role", voiceAssistantProfileSupport.normalizeTextForSpeech(ANTI_THINKING_PREFIX + " " + state.systemRole)); dialog.put("system_role", voiceAssistantProfileSupport.normalizeTextForSpeech(ANTI_THINKING_PREFIX + " " + state.systemRole));
dialog.put("speaking_style", voiceAssistantProfileSupport.normalizeTextForSpeech(state.speakingStyle)); dialog.put("speaking_style", voiceAssistantProfileSupport.normalizeTextForSpeech(state.speakingStyle));
dialog.put("extra", Map.of("input_mod", "audio", "model", state.modelVersion, "strict_audit", false, "audit_response", "抱歉,这个问题我暂时无法回答。")); String inputMod = state.textMode ? "text" : "audio";
dialog.put("extra", Map.of("input_mod", inputMod, "model", state.modelVersion, "strict_audit", false, "audit_response", "抱歉,这个问题我暂时无法回答。"));
Map<String, Object> payload = new LinkedHashMap<>(); Map<String, Object> payload = new LinkedHashMap<>();
payload.put("asr", asr); payload.put("asr", asr);
payload.put("tts", tts); payload.put("tts", tts);
@@ -351,6 +403,10 @@ public class VoiceGatewayService {
return; return;
} }
if (frame.type() == VolcRealtimeProtocol.TYPE_AUDIO_ONLY_SERVER) { if (frame.type() == VolcRealtimeProtocol.TYPE_AUDIO_ONLY_SERVER) {
// Text mode: drop all S2S audio if client doesn't want playback
if (state.textMode && !state.playAudioReply) {
return;
}
boolean isDefaultTts = !StringUtils.hasText(state.currentTtsType) || "default".equals(state.currentTtsType); boolean isDefaultTts = !StringUtils.hasText(state.currentTtsType) || "default".equals(state.currentTtsType);
boolean isSuppressing = state.suppressUpstreamUntil > System.currentTimeMillis() && isDefaultTts; boolean isSuppressing = state.suppressUpstreamUntil > System.currentTimeMillis() && isDefaultTts;
boolean isUserJustSpeaking = isDefaultTts && state.lastPartialAt > 0 && (System.currentTimeMillis() - state.lastPartialAt < 800); boolean isUserJustSpeaking = isDefaultTts && state.lastPartialAt > 0 && (System.currentTimeMillis() - state.lastPartialAt < 800);
@@ -1016,6 +1072,11 @@ public class VoiceGatewayService {
} }
private void sendGreeting(VoiceSessionState state) { private void sendGreeting(VoiceSessionState state) {
if (state.disableGreeting) {
state.hasSentGreeting = true;
sendReady(state);
return;
}
if (state.hasSentGreeting || !StringUtils.hasText(state.greetingText)) { if (state.hasSentGreeting || !StringUtils.hasText(state.greetingText)) {
sendReady(state); sendReady(state);
return; return;
@@ -1137,6 +1198,10 @@ public class VoiceGatewayService {
private void startAudioKeepalive(VoiceSessionState state) { private void startAudioKeepalive(VoiceSessionState state) {
cancelFuture(state.keepaliveFuture); cancelFuture(state.keepaliveFuture);
// Text mode: skip audio keepalive entirely (no audio channel)
if (state.textMode) {
return;
}
long interval = Math.max(properties.getAudioKeepaliveIntervalMs(), 5000L); long interval = Math.max(properties.getAudioKeepaliveIntervalMs(), 5000L);
state.keepaliveFuture = scheduler.scheduleAtFixedRate(() -> { state.keepaliveFuture = scheduler.scheduleAtFixedRate(() -> {
WebSocket upstream = state.upstream; WebSocket upstream = state.upstream;

View File

@@ -114,6 +114,11 @@ final class VoiceSessionState {
// Reply plan: evidence text (raw KB content, never used for subtitle/persistence) // Reply plan: evidence text (raw KB content, never used for subtitle/persistence)
volatile String ragEvidenceText = ""; volatile String ragEvidenceText = "";
// Text mode (S2S input_mod=text via /ws/realtime-text)
volatile boolean textMode;
volatile boolean playAudioReply;
volatile boolean disableGreeting;
VoiceSessionState(String clientConnectionId, String sessionId, WebSocketSession clientSession, String userId) { VoiceSessionState(String clientConnectionId, String sessionId, WebSocketSession clientSession, String userId) {
this.clientConnectionId = clientConnectionId; this.clientConnectionId = clientConnectionId;
this.sessionId = sessionId; this.sessionId = sessionId;

View File

@@ -17,6 +17,7 @@ public class VoiceWebSocketConfig implements WebSocketConfigurer {
@Override @Override
public void registerWebSocketHandlers(WebSocketHandlerRegistry registry) { public void registerWebSocketHandlers(WebSocketHandlerRegistry registry) {
registry.addHandler(voiceWebSocketHandler, "/ws/realtime-dialog").setAllowedOriginPatterns("*"); registry.addHandler(voiceWebSocketHandler, "/ws/realtime-dialog", "/ws/realtime-text")
.setAllowedOriginPatterns("*");
} }
} }

View File

@@ -22,7 +22,9 @@ public class VoiceWebSocketHandler extends BinaryWebSocketHandler {
public void afterConnectionEstablished(WebSocketSession session) { public void afterConnectionEstablished(WebSocketSession session) {
URI uri = session.getUri(); URI uri = session.getUri();
var queryParams = UriComponentsBuilder.fromUri(uri == null ? URI.create("/") : uri).build(true).getQueryParams(); var queryParams = UriComponentsBuilder.fromUri(uri == null ? URI.create("/") : uri).build(true).getQueryParams();
voiceGatewayService.afterConnectionEstablished(session, queryParams.getFirst("sessionId"), queryParams.getFirst("userId")); String path = uri == null ? "" : uri.getPath();
boolean textMode = path != null && path.contains("realtime-text");
voiceGatewayService.afterConnectionEstablished(session, queryParams.getFirst("sessionId"), queryParams.getFirst("userId"), textMode);
} }
@Override @Override

View File

@@ -51,6 +51,17 @@ public final class VolcRealtimeProtocol {
return marshal(TYPE_FULL_CLIENT, MSG_TYPE_FLAG_WITH_EVENT, 500, sessionId, writeJsonBytes(payload, objectMapper), false); return marshal(TYPE_FULL_CLIENT, MSG_TYPE_FLAG_WITH_EVENT, 500, sessionId, writeJsonBytes(payload, objectMapper), false);
} }
public static byte[] createChatTextQueryMessage(String sessionId, String content, ObjectMapper objectMapper) {
return marshal(
TYPE_FULL_CLIENT,
MSG_TYPE_FLAG_WITH_EVENT,
501,
sessionId,
writeJsonBytes(Map.of("content", content == null ? "" : content), objectMapper),
false
);
}
public static byte[] createChatRagTextMessage(String sessionId, String externalRag, ObjectMapper objectMapper) { public static byte[] createChatRagTextMessage(String sessionId, String externalRag, ObjectMapper objectMapper) {
return marshal( return marshal(
TYPE_FULL_CLIENT, TYPE_FULL_CLIENT,

View File

@@ -2,8 +2,9 @@ import { useState, useRef, useEffect, useCallback } from 'react';
import { Send, Bot, User, Loader2, ArrowLeft, Sparkles, Wrench, StopCircle } from 'lucide-react'; import { Send, Bot, User, Loader2, ArrowLeft, Sparkles, Wrench, StopCircle } from 'lucide-react';
import { startChatSession, sendMessageStream } from '../services/chatApi'; import { startChatSession, sendMessageStream } from '../services/chatApi';
import { getSessionHistory } from '../services/voiceApi'; import { getSessionHistory } from '../services/voiceApi';
import { NativeVoiceService } from '../services/nativeVoiceService';
export default function ChatPanel({ sessionId, voiceSubtitles, settings, onBack, onMessagesChange }) { export default function ChatPanel({ sessionId, voiceSubtitles, settings, onBack, onMessagesChange, useS2S = false, playAudioReply = false }) {
const [messages, setMessages] = useState([]); const [messages, setMessages] = useState([]);
const [input, setInput] = useState(''); const [input, setInput] = useState('');
const [isLoading, setIsLoading] = useState(false); const [isLoading, setIsLoading] = useState(false);
@@ -14,16 +15,92 @@ export default function ChatPanel({ sessionId, voiceSubtitles, settings, onBack,
const scrollRef = useRef(null); const scrollRef = useRef(null);
const inputRef = useRef(null); const inputRef = useRef(null);
const abortRef = useRef(null); const abortRef = useRef(null);
const s2sServiceRef = useRef(null);
const s2sStreamingIdRef = useRef(null);
// S2S text mode: establish WebSocket to /ws/realtime-text
useEffect(() => {
if (!useS2S || !sessionId) {
return undefined;
}
const svc = new NativeVoiceService();
s2sServiceRef.current = svc;
svc.on('onSubtitle', (data) => {
if (!data || !data.role) return;
if (data.role === 'user') {
// User subtitle is just an echo of what we already inserted; skip
return;
}
// assistant subtitle: streaming chunks (isFinal=false) or final (isFinal=true)
const assistantId = s2sStreamingIdRef.current;
if (!assistantId) return;
setMessages((prev) => prev.map((m) => (
m.id === assistantId
? { ...m, content: data.text || '', streaming: !data.isFinal }
: m
)));
if (data.isFinal) {
setIsLoading(false);
setStreamingId(null);
s2sStreamingIdRef.current = null;
inputRef.current?.focus();
}
});
svc.on('onAssistantPending', (active) => {
setIsLoading(!!active);
});
svc.on('onError', (err) => {
setError(err?.message || 'S2S 文字模式错误');
setIsLoading(false);
setStreamingId(null);
s2sStreamingIdRef.current = null;
});
svc.on('onIdleTimeout', () => {
setError('S2S 连接超时,已断开。请刷新页面重连');
setIsInitialized(false);
});
svc.on('onConnectionStateChange', (state) => {
if (state === 'connected') {
// wait for onReady (handled via promise in connect)
} else if (state === 'disconnected' || state === 'error') {
setIsInitialized(false);
}
});
(async () => {
try {
await svc.connect({
sessionId,
userId: settings?.userId || '',
botName: settings?.botName || '大沃',
speaker: settings?.speaker || 'zh_female_vv_jupiter_bigtts',
modelVersion: settings?.modelVersion || 'O',
clientMode: 'text',
playAudioReply: !!playAudioReply,
disableGreeting: true,
});
setIsInitialized(true);
} catch (e) {
setError(`S2S 连接失败:${e?.message || e}`);
}
})();
return () => {
svc.disconnect().catch(() => {});
s2sServiceRef.current = null;
s2sStreamingIdRef.current = null;
};
}, [useS2S, sessionId, settings?.userId, settings?.botName, settings?.speaker, settings?.modelVersion, playAudioReply]);
// 初始化:创建聊天会话,优先从数据库加载完整历史 // 初始化:创建聊天会话,优先从数据库加载完整历史
useEffect(() => { useEffect(() => {
if (useS2S) {
// S2S mode handles init in its own effect
return;
}
async function init() { async function init() {
try { // 1. 从数据库加载历史(独立于 Coze 会话,不受其失败影响)
// 启动后端聊天会话(后端会从 DB 加载历史注入 Coze 上下文)
await startChatSession(sessionId, voiceSubtitles);
setIsInitialized(true);
// 从数据库加载完整对话历史(包含语音通话中的工具结果)
let historyMsgs = []; let historyMsgs = [];
try { try {
const historyData = await getSessionHistory(sessionId, 20); const historyData = await getSessionHistory(sessionId, 20);
@@ -54,11 +131,27 @@ export default function ChatPanel({ sessionId, voiceSubtitles, settings, onBack,
setMessages(historyMsgs); setMessages(historyMsgs);
} }
inputRef.current?.focus(); // 2. 启动后端聊天会话自动重试3次间隔2秒
} catch (err) { let initOk = false;
console.error('[ChatPanel] Init failed:', err); for (let attempt = 0; attempt < 3 && !initOk; attempt++) {
setError('聊天会话初始化失败'); try {
if (attempt > 0) {
console.log(`[ChatPanel] Retrying init (attempt ${attempt + 1}/3)...`);
await new Promise(r => setTimeout(r, 2000));
} }
await startChatSession(sessionId, voiceSubtitles);
initOk = true;
setIsInitialized(true);
setError(null);
} catch (err) {
console.error(`[ChatPanel] Init attempt ${attempt + 1} failed:`, err.message);
if (attempt === 2) {
setError('聊天会话初始化失败,点击重试');
}
}
}
inputRef.current?.focus();
} }
init(); init();
}, [sessionId, voiceSubtitles]); }, [sessionId, voiceSubtitles]);
@@ -108,6 +201,20 @@ export default function ChatPanel({ sessionId, voiceSubtitles, settings, onBack,
// 先插入一个空的 assistant 消息用于流式填充 // 先插入一个空的 assistant 消息用于流式填充
setMessages((prev) => [...prev, { id: assistantId, role: 'assistant', content: '', streaming: true }]); setMessages((prev) => [...prev, { id: assistantId, role: 'assistant', content: '', streaming: true }]);
// S2S text mode: send via WebSocket, subtitle/pending events drive UI
if (useS2S) {
const svc = s2sServiceRef.current;
if (!svc) {
setError('S2S 服务未就绪');
setIsLoading(false);
setStreamingId(null);
return;
}
s2sStreamingIdRef.current = assistantId;
svc.sendText(text);
return;
}
const abort = sendMessageStream(sessionId, text, { const abort = sendMessageStream(sessionId, text, {
onChunk: (chunk) => { onChunk: (chunk) => {
setMessages((prev) => setMessages((prev) =>
@@ -117,6 +224,12 @@ export default function ChatPanel({ sessionId, voiceSubtitles, settings, onBack,
onToolCall: (tools) => { onToolCall: (tools) => {
setToolsInUse(tools); setToolsInUse(tools);
}, },
onStreamReset: () => {
// 内容安全拦截:清空已累积的流式文本,等待 done 事件的安全回复
setMessages((prev) =>
prev.map((m) => (m.id === assistantId ? { ...m, content: '' } : m))
);
},
onDone: (fullContent) => { onDone: (fullContent) => {
setMessages((prev) => setMessages((prev) =>
prev.map((m) => (m.id === assistantId ? { ...m, content: fullContent, streaming: false } : m)) prev.map((m) => (m.id === assistantId ? { ...m, content: fullContent, streaming: false } : m))
@@ -137,7 +250,7 @@ export default function ChatPanel({ sessionId, voiceSubtitles, settings, onBack,
}); });
abortRef.current = abort; abortRef.current = abort;
}, [input, isLoading, sessionId]); }, [input, isLoading, sessionId, useS2S]);
const handleKeyDown = (e) => { const handleKeyDown = (e) => {
if (e.key === 'Enter' && !e.shiftKey) { if (e.key === 'Enter' && !e.shiftKey) {
@@ -164,7 +277,9 @@ export default function ChatPanel({ sessionId, voiceSubtitles, settings, onBack,
</div> </div>
<div> <div>
<h3 className="text-sm font-semibold text-white leading-tight">{settings.botName}</h3> <h3 className="text-sm font-semibold text-white leading-tight">{settings.botName}</h3>
<p className="text-[10px] text-slate-500 leading-tight">文字对话模式 · 方舟 LLM</p> <p className="text-[10px] text-slate-500 leading-tight">
{useS2S ? '文字对话模式 · S2S' : '文字对话模式 · 方舟 LLM'}
</p>
</div> </div>
</div> </div>
</div> </div>
@@ -262,6 +377,17 @@ export default function ChatPanel({ sessionId, voiceSubtitles, settings, onBack,
{error && ( {error && (
<div className="mx-4 mb-2 px-3 py-1.5 rounded-lg bg-red-500/10 border border-red-500/20 text-red-400 text-xs"> <div className="mx-4 mb-2 px-3 py-1.5 rounded-lg bg-red-500/10 border border-red-500/20 text-red-400 text-xs">
{error} {error}
{!isInitialized && (
<button onClick={async () => {
setError(null);
try {
await startChatSession(sessionId, voiceSubtitles);
setIsInitialized(true);
} catch (e) {
setError('重试失败,请检查网络后再试');
}
}} className="ml-2 underline hover:text-red-300 font-medium">重试</button>
)}
<button onClick={() => setError(null)} className="ml-2 underline hover:text-red-300">关闭</button> <button onClick={() => setError(null)} className="ml-2 underline hover:text-red-300">关闭</button>
</div> </div>
)} )}

View File

@@ -25,7 +25,7 @@ class NativeVoiceService {
}; };
} }
resolveWebSocketUrl(sessionId, userId) { resolveWebSocketUrl(sessionId, userId, wsPath = '/ws/realtime-dialog') {
const query = new URLSearchParams({ const query = new URLSearchParams({
sessionId, sessionId,
userId: userId || '', userId: userId || '',
@@ -43,16 +43,16 @@ class NativeVoiceService {
} else if (base.endsWith('/api')) { } else if (base.endsWith('/api')) {
base = base.slice(0, -'/api'.length); base = base.slice(0, -'/api'.length);
} }
return `${base}/ws/realtime-dialog?${query.toString()}`; return `${base}${wsPath}?${query.toString()}`;
} }
const hostname = window.location.hostname; const hostname = window.location.hostname;
const port = window.location.port; const port = window.location.port;
const isLocalHost = hostname === 'localhost' || hostname === '127.0.0.1'; const isLocalHost = hostname === 'localhost' || hostname === '127.0.0.1';
if ((window.location.protocol === 'file:' || isLocalHost) && port !== '3012') { if ((window.location.protocol === 'file:' || isLocalHost) && port !== '3012') {
return `ws://${hostname || '127.0.0.1'}:3012/ws/realtime-dialog?${query.toString()}`; return `ws://${hostname || '127.0.0.1'}:3012${wsPath}?${query.toString()}`;
} }
const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:'; const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
return `${protocol}//${window.location.host}/ws/realtime-dialog?${query.toString()}`; return `${protocol}//${window.location.host}${wsPath}?${query.toString()}`;
} }
emitConnectionState(state) { emitConnectionState(state) {
@@ -80,16 +80,39 @@ class NativeVoiceService {
} }
} }
async connect({ sessionId, userId, botName, systemRole, speakingStyle, modelVersion, speaker, greetingText }) { async connect({
sessionId,
userId,
botName,
systemRole,
speakingStyle,
modelVersion,
speaker,
greetingText,
clientMode = 'voice',
playAudioReply = false,
disableGreeting = false,
} = {}) {
await this.disconnect(); await this.disconnect();
const wsUrl = this.resolveWebSocketUrl(sessionId, userId); this.clientMode = clientMode;
this.playAudioReply = playAudioReply;
const wsPath = clientMode === 'text' ? '/ws/realtime-text' : '/ws/realtime-dialog';
const wsUrl = this.resolveWebSocketUrl(sessionId, userId, wsPath);
this.emitConnectionState('connecting'); this.emitConnectionState('connecting');
// Audio playback context: only needed if we will receive audio (voice mode or text+playAudio)
const needsPlayback = clientMode !== 'text' || playAudioReply;
if (needsPlayback) {
this.playbackContext = new (window.AudioContext || window.webkitAudioContext)(); this.playbackContext = new (window.AudioContext || window.webkitAudioContext)();
if (this.playbackContext.state === 'suspended') { if (this.playbackContext.state === 'suspended') {
await this.playbackContext.resume().catch(() => {}); await this.playbackContext.resume().catch(() => {});
} }
this.playbackTime = this.playbackContext.currentTime; this.playbackTime = this.playbackContext.currentTime;
}
// Microphone capture: only needed in voice mode
let micPromise = Promise.resolve(null);
if (clientMode !== 'text') {
// 安全上下文检查: getUserMedia 需要 HTTPS 或 localhost // 安全上下文检查: getUserMedia 需要 HTTPS 或 localhost
if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) { if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
const errMsg = window.isSecureContext === false const errMsg = window.isSecureContext === false
@@ -98,9 +121,8 @@ class NativeVoiceService {
this.emitConnectionState('error', errMsg); this.emitConnectionState('error', errMsg);
throw new Error(errMsg); throw new Error(errMsg);
} }
// 并行: 同时预获取麦克风和建立WS连接节省500ms+ // 并行: 同时预获取麦克风和建立WS连接节省500ms+
const micPromise = navigator.mediaDevices.getUserMedia({ micPromise = navigator.mediaDevices.getUserMedia({
audio: { audio: {
channelCount: 1, channelCount: 1,
noiseSuppression: true, noiseSuppression: true,
@@ -117,6 +139,7 @@ class NativeVoiceService {
} }
return null; return null;
}); });
}
const CONNECTION_TIMEOUT_MS = 12000; const CONNECTION_TIMEOUT_MS = 12000;
@@ -151,6 +174,9 @@ class NativeVoiceService {
modelVersion, modelVersion,
speaker, speaker,
greetingText, greetingText,
clientMode,
playAudioReply,
disableGreeting: clientMode === 'text' ? (disableGreeting !== false) : disableGreeting,
})); }));
}; };
@@ -187,11 +213,21 @@ class NativeVoiceService {
}; };
}); });
// 文字模式:不启动麦克风采集
if (clientMode === 'text') {
return;
}
// 使用预获取的mediaStream已并行获取避免重复申请 // 使用预获取的mediaStream已并行获取避免重复申请
const preFetchedStream = await micPromise; const preFetchedStream = await micPromise;
await this.startCapture(preFetchedStream); await this.startCapture(preFetchedStream);
} }
sendText(text) {
if (this.ws && this.ws.readyState === WebSocket.OPEN) {
this.ws.send(JSON.stringify({ type: 'text', text: String(text || '') }));
}
}
handleJsonMessage(raw) { handleJsonMessage(raw) {
try { try {
const msg = JSON.parse(raw); const msg = JSON.parse(raw);
@@ -246,6 +282,10 @@ class NativeVoiceService {
} }
handleAudioMessage(arrayBuffer) { handleAudioMessage(arrayBuffer) {
// Text mode without playAudioReply: drop all incoming audio silently
if (this.clientMode === 'text' && !this.playAudioReply) {
return;
}
if (!this.playbackContext) { if (!this.playbackContext) {
return; return;
} }
@@ -441,3 +481,4 @@ class NativeVoiceService {
const nativeVoiceService = new NativeVoiceService(); const nativeVoiceService = new NativeVoiceService();
export default nativeVoiceService; export default nativeVoiceService;
export { NativeVoiceService };