feat(s2s): add S2S text dialog via /ws/realtime-text + event 501 ChatTextQuery
Dual-channel S2S architecture with full isolation between voice and text links: Backend (Java): - VolcRealtimeProtocol: add createChatTextQueryMessage (event 501) - VoiceSessionState: add textMode / playAudioReply / disableGreeting fields - VoiceWebSocketConfig: register second path /ws/realtime-text (same handler) - VoiceWebSocketHandler: detect text mode from URL path - VoiceGatewayService: * afterConnectionEstablished: overload with textMode flag * handleStart: parse playAudioReply / disableGreeting from client * buildStartSessionPayload: inject input_mod=text for text mode * handleDirectText: text mode sends event 501 directly, skip processReply * handleBinaryMessage: reject client audio in text mode * handleUpstreamBinary: drop S2S audio if text mode + no playback * startAudioKeepalive: skip entirely in text mode (no audio channel) * sendGreeting: skip greeting if disableGreeting=true Frontend (test2 + delivery): - nativeVoiceService: connect accepts clientMode/playAudioReply/disableGreeting * resolveWebSocketUrl accepts wsPath param * Text mode: no microphone capture, no playback context (unless playAudioReply) * New sendText() method for event 501 payload * handleAudioMessage drops audio in text mode without playback * Export NativeVoiceService class for multi-instance usage - ChatPanel (test2): new useS2S / playAudioReply props * useS2S=true: creates NativeVoiceService instance, connects to /ws/realtime-text * subtitle events drive streaming UI, assistant_pending drives loading state * handleSend routes to WebSocket in S2S mode, HTTP/SSE in Coze mode * Voice link code path zero-changed Verification: mvn test VoiceGatewaySmokeTest 20/20 pass, voice link regression-free
This commit is contained in:
@@ -25,7 +25,7 @@ class NativeVoiceService {
|
||||
};
|
||||
}
|
||||
|
||||
resolveWebSocketUrl(sessionId, userId) {
|
||||
resolveWebSocketUrl(sessionId, userId, wsPath = '/ws/realtime-dialog') {
|
||||
const query = new URLSearchParams({
|
||||
sessionId,
|
||||
userId: userId || '',
|
||||
@@ -43,16 +43,16 @@ class NativeVoiceService {
|
||||
} else if (base.endsWith('/api')) {
|
||||
base = base.slice(0, -'/api'.length);
|
||||
}
|
||||
return `${base}/ws/realtime-dialog?${query.toString()}`;
|
||||
return `${base}${wsPath}?${query.toString()}`;
|
||||
}
|
||||
const hostname = window.location.hostname;
|
||||
const port = window.location.port;
|
||||
const isLocalHost = hostname === 'localhost' || hostname === '127.0.0.1';
|
||||
if ((window.location.protocol === 'file:' || isLocalHost) && port !== '3013') {
|
||||
return `ws://${hostname || '127.0.0.1'}:3013/ws/realtime-dialog?${query.toString()}`;
|
||||
return `ws://${hostname || '127.0.0.1'}:3013${wsPath}?${query.toString()}`;
|
||||
}
|
||||
const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
|
||||
return `${protocol}//${window.location.host}/ws/realtime-dialog?${query.toString()}`;
|
||||
return `${protocol}//${window.location.host}${wsPath}?${query.toString()}`;
|
||||
}
|
||||
|
||||
emitConnectionState(state) {
|
||||
@@ -80,29 +80,52 @@ class NativeVoiceService {
|
||||
}
|
||||
}
|
||||
|
||||
async connect({ sessionId, userId, botName, systemRole, speakingStyle, modelVersion, speaker, greetingText }) {
|
||||
async connect({
|
||||
sessionId,
|
||||
userId,
|
||||
botName,
|
||||
systemRole,
|
||||
speakingStyle,
|
||||
modelVersion,
|
||||
speaker,
|
||||
greetingText,
|
||||
clientMode = 'voice',
|
||||
playAudioReply = false,
|
||||
disableGreeting = false,
|
||||
} = {}) {
|
||||
await this.disconnect();
|
||||
const wsUrl = this.resolveWebSocketUrl(sessionId, userId);
|
||||
this.clientMode = clientMode;
|
||||
this.playAudioReply = playAudioReply;
|
||||
const wsPath = clientMode === 'text' ? '/ws/realtime-text' : '/ws/realtime-dialog';
|
||||
const wsUrl = this.resolveWebSocketUrl(sessionId, userId, wsPath);
|
||||
this.emitConnectionState('connecting');
|
||||
this.playbackContext = new (window.AudioContext || window.webkitAudioContext)();
|
||||
if (this.playbackContext.state === 'suspended') {
|
||||
await this.playbackContext.resume().catch(() => {});
|
||||
}
|
||||
this.playbackTime = this.playbackContext.currentTime;
|
||||
|
||||
// 并行: 同时预获取麦克风和建立WS连接,节省500ms+
|
||||
const micPromise = navigator.mediaDevices.getUserMedia({
|
||||
audio: {
|
||||
channelCount: 1,
|
||||
noiseSuppression: true,
|
||||
echoCancellation: true,
|
||||
autoGainControl: true,
|
||||
},
|
||||
video: false,
|
||||
}).catch((err) => {
|
||||
console.warn('[NativeVoice] Pre-fetch getUserMedia failed:', err.message);
|
||||
return null;
|
||||
});
|
||||
// Audio playback context: only needed if we will receive audio
|
||||
const needsPlayback = clientMode !== 'text' || playAudioReply;
|
||||
if (needsPlayback) {
|
||||
this.playbackContext = new (window.AudioContext || window.webkitAudioContext)();
|
||||
if (this.playbackContext.state === 'suspended') {
|
||||
await this.playbackContext.resume().catch(() => {});
|
||||
}
|
||||
this.playbackTime = this.playbackContext.currentTime;
|
||||
}
|
||||
|
||||
// Microphone: only needed in voice mode
|
||||
let micPromise = Promise.resolve(null);
|
||||
if (clientMode !== 'text') {
|
||||
micPromise = navigator.mediaDevices.getUserMedia({
|
||||
audio: {
|
||||
channelCount: 1,
|
||||
noiseSuppression: true,
|
||||
echoCancellation: true,
|
||||
autoGainControl: true,
|
||||
},
|
||||
video: false,
|
||||
}).catch((err) => {
|
||||
console.warn('[NativeVoice] Pre-fetch getUserMedia failed:', err.message);
|
||||
return null;
|
||||
});
|
||||
}
|
||||
|
||||
const CONNECTION_TIMEOUT_MS = 12000;
|
||||
|
||||
@@ -137,6 +160,9 @@ class NativeVoiceService {
|
||||
modelVersion,
|
||||
speaker,
|
||||
greetingText,
|
||||
clientMode,
|
||||
playAudioReply,
|
||||
disableGreeting: clientMode === 'text' ? (disableGreeting !== false) : disableGreeting,
|
||||
}));
|
||||
};
|
||||
|
||||
@@ -173,11 +199,20 @@ class NativeVoiceService {
|
||||
};
|
||||
});
|
||||
|
||||
// 使用预获取的mediaStream(已并行获取),避免重复申请
|
||||
// 文字模式不启动麦克风
|
||||
if (clientMode === 'text') {
|
||||
return;
|
||||
}
|
||||
const preFetchedStream = await micPromise;
|
||||
await this.startCapture(preFetchedStream);
|
||||
}
|
||||
|
||||
sendText(text) {
|
||||
if (this.ws && this.ws.readyState === WebSocket.OPEN) {
|
||||
this.ws.send(JSON.stringify({ type: 'text', text: String(text || '') }));
|
||||
}
|
||||
}
|
||||
|
||||
handleJsonMessage(raw) {
|
||||
try {
|
||||
const msg = JSON.parse(raw);
|
||||
@@ -232,6 +267,10 @@ class NativeVoiceService {
|
||||
}
|
||||
|
||||
handleAudioMessage(arrayBuffer) {
|
||||
// Text mode without playAudioReply: drop S2S audio silently
|
||||
if (this.clientMode === 'text' && !this.playAudioReply) {
|
||||
return;
|
||||
}
|
||||
if (!this.playbackContext) {
|
||||
return;
|
||||
}
|
||||
@@ -424,3 +463,4 @@ class NativeVoiceService {
|
||||
|
||||
const nativeVoiceService = new NativeVoiceService();
|
||||
export default nativeVoiceService;
|
||||
export { NativeVoiceService };
|
||||
|
||||
Reference in New Issue
Block a user