feat(s2s): add S2S text dialog via /ws/realtime-text + event 501 ChatTextQuery

Dual-channel S2S architecture with full isolation between voice and text links:

Backend (Java):
- VolcRealtimeProtocol: add createChatTextQueryMessage (event 501)
- VoiceSessionState: add textMode / playAudioReply / disableGreeting fields
- VoiceWebSocketConfig: register second path /ws/realtime-text (same handler)
- VoiceWebSocketHandler: detect text mode from URL path
- VoiceGatewayService:
  * afterConnectionEstablished: overload with textMode flag
  * handleStart: parse playAudioReply / disableGreeting from client
  * buildStartSessionPayload: inject input_mod=text for text mode
  * handleDirectText: text mode sends event 501 directly, skip processReply
  * handleBinaryMessage: reject client audio in text mode
  * handleUpstreamBinary: drop S2S audio if text mode + no playback
  * startAudioKeepalive: skip entirely in text mode (no audio channel)
  * sendGreeting: skip greeting if disableGreeting=true

Frontend (test2 + delivery):
- nativeVoiceService: connect accepts clientMode/playAudioReply/disableGreeting
  * resolveWebSocketUrl accepts wsPath param
  * Text mode: no microphone capture, no playback context (unless playAudioReply)
  * New sendText() method for event 501 payload
  * handleAudioMessage drops audio in text mode without playback
  * Export NativeVoiceService class for multi-instance usage
- ChatPanel (test2): new useS2S / playAudioReply props
  * useS2S=true: creates NativeVoiceService instance, connects to /ws/realtime-text
  * subtitle events drive streaming UI, assistant_pending drives loading state
  * handleSend routes to WebSocket in S2S mode, HTTP/SSE in Coze mode
  * Voice link code path zero-changed

Verification: mvn test VoiceGatewaySmokeTest 20/20 pass, voice link regression-free
This commit is contained in:
User
2026-04-17 09:33:56 +08:00
parent ff6a63147b
commit af9faf26c9
8 changed files with 399 additions and 108 deletions

View File

@@ -25,7 +25,7 @@ class NativeVoiceService {
};
}
resolveWebSocketUrl(sessionId, userId) {
resolveWebSocketUrl(sessionId, userId, wsPath = '/ws/realtime-dialog') {
const query = new URLSearchParams({
sessionId,
userId: userId || '',
@@ -43,16 +43,16 @@ class NativeVoiceService {
} else if (base.endsWith('/api')) {
base = base.slice(0, -'/api'.length);
}
return `${base}/ws/realtime-dialog?${query.toString()}`;
return `${base}${wsPath}?${query.toString()}`;
}
const hostname = window.location.hostname;
const port = window.location.port;
const isLocalHost = hostname === 'localhost' || hostname === '127.0.0.1';
if ((window.location.protocol === 'file:' || isLocalHost) && port !== '3013') {
return `ws://${hostname || '127.0.0.1'}:3013/ws/realtime-dialog?${query.toString()}`;
return `ws://${hostname || '127.0.0.1'}:3013${wsPath}?${query.toString()}`;
}
const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
return `${protocol}//${window.location.host}/ws/realtime-dialog?${query.toString()}`;
return `${protocol}//${window.location.host}${wsPath}?${query.toString()}`;
}
emitConnectionState(state) {
@@ -80,29 +80,52 @@ class NativeVoiceService {
}
}
async connect({ sessionId, userId, botName, systemRole, speakingStyle, modelVersion, speaker, greetingText }) {
async connect({
sessionId,
userId,
botName,
systemRole,
speakingStyle,
modelVersion,
speaker,
greetingText,
clientMode = 'voice',
playAudioReply = false,
disableGreeting = false,
} = {}) {
await this.disconnect();
const wsUrl = this.resolveWebSocketUrl(sessionId, userId);
this.clientMode = clientMode;
this.playAudioReply = playAudioReply;
const wsPath = clientMode === 'text' ? '/ws/realtime-text' : '/ws/realtime-dialog';
const wsUrl = this.resolveWebSocketUrl(sessionId, userId, wsPath);
this.emitConnectionState('connecting');
this.playbackContext = new (window.AudioContext || window.webkitAudioContext)();
if (this.playbackContext.state === 'suspended') {
await this.playbackContext.resume().catch(() => {});
}
this.playbackTime = this.playbackContext.currentTime;
// 并行: 同时预获取麦克风和建立WS连接节省500ms+
const micPromise = navigator.mediaDevices.getUserMedia({
audio: {
channelCount: 1,
noiseSuppression: true,
echoCancellation: true,
autoGainControl: true,
},
video: false,
}).catch((err) => {
console.warn('[NativeVoice] Pre-fetch getUserMedia failed:', err.message);
return null;
});
// Audio playback context: only needed if we will receive audio
const needsPlayback = clientMode !== 'text' || playAudioReply;
if (needsPlayback) {
this.playbackContext = new (window.AudioContext || window.webkitAudioContext)();
if (this.playbackContext.state === 'suspended') {
await this.playbackContext.resume().catch(() => {});
}
this.playbackTime = this.playbackContext.currentTime;
}
// Microphone: only needed in voice mode
let micPromise = Promise.resolve(null);
if (clientMode !== 'text') {
micPromise = navigator.mediaDevices.getUserMedia({
audio: {
channelCount: 1,
noiseSuppression: true,
echoCancellation: true,
autoGainControl: true,
},
video: false,
}).catch((err) => {
console.warn('[NativeVoice] Pre-fetch getUserMedia failed:', err.message);
return null;
});
}
const CONNECTION_TIMEOUT_MS = 12000;
@@ -137,6 +160,9 @@ class NativeVoiceService {
modelVersion,
speaker,
greetingText,
clientMode,
playAudioReply,
disableGreeting: clientMode === 'text' ? (disableGreeting !== false) : disableGreeting,
}));
};
@@ -173,11 +199,20 @@ class NativeVoiceService {
};
});
// 使用预获取的mediaStream已并行获取避免重复申请
// 文字模式不启动麦克风
if (clientMode === 'text') {
return;
}
const preFetchedStream = await micPromise;
await this.startCapture(preFetchedStream);
}
sendText(text) {
if (this.ws && this.ws.readyState === WebSocket.OPEN) {
this.ws.send(JSON.stringify({ type: 'text', text: String(text || '') }));
}
}
handleJsonMessage(raw) {
try {
const msg = JSON.parse(raw);
@@ -232,6 +267,10 @@ class NativeVoiceService {
}
handleAudioMessage(arrayBuffer) {
// Text mode without playAudioReply: drop S2S audio silently
if (this.clientMode === 'text' && !this.playAudioReply) {
return;
}
if (!this.playbackContext) {
return;
}
@@ -424,3 +463,4 @@ class NativeVoiceService {
const nativeVoiceService = new NativeVoiceService();
export default nativeVoiceService;
export { NativeVoiceService };