feat(s2s): add S2S text dialog via /ws/realtime-text + event 501 ChatTextQuery

Dual-channel S2S architecture with full isolation between voice and text links: Backend (Java): - VolcRealtimeProtocol: add createChatTextQueryMessage (event 501) - VoiceSessionState: add textMode / playAudioReply / disableGreeting fields - VoiceWebSocketConfig: register second path /ws/realtime-text (same handler) - VoiceWebSocketHandler: detect text mode from URL path - VoiceGatewayService: * afterConnectionEstablished: overload with textMode flag * handleStart: parse playAudioReply / disableGreeting from client * buildStartSessionPayload: inject input_mod=text for text mode * handleDirectText: text mode sends event 501 directly, skip processReply * handleBinaryMessage: reject client audio in text mode * handleUpstreamBinary: drop S2S audio if text mode + no playback * startAudioKeepalive: skip entirely in text mode (no audio channel) * sendGreeting: skip greeting if disableGreeting=true Frontend (test2 + delivery): - nativeVoiceService: connect accepts clientMode/playAudioReply/disableGreeting * resolveWebSocketUrl accepts wsPath param * Text mode: no microphone capture, no playback context (unless playAudioReply) * New sendText() method for event 501 payload * handleAudioMessage drops audio in text mode without playback * Export NativeVoiceService class for multi-instance usage - ChatPanel (test2): new useS2S / playAudioReply props * useS2S=true: creates NativeVoiceService instance, connects to /ws/realtime-text * subtitle events drive streaming UI, assistant_pending drives loading state * handleSend routes to WebSocket in S2S mode, HTTP/SSE in Coze mode * Voice link code path zero-changed Verification: mvn test VoiceGatewaySmokeTest 20/20 pass, voice link regression-free
2026-04-17 09:33:56 +08:00
parent ff6a63147b
commit af9faf26c9
8 changed files with 399 additions and 108 deletions
--- a/delivery/client/src/services/nativeVoiceService.js
+++ b/delivery/client/src/services/nativeVoiceService.js
@@ -25,7 +25,7 @@ class NativeVoiceService {
    };
  }

-  resolveWebSocketUrl(sessionId, userId) {
+  resolveWebSocketUrl(sessionId, userId, wsPath = '/ws/realtime-dialog') {
    const query = new URLSearchParams({
      sessionId,
      userId: userId || '',
@@ -43,16 +43,16 @@ class NativeVoiceService {
      } else if (base.endsWith('/api')) {
        base = base.slice(0, -'/api'.length);
      }
-      return `${base}/ws/realtime-dialog?${query.toString()}`;
+      return `${base}${wsPath}?${query.toString()}`;
    }
    const hostname = window.location.hostname;
    const port = window.location.port;
    const isLocalHost = hostname === 'localhost' || hostname === '127.0.0.1';
    if ((window.location.protocol === 'file:' || isLocalHost) && port !== '3013') {
-      return `ws://${hostname || '127.0.0.1'}:3013/ws/realtime-dialog?${query.toString()}`;
+      return `ws://${hostname || '127.0.0.1'}:3013${wsPath}?${query.toString()}`;
    }
    const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
-    return `${protocol}//${window.location.host}/ws/realtime-dialog?${query.toString()}`;
+    return `${protocol}//${window.location.host}${wsPath}?${query.toString()}`;
  }

  emitConnectionState(state) {
@@ -80,29 +80,52 @@ class NativeVoiceService {
    }
  }

-  async connect({ sessionId, userId, botName, systemRole, speakingStyle, modelVersion, speaker, greetingText }) {
+  async connect({
+    sessionId,
+    userId,
+    botName,
+    systemRole,
+    speakingStyle,
+    modelVersion,
+    speaker,
+    greetingText,
+    clientMode = 'voice',
+    playAudioReply = false,
+    disableGreeting = false,
+  } = {}) {
    await this.disconnect();
-    const wsUrl = this.resolveWebSocketUrl(sessionId, userId);
+    this.clientMode = clientMode;
+    this.playAudioReply = playAudioReply;
+    const wsPath = clientMode === 'text' ? '/ws/realtime-text' : '/ws/realtime-dialog';
+    const wsUrl = this.resolveWebSocketUrl(sessionId, userId, wsPath);
    this.emitConnectionState('connecting');
-    this.playbackContext = new (window.AudioContext || window.webkitAudioContext)();
-    if (this.playbackContext.state === 'suspended') {
-      await this.playbackContext.resume().catch(() => {});
-    }
-    this.playbackTime = this.playbackContext.currentTime;

-    // 并行: 同时预获取麦克风和建立WS连接，节省500ms+
-    const micPromise = navigator.mediaDevices.getUserMedia({
-      audio: {
-        channelCount: 1,
-        noiseSuppression: true,
-        echoCancellation: true,
-        autoGainControl: true,
-      },
-      video: false,
-    }).catch((err) => {
-      console.warn('[NativeVoice] Pre-fetch getUserMedia failed:', err.message);
-      return null;
-    });
+    // Audio playback context: only needed if we will receive audio
+    const needsPlayback = clientMode !== 'text' || playAudioReply;
+    if (needsPlayback) {
+      this.playbackContext = new (window.AudioContext || window.webkitAudioContext)();
+      if (this.playbackContext.state === 'suspended') {
+        await this.playbackContext.resume().catch(() => {});
+      }
+      this.playbackTime = this.playbackContext.currentTime;
+    }
+
+    // Microphone: only needed in voice mode
+    let micPromise = Promise.resolve(null);
+    if (clientMode !== 'text') {
+      micPromise = navigator.mediaDevices.getUserMedia({
+        audio: {
+          channelCount: 1,
+          noiseSuppression: true,
+          echoCancellation: true,
+          autoGainControl: true,
+        },
+        video: false,
+      }).catch((err) => {
+        console.warn('[NativeVoice] Pre-fetch getUserMedia failed:', err.message);
+        return null;
+      });
+    }

    const CONNECTION_TIMEOUT_MS = 12000;

@@ -137,6 +160,9 @@ class NativeVoiceService {
          modelVersion,
          speaker,
          greetingText,
+          clientMode,
+          playAudioReply,
+          disableGreeting: clientMode === 'text' ? (disableGreeting !== false) : disableGreeting,
        }));
      };

@@ -173,11 +199,20 @@ class NativeVoiceService {
      };
    });

-    // 使用预获取的mediaStream（已并行获取），避免重复申请
+    // 文字模式不启动麦克风
+    if (clientMode === 'text') {
+      return;
+    }
    const preFetchedStream = await micPromise;
    await this.startCapture(preFetchedStream);
  }

+  sendText(text) {
+    if (this.ws && this.ws.readyState === WebSocket.OPEN) {
+      this.ws.send(JSON.stringify({ type: 'text', text: String(text || '') }));
+    }
+  }
+
  handleJsonMessage(raw) {
    try {
      const msg = JSON.parse(raw);
@@ -232,6 +267,10 @@ class NativeVoiceService {
  }

  handleAudioMessage(arrayBuffer) {
+    // Text mode without playAudioReply: drop S2S audio silently
+    if (this.clientMode === 'text' && !this.playAudioReply) {
+      return;
+    }
    if (!this.playbackContext) {
      return;
    }
@@ -424,3 +463,4 @@ class NativeVoiceService {

 const nativeVoiceService = new NativeVoiceService();
 export default nativeVoiceService;
+export { NativeVoiceService };