feat(s2s): add S2S text dialog via /ws/realtime-text + event 501 ChatTextQuery

Dual-channel S2S architecture with full isolation between voice and text links: Backend (Java): - VolcRealtimeProtocol: add createChatTextQueryMessage (event 501) - VoiceSessionState: add textMode / playAudioReply / disableGreeting fields - VoiceWebSocketConfig: register second path /ws/realtime-text (same handler) - VoiceWebSocketHandler: detect text mode from URL path - VoiceGatewayService: * afterConnectionEstablished: overload with textMode flag * handleStart: parse playAudioReply / disableGreeting from client * buildStartSessionPayload: inject input_mod=text for text mode * handleDirectText: text mode sends event 501 directly, skip processReply * handleBinaryMessage: reject client audio in text mode * handleUpstreamBinary: drop S2S audio if text mode + no playback * startAudioKeepalive: skip entirely in text mode (no audio channel) * sendGreeting: skip greeting if disableGreeting=true Frontend (test2 + delivery): - nativeVoiceService: connect accepts clientMode/playAudioReply/disableGreeting * resolveWebSocketUrl accepts wsPath param * Text mode: no microphone capture, no playback context (unless playAudioReply) * New sendText() method for event 501 payload * handleAudioMessage drops audio in text mode without playback * Export NativeVoiceService class for multi-instance usage - ChatPanel (test2): new useS2S / playAudioReply props * useS2S=true: creates NativeVoiceService instance, connects to /ws/realtime-text * subtitle events drive streaming UI, assistant_pending drives loading state * handleSend routes to WebSocket in S2S mode, HTTP/SSE in Coze mode * Voice link code path zero-changed Verification: mvn test VoiceGatewaySmokeTest 20/20 pass, voice link regression-free
2026-04-17 09:33:56 +08:00
parent ff6a63147b
commit af9faf26c9
8 changed files with 399 additions and 108 deletions
--- a/test2/client/src/components/ChatPanel.jsx
+++ b/test2/client/src/components/ChatPanel.jsx
@@ -2,8 +2,9 @@ import { useState, useRef, useEffect, useCallback } from 'react';
 import { Send, Bot, User, Loader2, ArrowLeft, Sparkles, Wrench, StopCircle } from 'lucide-react';
 import { startChatSession, sendMessageStream } from '../services/chatApi';
 import { getSessionHistory } from '../services/voiceApi';
+import { NativeVoiceService } from '../services/nativeVoiceService';

-export default function ChatPanel({ sessionId, voiceSubtitles, settings, onBack, onMessagesChange }) {
+export default function ChatPanel({ sessionId, voiceSubtitles, settings, onBack, onMessagesChange, useS2S = false, playAudioReply = false }) {
  const [messages, setMessages] = useState([]);
  const [input, setInput] = useState('');
  const [isLoading, setIsLoading] = useState(false);
@@ -14,51 +15,143 @@ export default function ChatPanel({ sessionId, voiceSubtitles, settings, onBack,
  const scrollRef = useRef(null);
  const inputRef = useRef(null);
  const abortRef = useRef(null);
+  const s2sServiceRef = useRef(null);
+  const s2sStreamingIdRef = useRef(null);
+
+  // S2S text mode: establish WebSocket to /ws/realtime-text
+  useEffect(() => {
+    if (!useS2S || !sessionId) {
+      return undefined;
+    }
+    const svc = new NativeVoiceService();
+    s2sServiceRef.current = svc;
+
+    svc.on('onSubtitle', (data) => {
+      if (!data || !data.role) return;
+      if (data.role === 'user') {
+        // User subtitle is just an echo of what we already inserted; skip
+        return;
+      }
+      // assistant subtitle: streaming chunks (isFinal=false) or final (isFinal=true)
+      const assistantId = s2sStreamingIdRef.current;
+      if (!assistantId) return;
+      setMessages((prev) => prev.map((m) => (
+        m.id === assistantId
+          ? { ...m, content: data.text || '', streaming: !data.isFinal }
+          : m
+      )));
+      if (data.isFinal) {
+        setIsLoading(false);
+        setStreamingId(null);
+        s2sStreamingIdRef.current = null;
+        inputRef.current?.focus();
+      }
+    });
+    svc.on('onAssistantPending', (active) => {
+      setIsLoading(!!active);
+    });
+    svc.on('onError', (err) => {
+      setError(err?.message || 'S2S 文字模式错误');
+      setIsLoading(false);
+      setStreamingId(null);
+      s2sStreamingIdRef.current = null;
+    });
+    svc.on('onIdleTimeout', () => {
+      setError('S2S 连接超时，已断开。请刷新页面重连');
+      setIsInitialized(false);
+    });
+    svc.on('onConnectionStateChange', (state) => {
+      if (state === 'connected') {
+        // wait for onReady (handled via promise in connect)
+      } else if (state === 'disconnected' || state === 'error') {
+        setIsInitialized(false);
+      }
+    });
+
+    (async () => {
+      try {
+        await svc.connect({
+          sessionId,
+          userId: settings?.userId || '',
+          botName: settings?.botName || '大沃',
+          speaker: settings?.speaker || 'zh_female_vv_jupiter_bigtts',
+          modelVersion: settings?.modelVersion || 'O',
+          clientMode: 'text',
+          playAudioReply: !!playAudioReply,
+          disableGreeting: true,
+        });
+        setIsInitialized(true);
+      } catch (e) {
+        setError(`S2S 连接失败：${e?.message || e}`);
+      }
+    })();
+
+    return () => {
+      svc.disconnect().catch(() => {});
+      s2sServiceRef.current = null;
+      s2sStreamingIdRef.current = null;
+    };
+  }, [useS2S, sessionId, settings?.userId, settings?.botName, settings?.speaker, settings?.modelVersion, playAudioReply]);

  // 初始化：创建聊天会话，优先从数据库加载完整历史
  useEffect(() => {
+    if (useS2S) {
+      // S2S mode handles init in its own effect
+      return;
+    }
    async function init() {
+      // 1. 从数据库加载历史（独立于 Coze 会话，不受其失败影响）
+      let historyMsgs = [];
      try {
-        // 启动后端聊天会话（后端会从 DB 加载历史注入 Coze 上下文）
-        await startChatSession(sessionId, voiceSubtitles);
-        setIsInitialized(true);
-
-        // 从数据库加载完整对话历史（包含语音通话中的工具结果）
-        let historyMsgs = [];
-        try {
-          const historyData = await getSessionHistory(sessionId, 20);
-          if (historyData?.messages?.length > 0) {
-            historyMsgs = historyData.messages.map((m, i) => ({
-              id: `history-${i}`,
-              role: m.role,
-              content: m.content,
-              fromVoice: true,
-            }));
-            console.log(`[ChatPanel] Loaded ${historyMsgs.length} messages from DB`);
-          }
-        } catch (e) {
-          console.warn('[ChatPanel] DB history load failed, falling back to subtitles:', e.message);
-        }
-
-        // 如果数据库没有历史，回退到 voiceSubtitles
-        if (historyMsgs.length === 0 && voiceSubtitles && voiceSubtitles.length > 0) {
-          historyMsgs = voiceSubtitles.map((s, i) => ({
-            id: `voice-${i}`,
-            role: s.role === 'user' ? 'user' : 'assistant',
-            content: s.text,
+        const historyData = await getSessionHistory(sessionId, 20);
+        if (historyData?.messages?.length > 0) {
+          historyMsgs = historyData.messages.map((m, i) => ({
+            id: `history-${i}`,
+            role: m.role,
+            content: m.content,
            fromVoice: true,
          }));
+          console.log(`[ChatPanel] Loaded ${historyMsgs.length} messages from DB`);
        }
-
-        if (historyMsgs.length > 0) {
-          setMessages(historyMsgs);
-        }
-
-        inputRef.current?.focus();
-      } catch (err) {
-        console.error('[ChatPanel] Init failed:', err);
-        setError('聊天会话初始化失败');
+      } catch (e) {
+        console.warn('[ChatPanel] DB history load failed, falling back to subtitles:', e.message);
      }
+
+      // 如果数据库没有历史，回退到 voiceSubtitles
+      if (historyMsgs.length === 0 && voiceSubtitles && voiceSubtitles.length > 0) {
+        historyMsgs = voiceSubtitles.map((s, i) => ({
+          id: `voice-${i}`,
+          role: s.role === 'user' ? 'user' : 'assistant',
+          content: s.text,
+          fromVoice: true,
+        }));
+      }
+
+      if (historyMsgs.length > 0) {
+        setMessages(historyMsgs);
+      }
+
+      // 2. 启动后端聊天会话（自动重试3次，间隔2秒）
+      let initOk = false;
+      for (let attempt = 0; attempt < 3 && !initOk; attempt++) {
+        try {
+          if (attempt > 0) {
+            console.log(`[ChatPanel] Retrying init (attempt ${attempt + 1}/3)...`);
+            await new Promise(r => setTimeout(r, 2000));
+          }
+          await startChatSession(sessionId, voiceSubtitles);
+          initOk = true;
+          setIsInitialized(true);
+          setError(null);
+        } catch (err) {
+          console.error(`[ChatPanel] Init attempt ${attempt + 1} failed:`, err.message);
+          if (attempt === 2) {
+            setError('聊天会话初始化失败，点击重试');
+          }
+        }
+      }
+
+      inputRef.current?.focus();
    }
    init();
  }, [sessionId, voiceSubtitles]);
@@ -108,6 +201,20 @@ export default function ChatPanel({ sessionId, voiceSubtitles, settings, onBack,
    // 先插入一个空的 assistant 消息用于流式填充
    setMessages((prev) => [...prev, { id: assistantId, role: 'assistant', content: '', streaming: true }]);

+    // S2S text mode: send via WebSocket, subtitle/pending events drive UI
+    if (useS2S) {
+      const svc = s2sServiceRef.current;
+      if (!svc) {
+        setError('S2S 服务未就绪');
+        setIsLoading(false);
+        setStreamingId(null);
+        return;
+      }
+      s2sStreamingIdRef.current = assistantId;
+      svc.sendText(text);
+      return;
+    }
+
    const abort = sendMessageStream(sessionId, text, {
      onChunk: (chunk) => {
        setMessages((prev) =>
@@ -117,6 +224,12 @@ export default function ChatPanel({ sessionId, voiceSubtitles, settings, onBack,
      onToolCall: (tools) => {
        setToolsInUse(tools);
      },
+      onStreamReset: () => {
+        // 内容安全拦截：清空已累积的流式文本，等待 done 事件的安全回复
+        setMessages((prev) =>
+          prev.map((m) => (m.id === assistantId ? { ...m, content: '' } : m))
+        );
+      },
      onDone: (fullContent) => {
        setMessages((prev) =>
          prev.map((m) => (m.id === assistantId ? { ...m, content: fullContent, streaming: false } : m))
@@ -137,7 +250,7 @@ export default function ChatPanel({ sessionId, voiceSubtitles, settings, onBack,
    });

    abortRef.current = abort;
-  }, [input, isLoading, sessionId]);
+  }, [input, isLoading, sessionId, useS2S]);

  const handleKeyDown = (e) => {
    if (e.key === 'Enter' && !e.shiftKey) {
@@ -164,7 +277,9 @@ export default function ChatPanel({ sessionId, voiceSubtitles, settings, onBack,
            </div>
            <div>
              <h3 className="text-sm font-semibold text-white leading-tight">{settings.botName}</h3>
-              <p className="text-[10px] text-slate-500 leading-tight">文字对话模式 · 方舟 LLM</p>
+              <p className="text-[10px] text-slate-500 leading-tight">
+                {useS2S ? '文字对话模式 · S2S' : '文字对话模式 · 方舟 LLM'}
+              </p>
            </div>
          </div>
        </div>
@@ -262,6 +377,17 @@ export default function ChatPanel({ sessionId, voiceSubtitles, settings, onBack,
      {error && (
        <div className="mx-4 mb-2 px-3 py-1.5 rounded-lg bg-red-500/10 border border-red-500/20 text-red-400 text-xs">
          {error}
+          {!isInitialized && (
+            <button onClick={async () => {
+              setError(null);
+              try {
+                await startChatSession(sessionId, voiceSubtitles);
+                setIsInitialized(true);
+              } catch (e) {
+                setError('重试失败，请检查网络后再试');
+              }
+            }} className="ml-2 underline hover:text-red-300 font-medium">重试</button>
+          )}
          <button onClick={() => setError(null)} className="ml-2 underline hover:text-red-300">关闭</button>
        </div>
      )}
--- a/test2/client/src/services/nativeVoiceService.js
+++ b/test2/client/src/services/nativeVoiceService.js
@@ -25,7 +25,7 @@ class NativeVoiceService {
    };
  }

-  resolveWebSocketUrl(sessionId, userId) {
+  resolveWebSocketUrl(sessionId, userId, wsPath = '/ws/realtime-dialog') {
    const query = new URLSearchParams({
      sessionId,
      userId: userId || '',
@@ -43,16 +43,16 @@ class NativeVoiceService {
      } else if (base.endsWith('/api')) {
        base = base.slice(0, -'/api'.length);
      }
-      return `${base}/ws/realtime-dialog?${query.toString()}`;
+      return `${base}${wsPath}?${query.toString()}`;
    }
    const hostname = window.location.hostname;
    const port = window.location.port;
    const isLocalHost = hostname === 'localhost' || hostname === '127.0.0.1';
    if ((window.location.protocol === 'file:' || isLocalHost) && port !== '3012') {
-      return `ws://${hostname || '127.0.0.1'}:3012/ws/realtime-dialog?${query.toString()}`;
+      return `ws://${hostname || '127.0.0.1'}:3012${wsPath}?${query.toString()}`;
    }
    const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
-    return `${protocol}//${window.location.host}/ws/realtime-dialog?${query.toString()}`;
+    return `${protocol}//${window.location.host}${wsPath}?${query.toString()}`;
  }

  emitConnectionState(state) {
@@ -80,43 +80,66 @@ class NativeVoiceService {
    }
  }

-  async connect({ sessionId, userId, botName, systemRole, speakingStyle, modelVersion, speaker, greetingText }) {
+  async connect({
+    sessionId,
+    userId,
+    botName,
+    systemRole,
+    speakingStyle,
+    modelVersion,
+    speaker,
+    greetingText,
+    clientMode = 'voice',
+    playAudioReply = false,
+    disableGreeting = false,
+  } = {}) {
    await this.disconnect();
-    const wsUrl = this.resolveWebSocketUrl(sessionId, userId);
+    this.clientMode = clientMode;
+    this.playAudioReply = playAudioReply;
+    const wsPath = clientMode === 'text' ? '/ws/realtime-text' : '/ws/realtime-dialog';
+    const wsUrl = this.resolveWebSocketUrl(sessionId, userId, wsPath);
    this.emitConnectionState('connecting');
-    this.playbackContext = new (window.AudioContext || window.webkitAudioContext)();
-    if (this.playbackContext.state === 'suspended') {
-      await this.playbackContext.resume().catch(() => {});
-    }
-    this.playbackTime = this.playbackContext.currentTime;

-    // 安全上下文检查: getUserMedia 需要 HTTPS 或 localhost
-    if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
-      const errMsg = window.isSecureContext === false
-        ? '麦克风访问需要 HTTPS 连接，请使用 https:// 地址访问'
-        : '当前浏览器不支持麦克风访问';
-      this.emitConnectionState('error', errMsg);
-      throw new Error(errMsg);
-    }
-
-    // 并行: 同时预获取麦克风和建立WS连接，节省500ms+
-    const micPromise = navigator.mediaDevices.getUserMedia({
-      audio: {
-        channelCount: 1,
-        noiseSuppression: true,
-        echoCancellation: true,
-        autoGainControl: true,
-      },
-      video: false,
-    }).catch((err) => {
-      console.warn('[NativeVoice] Pre-fetch getUserMedia failed:', err.name, err.message);
-      if (err.name === 'NotAllowedError' || err.message?.includes('Permission denied')) {
-        const msg = '麦克风权限被拒绝，请在浏览器设置中允许本站访问麦克风后重试';
-        this.emitConnectionState('error', msg);
-        throw new Error(msg);
+    // Audio playback context: only needed if we will receive audio (voice mode or text+playAudio)
+    const needsPlayback = clientMode !== 'text' || playAudioReply;
+    if (needsPlayback) {
+      this.playbackContext = new (window.AudioContext || window.webkitAudioContext)();
+      if (this.playbackContext.state === 'suspended') {
+        await this.playbackContext.resume().catch(() => {});
      }
-      return null;
-    });
+      this.playbackTime = this.playbackContext.currentTime;
+    }
+
+    // Microphone capture: only needed in voice mode
+    let micPromise = Promise.resolve(null);
+    if (clientMode !== 'text') {
+      // 安全上下文检查: getUserMedia 需要 HTTPS 或 localhost
+      if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
+        const errMsg = window.isSecureContext === false
+          ? '麦克风访问需要 HTTPS 连接，请使用 https:// 地址访问'
+          : '当前浏览器不支持麦克风访问';
+        this.emitConnectionState('error', errMsg);
+        throw new Error(errMsg);
+      }
+      // 并行: 同时预获取麦克风和建立WS连接，节省500ms+
+      micPromise = navigator.mediaDevices.getUserMedia({
+        audio: {
+          channelCount: 1,
+          noiseSuppression: true,
+          echoCancellation: true,
+          autoGainControl: true,
+        },
+        video: false,
+      }).catch((err) => {
+        console.warn('[NativeVoice] Pre-fetch getUserMedia failed:', err.name, err.message);
+        if (err.name === 'NotAllowedError' || err.message?.includes('Permission denied')) {
+          const msg = '麦克风权限被拒绝，请在浏览器设置中允许本站访问麦克风后重试';
+          this.emitConnectionState('error', msg);
+          throw new Error(msg);
+        }
+        return null;
+      });
+    }

    const CONNECTION_TIMEOUT_MS = 12000;

@@ -151,6 +174,9 @@ class NativeVoiceService {
          modelVersion,
          speaker,
          greetingText,
+          clientMode,
+          playAudioReply,
+          disableGreeting: clientMode === 'text' ? (disableGreeting !== false) : disableGreeting,
        }));
      };

@@ -187,11 +213,21 @@ class NativeVoiceService {
      };
    });

+    // 文字模式：不启动麦克风采集
+    if (clientMode === 'text') {
+      return;
+    }
    // 使用预获取的mediaStream（已并行获取），避免重复申请
    const preFetchedStream = await micPromise;
    await this.startCapture(preFetchedStream);
  }

+  sendText(text) {
+    if (this.ws && this.ws.readyState === WebSocket.OPEN) {
+      this.ws.send(JSON.stringify({ type: 'text', text: String(text || '') }));
+    }
+  }
+
  handleJsonMessage(raw) {
    try {
      const msg = JSON.parse(raw);
@@ -246,6 +282,10 @@ class NativeVoiceService {
  }

  handleAudioMessage(arrayBuffer) {
+    // Text mode without playAudioReply: drop all incoming audio silently
+    if (this.clientMode === 'text' && !this.playAudioReply) {
+      return;
+    }
    if (!this.playbackContext) {
      return;
    }
@@ -441,3 +481,4 @@ class NativeVoiceService {

 const nativeVoiceService = new NativeVoiceService();
 export default nativeVoiceService;
+export { NativeVoiceService };