feat(s2s): add S2S text dialog via /ws/realtime-text + event 501 ChatTextQuery

Dual-channel S2S architecture with full isolation between voice and text links: Backend (Java): - VolcRealtimeProtocol: add createChatTextQueryMessage (event 501) - VoiceSessionState: add textMode / playAudioReply / disableGreeting fields - VoiceWebSocketConfig: register second path /ws/realtime-text (same handler) - VoiceWebSocketHandler: detect text mode from URL path - VoiceGatewayService: * afterConnectionEstablished: overload with textMode flag * handleStart: parse playAudioReply / disableGreeting from client * buildStartSessionPayload: inject input_mod=text for text mode * handleDirectText: text mode sends event 501 directly, skip processReply * handleBinaryMessage: reject client audio in text mode * handleUpstreamBinary: drop S2S audio if text mode + no playback * startAudioKeepalive: skip entirely in text mode (no audio channel) * sendGreeting: skip greeting if disableGreeting=true Frontend (test2 + delivery): - nativeVoiceService: connect accepts clientMode/playAudioReply/disableGreeting * resolveWebSocketUrl accepts wsPath param * Text mode: no microphone capture, no playback context (unless playAudioReply) * New sendText() method for event 501 payload * handleAudioMessage drops audio in text mode without playback * Export NativeVoiceService class for multi-instance usage - ChatPanel (test2): new useS2S / playAudioReply props * useS2S=true: creates NativeVoiceService instance, connects to /ws/realtime-text * subtitle events drive streaming UI, assistant_pending drives loading state * handleSend routes to WebSocket in S2S mode, HTTP/SSE in Coze mode * Voice link code path zero-changed Verification: mvn test VoiceGatewaySmokeTest 20/20 pass, voice link regression-free
2026-04-17 09:33:56 +08:00
parent ff6a63147b
commit af9faf26c9
8 changed files with 399 additions and 108 deletions
--- a/delivery/client/src/services/nativeVoiceService.js
+++ b/delivery/client/src/services/nativeVoiceService.js
@@ -25,7 +25,7 @@ class NativeVoiceService {
    };
  }
-  resolveWebSocketUrl(sessionId, userId) {
+  resolveWebSocketUrl(sessionId, userId, wsPath = '/ws/realtime-dialog') {
    const query = new URLSearchParams({
      sessionId,
      userId: userId || '',
@@ -43,16 +43,16 @@ class NativeVoiceService {
      } else if (base.endsWith('/api')) {
        base = base.slice(0, -'/api'.length);
      }
-      return `${base}/ws/realtime-dialog?${query.toString()}`;
+      return `${base}${wsPath}?${query.toString()}`;
    }
    const hostname = window.location.hostname;
    const port = window.location.port;
    const isLocalHost = hostname === 'localhost' || hostname === '127.0.0.1';
    if ((window.location.protocol === 'file:' || isLocalHost) && port !== '3013') {
-      return `ws://${hostname || '127.0.0.1'}:3013/ws/realtime-dialog?${query.toString()}`;
+      return `ws://${hostname || '127.0.0.1'}:3013${wsPath}?${query.toString()}`;
    }
    const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
-    return `${protocol}//${window.location.host}/ws/realtime-dialog?${query.toString()}`;
+    return `${protocol}//${window.location.host}${wsPath}?${query.toString()}`;
  }
  emitConnectionState(state) {
@@ -80,18 +80,40 @@ class NativeVoiceService {
    }
  }
-  async connect({ sessionId, userId, botName, systemRole, speakingStyle, modelVersion, speaker, greetingText }) {
+  async connect({
    sessionId,
    userId,
    botName,
    systemRole,
    speakingStyle,
    modelVersion,
    speaker,
    greetingText,
    clientMode = 'voice',
    playAudioReply = false,
    disableGreeting = false,
  } = {}) {
    await this.disconnect();
-    const wsUrl = this.resolveWebSocketUrl(sessionId, userId);
+    this.clientMode = clientMode;
    this.playAudioReply = playAudioReply;
    const wsPath = clientMode === 'text' ? '/ws/realtime-text' : '/ws/realtime-dialog';
    const wsUrl = this.resolveWebSocketUrl(sessionId, userId, wsPath);
    this.emitConnectionState('connecting');
    // Audio playback context: only needed if we will receive audio
    const needsPlayback = clientMode !== 'text' || playAudioReply;
    if (needsPlayback) {
      this.playbackContext = new (window.AudioContext || window.webkitAudioContext)();
      if (this.playbackContext.state === 'suspended') {
        await this.playbackContext.resume().catch(() => {});
      }
      this.playbackTime = this.playbackContext.currentTime;
    }
-    // 并行: 同时预获取麦克风和建立WS连接，节省500ms+
+    // Microphone: only needed in voice mode
-    const micPromise = navigator.mediaDevices.getUserMedia({
+    let micPromise = Promise.resolve(null);
    if (clientMode !== 'text') {
      micPromise = navigator.mediaDevices.getUserMedia({
        audio: {
          channelCount: 1,
          noiseSuppression: true,
@@ -103,6 +125,7 @@ class NativeVoiceService {
        console.warn('[NativeVoice] Pre-fetch getUserMedia failed:', err.message);
        return null;
      });
    }
    const CONNECTION_TIMEOUT_MS = 12000;
@@ -137,6 +160,9 @@ class NativeVoiceService {
          modelVersion,
          speaker,
          greetingText,
          clientMode,
          playAudioReply,
          disableGreeting: clientMode === 'text' ? (disableGreeting !== false) : disableGreeting,
        }));
      };
@@ -173,11 +199,20 @@ class NativeVoiceService {
      };
    });
-    // 使用预获取的mediaStream（已并行获取），避免重复申请
+    // 文字模式不启动麦克风
    if (clientMode === 'text') {
      return;
    }
    const preFetchedStream = await micPromise;
    await this.startCapture(preFetchedStream);
  }
  sendText(text) {
    if (this.ws && this.ws.readyState === WebSocket.OPEN) {
      this.ws.send(JSON.stringify({ type: 'text', text: String(text || '') }));
    }
  }
  handleJsonMessage(raw) {
    try {
      const msg = JSON.parse(raw);
@@ -232,6 +267,10 @@ class NativeVoiceService {
  }
  handleAudioMessage(arrayBuffer) {
    // Text mode without playAudioReply: drop S2S audio silently
    if (this.clientMode === 'text' && !this.playAudioReply) {
      return;
    }
    if (!this.playbackContext) {
      return;
    }
@@ -424,3 +463,4 @@ class NativeVoiceService {
 const nativeVoiceService = new NativeVoiceService();
 export default nativeVoiceService;
 export { NativeVoiceService };
--- a/java-server/src/main/java/com/bigwo/javaserver/service/VoiceGatewayService.java
+++ b/java-server/src/main/java/com/bigwo/javaserver/service/VoiceGatewayService.java
@@ -117,6 +117,10 @@ public class VoiceGatewayService {
    }
    public void afterConnectionEstablished(WebSocketSession clientSession, String sessionId, String userId) {
        afterConnectionEstablished(clientSession, sessionId, userId, false);
    }
    public void afterConnectionEstablished(WebSocketSession clientSession, String sessionId, String userId, boolean textMode) {
        if (!properties.isEnabled()) {
            closeClient(clientSession, CloseStatus.NOT_ACCEPTABLE.withReason("voice gateway disabled"));
            return;
@@ -126,12 +130,17 @@ public class VoiceGatewayService {
            return;
        }
        VoiceSessionState state = new VoiceSessionState(clientSession.getId(), sessionId.trim(), clientSession, normalizeNullable(userId));
        state.textMode = textMode;
        if (textMode) {
            state.disableGreeting = true;
            state.playAudioReply = false;
        }
        sessions.put(clientSession.getId(), state);
-        log.info("[VoiceGateway] client WS connected session={} wsId={} remote={}", state.sessionId, clientSession.getId(),
+        log.info("[VoiceGateway] client WS connected session={} wsId={} mode={} remote={}",
-                clientSession.getRemoteAddress());
+                state.sessionId, clientSession.getId(), textMode ? "text" : "voice", clientSession.getRemoteAddress());
-        chatRepository.createSession(state.sessionId, state.userId, "voice");
+        chatRepository.createSession(state.sessionId, state.userId, textMode ? "text" : "voice");
        resetIdleTimer(state);
-        sendJson(state, Map.of("type", "connected", "sessionId", state.sessionId));
+        sendJson(state, Map.of("type", "connected", "sessionId", state.sessionId, "mode", textMode ? "text" : "voice"));
    }
    public void afterConnectionClosed(WebSocketSession clientSession) {
@@ -200,6 +209,10 @@ public class VoiceGatewayService {
        if (state == null || payload == null || payload.length == 0) {
            return;
        }
        if (state.textMode) {
            // Text mode: reject client audio frames entirely
            return;
        }
        WebSocket upstream = state.upstream;
        if (upstream == null || !state.upstreamReady) {
            return;
@@ -241,6 +254,17 @@ public class VoiceGatewayService {
    private void handleStart(VoiceSessionState state, JsonNode node) {
        state.userId = firstNonBlank(textValue(node.path("userId")), state.userId);
        // Text mode optional override from start message (text mode may choose to play audio reply)
        if (state.textMode) {
            JsonNode playNode = node.path("playAudioReply");
            if (!playNode.isMissingNode() && !playNode.isNull()) {
                state.playAudioReply = playNode.asBoolean(false);
            }
            JsonNode greetNode = node.path("disableGreeting");
            if (!greetNode.isMissingNode() && !greetNode.isNull()) {
                state.disableGreeting = greetNode.asBoolean(true);
            }
        }
        AssistantProfileResult profileResult = assistantProfileService.getAssistantProfile(state.userId, false);
        state.assistantProfile = voiceAssistantProfileSupport.resolve(profileResult.profile());
        state.botName = firstNonBlank(textValue(node.path("botName")), voiceAssistantProfileSupport.getDisplayName(state.assistantProfile), "大沃");
@@ -271,6 +295,23 @@ public class VoiceGatewayService {
        if (!persistUserSpeech(state, cleanText)) {
            return;
        }
        if (state.textMode) {
            // Text mode: send directly to S2S via event 501 (ChatTextQuery)
            // S2S will internally invoke LLM / tool_calls (event 502 external_rag) and stream back
            state.blockUpstreamAudio = !state.playAudioReply; // block audio if user doesn't want playback
            state.currentTtsType = "default";
            state.awaitingUpstreamReply = true;
            state.pendingAssistantSource = "voice_bot";
            state.pendingAssistantToolName = null;
            state.pendingAssistantMeta = null;
            state.pendingAssistantTurnSeq = state.latestUserTurnSeq;
            state.turnCount++;
            state.clearAssistantBuffer();
            sendUpstreamChatTextQuery(state, cleanText);
            sendJson(state, Map.of("type", "assistant_pending", "active", Boolean.TRUE));
            log.info("[VoiceGateway][text-mode] sent ChatTextQuery session={} len={}", state.sessionId, cleanText.length());
            return;
        }
        sendJson(state, Map.of("type", "tts_reset", "reason", "new_turn"));
        state.blockUpstreamAudio = true;
        state.currentTtsType = "default";
@@ -278,6 +319,16 @@ public class VoiceGatewayService {
        processReplyAsync(state, cleanText, state.latestUserTurnSeq);
    }
    private void sendUpstreamChatTextQuery(VoiceSessionState state, String text) {
        if (state.upstream == null || !state.upstreamReady) {
            log.warn("[VoiceGateway][text-mode] upstream not ready, drop text session={}", state.sessionId);
            sendJson(state, Map.of("type", "error", "error", "语音服务尚未就绪，请稍后重试"));
            return;
        }
        byte[] msg = VolcRealtimeProtocol.createChatTextQueryMessage(state.sessionId, text, objectMapper);
        sendUpstreamBinary(state, msg);
    }
    private void connectUpstream(VoiceSessionState state) {
        if (!properties.isConfigured()) {
            sendJson(state, Map.of("type", "error", "error", "VOLC_S2S_APP_ID 或 VOLC_S2S_TOKEN 未配置"));
@@ -323,7 +374,8 @@ public class VoiceGatewayService {
        dialog.put("bot_name", state.botName);
        dialog.put("system_role", voiceAssistantProfileSupport.normalizeTextForSpeech(ANTI_THINKING_PREFIX + " " + state.systemRole));
        dialog.put("speaking_style", voiceAssistantProfileSupport.normalizeTextForSpeech(state.speakingStyle));
-        dialog.put("extra", Map.of("input_mod", "audio", "model", state.modelVersion, "strict_audit", false, "audit_response", "抱歉，这个问题我暂时无法回答。"));
+        String inputMod = state.textMode ? "text" : "audio";
        dialog.put("extra", Map.of("input_mod", inputMod, "model", state.modelVersion, "strict_audit", false, "audit_response", "抱歉，这个问题我暂时无法回答。"));
        Map<String, Object> payload = new LinkedHashMap<>();
        payload.put("asr", asr);
        payload.put("tts", tts);
@@ -351,6 +403,10 @@ public class VoiceGatewayService {
            return;
        }
        if (frame.type() == VolcRealtimeProtocol.TYPE_AUDIO_ONLY_SERVER) {
            // Text mode: drop all S2S audio if client doesn't want playback
            if (state.textMode && !state.playAudioReply) {
                return;
            }
            boolean isDefaultTts = !StringUtils.hasText(state.currentTtsType) || "default".equals(state.currentTtsType);
            boolean isSuppressing = state.suppressUpstreamUntil > System.currentTimeMillis() && isDefaultTts;
            boolean isUserJustSpeaking = isDefaultTts && state.lastPartialAt > 0 && (System.currentTimeMillis() - state.lastPartialAt < 800);
@@ -1016,6 +1072,11 @@ public class VoiceGatewayService {
    }
    private void sendGreeting(VoiceSessionState state) {
        if (state.disableGreeting) {
            state.hasSentGreeting = true;
            sendReady(state);
            return;
        }
        if (state.hasSentGreeting || !StringUtils.hasText(state.greetingText)) {
            sendReady(state);
            return;
@@ -1137,6 +1198,10 @@ public class VoiceGatewayService {
    private void startAudioKeepalive(VoiceSessionState state) {
        cancelFuture(state.keepaliveFuture);
        // Text mode: skip audio keepalive entirely (no audio channel)
        if (state.textMode) {
            return;
        }
        long interval = Math.max(properties.getAudioKeepaliveIntervalMs(), 5000L);
        state.keepaliveFuture = scheduler.scheduleAtFixedRate(() -> {
            WebSocket upstream = state.upstream;
--- a/java-server/src/main/java/com/bigwo/javaserver/service/VoiceSessionState.java
+++ b/java-server/src/main/java/com/bigwo/javaserver/service/VoiceSessionState.java
@@ -114,6 +114,11 @@ final class VoiceSessionState {
    // Reply plan: evidence text (raw KB content, never used for subtitle/persistence)
    volatile String ragEvidenceText = "";
    // Text mode (S2S input_mod=text via /ws/realtime-text)
    volatile boolean textMode;
    volatile boolean playAudioReply;
    volatile boolean disableGreeting;
    VoiceSessionState(String clientConnectionId, String sessionId, WebSocketSession clientSession, String userId) {
        this.clientConnectionId = clientConnectionId;
        this.sessionId = sessionId;
--- a/java-server/src/main/java/com/bigwo/javaserver/websocket/VoiceWebSocketConfig.java
+++ b/java-server/src/main/java/com/bigwo/javaserver/websocket/VoiceWebSocketConfig.java
@@ -17,6 +17,7 @@ public class VoiceWebSocketConfig implements WebSocketConfigurer {
    @Override
    public void registerWebSocketHandlers(WebSocketHandlerRegistry registry) {
-        registry.addHandler(voiceWebSocketHandler, "/ws/realtime-dialog").setAllowedOriginPatterns("*");
+        registry.addHandler(voiceWebSocketHandler, "/ws/realtime-dialog", "/ws/realtime-text")
                .setAllowedOriginPatterns("*");
    }
 }
--- a/java-server/src/main/java/com/bigwo/javaserver/websocket/VoiceWebSocketHandler.java
+++ b/java-server/src/main/java/com/bigwo/javaserver/websocket/VoiceWebSocketHandler.java
@@ -22,7 +22,9 @@ public class VoiceWebSocketHandler extends BinaryWebSocketHandler {
    public void afterConnectionEstablished(WebSocketSession session) {
        URI uri = session.getUri();
        var queryParams = UriComponentsBuilder.fromUri(uri == null ? URI.create("/") : uri).build(true).getQueryParams();
-        voiceGatewayService.afterConnectionEstablished(session, queryParams.getFirst("sessionId"), queryParams.getFirst("userId"));
+        String path = uri == null ? "" : uri.getPath();
        boolean textMode = path != null && path.contains("realtime-text");
        voiceGatewayService.afterConnectionEstablished(session, queryParams.getFirst("sessionId"), queryParams.getFirst("userId"), textMode);
    }
    @Override
--- a/java-server/src/main/java/com/bigwo/javaserver/websocket/VolcRealtimeProtocol.java
+++ b/java-server/src/main/java/com/bigwo/javaserver/websocket/VolcRealtimeProtocol.java
@@ -51,6 +51,17 @@ public final class VolcRealtimeProtocol {
        return marshal(TYPE_FULL_CLIENT, MSG_TYPE_FLAG_WITH_EVENT, 500, sessionId, writeJsonBytes(payload, objectMapper), false);
    }
    public static byte[] createChatTextQueryMessage(String sessionId, String content, ObjectMapper objectMapper) {
        return marshal(
                TYPE_FULL_CLIENT,
                MSG_TYPE_FLAG_WITH_EVENT,
                501,
                sessionId,
                writeJsonBytes(Map.of("content", content == null ? "" : content), objectMapper),
                false
        );
    }
    public static byte[] createChatRagTextMessage(String sessionId, String externalRag, ObjectMapper objectMapper) {
        return marshal(
                TYPE_FULL_CLIENT,
--- a/test2/client/src/components/ChatPanel.jsx
+++ b/test2/client/src/components/ChatPanel.jsx
@@ -2,8 +2,9 @@ import { useState, useRef, useEffect, useCallback } from 'react';
 import { Send, Bot, User, Loader2, ArrowLeft, Sparkles, Wrench, StopCircle } from 'lucide-react';
 import { startChatSession, sendMessageStream } from '../services/chatApi';
 import { getSessionHistory } from '../services/voiceApi';
 import { NativeVoiceService } from '../services/nativeVoiceService';
-export default function ChatPanel({ sessionId, voiceSubtitles, settings, onBack, onMessagesChange }) {
+export default function ChatPanel({ sessionId, voiceSubtitles, settings, onBack, onMessagesChange, useS2S = false, playAudioReply = false }) {
  const [messages, setMessages] = useState([]);
  const [input, setInput] = useState('');
  const [isLoading, setIsLoading] = useState(false);
@@ -14,16 +15,92 @@ export default function ChatPanel({ sessionId, voiceSubtitles, settings, onBack,
  const scrollRef = useRef(null);
  const inputRef = useRef(null);
  const abortRef = useRef(null);
  const s2sServiceRef = useRef(null);
  const s2sStreamingIdRef = useRef(null);
  // S2S text mode: establish WebSocket to /ws/realtime-text
  useEffect(() => {
    if (!useS2S || !sessionId) {
      return undefined;
    }
    const svc = new NativeVoiceService();
    s2sServiceRef.current = svc;
    svc.on('onSubtitle', (data) => {
      if (!data || !data.role) return;
      if (data.role === 'user') {
        // User subtitle is just an echo of what we already inserted; skip
        return;
      }
      // assistant subtitle: streaming chunks (isFinal=false) or final (isFinal=true)
      const assistantId = s2sStreamingIdRef.current;
      if (!assistantId) return;
      setMessages((prev) => prev.map((m) => (
        m.id === assistantId
          ? { ...m, content: data.text || '', streaming: !data.isFinal }
          : m
      )));
      if (data.isFinal) {
        setIsLoading(false);
        setStreamingId(null);
        s2sStreamingIdRef.current = null;
        inputRef.current?.focus();
      }
    });
    svc.on('onAssistantPending', (active) => {
      setIsLoading(!!active);
    });
    svc.on('onError', (err) => {
      setError(err?.message || 'S2S 文字模式错误');
      setIsLoading(false);
      setStreamingId(null);
      s2sStreamingIdRef.current = null;
    });
    svc.on('onIdleTimeout', () => {
      setError('S2S 连接超时，已断开。请刷新页面重连');
      setIsInitialized(false);
    });
    svc.on('onConnectionStateChange', (state) => {
      if (state === 'connected') {
        // wait for onReady (handled via promise in connect)
      } else if (state === 'disconnected' || state === 'error') {
        setIsInitialized(false);
      }
    });
    (async () => {
      try {
        await svc.connect({
          sessionId,
          userId: settings?.userId || '',
          botName: settings?.botName || '大沃',
          speaker: settings?.speaker || 'zh_female_vv_jupiter_bigtts',
          modelVersion: settings?.modelVersion || 'O',
          clientMode: 'text',
          playAudioReply: !!playAudioReply,
          disableGreeting: true,
        });
        setIsInitialized(true);
      } catch (e) {
        setError(`S2S 连接失败：${e?.message || e}`);
      }
    })();
    return () => {
      svc.disconnect().catch(() => {});
      s2sServiceRef.current = null;
      s2sStreamingIdRef.current = null;
    };
  }, [useS2S, sessionId, settings?.userId, settings?.botName, settings?.speaker, settings?.modelVersion, playAudioReply]);
  // 初始化：创建聊天会话，优先从数据库加载完整历史
  useEffect(() => {
    if (useS2S) {
      // S2S mode handles init in its own effect
      return;
    }
    async function init() {
-      try {
+      // 1. 从数据库加载历史（独立于 Coze 会话，不受其失败影响）
        // 启动后端聊天会话（后端会从 DB 加载历史注入 Coze 上下文）
        await startChatSession(sessionId, voiceSubtitles);
        setIsInitialized(true);
        // 从数据库加载完整对话历史（包含语音通话中的工具结果）
      let historyMsgs = [];
      try {
        const historyData = await getSessionHistory(sessionId, 20);
@@ -54,11 +131,27 @@ export default function ChatPanel({ sessionId, voiceSubtitles, settings, onBack,
        setMessages(historyMsgs);
      }
-        inputRef.current?.focus();
+      // 2. 启动后端聊天会话（自动重试3次，间隔2秒）
-      } catch (err) {
+      let initOk = false;
-        console.error('[ChatPanel] Init failed:', err);
+      for (let attempt = 0; attempt < 3 && !initOk; attempt++) {
-        setError('聊天会话初始化失败');
+        try {
          if (attempt > 0) {
            console.log(`[ChatPanel] Retrying init (attempt ${attempt + 1}/3)...`);
            await new Promise(r => setTimeout(r, 2000));
          }
          await startChatSession(sessionId, voiceSubtitles);
          initOk = true;
          setIsInitialized(true);
          setError(null);
        } catch (err) {
          console.error(`[ChatPanel] Init attempt ${attempt + 1} failed:`, err.message);
          if (attempt === 2) {
            setError('聊天会话初始化失败，点击重试');
          }
        }
      }
      inputRef.current?.focus();
    }
    init();
  }, [sessionId, voiceSubtitles]);
@@ -108,6 +201,20 @@ export default function ChatPanel({ sessionId, voiceSubtitles, settings, onBack,
    // 先插入一个空的 assistant 消息用于流式填充
    setMessages((prev) => [...prev, { id: assistantId, role: 'assistant', content: '', streaming: true }]);
    // S2S text mode: send via WebSocket, subtitle/pending events drive UI
    if (useS2S) {
      const svc = s2sServiceRef.current;
      if (!svc) {
        setError('S2S 服务未就绪');
        setIsLoading(false);
        setStreamingId(null);
        return;
      }
      s2sStreamingIdRef.current = assistantId;
      svc.sendText(text);
      return;
    }
    const abort = sendMessageStream(sessionId, text, {
      onChunk: (chunk) => {
        setMessages((prev) =>
@@ -117,6 +224,12 @@ export default function ChatPanel({ sessionId, voiceSubtitles, settings, onBack,
      onToolCall: (tools) => {
        setToolsInUse(tools);
      },
      onStreamReset: () => {
        // 内容安全拦截：清空已累积的流式文本，等待 done 事件的安全回复
        setMessages((prev) =>
          prev.map((m) => (m.id === assistantId ? { ...m, content: '' } : m))
        );
      },
      onDone: (fullContent) => {
        setMessages((prev) =>
          prev.map((m) => (m.id === assistantId ? { ...m, content: fullContent, streaming: false } : m))
@@ -137,7 +250,7 @@ export default function ChatPanel({ sessionId, voiceSubtitles, settings, onBack,
    });
    abortRef.current = abort;
-  }, [input, isLoading, sessionId]);
+  }, [input, isLoading, sessionId, useS2S]);
  const handleKeyDown = (e) => {
    if (e.key === 'Enter' && !e.shiftKey) {
@@ -164,7 +277,9 @@ export default function ChatPanel({ sessionId, voiceSubtitles, settings, onBack,
            </div>
            <div>
              <h3 className="text-sm font-semibold text-white leading-tight">{settings.botName}</h3>
-              <p className="text-[10px] text-slate-500 leading-tight">文字对话模式 · 方舟 LLM</p>
+              <p className="text-[10px] text-slate-500 leading-tight">
                {useS2S ? '文字对话模式 · S2S' : '文字对话模式 · 方舟 LLM'}
              </p>
            </div>
          </div>
        </div>
@@ -262,6 +377,17 @@ export default function ChatPanel({ sessionId, voiceSubtitles, settings, onBack,
      {error && (
        <div className="mx-4 mb-2 px-3 py-1.5 rounded-lg bg-red-500/10 border border-red-500/20 text-red-400 text-xs">
          {error}
          {!isInitialized && (
            <button onClick={async () => {
              setError(null);
              try {
                await startChatSession(sessionId, voiceSubtitles);
                setIsInitialized(true);
              } catch (e) {
                setError('重试失败，请检查网络后再试');
              }
            }} className="ml-2 underline hover:text-red-300 font-medium">重试</button>
          )}
          <button onClick={() => setError(null)} className="ml-2 underline hover:text-red-300">关闭</button>
        </div>
      )}
--- a/test2/client/src/services/nativeVoiceService.js
+++ b/test2/client/src/services/nativeVoiceService.js
@@ -25,7 +25,7 @@ class NativeVoiceService {
    };
  }
-  resolveWebSocketUrl(sessionId, userId) {
+  resolveWebSocketUrl(sessionId, userId, wsPath = '/ws/realtime-dialog') {
    const query = new URLSearchParams({
      sessionId,
      userId: userId || '',
@@ -43,16 +43,16 @@ class NativeVoiceService {
      } else if (base.endsWith('/api')) {
        base = base.slice(0, -'/api'.length);
      }
-      return `${base}/ws/realtime-dialog?${query.toString()}`;
+      return `${base}${wsPath}?${query.toString()}`;
    }
    const hostname = window.location.hostname;
    const port = window.location.port;
    const isLocalHost = hostname === 'localhost' || hostname === '127.0.0.1';
    if ((window.location.protocol === 'file:' || isLocalHost) && port !== '3012') {
-      return `ws://${hostname || '127.0.0.1'}:3012/ws/realtime-dialog?${query.toString()}`;
+      return `ws://${hostname || '127.0.0.1'}:3012${wsPath}?${query.toString()}`;
    }
    const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
-    return `${protocol}//${window.location.host}/ws/realtime-dialog?${query.toString()}`;
+    return `${protocol}//${window.location.host}${wsPath}?${query.toString()}`;
  }
  emitConnectionState(state) {
@@ -80,16 +80,39 @@ class NativeVoiceService {
    }
  }
-  async connect({ sessionId, userId, botName, systemRole, speakingStyle, modelVersion, speaker, greetingText }) {
+  async connect({
    sessionId,
    userId,
    botName,
    systemRole,
    speakingStyle,
    modelVersion,
    speaker,
    greetingText,
    clientMode = 'voice',
    playAudioReply = false,
    disableGreeting = false,
  } = {}) {
    await this.disconnect();
-    const wsUrl = this.resolveWebSocketUrl(sessionId, userId);
+    this.clientMode = clientMode;
    this.playAudioReply = playAudioReply;
    const wsPath = clientMode === 'text' ? '/ws/realtime-text' : '/ws/realtime-dialog';
    const wsUrl = this.resolveWebSocketUrl(sessionId, userId, wsPath);
    this.emitConnectionState('connecting');
    // Audio playback context: only needed if we will receive audio (voice mode or text+playAudio)
    const needsPlayback = clientMode !== 'text' || playAudioReply;
    if (needsPlayback) {
      this.playbackContext = new (window.AudioContext || window.webkitAudioContext)();
      if (this.playbackContext.state === 'suspended') {
        await this.playbackContext.resume().catch(() => {});
      }
      this.playbackTime = this.playbackContext.currentTime;
    }
    // Microphone capture: only needed in voice mode
    let micPromise = Promise.resolve(null);
    if (clientMode !== 'text') {
      // 安全上下文检查: getUserMedia 需要 HTTPS 或 localhost
      if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
        const errMsg = window.isSecureContext === false
@@ -98,9 +121,8 @@ class NativeVoiceService {
        this.emitConnectionState('error', errMsg);
        throw new Error(errMsg);
      }
      // 并行: 同时预获取麦克风和建立WS连接，节省500ms+
-    const micPromise = navigator.mediaDevices.getUserMedia({
+      micPromise = navigator.mediaDevices.getUserMedia({
        audio: {
          channelCount: 1,
          noiseSuppression: true,
@@ -117,6 +139,7 @@ class NativeVoiceService {
        }
        return null;
      });
    }
    const CONNECTION_TIMEOUT_MS = 12000;
@@ -151,6 +174,9 @@ class NativeVoiceService {
          modelVersion,
          speaker,
          greetingText,
          clientMode,
          playAudioReply,
          disableGreeting: clientMode === 'text' ? (disableGreeting !== false) : disableGreeting,
        }));
      };
@@ -187,11 +213,21 @@ class NativeVoiceService {
      };
    });
    // 文字模式：不启动麦克风采集
    if (clientMode === 'text') {
      return;
    }
    // 使用预获取的mediaStream（已并行获取），避免重复申请
    const preFetchedStream = await micPromise;
    await this.startCapture(preFetchedStream);
  }
  sendText(text) {
    if (this.ws && this.ws.readyState === WebSocket.OPEN) {
      this.ws.send(JSON.stringify({ type: 'text', text: String(text || '') }));
    }
  }
  handleJsonMessage(raw) {
    try {
      const msg = JSON.parse(raw);
@@ -246,6 +282,10 @@ class NativeVoiceService {
  }
  handleAudioMessage(arrayBuffer) {
    // Text mode without playAudioReply: drop all incoming audio silently
    if (this.clientMode === 'text' && !this.playAudioReply) {
      return;
    }
    if (!this.playbackContext) {
      return;
    }
@@ -441,3 +481,4 @@ class NativeVoiceService {
 const nativeVoiceService = new NativeVoiceService();
 export default nativeVoiceService;
 export { NativeVoiceService };