class NativeVoiceService { constructor() { this.ws = null; this.mediaStream = null; this.captureContext = null; this.captureSource = null; this.captureProcessor = null; this.captureSilenceGain = null; this.playbackContext = null; this.playbackTime = 0; this.activeSources = new Set(); this.pendingSamples = []; this.pendingAudioChunks = []; this._resuming = false; this.readyResolver = null; this.readyRejector = null; this.callbacks = { onSubtitle: null, onConnectionStateChange: null, onError: null, onAssistantPending: null, onDiagnostic: null, onIdleTimeout: null, onProductLink: null, }; } resolveWebSocketUrl(sessionId, userId, wsPath = '/ws/realtime-dialog') { const query = new URLSearchParams({ sessionId, userId: userId || '', }); const configuredBase = import.meta.env.VITE_VOICE_WS_BASE_URL || import.meta.env.VITE_VOICE_API_BASE_URL || ''; if (configuredBase && !configuredBase.startsWith('/')) { let base = configuredBase.replace(/\/$/, ''); if (base.startsWith('https://')) { base = `wss://${base.slice('https://'.length)}`; } else if (base.startsWith('http://')) { base = `ws://${base.slice('http://'.length)}`; } if (base.endsWith('/api/voice')) { base = base.slice(0, -'/api/voice'.length); } else if (base.endsWith('/api')) { base = base.slice(0, -'/api'.length); } return `${base}${wsPath}?${query.toString()}`; } const hostname = window.location.hostname; const port = window.location.port; const isLocalHost = hostname === 'localhost' || hostname === '127.0.0.1'; if ((window.location.protocol === 'file:' || isLocalHost) && port !== '3013') { return `ws://${hostname || '127.0.0.1'}:3013${wsPath}?${query.toString()}`; } const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:'; return `${protocol}//${window.location.host}${wsPath}?${query.toString()}`; } emitConnectionState(state) { this.callbacks.onConnectionStateChange?.(state); } emitDiagnostic(type, payload) { this.callbacks.onDiagnostic?.({ type, payload, timestamp: Date.now() }); } resetPlaybackQueue() { this.activeSources.forEach((source) => { try { source.stop(); } catch (_) {} try { source.disconnect(); } catch (_) {} }); this.activeSources.clear(); if (this.playbackContext) { this.playbackTime = this.playbackContext.currentTime + 0.02; } else { this.playbackTime = 0; } } async connect({ sessionId, userId, botName, systemRole, speakingStyle, modelVersion, speaker, greetingText, clientMode = 'voice', playAudioReply = false, disableGreeting = false, } = {}) { await this.disconnect(); this.clientMode = clientMode; this.playAudioReply = playAudioReply; const wsPath = clientMode === 'text' ? '/ws/realtime-text' : '/ws/realtime-dialog'; const wsUrl = this.resolveWebSocketUrl(sessionId, userId, wsPath); this.emitConnectionState('connecting'); // Audio playback context: only needed if we will receive audio const needsPlayback = clientMode !== 'text' || playAudioReply; if (needsPlayback) { this.playbackContext = new (window.AudioContext || window.webkitAudioContext)(); if (this.playbackContext.state === 'suspended') { await this.playbackContext.resume().catch(() => {}); } this.playbackTime = this.playbackContext.currentTime; } // Microphone: only needed in voice mode let micPromise = Promise.resolve(null); if (clientMode !== 'text') { micPromise = navigator.mediaDevices.getUserMedia({ audio: { channelCount: 1, noiseSuppression: true, echoCancellation: true, autoGainControl: true, }, video: false, }).catch((err) => { console.warn('[NativeVoice] Pre-fetch getUserMedia failed:', err.message); return null; }); } const CONNECTION_TIMEOUT_MS = 12000; await new Promise((resolve, reject) => { this.readyResolver = resolve; this.readyRejector = reject; const ws = new WebSocket(wsUrl); ws.binaryType = 'arraybuffer'; this.ws = ws; // 超时兜底:避免无限等待 const timeoutId = setTimeout(() => { if (this.readyResolver) { console.warn(`[NativeVoice] Connection timeout (${CONNECTION_TIMEOUT_MS}ms), forcing ready`); this.readyResolver(); this.readyResolver = null; this.readyRejector = null; } }, CONNECTION_TIMEOUT_MS); const clearTimeoutOnSettle = () => clearTimeout(timeoutId); ws.onopen = () => { this.emitConnectionState('connected'); ws.send(JSON.stringify({ type: 'start', sessionId, userId, botName, systemRole, speakingStyle, modelVersion, speaker, greetingText, clientMode, playAudioReply, disableGreeting: clientMode === 'text' ? (disableGreeting !== false) : disableGreeting, })); }; ws.onerror = () => { clearTimeoutOnSettle(); const error = new Error('WebSocket connection failed'); this.callbacks.onError?.(error); this.readyRejector?.(error); this.readyResolver = null; this.readyRejector = null; reject(error); }; ws.onclose = () => { clearTimeoutOnSettle(); this.emitConnectionState('disconnected'); if (this.readyRejector) { this.readyRejector(new Error('WebSocket closed before ready')); this.readyResolver = null; this.readyRejector = null; } }; ws.onmessage = (event) => { if (typeof event.data === 'string') { const peek = event.data; if (peek.includes('"ready"')) { clearTimeoutOnSettle(); } this.handleJsonMessage(peek); return; } this.handleAudioMessage(event.data); }; }); // 文字模式不启动麦克风 if (clientMode === 'text') { return; } const preFetchedStream = await micPromise; await this.startCapture(preFetchedStream); } sendText(text) { if (this.ws && this.ws.readyState === WebSocket.OPEN) { this.ws.send(JSON.stringify({ type: 'text', text: String(text || '') })); } } handleJsonMessage(raw) { try { const msg = JSON.parse(raw); if (msg.type === 'ready') { this.readyResolver?.(); this.readyResolver = null; this.readyRejector = null; return; } if (msg.type === 'subtitle') { this.callbacks.onSubtitle?.({ text: msg.text, role: msg.role, isFinal: !!msg.isFinal, sequence: msg.sequence, }); return; } if (msg.type === 'tts_reset') { this.resetPlaybackQueue(); this.emitDiagnostic('tts_reset', msg); return; } if (msg.type === 'assistant_pending') { this.callbacks.onAssistantPending?.(!!msg.active); return; } if (msg.type === 'idle_timeout') { this.callbacks.onIdleTimeout?.(msg.timeout || 300000); return; } if (msg.type === 'product_link') { this.callbacks.onProductLink?.({ product: msg.product, link: msg.link, description: msg.description, }); return; } if (msg.type === 'upstream_closed') { this.callbacks.onError?.(new Error('语音服务已断开,请重新开始通话')); return; } if (msg.type === 'error') { this.callbacks.onError?.(new Error(msg.error || 'native voice error')); return; } this.emitDiagnostic('ws_message', msg); } catch (error) { this.emitDiagnostic('ws_raw_text', raw); } } handleAudioMessage(arrayBuffer) { // Text mode without playAudioReply: drop S2S audio silently if (this.clientMode === 'text' && !this.playAudioReply) { return; } if (!this.playbackContext) { return; } if (this.playbackContext.state === 'suspended') { this.pendingAudioChunks.push(arrayBuffer); this._tryResumePlayback(); return; } this._playPcm(arrayBuffer); } _playPcm(arrayBuffer) { try { const pcm16 = new Int16Array(arrayBuffer); if (!pcm16.length) { return; } const audioBuffer = this.playbackContext.createBuffer(1, pcm16.length, 24000); const channel = audioBuffer.getChannelData(0); for (let i = 0; i < pcm16.length; i += 1) { channel[i] = pcm16[i] / 32768; } const source = this.playbackContext.createBufferSource(); source.buffer = audioBuffer; source.connect(this.playbackContext.destination); this.activeSources.add(source); source.onended = () => { this.activeSources.delete(source); try { source.disconnect(); } catch (_) {} }; const now = this.playbackContext.currentTime; if (this.playbackTime < now) { this.playbackTime = now + 0.02; } source.start(this.playbackTime); this.playbackTime += audioBuffer.duration; this.emitDiagnostic('audio_chunk', { samples: pcm16.length, duration: audioBuffer.duration }); } catch (err) { console.warn('[NativeVoice] playPcm failed:', err.message); } } async _tryResumePlayback() { if (this._resuming) return; this._resuming = true; try { await this.playbackContext.resume(); while (this.pendingAudioChunks.length > 0) { this._playPcm(this.pendingAudioChunks.shift()); } } catch (e) { console.warn('[NativeVoice] resume failed:', e.message); } finally { this._resuming = false; } } async startCapture(preFetchedStream) { this.mediaStream = preFetchedStream || await navigator.mediaDevices.getUserMedia({ audio: { channelCount: 1, noiseSuppression: true, echoCancellation: true, autoGainControl: true, }, video: false, }); this.captureContext = new (window.AudioContext || window.webkitAudioContext)(); this.captureSource = this.captureContext.createMediaStreamSource(this.mediaStream); this.captureProcessor = this.captureContext.createScriptProcessor(4096, 1, 1); this.captureSilenceGain = this.captureContext.createGain(); this.captureSilenceGain.gain.value = 0; this.captureProcessor.onaudioprocess = (event) => { const input = event.inputBuffer.getChannelData(0); const downsampled = this.downsampleBuffer(input, this.captureContext.sampleRate, 16000); for (let i = 0; i < downsampled.length; i += 1) { this.pendingSamples.push(downsampled[i]); } while (this.pendingSamples.length >= 320) { const chunk = this.pendingSamples.splice(0, 320); const pcm = new Int16Array(chunk.length); for (let i = 0; i < chunk.length; i += 1) { const sample = Math.max(-1, Math.min(1, chunk[i])); pcm[i] = sample < 0 ? sample * 32768 : sample * 32767; } if (this.ws && this.ws.readyState === WebSocket.OPEN) { this.ws.send(pcm.buffer); } } }; this.captureSource.connect(this.captureProcessor); this.captureProcessor.connect(this.captureSilenceGain); this.captureSilenceGain.connect(this.captureContext.destination); } downsampleBuffer(buffer, inputRate, outputRate) { if (outputRate >= inputRate) { return Array.from(buffer); } const sampleRateRatio = inputRate / outputRate; const newLength = Math.round(buffer.length / sampleRateRatio); const result = new Array(newLength); let offsetResult = 0; let offsetBuffer = 0; while (offsetResult < result.length) { const nextOffsetBuffer = Math.round((offsetResult + 1) * sampleRateRatio); let accum = 0; let count = 0; for (let i = offsetBuffer; i < nextOffsetBuffer && i < buffer.length; i += 1) { accum += buffer[i]; count += 1; } result[offsetResult] = count > 0 ? accum / count : 0; offsetResult += 1; offsetBuffer = nextOffsetBuffer; } return result; } async setMuted(muted) { this.mediaStream?.getAudioTracks().forEach((track) => { track.enabled = !muted; }); } requestGreetingReplay() { if (this.ws && this.ws.readyState === WebSocket.OPEN) { this.ws.send(JSON.stringify({ type: 'replay_greeting' })); this.emitDiagnostic('replay_greeting', { sent: true }); } } async disconnect() { if (this.captureProcessor) { this.captureProcessor.disconnect(); this.captureProcessor.onaudioprocess = null; this.captureProcessor = null; } if (this.captureSource) { this.captureSource.disconnect(); this.captureSource = null; } if (this.captureSilenceGain) { this.captureSilenceGain.disconnect(); this.captureSilenceGain = null; } if (this.captureContext) { await this.captureContext.close().catch(() => {}); this.captureContext = null; } if (this.mediaStream) { this.mediaStream.getTracks().forEach((track) => track.stop()); this.mediaStream = null; } if (this.ws) { try { if (this.ws.readyState === WebSocket.OPEN) { this.ws.send(JSON.stringify({ type: 'stop' })); this.ws.close(); } } catch (_) {} this.ws = null; } if (this.playbackContext) { this.resetPlaybackQueue(); await this.playbackContext.close().catch(() => {}); this.playbackContext = null; } this.playbackTime = 0; this.pendingSamples = []; this.pendingAudioChunks = []; this._resuming = false; this.emitConnectionState('disconnected'); } on(event, callback) { if (event in this.callbacks) { this.callbacks[event] = callback; } } off(event) { if (event in this.callbacks) { this.callbacks[event] = null; } } } const nativeVoiceService = new NativeVoiceService(); export default nativeVoiceService; export { NativeVoiceService };