class NativeVoiceService { constructor() { this.ws = null; this.mediaStream = null; this.captureContext = null; this.captureSource = null; this.captureProcessor = null; this.captureSilenceGain = null; this.playbackContext = null; this.playbackTime = 0; this.activeSources = new Set(); this.pendingSamples = []; this.readyResolver = null; this.readyRejector = null; this.callbacks = { onSubtitle: null, onConnectionStateChange: null, onError: null, onAssistantPending: null, onDiagnostic: null, }; } resolveWebSocketUrl(sessionId, userId) { const query = new URLSearchParams({ sessionId, userId: userId || '', }); const configuredBase = import.meta.env.VITE_VOICE_WS_BASE_URL || import.meta.env.VITE_VOICE_API_BASE_URL || ''; if (configuredBase && !configuredBase.startsWith('/')) { let base = configuredBase.replace(/\/$/, ''); if (base.startsWith('https://')) { base = `wss://${base.slice('https://'.length)}`; } else if (base.startsWith('http://')) { base = `ws://${base.slice('http://'.length)}`; } if (base.endsWith('/api/voice')) { base = base.slice(0, -'/api/voice'.length); } else if (base.endsWith('/api')) { base = base.slice(0, -'/api'.length); } return `${base}/ws/realtime-dialog?${query.toString()}`; } const hostname = window.location.hostname; const port = window.location.port; const isLocalHost = hostname === 'localhost' || hostname === '127.0.0.1'; if ((window.location.protocol === 'file:' || isLocalHost) && port !== '3012') { return `ws://${hostname || '127.0.0.1'}:3012/ws/realtime-dialog?${query.toString()}`; } const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:'; return `${protocol}//${window.location.host}/ws/realtime-dialog?${query.toString()}`; } emitConnectionState(state) { this.callbacks.onConnectionStateChange?.(state); } emitDiagnostic(type, payload) { this.callbacks.onDiagnostic?.({ type, payload, timestamp: Date.now() }); } resetPlaybackQueue() { this.activeSources.forEach((source) => { try { source.stop(); } catch (_) {} try { source.disconnect(); } catch (_) {} }); this.activeSources.clear(); if (this.playbackContext) { this.playbackTime = this.playbackContext.currentTime + 0.02; } else { this.playbackTime = 0; } } async connect({ sessionId, userId, botName, systemRole, speakingStyle, modelVersion, speaker }) { await this.disconnect(); const wsUrl = this.resolveWebSocketUrl(sessionId, userId); this.emitConnectionState('connecting'); this.playbackContext = new (window.AudioContext || window.webkitAudioContext)(); if (this.playbackContext.state === 'suspended') { await this.playbackContext.resume().catch(() => {}); } this.playbackTime = this.playbackContext.currentTime; await new Promise((resolve, reject) => { this.readyResolver = resolve; this.readyRejector = reject; const ws = new WebSocket(wsUrl); ws.binaryType = 'arraybuffer'; this.ws = ws; ws.onopen = () => { this.emitConnectionState('connected'); ws.send(JSON.stringify({ type: 'start', sessionId, userId, botName, systemRole, speakingStyle, modelVersion, speaker, })); }; ws.onerror = () => { const error = new Error('WebSocket connection failed'); this.callbacks.onError?.(error); this.readyRejector?.(error); this.readyResolver = null; this.readyRejector = null; reject(error); }; ws.onclose = () => { this.emitConnectionState('disconnected'); if (this.readyRejector) { this.readyRejector(new Error('WebSocket closed before ready')); this.readyResolver = null; this.readyRejector = null; } }; ws.onmessage = (event) => { if (typeof event.data === 'string') { this.handleJsonMessage(event.data); return; } this.handleAudioMessage(event.data); }; }); await this.startCapture(); } handleJsonMessage(raw) { try { const msg = JSON.parse(raw); if (msg.type === 'ready') { this.readyResolver?.(); this.readyResolver = null; this.readyRejector = null; return; } if (msg.type === 'subtitle') { this.callbacks.onSubtitle?.({ text: msg.text, role: msg.role, isFinal: !!msg.isFinal, sequence: msg.sequence, }); return; } if (msg.type === 'tts_reset') { this.resetPlaybackQueue(); this.emitDiagnostic('tts_reset', msg); return; } if (msg.type === 'assistant_pending') { this.callbacks.onAssistantPending?.(!!msg.active); return; } if (msg.type === 'error') { this.callbacks.onError?.(new Error(msg.error || 'native voice error')); return; } this.emitDiagnostic('ws_message', msg); } catch (error) { this.emitDiagnostic('ws_raw_text', raw); } } handleAudioMessage(arrayBuffer) { if (!this.playbackContext) { return; } const pcm16 = new Int16Array(arrayBuffer); if (!pcm16.length) { return; } const audioBuffer = this.playbackContext.createBuffer(1, pcm16.length, 24000); const channel = audioBuffer.getChannelData(0); for (let i = 0; i < pcm16.length; i += 1) { channel[i] = pcm16[i] / 32768; } const source = this.playbackContext.createBufferSource(); source.buffer = audioBuffer; source.connect(this.playbackContext.destination); this.activeSources.add(source); source.onended = () => { this.activeSources.delete(source); try { source.disconnect(); } catch (_) {} }; const now = this.playbackContext.currentTime; if (this.playbackTime < now) { this.playbackTime = now + 0.02; } source.start(this.playbackTime); this.playbackTime += audioBuffer.duration; this.emitDiagnostic('audio_chunk', { samples: pcm16.length, duration: audioBuffer.duration }); } async startCapture() { this.mediaStream = await navigator.mediaDevices.getUserMedia({ audio: { channelCount: 1, noiseSuppression: true, echoCancellation: true, autoGainControl: true, }, video: false, }); this.captureContext = new (window.AudioContext || window.webkitAudioContext)(); this.captureSource = this.captureContext.createMediaStreamSource(this.mediaStream); this.captureProcessor = this.captureContext.createScriptProcessor(4096, 1, 1); this.captureSilenceGain = this.captureContext.createGain(); this.captureSilenceGain.gain.value = 0; this.captureProcessor.onaudioprocess = (event) => { const input = event.inputBuffer.getChannelData(0); const downsampled = this.downsampleBuffer(input, this.captureContext.sampleRate, 16000); for (let i = 0; i < downsampled.length; i += 1) { this.pendingSamples.push(downsampled[i]); } while (this.pendingSamples.length >= 320) { const chunk = this.pendingSamples.splice(0, 320); const pcm = new Int16Array(chunk.length); for (let i = 0; i < chunk.length; i += 1) { const sample = Math.max(-1, Math.min(1, chunk[i])); pcm[i] = sample < 0 ? sample * 32768 : sample * 32767; } if (this.ws && this.ws.readyState === WebSocket.OPEN) { this.ws.send(pcm.buffer); } } }; this.captureSource.connect(this.captureProcessor); this.captureProcessor.connect(this.captureSilenceGain); this.captureSilenceGain.connect(this.captureContext.destination); } downsampleBuffer(buffer, inputRate, outputRate) { if (outputRate >= inputRate) { return Array.from(buffer); } const sampleRateRatio = inputRate / outputRate; const newLength = Math.round(buffer.length / sampleRateRatio); const result = new Array(newLength); let offsetResult = 0; let offsetBuffer = 0; while (offsetResult < result.length) { const nextOffsetBuffer = Math.round((offsetResult + 1) * sampleRateRatio); let accum = 0; let count = 0; for (let i = offsetBuffer; i < nextOffsetBuffer && i < buffer.length; i += 1) { accum += buffer[i]; count += 1; } result[offsetResult] = count > 0 ? accum / count : 0; offsetResult += 1; offsetBuffer = nextOffsetBuffer; } return result; } async setMuted(muted) { this.mediaStream?.getAudioTracks().forEach((track) => { track.enabled = !muted; }); } async disconnect() { if (this.captureProcessor) { this.captureProcessor.disconnect(); this.captureProcessor.onaudioprocess = null; this.captureProcessor = null; } if (this.captureSource) { this.captureSource.disconnect(); this.captureSource = null; } if (this.captureSilenceGain) { this.captureSilenceGain.disconnect(); this.captureSilenceGain = null; } if (this.captureContext) { await this.captureContext.close().catch(() => {}); this.captureContext = null; } if (this.mediaStream) { this.mediaStream.getTracks().forEach((track) => track.stop()); this.mediaStream = null; } if (this.ws) { try { if (this.ws.readyState === WebSocket.OPEN) { this.ws.send(JSON.stringify({ type: 'stop' })); this.ws.close(); } } catch (_) {} this.ws = null; } if (this.playbackContext) { this.resetPlaybackQueue(); await this.playbackContext.close().catch(() => {}); this.playbackContext = null; } this.playbackTime = 0; this.pendingSamples = []; this.emitConnectionState('disconnected'); } on(event, callback) { if (event in this.callbacks) { this.callbacks[event] = callback; } } off(event) { if (event in this.callbacks) { this.callbacks[event] = null; } } } const nativeVoiceService = new NativeVoiceService(); export default nativeVoiceService;