bigwo/delivery/client/src/services/nativeVoiceService.js

class NativeVoiceService {
  constructor() {
    this.ws = null;
    this.mediaStream = null;
    this.captureContext = null;
    this.captureSource = null;
    this.captureProcessor = null;
    this.captureSilenceGain = null;
    this.playbackContext = null;
    this.playbackTime = 0;
    this.activeSources = new Set();
    this.pendingSamples = [];
    this.pendingAudioChunks = [];
    this._resuming = false;
    this.readyResolver = null;
    this.readyRejector = null;
    this.callbacks = {
      onSubtitle: null,
      onConnectionStateChange: null,
      onError: null,
      onAssistantPending: null,
      onDiagnostic: null,
      onIdleTimeout: null,
      onProductLink: null,
    };
  }

  resolveWebSocketUrl(sessionId, userId, wsPath = '/ws/realtime-dialog') {
    const query = new URLSearchParams({
      sessionId,
      userId: userId || '',
    });
    const configuredBase = import.meta.env.VITE_VOICE_WS_BASE_URL || import.meta.env.VITE_VOICE_API_BASE_URL || '';
    if (configuredBase && !configuredBase.startsWith('/')) {
      let base = configuredBase.replace(/\/$/, '');
      if (base.startsWith('https://')) {
        base = `wss://${base.slice('https://'.length)}`;
      } else if (base.startsWith('http://')) {
        base = `ws://${base.slice('http://'.length)}`;
      }
      if (base.endsWith('/api/voice')) {
        base = base.slice(0, -'/api/voice'.length);
      } else if (base.endsWith('/api')) {
        base = base.slice(0, -'/api'.length);
      }
      return `${base}${wsPath}?${query.toString()}`;
    }
    const hostname = window.location.hostname;
    const port = window.location.port;
    const isLocalHost = hostname === 'localhost' || hostname === '127.0.0.1';
    if ((window.location.protocol === 'file:' || isLocalHost) && port !== '3013') {
      return `ws://${hostname || '127.0.0.1'}:3013${wsPath}?${query.toString()}`;
    }
    const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
    return `${protocol}//${window.location.host}${wsPath}?${query.toString()}`;
  }

  emitConnectionState(state) {
    this.callbacks.onConnectionStateChange?.(state);
  }

  emitDiagnostic(type, payload) {
    this.callbacks.onDiagnostic?.({ type, payload, timestamp: Date.now() });
  }

  resetPlaybackQueue() {
    this.activeSources.forEach((source) => {
      try {
        source.stop();
      } catch (_) {}
      try {
        source.disconnect();
      } catch (_) {}
    });
    this.activeSources.clear();
    if (this.playbackContext) {
      this.playbackTime = this.playbackContext.currentTime + 0.02;
    } else {
      this.playbackTime = 0;
    }
  }

  async connect({
    sessionId,
    userId,
    botName,
    systemRole,
    speakingStyle,
    modelVersion,
    speaker,
    greetingText,
    clientMode = 'voice',
    playAudioReply = false,
    disableGreeting = false,
  } = {}) {
    await this.disconnect();
    this.clientMode = clientMode;
    this.playAudioReply = playAudioReply;
    const wsPath = clientMode === 'text' ? '/ws/realtime-text' : '/ws/realtime-dialog';
    const wsUrl = this.resolveWebSocketUrl(sessionId, userId, wsPath);
    this.emitConnectionState('connecting');

    // Audio playback context: only needed if we will receive audio
    const needsPlayback = clientMode !== 'text' || playAudioReply;
    if (needsPlayback) {
      this.playbackContext = new (window.AudioContext || window.webkitAudioContext)();
      if (this.playbackContext.state === 'suspended') {
        await this.playbackContext.resume().catch(() => {});
      }
      this.playbackTime = this.playbackContext.currentTime;
    }

    // Microphone: only needed in voice mode
    let micPromise = Promise.resolve(null);
    if (clientMode !== 'text') {
      micPromise = navigator.mediaDevices.getUserMedia({
        audio: {
          channelCount: 1,
          noiseSuppression: true,
          echoCancellation: true,
          autoGainControl: true,
        },
        video: false,
      }).catch((err) => {
        console.warn('[NativeVoice] Pre-fetch getUserMedia failed:', err.message);
        return null;
      });
    }

    const CONNECTION_TIMEOUT_MS = 12000;

    await new Promise((resolve, reject) => {
      this.readyResolver = resolve;
      this.readyRejector = reject;
      const ws = new WebSocket(wsUrl);
      ws.binaryType = 'arraybuffer';
      this.ws = ws;

      // 超时兜底：避免无限等待
      const timeoutId = setTimeout(() => {
        if (this.readyResolver) {
          console.warn(`[NativeVoice] Connection timeout (${CONNECTION_TIMEOUT_MS}ms), forcing ready`);
          this.readyResolver();
          this.readyResolver = null;
          this.readyRejector = null;
        }
      }, CONNECTION_TIMEOUT_MS);

      const clearTimeoutOnSettle = () => clearTimeout(timeoutId);

      ws.onopen = () => {
        this.emitConnectionState('connected');
        ws.send(JSON.stringify({
          type: 'start',
          sessionId,
          userId,
          botName,
          systemRole,
          speakingStyle,
          modelVersion,
          speaker,
          greetingText,
          clientMode,
          playAudioReply,
          disableGreeting: clientMode === 'text' ? (disableGreeting !== false) : disableGreeting,
        }));
      };

      ws.onerror = () => {
        clearTimeoutOnSettle();
        const error = new Error('WebSocket connection failed');
        this.callbacks.onError?.(error);
        this.readyRejector?.(error);
        this.readyResolver = null;
        this.readyRejector = null;
        reject(error);
      };

      ws.onclose = () => {
        clearTimeoutOnSettle();
        this.emitConnectionState('disconnected');
        if (this.readyRejector) {
          this.readyRejector(new Error('WebSocket closed before ready'));
          this.readyResolver = null;
          this.readyRejector = null;
        }
      };

      ws.onmessage = (event) => {
        if (typeof event.data === 'string') {
          const peek = event.data;
          if (peek.includes('"ready"')) {
            clearTimeoutOnSettle();
          }
          this.handleJsonMessage(peek);
          return;
        }
        this.handleAudioMessage(event.data);
      };
    });

    // 文字模式不启动麦克风
    if (clientMode === 'text') {
      return;
    }
    const preFetchedStream = await micPromise;
    await this.startCapture(preFetchedStream);
  }

  sendText(text) {
    if (this.ws && this.ws.readyState === WebSocket.OPEN) {
      this.ws.send(JSON.stringify({ type: 'text', text: String(text || '') }));
    }
  }

  handleJsonMessage(raw) {
    try {
      const msg = JSON.parse(raw);
      if (msg.type === 'ready') {
        this.readyResolver?.();
        this.readyResolver = null;
        this.readyRejector = null;
        return;
      }
      if (msg.type === 'subtitle') {
        this.callbacks.onSubtitle?.({
          text: msg.text,
          role: msg.role,
          isFinal: !!msg.isFinal,
          sequence: msg.sequence,
        });
        return;
      }
      if (msg.type === 'tts_reset') {
        this.resetPlaybackQueue();
        this.emitDiagnostic('tts_reset', msg);
        return;
      }
      if (msg.type === 'assistant_pending') {
        this.callbacks.onAssistantPending?.(!!msg.active);
        return;
      }
      if (msg.type === 'idle_timeout') {
        this.callbacks.onIdleTimeout?.(msg.timeout || 300000);
        return;
      }
      if (msg.type === 'product_link') {
        this.callbacks.onProductLink?.({
          product: msg.product,
          link: msg.link,
          description: msg.description,
        });
        return;
      }
      if (msg.type === 'upstream_closed') {
        this.callbacks.onError?.(new Error('语音服务已断开，请重新开始通话'));
        return;
      }
      if (msg.type === 'error') {
        this.callbacks.onError?.(new Error(msg.error || 'native voice error'));
        return;
      }
      this.emitDiagnostic('ws_message', msg);
    } catch (error) {
      this.emitDiagnostic('ws_raw_text', raw);
    }
  }

  handleAudioMessage(arrayBuffer) {
    // Text mode without playAudioReply: drop S2S audio silently
    if (this.clientMode === 'text' && !this.playAudioReply) {
      return;
    }
    if (!this.playbackContext) {
      return;
    }
    if (this.playbackContext.state === 'suspended') {
      this.pendingAudioChunks.push(arrayBuffer);
      this._tryResumePlayback();
      return;
    }
    this._playPcm(arrayBuffer);
  }

  _playPcm(arrayBuffer) {
    try {
      const pcm16 = new Int16Array(arrayBuffer);
      if (!pcm16.length) {
        return;
      }
      const audioBuffer = this.playbackContext.createBuffer(1, pcm16.length, 24000);
      const channel = audioBuffer.getChannelData(0);
      for (let i = 0; i < pcm16.length; i += 1) {
        channel[i] = pcm16[i] / 32768;
      }
      const source = this.playbackContext.createBufferSource();
      source.buffer = audioBuffer;
      source.connect(this.playbackContext.destination);
      this.activeSources.add(source);
      source.onended = () => {
        this.activeSources.delete(source);
        try {
          source.disconnect();
        } catch (_) {}
      };
      const now = this.playbackContext.currentTime;
      if (this.playbackTime < now) {
        this.playbackTime = now + 0.02;
      }
      source.start(this.playbackTime);
      this.playbackTime += audioBuffer.duration;
      this.emitDiagnostic('audio_chunk', { samples: pcm16.length, duration: audioBuffer.duration });
    } catch (err) {
      console.warn('[NativeVoice] playPcm failed:', err.message);
    }
  }

  async _tryResumePlayback() {
    if (this._resuming) return;
    this._resuming = true;
    try {
      await this.playbackContext.resume();
      while (this.pendingAudioChunks.length > 0) {
        this._playPcm(this.pendingAudioChunks.shift());
      }
    } catch (e) {
      console.warn('[NativeVoice] resume failed:', e.message);
    } finally {
      this._resuming = false;
    }
  }

  async startCapture(preFetchedStream) {
    this.mediaStream = preFetchedStream || await navigator.mediaDevices.getUserMedia({
      audio: {
        channelCount: 1,
        noiseSuppression: true,
        echoCancellation: true,
        autoGainControl: true,
      },
      video: false,
    });
    this.captureContext = new (window.AudioContext || window.webkitAudioContext)();
    this.captureSource = this.captureContext.createMediaStreamSource(this.mediaStream);
    this.captureProcessor = this.captureContext.createScriptProcessor(4096, 1, 1);
    this.captureSilenceGain = this.captureContext.createGain();
    this.captureSilenceGain.gain.value = 0;
    this.captureProcessor.onaudioprocess = (event) => {
      const input = event.inputBuffer.getChannelData(0);
      const downsampled = this.downsampleBuffer(input, this.captureContext.sampleRate, 16000);
      for (let i = 0; i < downsampled.length; i += 1) {
        this.pendingSamples.push(downsampled[i]);
      }
      while (this.pendingSamples.length >= 320) {
        const chunk = this.pendingSamples.splice(0, 320);
        const pcm = new Int16Array(chunk.length);
        for (let i = 0; i < chunk.length; i += 1) {
          const sample = Math.max(-1, Math.min(1, chunk[i]));
          pcm[i] = sample < 0 ? sample * 32768 : sample * 32767;
        }
        if (this.ws && this.ws.readyState === WebSocket.OPEN) {
          this.ws.send(pcm.buffer);
        }
      }
    };
    this.captureSource.connect(this.captureProcessor);
    this.captureProcessor.connect(this.captureSilenceGain);
    this.captureSilenceGain.connect(this.captureContext.destination);
  }

  downsampleBuffer(buffer, inputRate, outputRate) {
    if (outputRate >= inputRate) {
      return Array.from(buffer);
    }
    const sampleRateRatio = inputRate / outputRate;
    const newLength = Math.round(buffer.length / sampleRateRatio);
    const result = new Array(newLength);
    let offsetResult = 0;
    let offsetBuffer = 0;
    while (offsetResult < result.length) {
      const nextOffsetBuffer = Math.round((offsetResult + 1) * sampleRateRatio);
      let accum = 0;
      let count = 0;
      for (let i = offsetBuffer; i < nextOffsetBuffer && i < buffer.length; i += 1) {
        accum += buffer[i];
        count += 1;
      }
      result[offsetResult] = count > 0 ? accum / count : 0;
      offsetResult += 1;
      offsetBuffer = nextOffsetBuffer;
    }
    return result;
  }

  async setMuted(muted) {
    this.mediaStream?.getAudioTracks().forEach((track) => {
      track.enabled = !muted;
    });
  }

  requestGreetingReplay() {
    if (this.ws && this.ws.readyState === WebSocket.OPEN) {
      this.ws.send(JSON.stringify({ type: 'replay_greeting' }));
      this.emitDiagnostic('replay_greeting', { sent: true });
    }
  }

  async disconnect() {
    if (this.captureProcessor) {
      this.captureProcessor.disconnect();
      this.captureProcessor.onaudioprocess = null;
      this.captureProcessor = null;
    }
    if (this.captureSource) {
      this.captureSource.disconnect();
      this.captureSource = null;
    }
    if (this.captureSilenceGain) {
      this.captureSilenceGain.disconnect();
      this.captureSilenceGain = null;
    }
    if (this.captureContext) {
      await this.captureContext.close().catch(() => {});
      this.captureContext = null;
    }
    if (this.mediaStream) {
      this.mediaStream.getTracks().forEach((track) => track.stop());
      this.mediaStream = null;
    }
    if (this.ws) {
      try {
        if (this.ws.readyState === WebSocket.OPEN) {
          this.ws.send(JSON.stringify({ type: 'stop' }));
          this.ws.close();
        }
      } catch (_) {}
      this.ws = null;
    }
    if (this.playbackContext) {
      this.resetPlaybackQueue();
      await this.playbackContext.close().catch(() => {});
      this.playbackContext = null;
    }
    this.playbackTime = 0;
    this.pendingSamples = [];
    this.pendingAudioChunks = [];
    this._resuming = false;
    this.emitConnectionState('disconnected');
  }

  on(event, callback) {
    if (event in this.callbacks) {
      this.callbacks[event] = callback;
    }
  }

  off(event) {
    if (event in this.callbacks) {
      this.callbacks[event] = null;
    }
  }
}

const nativeVoiceService = new NativeVoiceService();
export default nativeVoiceService;
export { NativeVoiceService };