Files
bigwo/delivery/client/src/services/nativeVoiceService.js
User ff6a63147b fix(voice-gateway): S2S idle timeout + upstream send lock + iOS AudioContext suspended + port 3012→3013
- P0: S2S DialogAudioIdleTimeoutError now notifies client instead of force-closing, sets upstreamReady=false and cancels keepalive
- P0: Reduce audioKeepaliveIntervalMs from 20s to 8s to prevent S2S idle timeout
- P1: Add upstreamSendLock to prevent concurrent IllegalStateException: Send pending
- P1: iOS AudioContext suspended handling - buffer audio chunks and try resume after user interaction
- P1: disconnect() clears pendingAudioChunks and _resuming to prevent memory leak
- Fix: Frontend hardcoded port 3012→3013 in videoApi.js and vite.config.js
- Add complete Java backend source code to git tracking
2026-04-16 19:16:11 +08:00

427 lines
13 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

class NativeVoiceService {
constructor() {
this.ws = null;
this.mediaStream = null;
this.captureContext = null;
this.captureSource = null;
this.captureProcessor = null;
this.captureSilenceGain = null;
this.playbackContext = null;
this.playbackTime = 0;
this.activeSources = new Set();
this.pendingSamples = [];
this.pendingAudioChunks = [];
this._resuming = false;
this.readyResolver = null;
this.readyRejector = null;
this.callbacks = {
onSubtitle: null,
onConnectionStateChange: null,
onError: null,
onAssistantPending: null,
onDiagnostic: null,
onIdleTimeout: null,
onProductLink: null,
};
}
resolveWebSocketUrl(sessionId, userId) {
const query = new URLSearchParams({
sessionId,
userId: userId || '',
});
const configuredBase = import.meta.env.VITE_VOICE_WS_BASE_URL || import.meta.env.VITE_VOICE_API_BASE_URL || '';
if (configuredBase && !configuredBase.startsWith('/')) {
let base = configuredBase.replace(/\/$/, '');
if (base.startsWith('https://')) {
base = `wss://${base.slice('https://'.length)}`;
} else if (base.startsWith('http://')) {
base = `ws://${base.slice('http://'.length)}`;
}
if (base.endsWith('/api/voice')) {
base = base.slice(0, -'/api/voice'.length);
} else if (base.endsWith('/api')) {
base = base.slice(0, -'/api'.length);
}
return `${base}/ws/realtime-dialog?${query.toString()}`;
}
const hostname = window.location.hostname;
const port = window.location.port;
const isLocalHost = hostname === 'localhost' || hostname === '127.0.0.1';
if ((window.location.protocol === 'file:' || isLocalHost) && port !== '3013') {
return `ws://${hostname || '127.0.0.1'}:3013/ws/realtime-dialog?${query.toString()}`;
}
const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
return `${protocol}//${window.location.host}/ws/realtime-dialog?${query.toString()}`;
}
emitConnectionState(state) {
this.callbacks.onConnectionStateChange?.(state);
}
emitDiagnostic(type, payload) {
this.callbacks.onDiagnostic?.({ type, payload, timestamp: Date.now() });
}
resetPlaybackQueue() {
this.activeSources.forEach((source) => {
try {
source.stop();
} catch (_) {}
try {
source.disconnect();
} catch (_) {}
});
this.activeSources.clear();
if (this.playbackContext) {
this.playbackTime = this.playbackContext.currentTime + 0.02;
} else {
this.playbackTime = 0;
}
}
async connect({ sessionId, userId, botName, systemRole, speakingStyle, modelVersion, speaker, greetingText }) {
await this.disconnect();
const wsUrl = this.resolveWebSocketUrl(sessionId, userId);
this.emitConnectionState('connecting');
this.playbackContext = new (window.AudioContext || window.webkitAudioContext)();
if (this.playbackContext.state === 'suspended') {
await this.playbackContext.resume().catch(() => {});
}
this.playbackTime = this.playbackContext.currentTime;
// 并行: 同时预获取麦克风和建立WS连接节省500ms+
const micPromise = navigator.mediaDevices.getUserMedia({
audio: {
channelCount: 1,
noiseSuppression: true,
echoCancellation: true,
autoGainControl: true,
},
video: false,
}).catch((err) => {
console.warn('[NativeVoice] Pre-fetch getUserMedia failed:', err.message);
return null;
});
const CONNECTION_TIMEOUT_MS = 12000;
await new Promise((resolve, reject) => {
this.readyResolver = resolve;
this.readyRejector = reject;
const ws = new WebSocket(wsUrl);
ws.binaryType = 'arraybuffer';
this.ws = ws;
// 超时兜底:避免无限等待
const timeoutId = setTimeout(() => {
if (this.readyResolver) {
console.warn(`[NativeVoice] Connection timeout (${CONNECTION_TIMEOUT_MS}ms), forcing ready`);
this.readyResolver();
this.readyResolver = null;
this.readyRejector = null;
}
}, CONNECTION_TIMEOUT_MS);
const clearTimeoutOnSettle = () => clearTimeout(timeoutId);
ws.onopen = () => {
this.emitConnectionState('connected');
ws.send(JSON.stringify({
type: 'start',
sessionId,
userId,
botName,
systemRole,
speakingStyle,
modelVersion,
speaker,
greetingText,
}));
};
ws.onerror = () => {
clearTimeoutOnSettle();
const error = new Error('WebSocket connection failed');
this.callbacks.onError?.(error);
this.readyRejector?.(error);
this.readyResolver = null;
this.readyRejector = null;
reject(error);
};
ws.onclose = () => {
clearTimeoutOnSettle();
this.emitConnectionState('disconnected');
if (this.readyRejector) {
this.readyRejector(new Error('WebSocket closed before ready'));
this.readyResolver = null;
this.readyRejector = null;
}
};
ws.onmessage = (event) => {
if (typeof event.data === 'string') {
const peek = event.data;
if (peek.includes('"ready"')) {
clearTimeoutOnSettle();
}
this.handleJsonMessage(peek);
return;
}
this.handleAudioMessage(event.data);
};
});
// 使用预获取的mediaStream已并行获取避免重复申请
const preFetchedStream = await micPromise;
await this.startCapture(preFetchedStream);
}
handleJsonMessage(raw) {
try {
const msg = JSON.parse(raw);
if (msg.type === 'ready') {
this.readyResolver?.();
this.readyResolver = null;
this.readyRejector = null;
return;
}
if (msg.type === 'subtitle') {
this.callbacks.onSubtitle?.({
text: msg.text,
role: msg.role,
isFinal: !!msg.isFinal,
sequence: msg.sequence,
});
return;
}
if (msg.type === 'tts_reset') {
this.resetPlaybackQueue();
this.emitDiagnostic('tts_reset', msg);
return;
}
if (msg.type === 'assistant_pending') {
this.callbacks.onAssistantPending?.(!!msg.active);
return;
}
if (msg.type === 'idle_timeout') {
this.callbacks.onIdleTimeout?.(msg.timeout || 300000);
return;
}
if (msg.type === 'product_link') {
this.callbacks.onProductLink?.({
product: msg.product,
link: msg.link,
description: msg.description,
});
return;
}
if (msg.type === 'upstream_closed') {
this.callbacks.onError?.(new Error('语音服务已断开,请重新开始通话'));
return;
}
if (msg.type === 'error') {
this.callbacks.onError?.(new Error(msg.error || 'native voice error'));
return;
}
this.emitDiagnostic('ws_message', msg);
} catch (error) {
this.emitDiagnostic('ws_raw_text', raw);
}
}
handleAudioMessage(arrayBuffer) {
if (!this.playbackContext) {
return;
}
if (this.playbackContext.state === 'suspended') {
this.pendingAudioChunks.push(arrayBuffer);
this._tryResumePlayback();
return;
}
this._playPcm(arrayBuffer);
}
_playPcm(arrayBuffer) {
try {
const pcm16 = new Int16Array(arrayBuffer);
if (!pcm16.length) {
return;
}
const audioBuffer = this.playbackContext.createBuffer(1, pcm16.length, 24000);
const channel = audioBuffer.getChannelData(0);
for (let i = 0; i < pcm16.length; i += 1) {
channel[i] = pcm16[i] / 32768;
}
const source = this.playbackContext.createBufferSource();
source.buffer = audioBuffer;
source.connect(this.playbackContext.destination);
this.activeSources.add(source);
source.onended = () => {
this.activeSources.delete(source);
try {
source.disconnect();
} catch (_) {}
};
const now = this.playbackContext.currentTime;
if (this.playbackTime < now) {
this.playbackTime = now + 0.02;
}
source.start(this.playbackTime);
this.playbackTime += audioBuffer.duration;
this.emitDiagnostic('audio_chunk', { samples: pcm16.length, duration: audioBuffer.duration });
} catch (err) {
console.warn('[NativeVoice] playPcm failed:', err.message);
}
}
async _tryResumePlayback() {
if (this._resuming) return;
this._resuming = true;
try {
await this.playbackContext.resume();
while (this.pendingAudioChunks.length > 0) {
this._playPcm(this.pendingAudioChunks.shift());
}
} catch (e) {
console.warn('[NativeVoice] resume failed:', e.message);
} finally {
this._resuming = false;
}
}
async startCapture(preFetchedStream) {
this.mediaStream = preFetchedStream || await navigator.mediaDevices.getUserMedia({
audio: {
channelCount: 1,
noiseSuppression: true,
echoCancellation: true,
autoGainControl: true,
},
video: false,
});
this.captureContext = new (window.AudioContext || window.webkitAudioContext)();
this.captureSource = this.captureContext.createMediaStreamSource(this.mediaStream);
this.captureProcessor = this.captureContext.createScriptProcessor(4096, 1, 1);
this.captureSilenceGain = this.captureContext.createGain();
this.captureSilenceGain.gain.value = 0;
this.captureProcessor.onaudioprocess = (event) => {
const input = event.inputBuffer.getChannelData(0);
const downsampled = this.downsampleBuffer(input, this.captureContext.sampleRate, 16000);
for (let i = 0; i < downsampled.length; i += 1) {
this.pendingSamples.push(downsampled[i]);
}
while (this.pendingSamples.length >= 320) {
const chunk = this.pendingSamples.splice(0, 320);
const pcm = new Int16Array(chunk.length);
for (let i = 0; i < chunk.length; i += 1) {
const sample = Math.max(-1, Math.min(1, chunk[i]));
pcm[i] = sample < 0 ? sample * 32768 : sample * 32767;
}
if (this.ws && this.ws.readyState === WebSocket.OPEN) {
this.ws.send(pcm.buffer);
}
}
};
this.captureSource.connect(this.captureProcessor);
this.captureProcessor.connect(this.captureSilenceGain);
this.captureSilenceGain.connect(this.captureContext.destination);
}
downsampleBuffer(buffer, inputRate, outputRate) {
if (outputRate >= inputRate) {
return Array.from(buffer);
}
const sampleRateRatio = inputRate / outputRate;
const newLength = Math.round(buffer.length / sampleRateRatio);
const result = new Array(newLength);
let offsetResult = 0;
let offsetBuffer = 0;
while (offsetResult < result.length) {
const nextOffsetBuffer = Math.round((offsetResult + 1) * sampleRateRatio);
let accum = 0;
let count = 0;
for (let i = offsetBuffer; i < nextOffsetBuffer && i < buffer.length; i += 1) {
accum += buffer[i];
count += 1;
}
result[offsetResult] = count > 0 ? accum / count : 0;
offsetResult += 1;
offsetBuffer = nextOffsetBuffer;
}
return result;
}
async setMuted(muted) {
this.mediaStream?.getAudioTracks().forEach((track) => {
track.enabled = !muted;
});
}
requestGreetingReplay() {
if (this.ws && this.ws.readyState === WebSocket.OPEN) {
this.ws.send(JSON.stringify({ type: 'replay_greeting' }));
this.emitDiagnostic('replay_greeting', { sent: true });
}
}
async disconnect() {
if (this.captureProcessor) {
this.captureProcessor.disconnect();
this.captureProcessor.onaudioprocess = null;
this.captureProcessor = null;
}
if (this.captureSource) {
this.captureSource.disconnect();
this.captureSource = null;
}
if (this.captureSilenceGain) {
this.captureSilenceGain.disconnect();
this.captureSilenceGain = null;
}
if (this.captureContext) {
await this.captureContext.close().catch(() => {});
this.captureContext = null;
}
if (this.mediaStream) {
this.mediaStream.getTracks().forEach((track) => track.stop());
this.mediaStream = null;
}
if (this.ws) {
try {
if (this.ws.readyState === WebSocket.OPEN) {
this.ws.send(JSON.stringify({ type: 'stop' }));
this.ws.close();
}
} catch (_) {}
this.ws = null;
}
if (this.playbackContext) {
this.resetPlaybackQueue();
await this.playbackContext.close().catch(() => {});
this.playbackContext = null;
}
this.playbackTime = 0;
this.pendingSamples = [];
this.pendingAudioChunks = [];
this._resuming = false;
this.emitConnectionState('disconnected');
}
on(event, callback) {
if (event in this.callbacks) {
this.callbacks[event] = callback;
}
}
off(event) {
if (event in this.callbacks) {
this.callbacks[event] = null;
}
}
}
const nativeVoiceService = new NativeVoiceService();
export default nativeVoiceService;