- P0: S2S DialogAudioIdleTimeoutError now notifies client instead of force-closing, sets upstreamReady=false and cancels keepalive - P0: Reduce audioKeepaliveIntervalMs from 20s to 8s to prevent S2S idle timeout - P1: Add upstreamSendLock to prevent concurrent IllegalStateException: Send pending - P1: iOS AudioContext suspended handling - buffer audio chunks and try resume after user interaction - P1: disconnect() clears pendingAudioChunks and _resuming to prevent memory leak - Fix: Frontend hardcoded port 3012→3013 in videoApi.js and vite.config.js - Add complete Java backend source code to git tracking
427 lines
13 KiB
JavaScript
427 lines
13 KiB
JavaScript
class NativeVoiceService {
|
||
constructor() {
|
||
this.ws = null;
|
||
this.mediaStream = null;
|
||
this.captureContext = null;
|
||
this.captureSource = null;
|
||
this.captureProcessor = null;
|
||
this.captureSilenceGain = null;
|
||
this.playbackContext = null;
|
||
this.playbackTime = 0;
|
||
this.activeSources = new Set();
|
||
this.pendingSamples = [];
|
||
this.pendingAudioChunks = [];
|
||
this._resuming = false;
|
||
this.readyResolver = null;
|
||
this.readyRejector = null;
|
||
this.callbacks = {
|
||
onSubtitle: null,
|
||
onConnectionStateChange: null,
|
||
onError: null,
|
||
onAssistantPending: null,
|
||
onDiagnostic: null,
|
||
onIdleTimeout: null,
|
||
onProductLink: null,
|
||
};
|
||
}
|
||
|
||
resolveWebSocketUrl(sessionId, userId) {
|
||
const query = new URLSearchParams({
|
||
sessionId,
|
||
userId: userId || '',
|
||
});
|
||
const configuredBase = import.meta.env.VITE_VOICE_WS_BASE_URL || import.meta.env.VITE_VOICE_API_BASE_URL || '';
|
||
if (configuredBase && !configuredBase.startsWith('/')) {
|
||
let base = configuredBase.replace(/\/$/, '');
|
||
if (base.startsWith('https://')) {
|
||
base = `wss://${base.slice('https://'.length)}`;
|
||
} else if (base.startsWith('http://')) {
|
||
base = `ws://${base.slice('http://'.length)}`;
|
||
}
|
||
if (base.endsWith('/api/voice')) {
|
||
base = base.slice(0, -'/api/voice'.length);
|
||
} else if (base.endsWith('/api')) {
|
||
base = base.slice(0, -'/api'.length);
|
||
}
|
||
return `${base}/ws/realtime-dialog?${query.toString()}`;
|
||
}
|
||
const hostname = window.location.hostname;
|
||
const port = window.location.port;
|
||
const isLocalHost = hostname === 'localhost' || hostname === '127.0.0.1';
|
||
if ((window.location.protocol === 'file:' || isLocalHost) && port !== '3013') {
|
||
return `ws://${hostname || '127.0.0.1'}:3013/ws/realtime-dialog?${query.toString()}`;
|
||
}
|
||
const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
|
||
return `${protocol}//${window.location.host}/ws/realtime-dialog?${query.toString()}`;
|
||
}
|
||
|
||
emitConnectionState(state) {
|
||
this.callbacks.onConnectionStateChange?.(state);
|
||
}
|
||
|
||
emitDiagnostic(type, payload) {
|
||
this.callbacks.onDiagnostic?.({ type, payload, timestamp: Date.now() });
|
||
}
|
||
|
||
resetPlaybackQueue() {
|
||
this.activeSources.forEach((source) => {
|
||
try {
|
||
source.stop();
|
||
} catch (_) {}
|
||
try {
|
||
source.disconnect();
|
||
} catch (_) {}
|
||
});
|
||
this.activeSources.clear();
|
||
if (this.playbackContext) {
|
||
this.playbackTime = this.playbackContext.currentTime + 0.02;
|
||
} else {
|
||
this.playbackTime = 0;
|
||
}
|
||
}
|
||
|
||
async connect({ sessionId, userId, botName, systemRole, speakingStyle, modelVersion, speaker, greetingText }) {
|
||
await this.disconnect();
|
||
const wsUrl = this.resolveWebSocketUrl(sessionId, userId);
|
||
this.emitConnectionState('connecting');
|
||
this.playbackContext = new (window.AudioContext || window.webkitAudioContext)();
|
||
if (this.playbackContext.state === 'suspended') {
|
||
await this.playbackContext.resume().catch(() => {});
|
||
}
|
||
this.playbackTime = this.playbackContext.currentTime;
|
||
|
||
// 并行: 同时预获取麦克风和建立WS连接,节省500ms+
|
||
const micPromise = navigator.mediaDevices.getUserMedia({
|
||
audio: {
|
||
channelCount: 1,
|
||
noiseSuppression: true,
|
||
echoCancellation: true,
|
||
autoGainControl: true,
|
||
},
|
||
video: false,
|
||
}).catch((err) => {
|
||
console.warn('[NativeVoice] Pre-fetch getUserMedia failed:', err.message);
|
||
return null;
|
||
});
|
||
|
||
const CONNECTION_TIMEOUT_MS = 12000;
|
||
|
||
await new Promise((resolve, reject) => {
|
||
this.readyResolver = resolve;
|
||
this.readyRejector = reject;
|
||
const ws = new WebSocket(wsUrl);
|
||
ws.binaryType = 'arraybuffer';
|
||
this.ws = ws;
|
||
|
||
// 超时兜底:避免无限等待
|
||
const timeoutId = setTimeout(() => {
|
||
if (this.readyResolver) {
|
||
console.warn(`[NativeVoice] Connection timeout (${CONNECTION_TIMEOUT_MS}ms), forcing ready`);
|
||
this.readyResolver();
|
||
this.readyResolver = null;
|
||
this.readyRejector = null;
|
||
}
|
||
}, CONNECTION_TIMEOUT_MS);
|
||
|
||
const clearTimeoutOnSettle = () => clearTimeout(timeoutId);
|
||
|
||
ws.onopen = () => {
|
||
this.emitConnectionState('connected');
|
||
ws.send(JSON.stringify({
|
||
type: 'start',
|
||
sessionId,
|
||
userId,
|
||
botName,
|
||
systemRole,
|
||
speakingStyle,
|
||
modelVersion,
|
||
speaker,
|
||
greetingText,
|
||
}));
|
||
};
|
||
|
||
ws.onerror = () => {
|
||
clearTimeoutOnSettle();
|
||
const error = new Error('WebSocket connection failed');
|
||
this.callbacks.onError?.(error);
|
||
this.readyRejector?.(error);
|
||
this.readyResolver = null;
|
||
this.readyRejector = null;
|
||
reject(error);
|
||
};
|
||
|
||
ws.onclose = () => {
|
||
clearTimeoutOnSettle();
|
||
this.emitConnectionState('disconnected');
|
||
if (this.readyRejector) {
|
||
this.readyRejector(new Error('WebSocket closed before ready'));
|
||
this.readyResolver = null;
|
||
this.readyRejector = null;
|
||
}
|
||
};
|
||
|
||
ws.onmessage = (event) => {
|
||
if (typeof event.data === 'string') {
|
||
const peek = event.data;
|
||
if (peek.includes('"ready"')) {
|
||
clearTimeoutOnSettle();
|
||
}
|
||
this.handleJsonMessage(peek);
|
||
return;
|
||
}
|
||
this.handleAudioMessage(event.data);
|
||
};
|
||
});
|
||
|
||
// 使用预获取的mediaStream(已并行获取),避免重复申请
|
||
const preFetchedStream = await micPromise;
|
||
await this.startCapture(preFetchedStream);
|
||
}
|
||
|
||
handleJsonMessage(raw) {
|
||
try {
|
||
const msg = JSON.parse(raw);
|
||
if (msg.type === 'ready') {
|
||
this.readyResolver?.();
|
||
this.readyResolver = null;
|
||
this.readyRejector = null;
|
||
return;
|
||
}
|
||
if (msg.type === 'subtitle') {
|
||
this.callbacks.onSubtitle?.({
|
||
text: msg.text,
|
||
role: msg.role,
|
||
isFinal: !!msg.isFinal,
|
||
sequence: msg.sequence,
|
||
});
|
||
return;
|
||
}
|
||
if (msg.type === 'tts_reset') {
|
||
this.resetPlaybackQueue();
|
||
this.emitDiagnostic('tts_reset', msg);
|
||
return;
|
||
}
|
||
if (msg.type === 'assistant_pending') {
|
||
this.callbacks.onAssistantPending?.(!!msg.active);
|
||
return;
|
||
}
|
||
if (msg.type === 'idle_timeout') {
|
||
this.callbacks.onIdleTimeout?.(msg.timeout || 300000);
|
||
return;
|
||
}
|
||
if (msg.type === 'product_link') {
|
||
this.callbacks.onProductLink?.({
|
||
product: msg.product,
|
||
link: msg.link,
|
||
description: msg.description,
|
||
});
|
||
return;
|
||
}
|
||
if (msg.type === 'upstream_closed') {
|
||
this.callbacks.onError?.(new Error('语音服务已断开,请重新开始通话'));
|
||
return;
|
||
}
|
||
if (msg.type === 'error') {
|
||
this.callbacks.onError?.(new Error(msg.error || 'native voice error'));
|
||
return;
|
||
}
|
||
this.emitDiagnostic('ws_message', msg);
|
||
} catch (error) {
|
||
this.emitDiagnostic('ws_raw_text', raw);
|
||
}
|
||
}
|
||
|
||
handleAudioMessage(arrayBuffer) {
|
||
if (!this.playbackContext) {
|
||
return;
|
||
}
|
||
if (this.playbackContext.state === 'suspended') {
|
||
this.pendingAudioChunks.push(arrayBuffer);
|
||
this._tryResumePlayback();
|
||
return;
|
||
}
|
||
this._playPcm(arrayBuffer);
|
||
}
|
||
|
||
_playPcm(arrayBuffer) {
|
||
try {
|
||
const pcm16 = new Int16Array(arrayBuffer);
|
||
if (!pcm16.length) {
|
||
return;
|
||
}
|
||
const audioBuffer = this.playbackContext.createBuffer(1, pcm16.length, 24000);
|
||
const channel = audioBuffer.getChannelData(0);
|
||
for (let i = 0; i < pcm16.length; i += 1) {
|
||
channel[i] = pcm16[i] / 32768;
|
||
}
|
||
const source = this.playbackContext.createBufferSource();
|
||
source.buffer = audioBuffer;
|
||
source.connect(this.playbackContext.destination);
|
||
this.activeSources.add(source);
|
||
source.onended = () => {
|
||
this.activeSources.delete(source);
|
||
try {
|
||
source.disconnect();
|
||
} catch (_) {}
|
||
};
|
||
const now = this.playbackContext.currentTime;
|
||
if (this.playbackTime < now) {
|
||
this.playbackTime = now + 0.02;
|
||
}
|
||
source.start(this.playbackTime);
|
||
this.playbackTime += audioBuffer.duration;
|
||
this.emitDiagnostic('audio_chunk', { samples: pcm16.length, duration: audioBuffer.duration });
|
||
} catch (err) {
|
||
console.warn('[NativeVoice] playPcm failed:', err.message);
|
||
}
|
||
}
|
||
|
||
async _tryResumePlayback() {
|
||
if (this._resuming) return;
|
||
this._resuming = true;
|
||
try {
|
||
await this.playbackContext.resume();
|
||
while (this.pendingAudioChunks.length > 0) {
|
||
this._playPcm(this.pendingAudioChunks.shift());
|
||
}
|
||
} catch (e) {
|
||
console.warn('[NativeVoice] resume failed:', e.message);
|
||
} finally {
|
||
this._resuming = false;
|
||
}
|
||
}
|
||
|
||
async startCapture(preFetchedStream) {
|
||
this.mediaStream = preFetchedStream || await navigator.mediaDevices.getUserMedia({
|
||
audio: {
|
||
channelCount: 1,
|
||
noiseSuppression: true,
|
||
echoCancellation: true,
|
||
autoGainControl: true,
|
||
},
|
||
video: false,
|
||
});
|
||
this.captureContext = new (window.AudioContext || window.webkitAudioContext)();
|
||
this.captureSource = this.captureContext.createMediaStreamSource(this.mediaStream);
|
||
this.captureProcessor = this.captureContext.createScriptProcessor(4096, 1, 1);
|
||
this.captureSilenceGain = this.captureContext.createGain();
|
||
this.captureSilenceGain.gain.value = 0;
|
||
this.captureProcessor.onaudioprocess = (event) => {
|
||
const input = event.inputBuffer.getChannelData(0);
|
||
const downsampled = this.downsampleBuffer(input, this.captureContext.sampleRate, 16000);
|
||
for (let i = 0; i < downsampled.length; i += 1) {
|
||
this.pendingSamples.push(downsampled[i]);
|
||
}
|
||
while (this.pendingSamples.length >= 320) {
|
||
const chunk = this.pendingSamples.splice(0, 320);
|
||
const pcm = new Int16Array(chunk.length);
|
||
for (let i = 0; i < chunk.length; i += 1) {
|
||
const sample = Math.max(-1, Math.min(1, chunk[i]));
|
||
pcm[i] = sample < 0 ? sample * 32768 : sample * 32767;
|
||
}
|
||
if (this.ws && this.ws.readyState === WebSocket.OPEN) {
|
||
this.ws.send(pcm.buffer);
|
||
}
|
||
}
|
||
};
|
||
this.captureSource.connect(this.captureProcessor);
|
||
this.captureProcessor.connect(this.captureSilenceGain);
|
||
this.captureSilenceGain.connect(this.captureContext.destination);
|
||
}
|
||
|
||
downsampleBuffer(buffer, inputRate, outputRate) {
|
||
if (outputRate >= inputRate) {
|
||
return Array.from(buffer);
|
||
}
|
||
const sampleRateRatio = inputRate / outputRate;
|
||
const newLength = Math.round(buffer.length / sampleRateRatio);
|
||
const result = new Array(newLength);
|
||
let offsetResult = 0;
|
||
let offsetBuffer = 0;
|
||
while (offsetResult < result.length) {
|
||
const nextOffsetBuffer = Math.round((offsetResult + 1) * sampleRateRatio);
|
||
let accum = 0;
|
||
let count = 0;
|
||
for (let i = offsetBuffer; i < nextOffsetBuffer && i < buffer.length; i += 1) {
|
||
accum += buffer[i];
|
||
count += 1;
|
||
}
|
||
result[offsetResult] = count > 0 ? accum / count : 0;
|
||
offsetResult += 1;
|
||
offsetBuffer = nextOffsetBuffer;
|
||
}
|
||
return result;
|
||
}
|
||
|
||
async setMuted(muted) {
|
||
this.mediaStream?.getAudioTracks().forEach((track) => {
|
||
track.enabled = !muted;
|
||
});
|
||
}
|
||
|
||
requestGreetingReplay() {
|
||
if (this.ws && this.ws.readyState === WebSocket.OPEN) {
|
||
this.ws.send(JSON.stringify({ type: 'replay_greeting' }));
|
||
this.emitDiagnostic('replay_greeting', { sent: true });
|
||
}
|
||
}
|
||
|
||
async disconnect() {
|
||
if (this.captureProcessor) {
|
||
this.captureProcessor.disconnect();
|
||
this.captureProcessor.onaudioprocess = null;
|
||
this.captureProcessor = null;
|
||
}
|
||
if (this.captureSource) {
|
||
this.captureSource.disconnect();
|
||
this.captureSource = null;
|
||
}
|
||
if (this.captureSilenceGain) {
|
||
this.captureSilenceGain.disconnect();
|
||
this.captureSilenceGain = null;
|
||
}
|
||
if (this.captureContext) {
|
||
await this.captureContext.close().catch(() => {});
|
||
this.captureContext = null;
|
||
}
|
||
if (this.mediaStream) {
|
||
this.mediaStream.getTracks().forEach((track) => track.stop());
|
||
this.mediaStream = null;
|
||
}
|
||
if (this.ws) {
|
||
try {
|
||
if (this.ws.readyState === WebSocket.OPEN) {
|
||
this.ws.send(JSON.stringify({ type: 'stop' }));
|
||
this.ws.close();
|
||
}
|
||
} catch (_) {}
|
||
this.ws = null;
|
||
}
|
||
if (this.playbackContext) {
|
||
this.resetPlaybackQueue();
|
||
await this.playbackContext.close().catch(() => {});
|
||
this.playbackContext = null;
|
||
}
|
||
this.playbackTime = 0;
|
||
this.pendingSamples = [];
|
||
this.pendingAudioChunks = [];
|
||
this._resuming = false;
|
||
this.emitConnectionState('disconnected');
|
||
}
|
||
|
||
on(event, callback) {
|
||
if (event in this.callbacks) {
|
||
this.callbacks[event] = callback;
|
||
}
|
||
}
|
||
|
||
off(event) {
|
||
if (event in this.callbacks) {
|
||
this.callbacks[event] = null;
|
||
}
|
||
}
|
||
}
|
||
|
||
const nativeVoiceService = new NativeVoiceService();
|
||
export default nativeVoiceService;
|