Files
bigwo/test2/server/routes/voice.js
2026-03-12 12:47:56 +08:00

560 lines
23 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

const express = require('express');
const router = express.Router();
const { v4: uuidv4 } = require('uuid');
const volcengine = require('../services/volcengine');
const VoiceChatConfigBuilder = require('../config/voiceChatConfig');
const ToolExecutor = require('../services/toolExecutor');
const DEFAULT_TOOLS = require('../config/tools');
const db = require('../db');
const activeSessions = new Map();
const completedSessions = new Map();
const roomToBotUserId = new Map();
const roomToHumanUserId = new Map();
const roomToSessionId = new Map();
const roomToTaskId = new Map();
const latestUserSpeech = new Map();
const toolCallBuffers = new Map();
router.get('/config', (req, res) => {
res.json({
success: true,
data: {
models: [
{ value: '1.2.1.0', label: 'O2.0(推荐,精品音质)' },
{ value: 'O', label: 'O基础版' },
{ value: '2.2.0.0', label: 'SC2.0(推荐,声音复刻)' },
{ value: 'SC', label: 'SC基础版' },
],
speakers: [
{ value: 'zh_female_vv_jupiter_bigtts', label: 'VV活泼女声', series: 'O' },
{ value: 'zh_female_xiaohe_jupiter_bigtts', label: '小禾(甜美女声·台湾口音)', series: 'O' },
{ value: 'zh_male_yunzhou_jupiter_bigtts', label: '云舟(沉稳男声)', series: 'O' },
{ value: 'zh_male_xiaotian_jupiter_bigtts', label: '小天(磁性男声)', series: 'O' },
{ value: 'saturn_common_female_1', label: 'Saturn 女声1', series: 'SC2.0' },
{ value: 'saturn_common_male_1', label: 'Saturn 男声1', series: 'SC2.0' },
{ value: 'ICL_common_female_1', label: 'ICL 女声1', series: 'SC' },
{ value: 'ICL_common_male_1', label: 'ICL 男声1', series: 'SC' },
],
tools: DEFAULT_TOOLS.map((t) => ({
name: t.function.name,
description: t.function.description,
})),
},
});
});
router.post('/prepare', async (req, res) => {
try {
const { userId } = req.body;
if (!userId) {
return res.status(400).json({ success: false, error: 'userId is required' });
}
const sessionId = uuidv4();
const roomId = `room_${sessionId.slice(0, 8)}`;
const taskId = `task_${sessionId.slice(0, 8)}_${Date.now()}`;
const rtcToken = volcengine.generateRTCToken(roomId, userId);
activeSessions.set(sessionId, {
roomId,
taskId,
userId,
startTime: Date.now(),
subtitles: [],
started: false,
});
roomToTaskId.set(roomId, taskId);
roomToSessionId.set(roomId, sessionId);
console.log(`[Voice] Session prepared: ${sessionId}, room: ${roomId}, user: ${userId}`);
try { await db.createSession(sessionId, userId, 'voice'); } catch (e) { console.warn('[DB] createSession failed:', e.message); }
res.json({
success: true,
data: {
sessionId,
roomId,
taskId,
rtcToken,
rtcAppId: process.env.VOLC_RTC_APP_ID,
},
});
} catch (error) {
console.error('[Voice] Prepare failed:', error.message);
res.status(500).json({ success: false, error: error.message });
}
});
router.post('/start', async (req, res) => {
let session = null;
try {
const {
sessionId,
botName,
systemRole,
speakingStyle,
modelVersion,
speaker,
enableWebSearch,
chatHistory,
} = req.body;
if (!sessionId) {
return res.status(400).json({ success: false, error: 'sessionId is required' });
}
session = activeSessions.get(sessionId);
if (!session) {
return res.status(404).json({ success: false, error: 'Session not found' });
}
if (session.started) {
return res.json({ success: true, data: { message: 'Already started' } });
}
let effectiveChatHistory = chatHistory;
if ((!chatHistory || chatHistory.length === 0) && sessionId) {
try {
const dbHistory = await db.getHistoryForLLM(sessionId, 20);
if (dbHistory.length > 0) {
effectiveChatHistory = dbHistory;
console.log(`[Voice] Loaded ${dbHistory.length} messages from DB for session ${sessionId}`);
}
} catch (e) { console.warn('[DB] getHistoryForLLM failed:', e.message); }
}
console.log(`[Voice] chatHistory: ${effectiveChatHistory ? effectiveChatHistory.length : 'undefined'} messages`);
const { config, botUserId } = VoiceChatConfigBuilder.build({
roomId: session.roomId,
taskId: session.taskId,
userId: session.userId,
botName,
systemRole,
speakingStyle,
modelVersion,
speaker,
tools: DEFAULT_TOOLS,
enableWebSearch,
chatHistory: effectiveChatHistory,
});
session.botUserId = botUserId;
roomToBotUserId.set(session.roomId, botUserId);
roomToHumanUserId.set(session.roomId, session.userId);
console.log(`[Voice] room=${session.roomId} botUserId=${botUserId} humanUserId=${session.userId}`);
const result = await volcengine.startVoiceChat(config);
session.started = true;
// 捕获服务端可能分配的不同 TaskId
const serverTaskId = result?.Result?.TaskId || result?.Result?.task_id;
if (serverTaskId && serverTaskId !== session.taskId) {
console.log(`[Voice] Server assigned different TaskId: ${serverTaskId} (ours: ${session.taskId})`);
roomToTaskId.set(session.roomId, serverTaskId);
session.taskId = serverTaskId;
}
console.log(`[Voice] Session started: ${sessionId}, TaskId=${session.taskId}`);
res.json({
success: true,
data: { startResult: result },
});
} catch (error) {
const detail = error.response?.data || error.message;
console.error('[Voice] Start failed:', JSON.stringify(detail, null, 2));
if (session) {
try {
await volcengine.stopVoiceChat({
AppId: process.env.VOLC_RTC_APP_ID,
RoomId: session.roomId,
TaskId: session.taskId,
});
console.log(`[Voice] Stopped failed session`);
} catch (stopErr) {
console.warn('[Voice] Stop failed during error handling:', stopErr.message);
}
}
res.status(500).json({ success: false, error: typeof detail === 'string' ? detail : JSON.stringify(detail) });
}
});
router.post('/stop', async (req, res) => {
try {
const { sessionId } = req.body;
const session = activeSessions.get(sessionId);
if (session) {
await volcengine.stopVoiceChat({
AppId: process.env.VOLC_RTC_APP_ID,
RoomId: session.roomId,
TaskId: session.taskId,
});
const duration = Math.floor((Date.now() - session.startTime) / 1000);
console.log(`[Voice] Session stopped: ${sessionId}, duration: ${duration}s, subtitles: ${session.subtitles.length}`);
if (session.subtitles.length > 0) {
completedSessions.set(sessionId, {
subtitles: session.subtitles,
duration,
endTime: Date.now(),
});
setTimeout(() => completedSessions.delete(sessionId), 30 * 60 * 1000);
}
activeSessions.delete(sessionId);
roomToTaskId.delete(session.roomId);
roomToSessionId.delete(session.roomId);
roomToBotUserId.delete(session.roomId);
roomToHumanUserId.delete(session.roomId);
res.json({
success: true,
data: {
duration,
subtitleCount: session.subtitles.length,
subtitles: session.subtitles,
},
});
} else {
res.json({ success: true, data: { message: 'Session not found or already stopped' } });
}
} catch (error) {
console.error('[Voice] Stop failed:', error.message);
res.status(500).json({ success: false, error: error.message });
}
});
router.post('/subtitle', (req, res) => {
try {
const { sessionId, roomId, text, role, definite, sequence } = req.body;
const session = activeSessions.get(sessionId);
if (definite && text) {
const subtitleRole = role === 'user' ? 'user' : 'assistant';
if (session) {
session.subtitles.push({ text, role: subtitleRole, timestamp: Date.now(), sequence });
}
const sid = sessionId || (session && roomToSessionId.get(session.roomId));
if (sid) {
const source = subtitleRole === 'user' ? 'voice_asr' : 'voice_bot';
db.addMessage(sid, subtitleRole, text, source).catch(e => console.warn('[DB] addMessage failed:', e.message));
}
if (subtitleRole === 'user') {
const rid = roomId || (session && session.roomId) || '';
if (rid) {
latestUserSpeech.set(rid, { text, timestamp: Date.now() });
console.log(`[Subtitle][user][${rid}] "${text}"`);
}
} else {
console.log(`[Subtitle][assistant] ${text}`);
}
}
res.json({ success: true });
} catch (error) {
console.error('[Subtitle] Error:', error.message);
res.status(500).json({ success: false, error: error.message });
}
});
router.get('/subtitles/:sessionId', (req, res) => {
const session = activeSessions.get(req.params.sessionId);
res.json({
success: true,
data: session ? session.subtitles : [],
});
});
function extractReadableText(chunks) {
const raw = chunks.join('');
let decoded = raw;
try {
decoded = decoded.replace(/\\\\u([0-9a-fA-F]{4})/g, (_, hex) => {
return String.fromCharCode(parseInt(hex, 16));
});
decoded = decoded.replace(/\\u([0-9a-fA-F]{4})/g, (_, hex) => {
return String.fromCharCode(parseInt(hex, 16));
});
} catch (e) { }
const chineseChars = decoded.match(/[\u4e00-\u9fff\u3400-\u4dbf]+/g) || [];
const skipWords = new Set(['id', 'type', 'function', 'name', 'arguments', 'query', 'object', 'string']);
const englishWords = (decoded.match(/[a-zA-Z]{2,}/g) || [])
.filter(w => !skipWords.has(w.toLowerCase()));
const parts = [...chineseChars, ...englishWords];
const result = parts.join(' ').trim();
console.log(`[FC] extractReadableText: chinese=[${chineseChars.join(',')}] english=[${englishWords.join(',')}] → "${result}"`);
return result;
}
let fcCallbackSeq = 0;
router.post('/fc_callback', async (req, res) => {
try {
const body = req.body;
if (!body || typeof body !== 'object' || Object.keys(body).length === 0) {
console.error('[FC] Empty body');
return res.status(400).json({ success: false, error: 'Empty body' });
}
const { Message, Signature, Type, RoomID, TaskID, TaskType, AppID, AppId, room_id, task_id, roomId, taskId } = body;
const effectiveRoomId = RoomID || room_id || roomId;
const effectiveTaskId = TaskID || task_id || taskId;
const effectiveAppId = AppID || AppId || process.env.VOLC_RTC_APP_ID;
const seq = body._seq || ++fcCallbackSeq;
console.log(`[FC] >>> Callback received: seq=${seq} Type="${Type}" Room=${effectiveRoomId} Task=${effectiveTaskId} TaskType=${TaskType}`);
let msgObj = null;
try {
msgObj = typeof Message === 'string' ? JSON.parse(Message) : Message;
} catch (e) {
console.error('[FC] Failed to parse Message:', e.message);
return res.json({ success: true });
}
if (Type === 'tool_calls' && Array.isArray(msgObj) && msgObj.length > 0) {
const tc = msgObj[0];
const chunkId = tc.id || '';
const chunkName = tc.function?.name || '';
const chunkArgs = tc.function?.arguments || '';
const existing = toolCallBuffers.get(effectiveTaskId);
if (existing && existing.triggered) {
const userSpeech = latestUserSpeech.get(effectiveRoomId);
const hasNewInput = userSpeech && (Date.now() - userSpeech.timestamp < 10000);
if (hasNewInput) {
console.log(`[FC] [FormatA] New user input detected, clearing cooldown for room=${effectiveRoomId}`);
toolCallBuffers.delete(effectiveTaskId);
} else {
// 扩展 cooldown 到 30 秒,防止 LLM 在 KB 查询期间无限重试
const cooldownMs = existing.resultSentAt ? 30000 : 15000;
const elapsed = existing.resultSentAt
? (Date.now() - existing.resultSentAt)
: (Date.now() - existing.createdAt);
if (elapsed < cooldownMs) {
console.log(`[FC] [FormatA] Cooldown active (${elapsed}ms < ${cooldownMs}ms), ignoring retry for TaskID=${effectiveTaskId}`);
res.json({ success: true });
return;
}
console.log(`[FC] [FormatA] Cooldown expired (${elapsed}ms >= ${cooldownMs}ms), allowing new call for TaskID=${effectiveTaskId}`);
toolCallBuffers.delete(effectiveTaskId);
}
}
if (!toolCallBuffers.has(effectiveTaskId)) {
toolCallBuffers.set(effectiveTaskId, {
id: '', name: '', chunks: [], triggered: false,
RoomID: effectiveRoomId, AppID: effectiveAppId, S2STaskID: effectiveTaskId, createdAt: Date.now(), timer: null,
});
console.log(`[FC] [FormatA] New buffer created for TaskID=${effectiveTaskId}, room=${effectiveRoomId}`);
}
const buf = toolCallBuffers.get(effectiveTaskId);
if (chunkId && !buf.id) buf.id = chunkId;
if (chunkName && !buf.name) buf.name = chunkName;
if (chunkArgs) {
buf.chunks.push({ seq: tc.seq || 0, args: chunkArgs });
}
res.json({ success: true });
if (buf.timer) clearTimeout(buf.timer);
buf.timer = setTimeout(async () => { // 500ms 收集 chunks
const b = toolCallBuffers.get(effectiveTaskId);
if (!b || b.triggered) return;
b.triggered = true;
const toolName = b.name || 'search_knowledge';
const sortedChunks = b.chunks.sort((a, b) => a.seq - b.seq);
const allArgs = sortedChunks.map(c => c.args).join('');
console.log(`[FC] [FormatA] 500ms timeout, ${b.chunks.length} chunks collected, name="${toolName}"`);
const s2sTaskId = roomToTaskId.get(b.RoomID) || b.S2STaskID || effectiveTaskId;
console.log(`[FC] TaskId resolution: roomToTaskId=${roomToTaskId.get(b.RoomID)} callback=${b.S2STaskID} → using=${s2sTaskId}`);
// 不再单独发 interrupt 命令ExternalTextToSpeech 的 InterruptMode:1 已包含打断功能
let parsedArgs = null;
try {
parsedArgs = JSON.parse(allArgs);
console.log(`[FC] [FormatA] JSON.parse succeeded: ${JSON.stringify(parsedArgs)}`);
} catch (e) {
const userSpeech = latestUserSpeech.get(b.RoomID);
if (userSpeech && (Date.now() - userSpeech.timestamp < 30000)) {
console.log(`[FC] [FormatA] Using ASR user speech: "${userSpeech.text}"`);
parsedArgs = { query: userSpeech.text };
} else {
const extractedText = extractReadableText(b.chunks.map(c => c.args));
console.log(`[FC] [FormatA] No ASR text, extracted from chunks: "${extractedText}"`);
parsedArgs = { query: extractedText || '' };
}
}
console.log('[FC] ⚡ Starting KB query (no pre-query interrupt)');
const kbResult = await ToolExecutor.execute(toolName, parsedArgs);
try {
const result = kbResult;
const resultStr = JSON.stringify(result);
console.log(`[FC] Tool result (${toolName}): ${resultStr.substring(0, 500)}`);
let contentText = resultStr;
try {
if (result && result.results && Array.isArray(result.results)) {
contentText = result.results.map(r => r.content || JSON.stringify(r)).join('\n');
} else if (result && result.error) {
contentText = result.error;
} else if (typeof result === 'string') {
contentText = result;
}
} catch (e) { }
const dbSessionId = roomToSessionId.get(b.RoomID);
if (dbSessionId) {
db.addMessage(dbSessionId, 'assistant', contentText, 'voice_tool', toolName)
.catch(e => console.warn('[DB] addMessage(tool) failed:', e.message));
}
console.log(`[FC] Knowledge base content (${contentText.length} chars): ${contentText.substring(0, 200)}${contentText.length > 200 ? '...' : ''}`);
b.resultSentAt = Date.now();
// === 策略:只用 Command:function 回传结果给 LLM ===
// 根因分析:
// 1. ExternalTextToSpeech 在 S2S 端到端模式下不产生可听见的音频API返回ok但无声音
// 2. ExternalTextToSpeech InterruptMode=1 会打断正在播放的 S2S 回复,导致用户听到中断
// 3. Command:function 是官方自定义 FC 模式的正确回传方式
// 流程Command:function → LLM 收到工具结果 → LLM 生成回复 → S2S 朗读
const toolCallId = b.id || 'unknown_call_id';
const functionContent = contentText.length > 1500
? contentText.substring(0, 1500) + '……(内容较长,以上为主要部分)'
: contentText;
const funcMsg = JSON.stringify({
ToolCallID: toolCallId,
Content: functionContent,
});
let activeTaskId = s2sTaskId;
try {
console.log(`[FC] ★ Sending Command:function (ToolCallID=${toolCallId}, content=${functionContent.length} chars)`);
await volcengine.updateVoiceChat({
AppId: effectiveAppId,
RoomId: b.RoomID,
TaskId: activeTaskId,
Command: 'function',
Message: funcMsg,
});
console.log('[FC] ✅ Command:function sent OK → LLM will generate S2S response with KB content');
} catch (funcErr) {
console.error('[FC] ✖ Command:function failed:', funcErr.message);
// 如果正式 TaskId 失败,尝试回调 TaskId
if (activeTaskId !== b.S2STaskID) {
try {
console.log(`[FC] Retrying Command:function with callback TaskID=${b.S2STaskID}`);
activeTaskId = b.S2STaskID;
await volcengine.updateVoiceChat({
AppId: effectiveAppId,
RoomId: b.RoomID,
TaskId: activeTaskId,
Command: 'function',
Message: funcMsg,
});
console.log('[FC] ✅ Command:function retry OK');
} catch (retryErr) {
console.error('[FC] ✖ Command:function retry also failed:', retryErr.message);
}
}
}
console.log(`[FC] Final result: Command:function sent (${functionContent.length} chars)`);
} catch (err) {
console.error(`[FC] Tool execution failed:`, err.message);
console.error(`[FC] Error details:`, err);
}
}, 500); // 从1s减到500ms减少等待
return;
}
if (msgObj && typeof msgObj === 'object' && !Array.isArray(msgObj)) {
const eventType = msgObj.event_type;
console.log(`[FC] [FormatB] event_type="${eventType}"`);
if (eventType === 'function_calling') {
const funcName = msgObj.function || '';
const toolCallId = msgObj.tool_call_id || '';
const responseId = msgObj.response_id || '';
console.log(`[FC] [Information] FC notification: func=${funcName} toolCallId=${toolCallId} responseId=${responseId}`);
res.json({ success: true });
// ExternalTextToSpeech 在 S2S 模式下不产生音频,不再发送安抚语
// LLM 的 tool_calls 会触发 FormatA 分支执行工具并通过 Command:function 回传结果
console.log(`[FC] [Information] FC notification received, waiting for tool_calls`);
return;
}
}
if (msgObj && typeof msgObj === 'object') {
const asrText = msgObj.text || msgObj.asr_text || msgObj.content ||
msgObj.user_text || msgObj.transcript ||
(msgObj.data && (msgObj.data.text || msgObj.data.asr_text || msgObj.data.content));
const role = msgObj.role || msgObj.speaker || msgObj.data?.role || '';
const isUser = !role || role === 'user' || role === 'human';
if (asrText && isUser && RoomID) {
latestUserSpeech.set(RoomID, { text: asrText, timestamp: Date.now() });
console.log(`[FC] [ConvState] Stored user speech for ${RoomID}: "${asrText}"`);
}
}
res.json({ success: true });
} catch (error) {
console.error('[FC] Error:', error.message);
res.status(500).json({ success: false, error: error.message });
}
});
router.post('/room_message', (req, res) => {
try {
const { roomId, uid, text } = req.body;
if (!roomId || !text) {
return res.json({ success: true });
}
const jsonStart = text.search(/[\[{]/);
if (jsonStart < 0) {
return res.json({ success: true });
}
const jsonStr = text.substring(jsonStart);
let parsed = null;
try { parsed = JSON.parse(jsonStr); } catch (e) {
const textMatch = jsonStr.match(/"text"\s*:\s*"([^"]+)"/);
if (textMatch && textMatch[1]) {
const extractedText = textMatch[1];
const userIdMatch = jsonStr.match(/"userId"\s*:\s*"([^"]+)"/);
const subtitleUserId = userIdMatch ? userIdMatch[1] : '';
const isUserSpeech = subtitleUserId && !subtitleUserId.startsWith('bot_');
if (isUserSpeech && extractedText) {
latestUserSpeech.set(roomId, { text: extractedText, timestamp: Date.now(), source: 'room_regex' });
console.log(`[RoomMsg] ✅ Stored user speech (regex) for ${roomId}: "${extractedText}"`);
}
}
return res.json({ success: true });
}
if (parsed && parsed.data && Array.isArray(parsed.data)) {
parsed.data.forEach(sub => {
const subText = sub.text || '';
const subUserId = sub.userId || sub.user_id || '';
const isDefinite = sub.definite === true;
const isUserSpeech = subUserId && !subUserId.startsWith('bot_');
if (subText && isUserSpeech && isDefinite) {
latestUserSpeech.set(roomId, { text: subText, timestamp: Date.now(), source: 'room_subtitle' });
console.log(`[RoomMsg] ✅ Stored user speech for ${roomId}: "${subText}"`);
}
});
res.json({ success: true });
return;
}
if (parsed && typeof parsed === 'object') {
const asrText = parsed.text || parsed.asr_text || parsed.content ||
parsed.user_text || parsed.transcript ||
(parsed.data && typeof parsed.data === 'string' ? parsed.data : null);
const isBot = uid && uid.startsWith('bot_');
if (asrText && !isBot) {
latestUserSpeech.set(roomId, { text: asrText, timestamp: Date.now(), source: 'room_object' });
console.log(`[RoomMsg] ✅ Stored user speech (obj) for ${roomId}: "${asrText}"`);
}
}
res.json({ success: true });
} catch (error) {
console.error('[RoomMsg] Error:', error.message);
res.json({ success: true });
}
});
router.post('/tool-callback', async (req, res) => {
console.log('[ToolCallback] Legacy callback received:', JSON.stringify(req.body));
res.json({ success: true, message: 'deprecated, use fc_callback instead' });
});
router.get('/sessions', (req, res) => {
const sessions = [];
for (const [id, session] of activeSessions) {
sessions.push({
sessionId: id,
roomId: session.roomId,
userId: session.userId,
duration: Math.floor((Date.now() - session.startTime) / 1000),
subtitleCount: session.subtitles.length,
});
}
res.json({ success: true, data: sessions });
});
module.exports = router;