const express = require('express'); const router = express.Router(); const { v4: uuidv4 } = require('uuid'); const volcengine = require('../services/volcengine'); const VoiceChatConfigBuilder = require('../config/voiceChatConfig'); const ToolExecutor = require('../services/toolExecutor'); const DEFAULT_TOOLS = require('../config/tools'); const db = require('../db'); const activeSessions = new Map(); const completedSessions = new Map(); const roomToBotUserId = new Map(); const roomToHumanUserId = new Map(); const roomToSessionId = new Map(); const roomToTaskId = new Map(); const latestUserSpeech = new Map(); const toolCallBuffers = new Map(); router.get('/config', (req, res) => { res.json({ success: true, data: { models: [ { value: '1.2.1.0', label: 'O2.0(推荐,精品音质)' }, { value: 'O', label: 'O(基础版)' }, { value: '2.2.0.0', label: 'SC2.0(推荐,声音复刻)' }, { value: 'SC', label: 'SC(基础版)' }, ], speakers: [ { value: 'zh_female_vv_jupiter_bigtts', label: 'VV(活泼女声)', series: 'O' }, { value: 'zh_female_xiaohe_jupiter_bigtts', label: '小禾(甜美女声·台湾口音)', series: 'O' }, { value: 'zh_male_yunzhou_jupiter_bigtts', label: '云舟(沉稳男声)', series: 'O' }, { value: 'zh_male_xiaotian_jupiter_bigtts', label: '小天(磁性男声)', series: 'O' }, { value: 'saturn_common_female_1', label: 'Saturn 女声1', series: 'SC2.0' }, { value: 'saturn_common_male_1', label: 'Saturn 男声1', series: 'SC2.0' }, { value: 'ICL_common_female_1', label: 'ICL 女声1', series: 'SC' }, { value: 'ICL_common_male_1', label: 'ICL 男声1', series: 'SC' }, ], tools: DEFAULT_TOOLS.map((t) => ({ name: t.function.name, description: t.function.description, })), }, }); }); router.post('/prepare', async (req, res) => { try { const { userId } = req.body; if (!userId) { return res.status(400).json({ success: false, error: 'userId is required' }); } const sessionId = uuidv4(); const roomId = `room_${sessionId.slice(0, 8)}`; const taskId = `task_${sessionId.slice(0, 8)}_${Date.now()}`; const rtcToken = volcengine.generateRTCToken(roomId, userId); activeSessions.set(sessionId, { roomId, taskId, userId, startTime: Date.now(), subtitles: [], started: false, }); roomToTaskId.set(roomId, taskId); roomToSessionId.set(roomId, sessionId); console.log(`[Voice] Session prepared: ${sessionId}, room: ${roomId}, user: ${userId}`); try { await db.createSession(sessionId, userId, 'voice'); } catch (e) { console.warn('[DB] createSession failed:', e.message); } res.json({ success: true, data: { sessionId, roomId, taskId, rtcToken, rtcAppId: process.env.VOLC_RTC_APP_ID, }, }); } catch (error) { console.error('[Voice] Prepare failed:', error.message); res.status(500).json({ success: false, error: error.message }); } }); router.post('/start', async (req, res) => { let session = null; try { const { sessionId, botName, systemRole, speakingStyle, modelVersion, speaker, enableWebSearch, chatHistory, } = req.body; if (!sessionId) { return res.status(400).json({ success: false, error: 'sessionId is required' }); } session = activeSessions.get(sessionId); if (!session) { return res.status(404).json({ success: false, error: 'Session not found' }); } if (session.started) { return res.json({ success: true, data: { message: 'Already started' } }); } let effectiveChatHistory = chatHistory; if ((!chatHistory || chatHistory.length === 0) && sessionId) { try { const dbHistory = await db.getHistoryForLLM(sessionId, 20); if (dbHistory.length > 0) { effectiveChatHistory = dbHistory; console.log(`[Voice] Loaded ${dbHistory.length} messages from DB for session ${sessionId}`); } } catch (e) { console.warn('[DB] getHistoryForLLM failed:', e.message); } } console.log(`[Voice] chatHistory: ${effectiveChatHistory ? effectiveChatHistory.length : 'undefined'} messages`); const { config, botUserId } = VoiceChatConfigBuilder.build({ roomId: session.roomId, taskId: session.taskId, userId: session.userId, botName, systemRole, speakingStyle, modelVersion, speaker, tools: DEFAULT_TOOLS, enableWebSearch, chatHistory: effectiveChatHistory, }); session.botUserId = botUserId; roomToBotUserId.set(session.roomId, botUserId); roomToHumanUserId.set(session.roomId, session.userId); console.log(`[Voice] room=${session.roomId} botUserId=${botUserId} humanUserId=${session.userId}`); const result = await volcengine.startVoiceChat(config); session.started = true; // 捕获服务端可能分配的不同 TaskId const serverTaskId = result?.Result?.TaskId || result?.Result?.task_id; if (serverTaskId && serverTaskId !== session.taskId) { console.log(`[Voice] Server assigned different TaskId: ${serverTaskId} (ours: ${session.taskId})`); roomToTaskId.set(session.roomId, serverTaskId); session.taskId = serverTaskId; } console.log(`[Voice] Session started: ${sessionId}, TaskId=${session.taskId}`); res.json({ success: true, data: { startResult: result }, }); } catch (error) { const detail = error.response?.data || error.message; console.error('[Voice] Start failed:', JSON.stringify(detail, null, 2)); if (session) { try { await volcengine.stopVoiceChat({ AppId: process.env.VOLC_RTC_APP_ID, RoomId: session.roomId, TaskId: session.taskId, }); console.log(`[Voice] Stopped failed session`); } catch (stopErr) { console.warn('[Voice] Stop failed during error handling:', stopErr.message); } } res.status(500).json({ success: false, error: typeof detail === 'string' ? detail : JSON.stringify(detail) }); } }); router.post('/stop', async (req, res) => { try { const { sessionId } = req.body; const session = activeSessions.get(sessionId); if (session) { await volcengine.stopVoiceChat({ AppId: process.env.VOLC_RTC_APP_ID, RoomId: session.roomId, TaskId: session.taskId, }); const duration = Math.floor((Date.now() - session.startTime) / 1000); console.log(`[Voice] Session stopped: ${sessionId}, duration: ${duration}s, subtitles: ${session.subtitles.length}`); if (session.subtitles.length > 0) { completedSessions.set(sessionId, { subtitles: session.subtitles, duration, endTime: Date.now(), }); setTimeout(() => completedSessions.delete(sessionId), 30 * 60 * 1000); } activeSessions.delete(sessionId); roomToTaskId.delete(session.roomId); roomToSessionId.delete(session.roomId); roomToBotUserId.delete(session.roomId); roomToHumanUserId.delete(session.roomId); res.json({ success: true, data: { duration, subtitleCount: session.subtitles.length, subtitles: session.subtitles, }, }); } else { res.json({ success: true, data: { message: 'Session not found or already stopped' } }); } } catch (error) { console.error('[Voice] Stop failed:', error.message); res.status(500).json({ success: false, error: error.message }); } }); router.post('/subtitle', (req, res) => { try { const { sessionId, roomId, text, role, definite, sequence } = req.body; const session = activeSessions.get(sessionId); if (definite && text) { const subtitleRole = role === 'user' ? 'user' : 'assistant'; if (session) { session.subtitles.push({ text, role: subtitleRole, timestamp: Date.now(), sequence }); } const sid = sessionId || (session && roomToSessionId.get(session.roomId)); if (sid) { const source = subtitleRole === 'user' ? 'voice_asr' : 'voice_bot'; db.addMessage(sid, subtitleRole, text, source).catch(e => console.warn('[DB] addMessage failed:', e.message)); } if (subtitleRole === 'user') { const rid = roomId || (session && session.roomId) || ''; if (rid) { latestUserSpeech.set(rid, { text, timestamp: Date.now() }); console.log(`[Subtitle][user][${rid}] "${text}"`); } } else { console.log(`[Subtitle][assistant] ${text}`); } } res.json({ success: true }); } catch (error) { console.error('[Subtitle] Error:', error.message); res.status(500).json({ success: false, error: error.message }); } }); router.get('/subtitles/:sessionId', (req, res) => { const session = activeSessions.get(req.params.sessionId); res.json({ success: true, data: session ? session.subtitles : [], }); }); function extractReadableText(chunks) { const raw = chunks.join(''); let decoded = raw; try { decoded = decoded.replace(/\\\\u([0-9a-fA-F]{4})/g, (_, hex) => { return String.fromCharCode(parseInt(hex, 16)); }); decoded = decoded.replace(/\\u([0-9a-fA-F]{4})/g, (_, hex) => { return String.fromCharCode(parseInt(hex, 16)); }); } catch (e) { } const chineseChars = decoded.match(/[\u4e00-\u9fff\u3400-\u4dbf]+/g) || []; const skipWords = new Set(['id', 'type', 'function', 'name', 'arguments', 'query', 'object', 'string']); const englishWords = (decoded.match(/[a-zA-Z]{2,}/g) || []) .filter(w => !skipWords.has(w.toLowerCase())); const parts = [...chineseChars, ...englishWords]; const result = parts.join(' ').trim(); console.log(`[FC] extractReadableText: chinese=[${chineseChars.join(',')}] english=[${englishWords.join(',')}] → "${result}"`); return result; } let fcCallbackSeq = 0; router.post('/fc_callback', async (req, res) => { try { const body = req.body; if (!body || typeof body !== 'object' || Object.keys(body).length === 0) { console.error('[FC] Empty body'); return res.status(400).json({ success: false, error: 'Empty body' }); } const { Message, Signature, Type, RoomID, TaskID, TaskType, AppID, AppId, room_id, task_id, roomId, taskId } = body; const effectiveRoomId = RoomID || room_id || roomId; const effectiveTaskId = TaskID || task_id || taskId; const effectiveAppId = AppID || AppId || process.env.VOLC_RTC_APP_ID; const seq = body._seq || ++fcCallbackSeq; console.log(`[FC] >>> Callback received: seq=${seq} Type="${Type}" Room=${effectiveRoomId} Task=${effectiveTaskId} TaskType=${TaskType}`); let msgObj = null; try { msgObj = typeof Message === 'string' ? JSON.parse(Message) : Message; } catch (e) { console.error('[FC] Failed to parse Message:', e.message); return res.json({ success: true }); } if (Type === 'tool_calls' && Array.isArray(msgObj) && msgObj.length > 0) { const tc = msgObj[0]; const chunkId = tc.id || ''; const chunkName = tc.function?.name || ''; const chunkArgs = tc.function?.arguments || ''; const existing = toolCallBuffers.get(effectiveTaskId); if (existing && existing.triggered) { const userSpeech = latestUserSpeech.get(effectiveRoomId); const hasNewInput = userSpeech && (Date.now() - userSpeech.timestamp < 10000); if (hasNewInput) { console.log(`[FC] [FormatA] New user input detected, clearing cooldown for room=${effectiveRoomId}`); toolCallBuffers.delete(effectiveTaskId); } else { // 扩展 cooldown 到 30 秒,防止 LLM 在 KB 查询期间无限重试 const cooldownMs = existing.resultSentAt ? 30000 : 15000; const elapsed = existing.resultSentAt ? (Date.now() - existing.resultSentAt) : (Date.now() - existing.createdAt); if (elapsed < cooldownMs) { console.log(`[FC] [FormatA] Cooldown active (${elapsed}ms < ${cooldownMs}ms), ignoring retry for TaskID=${effectiveTaskId}`); res.json({ success: true }); return; } console.log(`[FC] [FormatA] Cooldown expired (${elapsed}ms >= ${cooldownMs}ms), allowing new call for TaskID=${effectiveTaskId}`); toolCallBuffers.delete(effectiveTaskId); } } if (!toolCallBuffers.has(effectiveTaskId)) { toolCallBuffers.set(effectiveTaskId, { id: '', name: '', chunks: [], triggered: false, RoomID: effectiveRoomId, AppID: effectiveAppId, S2STaskID: effectiveTaskId, createdAt: Date.now(), timer: null, }); console.log(`[FC] [FormatA] New buffer created for TaskID=${effectiveTaskId}, room=${effectiveRoomId}`); } const buf = toolCallBuffers.get(effectiveTaskId); if (chunkId && !buf.id) buf.id = chunkId; if (chunkName && !buf.name) buf.name = chunkName; if (chunkArgs) { buf.chunks.push({ seq: tc.seq || 0, args: chunkArgs }); } res.json({ success: true }); if (buf.timer) clearTimeout(buf.timer); buf.timer = setTimeout(async () => { // 500ms 收集 chunks const b = toolCallBuffers.get(effectiveTaskId); if (!b || b.triggered) return; b.triggered = true; const toolName = b.name || 'search_knowledge'; const sortedChunks = b.chunks.sort((a, b) => a.seq - b.seq); const allArgs = sortedChunks.map(c => c.args).join(''); console.log(`[FC] [FormatA] 500ms timeout, ${b.chunks.length} chunks collected, name="${toolName}"`); const s2sTaskId = roomToTaskId.get(b.RoomID) || b.S2STaskID || effectiveTaskId; console.log(`[FC] TaskId resolution: roomToTaskId=${roomToTaskId.get(b.RoomID)} callback=${b.S2STaskID} → using=${s2sTaskId}`); // 不再单独发 interrupt 命令,ExternalTextToSpeech 的 InterruptMode:1 已包含打断功能 let parsedArgs = null; try { parsedArgs = JSON.parse(allArgs); console.log(`[FC] [FormatA] JSON.parse succeeded: ${JSON.stringify(parsedArgs)}`); } catch (e) { const userSpeech = latestUserSpeech.get(b.RoomID); if (userSpeech && (Date.now() - userSpeech.timestamp < 30000)) { console.log(`[FC] [FormatA] Using ASR user speech: "${userSpeech.text}"`); parsedArgs = { query: userSpeech.text }; } else { const extractedText = extractReadableText(b.chunks.map(c => c.args)); console.log(`[FC] [FormatA] No ASR text, extracted from chunks: "${extractedText}"`); parsedArgs = { query: extractedText || '' }; } } console.log('[FC] ⚡ Starting KB query (no pre-query interrupt)'); const kbResult = await ToolExecutor.execute(toolName, parsedArgs); try { const result = kbResult; const resultStr = JSON.stringify(result); console.log(`[FC] Tool result (${toolName}): ${resultStr.substring(0, 500)}`); let contentText = resultStr; try { if (result && result.results && Array.isArray(result.results)) { contentText = result.results.map(r => r.content || JSON.stringify(r)).join('\n'); } else if (result && result.error) { contentText = result.error; } else if (typeof result === 'string') { contentText = result; } } catch (e) { } const dbSessionId = roomToSessionId.get(b.RoomID); if (dbSessionId) { db.addMessage(dbSessionId, 'assistant', contentText, 'voice_tool', toolName) .catch(e => console.warn('[DB] addMessage(tool) failed:', e.message)); } console.log(`[FC] Knowledge base content (${contentText.length} chars): ${contentText.substring(0, 200)}${contentText.length > 200 ? '...' : ''}`); b.resultSentAt = Date.now(); // === 策略:只用 Command:function 回传结果给 LLM === // 根因分析: // 1. ExternalTextToSpeech 在 S2S 端到端模式下不产生可听见的音频(API返回ok但无声音) // 2. ExternalTextToSpeech InterruptMode=1 会打断正在播放的 S2S 回复,导致用户听到中断 // 3. Command:function 是官方自定义 FC 模式的正确回传方式 // 流程:Command:function → LLM 收到工具结果 → LLM 生成回复 → S2S 朗读 const toolCallId = b.id || 'unknown_call_id'; const functionContent = contentText.length > 1500 ? contentText.substring(0, 1500) + '……(内容较长,以上为主要部分)' : contentText; const funcMsg = JSON.stringify({ ToolCallID: toolCallId, Content: functionContent, }); let activeTaskId = s2sTaskId; try { console.log(`[FC] ★ Sending Command:function (ToolCallID=${toolCallId}, content=${functionContent.length} chars)`); await volcengine.updateVoiceChat({ AppId: effectiveAppId, RoomId: b.RoomID, TaskId: activeTaskId, Command: 'function', Message: funcMsg, }); console.log('[FC] ✅ Command:function sent OK → LLM will generate S2S response with KB content'); } catch (funcErr) { console.error('[FC] ✖ Command:function failed:', funcErr.message); // 如果正式 TaskId 失败,尝试回调 TaskId if (activeTaskId !== b.S2STaskID) { try { console.log(`[FC] Retrying Command:function with callback TaskID=${b.S2STaskID}`); activeTaskId = b.S2STaskID; await volcengine.updateVoiceChat({ AppId: effectiveAppId, RoomId: b.RoomID, TaskId: activeTaskId, Command: 'function', Message: funcMsg, }); console.log('[FC] ✅ Command:function retry OK'); } catch (retryErr) { console.error('[FC] ✖ Command:function retry also failed:', retryErr.message); } } } console.log(`[FC] Final result: Command:function sent (${functionContent.length} chars)`); } catch (err) { console.error(`[FC] Tool execution failed:`, err.message); console.error(`[FC] Error details:`, err); } }, 500); // 从1s减到500ms,减少等待 return; } if (msgObj && typeof msgObj === 'object' && !Array.isArray(msgObj)) { const eventType = msgObj.event_type; console.log(`[FC] [FormatB] event_type="${eventType}"`); if (eventType === 'function_calling') { const funcName = msgObj.function || ''; const toolCallId = msgObj.tool_call_id || ''; const responseId = msgObj.response_id || ''; console.log(`[FC] [Information] FC notification: func=${funcName} toolCallId=${toolCallId} responseId=${responseId}`); res.json({ success: true }); // ExternalTextToSpeech 在 S2S 模式下不产生音频,不再发送安抚语 // LLM 的 tool_calls 会触发 FormatA 分支执行工具并通过 Command:function 回传结果 console.log(`[FC] [Information] FC notification received, waiting for tool_calls`); return; } } if (msgObj && typeof msgObj === 'object') { const asrText = msgObj.text || msgObj.asr_text || msgObj.content || msgObj.user_text || msgObj.transcript || (msgObj.data && (msgObj.data.text || msgObj.data.asr_text || msgObj.data.content)); const role = msgObj.role || msgObj.speaker || msgObj.data?.role || ''; const isUser = !role || role === 'user' || role === 'human'; if (asrText && isUser && RoomID) { latestUserSpeech.set(RoomID, { text: asrText, timestamp: Date.now() }); console.log(`[FC] [ConvState] Stored user speech for ${RoomID}: "${asrText}"`); } } res.json({ success: true }); } catch (error) { console.error('[FC] Error:', error.message); res.status(500).json({ success: false, error: error.message }); } }); router.post('/room_message', (req, res) => { try { const { roomId, uid, text } = req.body; if (!roomId || !text) { return res.json({ success: true }); } const jsonStart = text.search(/[\[{]/); if (jsonStart < 0) { return res.json({ success: true }); } const jsonStr = text.substring(jsonStart); let parsed = null; try { parsed = JSON.parse(jsonStr); } catch (e) { const textMatch = jsonStr.match(/"text"\s*:\s*"([^"]+)"/); if (textMatch && textMatch[1]) { const extractedText = textMatch[1]; const userIdMatch = jsonStr.match(/"userId"\s*:\s*"([^"]+)"/); const subtitleUserId = userIdMatch ? userIdMatch[1] : ''; const isUserSpeech = subtitleUserId && !subtitleUserId.startsWith('bot_'); if (isUserSpeech && extractedText) { latestUserSpeech.set(roomId, { text: extractedText, timestamp: Date.now(), source: 'room_regex' }); console.log(`[RoomMsg] ✅ Stored user speech (regex) for ${roomId}: "${extractedText}"`); } } return res.json({ success: true }); } if (parsed && parsed.data && Array.isArray(parsed.data)) { parsed.data.forEach(sub => { const subText = sub.text || ''; const subUserId = sub.userId || sub.user_id || ''; const isDefinite = sub.definite === true; const isUserSpeech = subUserId && !subUserId.startsWith('bot_'); if (subText && isUserSpeech && isDefinite) { latestUserSpeech.set(roomId, { text: subText, timestamp: Date.now(), source: 'room_subtitle' }); console.log(`[RoomMsg] ✅ Stored user speech for ${roomId}: "${subText}"`); } }); res.json({ success: true }); return; } if (parsed && typeof parsed === 'object') { const asrText = parsed.text || parsed.asr_text || parsed.content || parsed.user_text || parsed.transcript || (parsed.data && typeof parsed.data === 'string' ? parsed.data : null); const isBot = uid && uid.startsWith('bot_'); if (asrText && !isBot) { latestUserSpeech.set(roomId, { text: asrText, timestamp: Date.now(), source: 'room_object' }); console.log(`[RoomMsg] ✅ Stored user speech (obj) for ${roomId}: "${asrText}"`); } } res.json({ success: true }); } catch (error) { console.error('[RoomMsg] Error:', error.message); res.json({ success: true }); } }); router.post('/tool-callback', async (req, res) => { console.log('[ToolCallback] Legacy callback received:', JSON.stringify(req.body)); res.json({ success: true, message: 'deprecated, use fc_callback instead' }); }); router.get('/sessions', (req, res) => { const sessions = []; for (const [id, session] of activeSessions) { sessions.push({ sessionId: id, roomId: session.roomId, userId: session.userId, duration: Math.floor((Date.now() - session.startTime) / 1000), subtitleCount: session.subtitles.length, }); } res.json({ success: true, data: sessions }); }); module.exports = router;