dify

2025-12-01 17:21:38 +08:00
parent 32fee2b8ab
commit fab8c13cb3
7511 changed files with 996300 additions and 0 deletions
--- a/dify/web/app/components/base/voice-input/index.module.css
+++ b/dify/web/app/components/base/voice-input/index.module.css
@@ -0,0 +1,10 @@
+.wrapper {
+  background: linear-gradient(131deg, #2250F2 0%, #0EBCF3 100%);
+  box-shadow: 0px 4px 6px -2px rgba(16, 24, 40, 0.03), 0px 12px 16px -4px rgba(16, 24, 40, 0.08);
+}
+
+.convert {
+  background: linear-gradient(91.92deg, #104AE1 -1.74%, #0098EE 75.74%);
+  background-clip: text;
+  color: transparent;
+}
--- a/dify/web/app/components/base/voice-input/index.stories.tsx
+++ b/dify/web/app/components/base/voice-input/index.stories.tsx
@@ -0,0 +1,499 @@
+import type { Meta, StoryObj } from '@storybook/nextjs'
+import { useState } from 'react'
+
+// Mock component since VoiceInput requires browser APIs and service dependencies
+const VoiceInputMock = ({ onConverted, onCancel }: any) => {
+  const [state, setState] = useState<'idle' | 'recording' | 'converting'>('recording')
+  const [duration, setDuration] = useState(0)
+
+  // Simulate recording
+  useState(() => {
+    const interval = setInterval(() => {
+      setDuration(d => d + 1)
+    }, 1000)
+    return () => clearInterval(interval)
+  })
+
+  const handleStop = () => {
+    setState('converting')
+    setTimeout(() => {
+      onConverted('This is simulated transcribed text from voice input.')
+    }, 2000)
+  }
+
+  const minutes = Math.floor(duration / 60)
+  const seconds = duration % 60
+
+  return (
+    <div className="relative h-16 w-full overflow-hidden rounded-xl border-2 border-primary-600">
+      <div className="absolute inset-[1.5px] flex items-center overflow-hidden rounded-[10.5px] bg-primary-25 py-[14px] pl-[14.5px] pr-[6.5px]">
+        {/* Waveform visualization placeholder */}
+        <div className="absolute bottom-0 left-0 flex h-4 w-full items-end gap-[3px] px-2">
+          {Array.from({ length: 40 }).map((_, i) => (
+            <div
+              key={i}
+              className="w-[2px] rounded-t bg-blue-200"
+              style={{
+                height: `${Math.random() * 100}%`,
+                animation: state === 'recording' ? 'pulse 1s infinite' : 'none',
+              }}
+            />
+          ))}
+        </div>
+
+        {state === 'converting' && (
+          <div className="mr-2 h-4 w-4 animate-spin rounded-full border-2 border-primary-700 border-t-transparent" />
+        )}
+
+        <div className="z-10 grow">
+          {state === 'recording' && (
+            <div className="text-sm text-gray-500">Speaking...</div>
+          )}
+          {state === 'converting' && (
+            <div className="text-sm text-gray-500">Converting to text...</div>
+          )}
+        </div>
+
+        {state === 'recording' && (
+          <div
+            className="mr-1 flex h-8 w-8 cursor-pointer items-center justify-center rounded-lg hover:bg-primary-100"
+            onClick={handleStop}
+          >
+            <div className="h-5 w-5 rounded bg-primary-600" />
+          </div>
+        )}
+
+        {state === 'converting' && (
+          <div
+            className="mr-1 flex h-8 w-8 cursor-pointer items-center justify-center rounded-lg hover:bg-gray-200"
+            onClick={onCancel}
+          >
+            <span className="text-lg text-gray-500">×</span>
+          </div>
+        )}
+
+        <div className={`w-[45px] pl-1 text-xs font-medium ${duration > 500 ? 'text-red-600' : 'text-gray-700'}`}>
+          {`0${minutes}:${seconds >= 10 ? seconds : `0${seconds}`}`}
+        </div>
+      </div>
+    </div>
+  )
+}
+
+const meta = {
+  title: 'Base/Data Entry/VoiceInput',
+  component: VoiceInputMock,
+  parameters: {
+    layout: 'centered',
+    docs: {
+      description: {
+        component: 'Voice input component for recording audio and converting speech to text. Features waveform visualization, recording timer (max 10 minutes), and audio-to-text conversion using js-audio-recorder.\n\n**Note:** This is a simplified mock for Storybook. The actual component requires microphone permissions and audio-to-text API.',
+      },
+    },
+  },
+  tags: ['autodocs'],
+} satisfies Meta<typeof VoiceInputMock>
+
+export default meta
+type Story = StoryObj<typeof meta>
+
+// Basic demo
+const VoiceInputDemo = () => {
+  const [isRecording, setIsRecording] = useState(false)
+  const [transcription, setTranscription] = useState('')
+
+  const handleStartRecording = () => {
+    setIsRecording(true)
+    setTranscription('')
+  }
+
+  const handleConverted = (text: string) => {
+    setTranscription(text)
+    setIsRecording(false)
+  }
+
+  const handleCancel = () => {
+    setIsRecording(false)
+    setTranscription('')
+  }
+
+  return (
+    <div style={{ width: '600px' }}>
+      {!isRecording && (
+        <button
+          className="w-full rounded-lg bg-blue-600 px-4 py-3 font-medium text-white hover:bg-blue-700"
+          onClick={handleStartRecording}
+        >
+          🎤 Start Voice Recording
+        </button>
+      )}
+
+      {isRecording && (
+        <VoiceInputMock
+          onConverted={handleConverted}
+          onCancel={handleCancel}
+        />
+      )}
+
+      {transcription && (
+        <div className="mt-4 rounded-lg bg-gray-50 p-4">
+          <div className="mb-2 text-xs font-medium text-gray-600">Transcription:</div>
+          <div className="text-sm text-gray-800">{transcription}</div>
+        </div>
+      )}
+    </div>
+  )
+}
+
+// Default state
+export const Default: Story = {
+  render: () => <VoiceInputDemo />,
+}
+
+// Recording state
+export const RecordingState: Story = {
+  render: () => (
+    <div style={{ width: '600px' }}>
+      <VoiceInputMock
+        onConverted={() => console.log('Converted')}
+        onCancel={() => console.log('Cancelled')}
+      />
+      <div className="mt-3 text-xs text-gray-500">
+        Recording in progress with live waveform visualization
+      </div>
+    </div>
+  ),
+}
+
+// Real-world example - Chat input with voice
+const ChatInputWithVoiceDemo = () => {
+  const [message, setMessage] = useState('')
+  const [isRecording, setIsRecording] = useState(false)
+
+  return (
+    <div style={{ width: '700px' }} className="rounded-lg border border-gray-200 bg-white p-6">
+      <h3 className="mb-4 text-lg font-semibold">Chat Interface</h3>
+
+      {/* Existing messages */}
+      <div className="mb-4 h-64 space-y-3 overflow-y-auto">
+        <div className="flex gap-3">
+          <div className="flex h-8 w-8 items-center justify-center rounded-full bg-blue-500 text-sm text-white">
+            U
+          </div>
+          <div className="flex-1">
+            <div className="rounded-lg bg-gray-100 p-3 text-sm">
+              Hello! How can I help you today?
+            </div>
+          </div>
+        </div>
+        <div className="flex gap-3">
+          <div className="flex h-8 w-8 items-center justify-center rounded-full bg-green-500 text-sm text-white">
+            A
+          </div>
+          <div className="flex-1">
+            <div className="rounded-lg bg-blue-50 p-3 text-sm">
+              I can assist you with various tasks. What would you like to know?
+            </div>
+          </div>
+        </div>
+      </div>
+
+      {/* Input area */}
+      <div className="space-y-3">
+        {!isRecording ? (
+          <div className="flex gap-2">
+            <input
+              type="text"
+              className="flex-1 rounded-lg border border-gray-300 px-4 py-3 text-sm"
+              placeholder="Type a message..."
+              value={message}
+              onChange={e => setMessage(e.target.value)}
+            />
+            <button
+              className="rounded-lg bg-gray-100 px-4 py-3 hover:bg-gray-200"
+              onClick={() => setIsRecording(true)}
+              title="Voice input"
+            >
+              🎤
+            </button>
+            <button className="rounded-lg bg-blue-600 px-6 py-3 text-white hover:bg-blue-700">
+              Send
+            </button>
+          </div>
+        ) : (
+          <VoiceInputMock
+            onConverted={(text: string) => {
+              setMessage(text)
+              setIsRecording(false)
+            }}
+            onCancel={() => setIsRecording(false)}
+          />
+        )}
+      </div>
+    </div>
+  )
+}
+
+export const ChatInputWithVoice: Story = {
+  render: () => <ChatInputWithVoiceDemo />,
+}
+
+// Real-world example - Search with voice
+const SearchWithVoiceDemo = () => {
+  const [searchQuery, setSearchQuery] = useState('')
+  const [isRecording, setIsRecording] = useState(false)
+
+  return (
+    <div style={{ width: '700px' }} className="rounded-lg border border-gray-200 bg-white p-6">
+      <h3 className="mb-4 text-lg font-semibold">Voice Search</h3>
+
+      {!isRecording ? (
+        <div className="flex gap-2">
+          <div className="relative flex-1">
+            <input
+              type="text"
+              className="w-full rounded-lg border border-gray-300 px-4 py-3 pl-10 text-sm"
+              placeholder="Search or use voice..."
+              value={searchQuery}
+              onChange={e => setSearchQuery(e.target.value)}
+            />
+            <span className="absolute left-3 top-1/2 -translate-y-1/2 text-gray-400">
+              🔍
+            </span>
+          </div>
+          <button
+            className="rounded-lg bg-blue-600 px-4 py-3 text-white hover:bg-blue-700"
+            onClick={() => setIsRecording(true)}
+          >
+            🎤 Voice Search
+          </button>
+        </div>
+      ) : (
+        <VoiceInputMock
+          onConverted={(text: string) => {
+            setSearchQuery(text)
+            setIsRecording(false)
+          }}
+          onCancel={() => setIsRecording(false)}
+        />
+      )}
+
+      {searchQuery && !isRecording && (
+        <div className="mt-4 rounded-lg bg-blue-50 p-4">
+          <div className="mb-2 text-xs font-medium text-blue-900">
+            Searching for: <strong>{searchQuery}</strong>
+          </div>
+        </div>
+      )}
+    </div>
+  )
+}
+
+export const SearchWithVoice: Story = {
+  render: () => <SearchWithVoiceDemo />,
+}
+
+// Real-world example - Note taking
+const NoteTakingDemo = () => {
+  const [notes, setNotes] = useState<string[]>([])
+  const [isRecording, setIsRecording] = useState(false)
+
+  return (
+    <div style={{ width: '700px' }} className="rounded-lg border border-gray-200 bg-white p-6">
+      <div className="mb-4 flex items-center justify-between">
+        <h3 className="text-lg font-semibold">Voice Notes</h3>
+        <span className="text-sm text-gray-500">{notes.length} notes</span>
+      </div>
+
+      <div className="mb-4">
+        {!isRecording ? (
+          <button
+            className="flex w-full items-center justify-center gap-2 rounded-lg bg-red-500 px-4 py-3 font-medium text-white hover:bg-red-600"
+            onClick={() => setIsRecording(true)}
+          >
+            <span className="text-xl">🎤</span>
+            Record Voice Note
+          </button>
+        ) : (
+          <VoiceInputMock
+            onConverted={(text: string) => {
+              setNotes([...notes, text])
+              setIsRecording(false)
+            }}
+            onCancel={() => setIsRecording(false)}
+          />
+        )}
+      </div>
+
+      <div className="max-h-80 space-y-2 overflow-y-auto">
+        {notes.length === 0 ? (
+          <div className="py-12 text-center text-gray-400">
+            No notes yet. Click the button above to start recording.
+          </div>
+        ) : (
+          notes.map((note, index) => (
+            <div key={index} className="rounded-lg border border-gray-200 bg-gray-50 p-3">
+              <div className="flex items-start justify-between">
+                <div className="flex-1">
+                  <div className="mb-1 text-xs text-gray-500">Note {index + 1}</div>
+                  <div className="text-sm text-gray-800">{note}</div>
+                </div>
+                <button
+                  className="text-gray-400 hover:text-red-500"
+                  onClick={() => setNotes(notes.filter((_, i) => i !== index))}
+                >
+                  ×
+                </button>
+              </div>
+            </div>
+          ))
+        )}
+      </div>
+    </div>
+  )
+}
+
+export const NoteTaking: Story = {
+  render: () => <NoteTakingDemo />,
+}
+
+// Real-world example - Form with voice
+const FormWithVoiceDemo = () => {
+  const [formData, setFormData] = useState({
+    name: '',
+    description: '',
+  })
+  const [activeField, setActiveField] = useState<'name' | 'description' | null>(null)
+
+  return (
+    <div style={{ width: '600px' }} className="rounded-lg border border-gray-200 bg-white p-6">
+      <h3 className="mb-4 text-lg font-semibold">Create Product</h3>
+
+      <div className="space-y-4">
+        <div>
+          <label className="mb-2 block text-sm font-medium text-gray-700">
+            Product Name
+          </label>
+          {activeField === 'name' ? (
+            <VoiceInputMock
+              onConverted={(text: string) => {
+                setFormData({ ...formData, name: text })
+                setActiveField(null)
+              }}
+              onCancel={() => setActiveField(null)}
+            />
+          ) : (
+            <div className="flex gap-2">
+              <input
+                type="text"
+                className="flex-1 rounded-lg border border-gray-300 px-3 py-2 text-sm"
+                placeholder="Enter product name..."
+                value={formData.name}
+                onChange={e => setFormData({ ...formData, name: e.target.value })}
+              />
+              <button
+                className="rounded-lg bg-gray-100 px-3 py-2 hover:bg-gray-200"
+                onClick={() => setActiveField('name')}
+              >
+                🎤
+              </button>
+            </div>
+          )}
+        </div>
+
+        <div>
+          <label className="mb-2 block text-sm font-medium text-gray-700">
+            Description
+          </label>
+          {activeField === 'description' ? (
+            <VoiceInputMock
+              onConverted={(text: string) => {
+                setFormData({ ...formData, description: text })
+                setActiveField(null)
+              }}
+              onCancel={() => setActiveField(null)}
+            />
+          ) : (
+            <div className="space-y-2">
+              <textarea
+                className="w-full rounded-lg border border-gray-300 px-3 py-2 text-sm"
+                rows={4}
+                placeholder="Enter product description..."
+                value={formData.description}
+                onChange={e => setFormData({ ...formData, description: e.target.value })}
+              />
+              <button
+                className="w-full rounded-lg bg-gray-100 px-3 py-2 text-sm hover:bg-gray-200"
+                onClick={() => setActiveField('description')}
+              >
+                🎤 Use Voice Input
+              </button>
+            </div>
+          )}
+        </div>
+
+        <button className="w-full rounded-lg bg-blue-600 px-4 py-2 text-white hover:bg-blue-700">
+          Create Product
+        </button>
+      </div>
+    </div>
+  )
+}
+
+export const FormWithVoice: Story = {
+  render: () => <FormWithVoiceDemo />,
+}
+
+// Features showcase
+export const FeaturesShowcase: Story = {
+  render: () => (
+    <div style={{ width: '700px' }} className="rounded-lg border border-gray-200 bg-white p-6">
+      <h3 className="mb-4 text-lg font-semibold">Voice Input Features</h3>
+
+      <div className="mb-6">
+        <VoiceInputMock
+          onConverted={() => undefined}
+          onCancel={() => undefined}
+        />
+      </div>
+
+      <div className="space-y-4">
+        <div className="rounded-lg bg-blue-50 p-4">
+          <div className="mb-2 text-sm font-medium text-blue-900">🎤 Audio Recording</div>
+          <ul className="space-y-1 text-xs text-blue-800">
+            <li>• Uses js-audio-recorder for browser-based recording</li>
+            <li>• 16kHz sample rate, 16-bit, mono channel</li>
+            <li>• Converts to MP3 format for transmission</li>
+          </ul>
+        </div>
+
+        <div className="rounded-lg bg-green-50 p-4">
+          <div className="mb-2 text-sm font-medium text-green-900">📊 Waveform Visualization</div>
+          <ul className="space-y-1 text-xs text-green-800">
+            <li>• Real-time audio level display using Canvas API</li>
+            <li>• Animated bars showing voice amplitude</li>
+            <li>• Visual feedback during recording</li>
+          </ul>
+        </div>
+
+        <div className="rounded-lg bg-purple-50 p-4">
+          <div className="mb-2 text-sm font-medium text-purple-900">⏱️ Time Limits</div>
+          <ul className="space-y-1 text-xs text-purple-800">
+            <li>• Maximum recording duration: 10 minutes (600 seconds)</li>
+            <li>• Timer turns red after 8:20 (500 seconds)</li>
+            <li>• Automatic stop at max duration</li>
+          </ul>
+        </div>
+
+        <div className="rounded-lg bg-orange-50 p-4">
+          <div className="mb-2 text-sm font-medium text-orange-900">🔄 Audio-to-Text Conversion</div>
+          <ul className="space-y-1 text-xs text-orange-800">
+            <li>• Server-side speech-to-text processing</li>
+            <li>• Optional word timestamps support</li>
+            <li>• Loading state during conversion</li>
+          </ul>
+        </div>
+      </div>
+    </div>
+  ),
+}
--- a/dify/web/app/components/base/voice-input/index.tsx
+++ b/dify/web/app/components/base/voice-input/index.tsx
@@ -0,0 +1,217 @@
+import { useCallback, useEffect, useRef, useState } from 'react'
+import { useTranslation } from 'react-i18next'
+import { useParams, usePathname } from 'next/navigation'
+import {
+  RiCloseLine,
+  RiLoader2Line,
+} from '@remixicon/react'
+import Recorder from 'js-audio-recorder'
+import { useRafInterval } from 'ahooks'
+import { convertToMp3 } from './utils'
+import s from './index.module.css'
+import cn from '@/utils/classnames'
+import { StopCircle } from '@/app/components/base/icons/src/vender/solid/mediaAndDevices'
+import { audioToText } from '@/service/share'
+
+type VoiceInputTypes = {
+  onConverted: (text: string) => void
+  onCancel: () => void
+  wordTimestamps?: string
+}
+
+const VoiceInput = ({
+  onCancel,
+  onConverted,
+  wordTimestamps,
+}: VoiceInputTypes) => {
+  const { t } = useTranslation()
+  const recorder = useRef(new Recorder({
+    sampleBits: 16,
+    sampleRate: 16000,
+    numChannels: 1,
+    compiling: false,
+  }))
+  const canvasRef = useRef<HTMLCanvasElement | null>(null)
+  const ctxRef = useRef<CanvasRenderingContext2D | null>(null)
+  const drawRecordId = useRef<number | null>(null)
+  const [originDuration, setOriginDuration] = useState(0)
+  const [startRecord, setStartRecord] = useState(false)
+  const [startConvert, setStartConvert] = useState(false)
+  const pathname = usePathname()
+  const params = useParams()
+  const clearInterval = useRafInterval(() => {
+    setOriginDuration(originDuration + 1)
+  }, 1000)
+
+  const drawRecord = useCallback(() => {
+    drawRecordId.current = requestAnimationFrame(drawRecord)
+    const canvas = canvasRef.current!
+    const ctx = ctxRef.current!
+    const dataUnit8Array = recorder.current.getRecordAnalyseData()
+    const dataArray = [].slice.call(dataUnit8Array)
+    const lineLength = Number.parseInt(`${canvas.width / 3}`)
+    const gap = Number.parseInt(`${1024 / lineLength}`)
+
+    ctx.clearRect(0, 0, canvas.width, canvas.height)
+    ctx.beginPath()
+    let x = 0
+    for (let i = 0; i < lineLength; i++) {
+      let v = dataArray.slice(i * gap, i * gap + gap).reduce((prev: number, next: number) => {
+        return prev + next
+      }, 0) / gap
+
+      if (v < 128)
+        v = 128
+      if (v > 178)
+        v = 178
+      const y = (v - 128) / 50 * canvas.height
+
+      ctx.moveTo(x, 16)
+      if (ctx.roundRect)
+        ctx.roundRect(x, 16 - y, 2, y, [1, 1, 0, 0])
+      else
+        ctx.rect(x, 16 - y, 2, y)
+      ctx.fill()
+      x += 3
+    }
+    ctx.closePath()
+  }, [])
+  const handleStopRecorder = useCallback(async () => {
+    clearInterval()
+    setStartRecord(false)
+    setStartConvert(true)
+    recorder.current.stop()
+    if (drawRecordId.current)
+      cancelAnimationFrame(drawRecordId.current)
+    drawRecordId.current = null
+    const canvas = canvasRef.current!
+    const ctx = ctxRef.current!
+    ctx.clearRect(0, 0, canvas.width, canvas.height)
+    const mp3Blob = convertToMp3(recorder.current)
+    const mp3File = new File([mp3Blob], 'temp.mp3', { type: 'audio/mp3' })
+    const formData = new FormData()
+    formData.append('file', mp3File)
+    formData.append('word_timestamps', wordTimestamps || 'disabled')
+
+    let url = ''
+    let isPublic = false
+
+    if (params.token) {
+      url = '/audio-to-text'
+      isPublic = true
+    }
+    else if (params.appId) {
+      if (pathname.search('explore/installed') > -1)
+        url = `/installed-apps/${params.appId}/audio-to-text`
+      else
+        url = `/apps/${params.appId}/audio-to-text`
+    }
+
+    try {
+      const audioResponse = await audioToText(url, isPublic, formData)
+      onConverted(audioResponse.text)
+      onCancel()
+    }
+    catch {
+      onConverted('')
+      onCancel()
+    }
+  }, [clearInterval, onCancel, onConverted, params.appId, params.token, pathname, wordTimestamps])
+  const handleStartRecord = async () => {
+    try {
+      await recorder.current.start()
+      setStartRecord(true)
+      setStartConvert(false)
+
+      if (canvasRef.current && ctxRef.current)
+        drawRecord()
+    }
+    catch {
+      onCancel()
+    }
+  }
+
+  const initCanvas = () => {
+    const dpr = window.devicePixelRatio || 1
+    const canvas = document.getElementById('voice-input-record') as HTMLCanvasElement
+
+    if (canvas) {
+      const { width: cssWidth, height: cssHeight } = canvas.getBoundingClientRect()
+
+      canvas.width = dpr * cssWidth
+      canvas.height = dpr * cssHeight
+      canvasRef.current = canvas
+
+      const ctx = canvas.getContext('2d')
+      if (ctx) {
+        ctx.scale(dpr, dpr)
+        ctx.fillStyle = 'rgba(209, 224, 255, 1)'
+        ctxRef.current = ctx
+      }
+    }
+  }
+  if (originDuration >= 600 && startRecord)
+    handleStopRecorder()
+
+  useEffect(() => {
+    initCanvas()
+    handleStartRecord()
+    const recorderRef = recorder?.current
+    return () => {
+      recorderRef?.stop()
+    }
+  }, [])
+
+  const minutes = Number.parseInt(`${Number.parseInt(`${originDuration}`) / 60}`)
+  const seconds = Number.parseInt(`${originDuration}`) % 60
+
+  return (
+    <div className={cn(s.wrapper, 'absolute inset-0 rounded-xl')}>
+      <div className='absolute inset-[1.5px] flex items-center overflow-hidden rounded-[10.5px] bg-primary-25 py-[14px] pl-[14.5px] pr-[6.5px]'>
+        <canvas id='voice-input-record' className='absolute bottom-0 left-0 h-4 w-full' />
+        {
+          startConvert && <RiLoader2Line className='mr-2 h-4 w-4 animate-spin text-primary-700' />
+        }
+        <div className='grow'>
+          {
+            startRecord && (
+              <div className='text-sm text-gray-500'>
+                {t('common.voiceInput.speaking')}
+              </div>
+            )
+          }
+          {
+            startConvert && (
+              <div className={cn(s.convert, 'text-sm')}>
+                {t('common.voiceInput.converting')}
+              </div>
+            )
+          }
+        </div>
+        {
+          startRecord && (
+            <div
+              className='mr-1 flex h-8 w-8 cursor-pointer items-center justify-center rounded-lg  hover:bg-primary-100'
+              onClick={handleStopRecorder}
+            >
+              <StopCircle className='h-5 w-5 text-primary-600' />
+            </div>
+          )
+        }
+        {
+          startConvert && (
+            <div
+              className='mr-1 flex h-8 w-8 cursor-pointer items-center justify-center rounded-lg  hover:bg-gray-200'
+              onClick={onCancel}
+            >
+              <RiCloseLine className='h-4 w-4 text-gray-500' />
+            </div>
+          )
+        }
+        <div className={`w-[45px] pl-1 text-xs font-medium ${originDuration > 500 ? 'text-[#F04438]' : 'text-gray-700'}`}>{`0${minutes.toFixed(0)}:${seconds >= 10 ? seconds : `0${seconds}`}`}</div>
+      </div>
+    </div>
+  )
+}
+
+export default VoiceInput
--- a/dify/web/app/components/base/voice-input/utils.ts
+++ b/dify/web/app/components/base/voice-input/utils.ts
@@ -0,0 +1,53 @@
+import lamejs from 'lamejs'
+import MPEGMode from 'lamejs/src/js/MPEGMode'
+import Lame from 'lamejs/src/js/Lame'
+import BitStream from 'lamejs/src/js/BitStream'
+
+if (globalThis) {
+  (globalThis as any).MPEGMode = MPEGMode
+  ;(globalThis as any).Lame = Lame
+  ;(globalThis as any).BitStream = BitStream
+}
+
+export const convertToMp3 = (recorder: any) => {
+  const wav = lamejs.WavHeader.readHeader(recorder.getWAV())
+  const { channels, sampleRate } = wav
+  const mp3enc = new lamejs.Mp3Encoder(channels, sampleRate, 128)
+  const result = recorder.getChannelData()
+  const buffer: BlobPart[] = []
+
+  const leftData = result.left && new Int16Array(result.left.buffer, 0, result.left.byteLength / 2)
+  const rightData = result.right && new Int16Array(result.right.buffer, 0, result.right.byteLength / 2)
+  const remaining = leftData.length + (rightData ? rightData.length : 0)
+
+  const maxSamples = 1152
+  const toArrayBuffer = (bytes: Int8Array) => {
+    const arrayBuffer = new ArrayBuffer(bytes.length)
+    new Uint8Array(arrayBuffer).set(bytes)
+    return arrayBuffer
+  }
+
+  for (let i = 0; i < remaining; i += maxSamples) {
+    const left = leftData.subarray(i, i + maxSamples)
+    let right = null
+    let mp3buf = null
+
+    if (channels === 2) {
+      right = rightData.subarray(i, i + maxSamples)
+      mp3buf = mp3enc.encodeBuffer(left, right)
+    }
+    else {
+      mp3buf = mp3enc.encodeBuffer(left)
+    }
+
+    if (mp3buf.length > 0)
+      buffer.push(toArrayBuffer(mp3buf))
+  }
+
+  const enc = mp3enc.flush()
+
+  if (enc.length > 0)
+    buffer.push(toArrayBuffer(enc))
+
+  return new Blob(buffer, { type: 'audio/mp3' })
+}