import { WSServer, WSSOptions } from '@kevisual/video-tools/asr/ws.ts'; type Options = { model?: string; token?: string; } & Partial /** * 阿里云实时语音识别服务 * new AsrRelatime({ * token: 'your_token', * model: 'general_16k', * enableServerVad: true, * onConnect: async () => { * await asr.sendSessionUpdate(); * } * }); */ export class AsrRelatime extends WSServer { static baseURL = "wss://dashscope.aliyuncs.com/api-ws/v1/realtime" /** * 是否启用服务端VAD功能，true为VAD模式，false为Manual模式 */ enableServerVad: boolean = true; constructor(options: Options) { const { url: _, wsOptions: _wsOptions, ...rest } = options; const wsOptions: WSSOptions['wsOptions'] = { ..._wsOptions, headers: { Authorization: `Bearer ${options.token}`, 'OpenAi-Beta': 'realtime=v1', ..._wsOptions?.headers } }; const models = options.model || 'qwen3-asr-flash-realtime'; const url = AsrRelatime.baseURL + `?model=${models}`; super({ ...rest, url, wsOptions, onConnect: options.onConnect }); } async sendSessionUpdate() { const { ws, enableServerVad } = this; const connected = await this.checkConnected() if (!connected) { this.reconnect({ timeout: 60 * 1000 }); return; } const event = { event_id: 'event_123', type: 'session.update', session: { modalities: ['text'], input_audio_format: 'pcm', sample_rate: 16000, input_audio_transcription: { language: 'zh' }, turn_detection: null } }; if (enableServerVad) { event.session.turn_detection = { type: 'server_vad', threshold: 0.2, silence_duration_ms: 800 } } ws.send(JSON.stringify(event)); } async start() { this.sendSessionUpdate(); } async sendBuffer(buffer: Buffer) { const { ws, enableServerVad } = this;; const connected = await this.checkConnected() if (!connected) { this.reconnect({ timeout: 60 * 1000 }); return; } let offset = 0; const bufferLength = Buffer.byteLength(buffer); const chunkSize = 3200; // 约0.1s的PCM16音频 // max lenghth 262144 while (offset < bufferLength) { const chunkBuffer = buffer.subarray(offset, offset + chunkSize); offset += chunkSize; const encoded = chunkBuffer.toString('base64'); const appendEvent = { event_id: `event_${Date.now()}`, type: 'input_audio_buffer.append', audio: encoded }; ws.send(JSON.stringify(appendEvent)); } if (!enableServerVad) { const commitEvent = { event_id: 'event_789', type: 'input_audio_buffer.commit' }; ws.send(JSON.stringify(commitEvent)); } } async onMessage(event: MessageEvent) { super.onMessage(event); const data = event.data; try { const result = JSON.parse(data.toString()); const isEnd = await this.isEnd(result.type); if (isEnd && result?.transcript) { const text = result.transcript; this.emitter.emit('result', { text: text, raw: result }); } } catch (error) { console.log('error', error); } } async isEnd(type: string) { const types = ['conversation.item.input_audio_transcription.text', 'conversation.item.input_audio_transcription.completed']; if (type === types[1]) { return true; } return false; } async sendBlank(buffer?: Buffer): Promise { this.sendBuffer(buffer || this.generateSilence(2)); } }