Files
video-tools/src/asr/provider/aliyun/base.ts
xiongxiao 58b27b86fe Refactor ASR module and remove deprecated AliAsrServer
- Introduced AsrRelatime class for real-time ASR using WebSocket.
- Removed AliAsrServer and related files from the aliyun provider.
- Updated base class for ASR to use WSServer for WebSocket connections.
- Added new test cases for the updated ASR functionality.
- Cleaned up unused imports and files across the project.
- Adjusted TypeScript configuration for better module resolution.
- Implemented silence generation for audio streaming.
2025-12-21 18:56:32 +08:00

129 lines
3.6 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import { WSServer, WSSOptions } from '@kevisual/video-tools/asr/ws.ts';
type Options = {
model?: string;
token?: string;
} & Partial<WSSOptions>
/**
* 阿里云实时语音识别服务
* new AsrRelatime({
* token: 'your_token',
* model: 'general_16k',
* enableServerVad: true,
* onConnect: async () => {
* await asr.sendSessionUpdate();
* }
* });
*/
export class AsrRelatime extends WSServer {
static baseURL = "wss://dashscope.aliyuncs.com/api-ws/v1/realtime"
/**
* 是否启用服务端VAD功能true为VAD模式false为Manual模式
*/
enableServerVad: boolean = true;
constructor(options: Options) {
const { url: _, wsOptions: _wsOptions, ...rest } = options;
const wsOptions: WSSOptions['wsOptions'] = {
..._wsOptions,
headers: {
Authorization: `Bearer ${options.token}`,
'OpenAi-Beta': 'realtime=v1',
..._wsOptions?.headers
}
};
const models = options.model || 'qwen3-asr-flash-realtime';
const url = AsrRelatime.baseURL + `?model=${models}`;
super({ ...rest, url, wsOptions, onConnect: options.onConnect });
}
async sendSessionUpdate() {
const { ws, enableServerVad } = this;
const connected = await this.checkConnected()
if (!connected) {
this.reconnect({ timeout: 60 * 1000 });
return;
}
const event = {
event_id: 'event_123',
type: 'session.update',
session: {
modalities: ['text'],
input_audio_format: 'pcm',
sample_rate: 16000,
input_audio_transcription: {
language: 'zh'
},
turn_detection: null
}
};
if (enableServerVad) {
event.session.turn_detection = {
type: 'server_vad',
threshold: 0.2,
silence_duration_ms: 800
}
}
ws.send(JSON.stringify(event));
}
async start() {
this.sendSessionUpdate();
}
async sendBuffer(buffer: Buffer) {
const { ws, enableServerVad } = this;;
const connected = await this.checkConnected()
if (!connected) {
this.reconnect({ timeout: 60 * 1000 });
return;
}
let offset = 0;
const bufferLength = Buffer.byteLength(buffer);
const chunkSize = 3200; // 约0.1s的PCM16音频 // max lenghth 262144
while (offset < bufferLength) {
const chunkBuffer = buffer.subarray(offset, offset + chunkSize);
offset += chunkSize;
const encoded = chunkBuffer.toString('base64');
const appendEvent = {
event_id: `event_${Date.now()}`,
type: 'input_audio_buffer.append',
audio: encoded
};
ws.send(JSON.stringify(appendEvent));
}
if (!enableServerVad) {
const commitEvent = {
event_id: 'event_789',
type: 'input_audio_buffer.commit'
};
ws.send(JSON.stringify(commitEvent));
}
}
async onMessage(event: MessageEvent) {
super.onMessage(event);
const data = event.data;
try {
const result = JSON.parse(data.toString());
const isEnd = await this.isEnd(result.type);
if (isEnd && result?.transcript) {
const text = result.transcript;
this.emitter.emit('result', {
text: text,
raw: result
});
}
} catch (error) {
console.log('error', error);
}
}
async isEnd(type: string) {
const types = ['conversation.item.input_audio_transcription.text', 'conversation.item.input_audio_transcription.completed'];
if (type === types[1]) {
return true;
}
return false;
}
async sendBlank(buffer?: Buffer): Promise<void> {
this.sendBuffer(buffer || this.generateSilence(2));
}
}