Refactor ASR module and remove deprecated AliAsrServer

- Introduced AsrRelatime class for real-time ASR using WebSocket.
- Removed AliAsrServer and related files from the aliyun provider.
- Updated base class for ASR to use WSServer for WebSocket connections.
- Added new test cases for the updated ASR functionality.
- Cleaned up unused imports and files across the project.
- Adjusted TypeScript configuration for better module resolution.
- Implemented silence generation for audio streaming.
This commit is contained in:
2025-12-21 18:56:32 +08:00
parent 9e94a4d898
commit 58b27b86fe
20 changed files with 858 additions and 3626 deletions

View File

@@ -1,42 +1,128 @@
import RPCClient from '@alicloud/pop-core';
interface TokenResponse {
Token: {
Id: string;
ExpireTime: number;
};
}
type AliCommonOptions = {
accessKeyId: string;
accessKeySecret: string;
};
export class AliCommon {
private accessKeyId: string;
private accessKeySecret: string;
private endpoint: string;
private apiVersion: string;
token = '';
expireTime = 0;
constructor(opts?: AliCommonOptions) {
this.accessKeyId = opts?.accessKeyId || process.env.ALIYUN_AK_ID || '';
this.accessKeySecret = opts?.accessKeySecret || process.env.ALIYUN_AK_SECRET || '';
this.endpoint = 'http://nls-meta.cn-shanghai.aliyuncs.com';
this.apiVersion = '2019-02-28';
import { WSServer, WSSOptions } from '@kevisual/video-tools/asr/ws.ts';
type Options = {
model?: string;
token?: string;
} & Partial<WSSOptions>
/**
* 阿里云实时语音识别服务
* new AsrRelatime({
* token: 'your_token',
* model: 'general_16k',
* enableServerVad: true,
* onConnect: async () => {
* await asr.sendSessionUpdate();
* }
* });
*/
export class AsrRelatime extends WSServer {
static baseURL = "wss://dashscope.aliyuncs.com/api-ws/v1/realtime"
/**
* 是否启用服务端VAD功能true为VAD模式false为Manual模式
*/
enableServerVad: boolean = true;
constructor(options: Options) {
const { url: _, wsOptions: _wsOptions, ...rest } = options;
const wsOptions: WSSOptions['wsOptions'] = {
..._wsOptions,
headers: {
Authorization: `Bearer ${options.token}`,
'OpenAi-Beta': 'realtime=v1',
..._wsOptions?.headers
}
};
const models = options.model || 'qwen3-asr-flash-realtime';
const url = AsrRelatime.baseURL + `?model=${models}`;
super({ ...rest, url, wsOptions, onConnect: options.onConnect });
}
async getToken() {
if (this.token && this.expireTime > Date.now()) {
return this.token;
async sendSessionUpdate() {
const { ws, enableServerVad } = this;
const connected = await this.checkConnected()
if (!connected) {
this.reconnect({ timeout: 60 * 1000 });
return;
}
const event = {
event_id: 'event_123',
type: 'session.update',
session: {
modalities: ['text'],
input_audio_format: 'pcm',
sample_rate: 16000,
input_audio_transcription: {
language: 'zh'
},
turn_detection: null
}
};
if (enableServerVad) {
event.session.turn_detection = {
type: 'server_vad',
threshold: 0.2,
silence_duration_ms: 800
}
}
ws.send(JSON.stringify(event));
}
async start() {
this.sendSessionUpdate();
}
async sendBuffer(buffer: Buffer) {
const { ws, enableServerVad } = this;;
const connected = await this.checkConnected()
if (!connected) {
this.reconnect({ timeout: 60 * 1000 });
return;
}
const client = new RPCClient({
accessKeyId: this.accessKeyId,
accessKeySecret: this.accessKeySecret,
endpoint: this.endpoint,
apiVersion: this.apiVersion,
});
const result = await client.request<TokenResponse>('CreateToken', {});
this.token = result.Token.Id;
this.expireTime = result.Token.ExpireTime * 1000;
return result.Token.Id;
let offset = 0;
const bufferLength = Buffer.byteLength(buffer);
const chunkSize = 3200; // 约0.1s的PCM16音频 // max lenghth 262144
while (offset < bufferLength) {
const chunkBuffer = buffer.subarray(offset, offset + chunkSize);
offset += chunkSize;
const encoded = chunkBuffer.toString('base64');
const appendEvent = {
event_id: `event_${Date.now()}`,
type: 'input_audio_buffer.append',
audio: encoded
};
ws.send(JSON.stringify(appendEvent));
}
if (!enableServerVad) {
const commitEvent = {
event_id: 'event_789',
type: 'input_audio_buffer.commit'
};
ws.send(JSON.stringify(commitEvent));
}
}
async onMessage(event: MessageEvent) {
super.onMessage(event);
const data = event.data;
try {
const result = JSON.parse(data.toString());
const isEnd = await this.isEnd(result.type);
if (isEnd && result?.transcript) {
const text = result.transcript;
this.emitter.emit('result', {
text: text,
raw: result
});
}
} catch (error) {
console.log('error', error);
}
}
async isEnd(type: string) {
const types = ['conversation.item.input_audio_transcription.text', 'conversation.item.input_audio_transcription.completed'];
if (type === types[1]) {
return true;
}
return false;
}
async sendBlank(buffer?: Buffer): Promise<void> {
this.sendBuffer(buffer || this.generateSilence(2));
}
}