generated from tailored/router-db-template
Refactor ASR module and remove deprecated AliAsrServer
- Introduced AsrRelatime class for real-time ASR using WebSocket. - Removed AliAsrServer and related files from the aliyun provider. - Updated base class for ASR to use WSServer for WebSocket connections. - Added new test cases for the updated ASR functionality. - Cleaned up unused imports and files across the project. - Adjusted TypeScript configuration for better module resolution. - Implemented silence generation for audio streaming.
This commit is contained in:
@@ -1,42 +1,128 @@
|
||||
import RPCClient from '@alicloud/pop-core';
|
||||
|
||||
interface TokenResponse {
|
||||
Token: {
|
||||
Id: string;
|
||||
ExpireTime: number;
|
||||
};
|
||||
}
|
||||
type AliCommonOptions = {
|
||||
accessKeyId: string;
|
||||
accessKeySecret: string;
|
||||
};
|
||||
export class AliCommon {
|
||||
private accessKeyId: string;
|
||||
private accessKeySecret: string;
|
||||
private endpoint: string;
|
||||
private apiVersion: string;
|
||||
token = '';
|
||||
expireTime = 0;
|
||||
constructor(opts?: AliCommonOptions) {
|
||||
this.accessKeyId = opts?.accessKeyId || process.env.ALIYUN_AK_ID || '';
|
||||
this.accessKeySecret = opts?.accessKeySecret || process.env.ALIYUN_AK_SECRET || '';
|
||||
this.endpoint = 'http://nls-meta.cn-shanghai.aliyuncs.com';
|
||||
this.apiVersion = '2019-02-28';
|
||||
import { WSServer, WSSOptions } from '@kevisual/video-tools/asr/ws.ts';
|
||||
type Options = {
|
||||
model?: string;
|
||||
token?: string;
|
||||
} & Partial<WSSOptions>
|
||||
|
||||
/**
|
||||
* 阿里云实时语音识别服务
|
||||
* new AsrRelatime({
|
||||
* token: 'your_token',
|
||||
* model: 'general_16k',
|
||||
* enableServerVad: true,
|
||||
* onConnect: async () => {
|
||||
* await asr.sendSessionUpdate();
|
||||
* }
|
||||
* });
|
||||
*/
|
||||
export class AsrRelatime extends WSServer {
|
||||
static baseURL = "wss://dashscope.aliyuncs.com/api-ws/v1/realtime"
|
||||
/**
|
||||
* 是否启用服务端VAD功能,true为VAD模式,false为Manual模式
|
||||
*/
|
||||
enableServerVad: boolean = true;
|
||||
constructor(options: Options) {
|
||||
const { url: _, wsOptions: _wsOptions, ...rest } = options;
|
||||
const wsOptions: WSSOptions['wsOptions'] = {
|
||||
..._wsOptions,
|
||||
headers: {
|
||||
Authorization: `Bearer ${options.token}`,
|
||||
'OpenAi-Beta': 'realtime=v1',
|
||||
..._wsOptions?.headers
|
||||
}
|
||||
};
|
||||
const models = options.model || 'qwen3-asr-flash-realtime';
|
||||
const url = AsrRelatime.baseURL + `?model=${models}`;
|
||||
super({ ...rest, url, wsOptions, onConnect: options.onConnect });
|
||||
}
|
||||
async getToken() {
|
||||
if (this.token && this.expireTime > Date.now()) {
|
||||
return this.token;
|
||||
async sendSessionUpdate() {
|
||||
const { ws, enableServerVad } = this;
|
||||
const connected = await this.checkConnected()
|
||||
if (!connected) {
|
||||
this.reconnect({ timeout: 60 * 1000 });
|
||||
return;
|
||||
}
|
||||
const event = {
|
||||
event_id: 'event_123',
|
||||
type: 'session.update',
|
||||
session: {
|
||||
modalities: ['text'],
|
||||
input_audio_format: 'pcm',
|
||||
sample_rate: 16000,
|
||||
input_audio_transcription: {
|
||||
language: 'zh'
|
||||
},
|
||||
turn_detection: null
|
||||
}
|
||||
};
|
||||
if (enableServerVad) {
|
||||
event.session.turn_detection = {
|
||||
type: 'server_vad',
|
||||
threshold: 0.2,
|
||||
silence_duration_ms: 800
|
||||
}
|
||||
}
|
||||
ws.send(JSON.stringify(event));
|
||||
}
|
||||
async start() {
|
||||
this.sendSessionUpdate();
|
||||
}
|
||||
async sendBuffer(buffer: Buffer) {
|
||||
const { ws, enableServerVad } = this;;
|
||||
const connected = await this.checkConnected()
|
||||
if (!connected) {
|
||||
this.reconnect({ timeout: 60 * 1000 });
|
||||
return;
|
||||
}
|
||||
const client = new RPCClient({
|
||||
accessKeyId: this.accessKeyId,
|
||||
accessKeySecret: this.accessKeySecret,
|
||||
endpoint: this.endpoint,
|
||||
apiVersion: this.apiVersion,
|
||||
});
|
||||
|
||||
const result = await client.request<TokenResponse>('CreateToken', {});
|
||||
this.token = result.Token.Id;
|
||||
this.expireTime = result.Token.ExpireTime * 1000;
|
||||
return result.Token.Id;
|
||||
let offset = 0;
|
||||
const bufferLength = Buffer.byteLength(buffer);
|
||||
const chunkSize = 3200; // 约0.1s的PCM16音频 // max lenghth 262144
|
||||
while (offset < bufferLength) {
|
||||
const chunkBuffer = buffer.subarray(offset, offset + chunkSize);
|
||||
offset += chunkSize;
|
||||
const encoded = chunkBuffer.toString('base64');
|
||||
const appendEvent = {
|
||||
event_id: `event_${Date.now()}`,
|
||||
type: 'input_audio_buffer.append',
|
||||
audio: encoded
|
||||
};
|
||||
ws.send(JSON.stringify(appendEvent));
|
||||
}
|
||||
if (!enableServerVad) {
|
||||
const commitEvent = {
|
||||
event_id: 'event_789',
|
||||
type: 'input_audio_buffer.commit'
|
||||
};
|
||||
ws.send(JSON.stringify(commitEvent));
|
||||
}
|
||||
}
|
||||
async onMessage(event: MessageEvent) {
|
||||
super.onMessage(event);
|
||||
const data = event.data;
|
||||
try {
|
||||
const result = JSON.parse(data.toString());
|
||||
const isEnd = await this.isEnd(result.type);
|
||||
if (isEnd && result?.transcript) {
|
||||
const text = result.transcript;
|
||||
this.emitter.emit('result', {
|
||||
text: text,
|
||||
raw: result
|
||||
});
|
||||
}
|
||||
} catch (error) {
|
||||
console.log('error', error);
|
||||
}
|
||||
}
|
||||
async isEnd(type: string) {
|
||||
const types = ['conversation.item.input_audio_transcription.text', 'conversation.item.input_audio_transcription.completed'];
|
||||
if (type === types[1]) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
async sendBlank(buffer?: Buffer): Promise<void> {
|
||||
this.sendBuffer(buffer || this.generateSilence(2));
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user