generated from tailored/router-db-template
更新 package.json 和 pnpm-lock.yaml,修改版本号并添加依赖;重构 ASR 模块,优化音频处理逻辑,新增 AliyunAucChat 类及测试用例
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@kevisual/video-tools",
|
||||
"version": "0.0.8",
|
||||
"version": "0.0.12",
|
||||
"description": "",
|
||||
"main": "index.js",
|
||||
"basename": "/root/video-tools",
|
||||
@@ -9,7 +9,7 @@
|
||||
"type": "system-app"
|
||||
},
|
||||
"scripts": {
|
||||
"build": "bun bun.config.ts",
|
||||
"build": "npm publish",
|
||||
"dev:bun": "bun run src/dev.ts --watch",
|
||||
"test": "tsx test/**/*.ts",
|
||||
"clean": "rm -rf dist",
|
||||
@@ -30,6 +30,7 @@
|
||||
},
|
||||
"dependencies": {
|
||||
"@gradio/client": "^2.0.1",
|
||||
"@kevisual/ai": "^0.0.19",
|
||||
"@kevisual/router": "0.0.48",
|
||||
"@kevisual/use-config": "^1.0.21",
|
||||
"@kevisual/video": "^0.0.2",
|
||||
|
||||
22
pnpm-lock.yaml
generated
22
pnpm-lock.yaml
generated
@@ -11,6 +11,9 @@ importers:
|
||||
'@gradio/client':
|
||||
specifier: ^2.0.1
|
||||
version: 2.0.1
|
||||
'@kevisual/ai':
|
||||
specifier: ^0.0.19
|
||||
version: 0.0.19
|
||||
'@kevisual/router':
|
||||
specifier: 0.0.48
|
||||
version: 0.0.48
|
||||
@@ -68,12 +71,21 @@ packages:
|
||||
resolution: {integrity: sha512-NLaQNj5fn+Klgtf9ESL2NhlfBo9GHYjxBCbLMXamRev36nQ/fVmhKV2V2DLV91IVTbL/gAMzeTsCmZ1Cl2CLlQ==}
|
||||
engines: {node: '>=18.0.0'}
|
||||
|
||||
'@kevisual/ai@0.0.19':
|
||||
resolution: {integrity: sha512-AFc8m6OcHZNxCb88bvzhvwWTZ4EVYyPupBzPUsLKLpdNBvsqm9TRboKCM2brJj2cqHnm+H+RbAk9AcGJkYhRCA==}
|
||||
|
||||
'@kevisual/load@0.0.6':
|
||||
resolution: {integrity: sha512-+3YTFehRcZ1haGel5DKYMUwmi5i6f2psyaPZlfkKU/cOXgkpwoG9/BEqPCnPjicKqqnksEpixVRkyHJ+5bjLVA==}
|
||||
|
||||
'@kevisual/logger@0.0.4':
|
||||
resolution: {integrity: sha512-+fpr92eokSxoGOW1SIRl/27lPuO+zyY+feR5o2Q4YCNlAdt2x64NwC/w8r/3NEC5QenLgd4K0azyKTI2mHbARw==}
|
||||
|
||||
'@kevisual/permission@0.0.3':
|
||||
resolution: {integrity: sha512-8JsA/5O5Ax/z+M+MYpFYdlioHE6jNmWMuFSokBWYs9CCAHNiSKMR01YLkoVDoPvncfH/Y8F5K/IEXRCbptuMNA==}
|
||||
|
||||
'@kevisual/query@0.0.31':
|
||||
resolution: {integrity: sha512-bBdepjmMICLpcj/a9fnn82/0CGGYUZiCV+usWsJZKAwVlZcnj+WtKmbgKT09KpP6g3jjYzYOaXHiNFB8N0bQAQ==}
|
||||
|
||||
'@kevisual/router@0.0.48':
|
||||
resolution: {integrity: sha512-WsSvT+NpfC/bZbaAzE3WSKD2DRZP0JuPQJGr4YucSdO/lOLB4cEpOZRbPlV3l7G064ow8QJRAN2DUW+bRjrp1A==}
|
||||
|
||||
@@ -336,12 +348,22 @@ snapshots:
|
||||
dependencies:
|
||||
fetch-event-stream: 0.1.5
|
||||
|
||||
'@kevisual/ai@0.0.19':
|
||||
dependencies:
|
||||
'@kevisual/logger': 0.0.4
|
||||
'@kevisual/permission': 0.0.3
|
||||
'@kevisual/query': 0.0.31
|
||||
|
||||
'@kevisual/load@0.0.6':
|
||||
dependencies:
|
||||
eventemitter3: 5.0.1
|
||||
|
||||
'@kevisual/logger@0.0.4': {}
|
||||
|
||||
'@kevisual/permission@0.0.3': {}
|
||||
|
||||
'@kevisual/query@0.0.31': {}
|
||||
|
||||
'@kevisual/router@0.0.48':
|
||||
dependencies:
|
||||
path-to-regexp: 8.3.0
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
import { AsrRelatime as QwenAsrRelatime } from "./provider/aliyun/base.ts";
|
||||
|
||||
export { WSServer } from "./ws.ts";
|
||||
|
||||
export {
|
||||
QwenAsrRelatime
|
||||
|
||||
59
src/asr/provider/aliyun/auc.ts
Normal file
59
src/asr/provider/aliyun/auc.ts
Normal file
@@ -0,0 +1,59 @@
|
||||
import { BailianChat } from '@kevisual/ai'
|
||||
|
||||
type Options = {
|
||||
token?: string
|
||||
}
|
||||
export class AliyunAucChat extends BailianChat {
|
||||
constructor(opts?: Options) {
|
||||
super({
|
||||
apiKey: opts?.token,
|
||||
baseURL: 'https://dashscope.aliyuncs.com/api/v1',
|
||||
})
|
||||
}
|
||||
async getText(messages: TextMessages[], options?: { model?: string, parameters?: any }): Promise<any> {
|
||||
const model = options?.model || 'qwen3-asr-flash'
|
||||
const data = {
|
||||
model: model,
|
||||
input: {
|
||||
messages: messages,
|
||||
},
|
||||
parameters: {
|
||||
"incremental_output": true,
|
||||
"asr_options": {
|
||||
"enable_itn": false
|
||||
},
|
||||
...options?.parameters
|
||||
},
|
||||
stream: false,
|
||||
}
|
||||
const response = await this.post(`${this.baseURL}/services/aigc/multimodal-generation/generation`, { data: data });
|
||||
if (!response.ok) {
|
||||
const errorText = await response.text();
|
||||
throw new Error(`Chat API request failed: ${response.status} ${response.statusText} - ${errorText}`);
|
||||
}
|
||||
|
||||
const res = await response.json() as any;
|
||||
const choices = res.output?.choices || [];
|
||||
const choice = choices[0] || {};
|
||||
const message = choice.message || {};
|
||||
this.prompt_tokens = res.usage?.prompt_tokens ?? 0;
|
||||
this.total_tokens = res.usage?.total_tokens ?? 0;
|
||||
this.completion_tokens = res.usage?.completion_tokens ?? 0;
|
||||
|
||||
const text = message.content?.map?.((item: any) => item.text).join('') || '';
|
||||
this.responseText = text;
|
||||
|
||||
return message as ResponseMessage;
|
||||
}
|
||||
}
|
||||
|
||||
type TextMessages = {
|
||||
role?: 'system' | 'user' | 'assistant',
|
||||
content?: string | Array<{ audio: string }>
|
||||
}
|
||||
|
||||
type ResponseMessage = {
|
||||
role?: string,
|
||||
content?: Array<{ text: string }>,
|
||||
annotations?: { emotion: string, language: string, type: string }[],
|
||||
}
|
||||
@@ -103,18 +103,43 @@ export class AsrRelatime extends WSServer {
|
||||
const data = event.data;
|
||||
try {
|
||||
const result = JSON.parse(data.toString());
|
||||
const isEnd = await this.isEnd(result.type);
|
||||
const types = ['conversation.item.input_audio_transcription.text', 'conversation.item.input_audio_transcription.completed'];
|
||||
const isEnd = this.isComplated(result.type, types[1]);
|
||||
const isText = this.isComplated(result.type, types[0]);
|
||||
if (isEnd && result?.transcript) {
|
||||
const text = result.transcript;
|
||||
this.emitter.emit('result', {
|
||||
text: text,
|
||||
raw: result
|
||||
});
|
||||
} else if (isText && result?.stash) {
|
||||
this.emitter.emit('partial', {
|
||||
text: result.stash,
|
||||
raw: result
|
||||
});
|
||||
}
|
||||
} catch (error) {
|
||||
console.log('error', error);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* 运行在node环境,将浏览器发送的Float32格式音频数据转换为PCM16格式
|
||||
* @param base64
|
||||
* @returns
|
||||
*/
|
||||
async fixBrowerBuffer(base64: string): Promise<Buffer> {
|
||||
let voice = Buffer.from(base64, 'base64');
|
||||
// 浏览器发送的Float32格式音频数据,需要转换为PCM16
|
||||
const floatArray = new Float32Array(voice.buffer, voice.byteOffset, voice.length / 4);
|
||||
const pcm16 = Buffer.alloc(floatArray.length * 2);
|
||||
for (let i = 0; i < floatArray.length; i++) {
|
||||
// 将浮点数 [-1.0, 1.0] 转换为 Int16 [-32768, 32767]
|
||||
const sample = Math.max(-1, Math.min(1, floatArray[i]));
|
||||
pcm16.writeInt16LE(sample < 0 ? sample * 0x8000 : sample * 0x7FFF, i * 2);
|
||||
}
|
||||
voice = pcm16;
|
||||
return voice;
|
||||
}
|
||||
async onClose(event: CloseEvent) {
|
||||
let { code } = event;
|
||||
if (code === 1007) {
|
||||
@@ -123,14 +148,7 @@ export class AsrRelatime extends WSServer {
|
||||
}
|
||||
super.onClose({ ...event, code });
|
||||
}
|
||||
async isEnd(type: string) {
|
||||
const types = ['conversation.item.input_audio_transcription.text', 'conversation.item.input_audio_transcription.completed'];
|
||||
if (type === types[1]) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
async sendBlank(buffer?: Buffer): Promise<void> {
|
||||
this.sendBuffer(buffer || this.generateSilence(2));
|
||||
this.sendBuffer(buffer || this.generateSilence(2) as Buffer);
|
||||
}
|
||||
}
|
||||
|
||||
35
src/asr/provider/aliyun/test/get-text2.ts
Normal file
35
src/asr/provider/aliyun/test/get-text2.ts
Normal file
@@ -0,0 +1,35 @@
|
||||
import { AliyunAucChat } from '../auc.ts';
|
||||
import fs from 'fs/promises';
|
||||
import path from 'path';
|
||||
import net from 'net';
|
||||
import dotenv from 'dotenv';
|
||||
dotenv.config();
|
||||
// const videoTestPath = path.join(process.cwd(), 'videos/asr_example.wav');
|
||||
// const videoTestPath = path.join(process.cwd(), 'videos/asr_example2.wav');
|
||||
// const videoTestPath = path.join(process.cwd(), 'videos/tts_mix.mp3');
|
||||
const videoTestPath = path.join(process.cwd(), 'videos/my_speech_text.wav');
|
||||
const videoTestPath2 = path.join(process.cwd(), 'videos/asr_example2.wav');
|
||||
const videoBlankPath = path.join(process.cwd(), 'videos/blank.wav');
|
||||
const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));
|
||||
|
||||
|
||||
const messages = [
|
||||
{
|
||||
role: 'user' as const,
|
||||
content: [
|
||||
{
|
||||
audio: `data:audio/wav;base64,${(await fs.readFile(videoTestPath2)).toString('base64')}`,
|
||||
},
|
||||
],
|
||||
},
|
||||
];
|
||||
|
||||
async function main() {
|
||||
const chat = new AliyunAucChat({
|
||||
token: process.env.BAILIAN_API_KEY,
|
||||
});
|
||||
const response = await chat.getText(messages, { stream: false, model: 'qwen3-asr-flash' });
|
||||
console.log('Final response:', response);
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
@@ -9,7 +9,7 @@ export type WSSOptions = {
|
||||
emitter?: EventEmitter;
|
||||
};
|
||||
interface WSServerInterface {
|
||||
isComplated(type: string, endType?: string): Promise<boolean>;
|
||||
isComplated(type: string, endType?: string): boolean;
|
||||
start(): Promise<void>;
|
||||
}
|
||||
export class WSServer implements WSServerInterface {
|
||||
@@ -151,7 +151,7 @@ export class WSServer implements WSServerInterface {
|
||||
}
|
||||
}
|
||||
|
||||
async isComplated(type: string, endType = '') {
|
||||
isComplated(type: string, endType = '') {
|
||||
if (type === endType) {
|
||||
return true;
|
||||
}
|
||||
@@ -163,7 +163,7 @@ export class WSServer implements WSServerInterface {
|
||||
* @param durationSeconds 静音时长(秒)
|
||||
* @returns WAV 音频缓冲区
|
||||
*/
|
||||
generateSilence(durationSeconds: number): Buffer {
|
||||
generateSilence(durationSeconds: number, { encoding = 'buffer' }: { encoding?: 'buffer' | 'base64' } = {}): Buffer | string {
|
||||
const sampleRate = 16000; // 采样率 16kHz
|
||||
const bitDepth = 16; // 位深 16bit
|
||||
const channels = 1; // 单声道
|
||||
@@ -203,15 +203,27 @@ export class WSServer implements WSServerInterface {
|
||||
const silenceData = Buffer.alloc(dataSize);
|
||||
|
||||
// 合并头部和数据
|
||||
return Buffer.concat([header, silenceData]);
|
||||
const buffer = Buffer.concat([header, silenceData]);
|
||||
if (encoding === 'base64') {
|
||||
return buffer.toString('base64');
|
||||
}
|
||||
return buffer;
|
||||
}
|
||||
async sendBlank(buffer?: Buffer) {
|
||||
async sendBlank(buffer?: Buffer | ((buffer: Buffer) => any)) {
|
||||
const isConnected = await this.checkConnected();
|
||||
if (!isConnected) {
|
||||
this.reconnect({ timeout: 1000 });
|
||||
return;
|
||||
}
|
||||
if (buffer) {
|
||||
if (buffer && typeof buffer === 'function') {
|
||||
const blankBuffer = this.generateSilence(2);
|
||||
const value = await buffer(Buffer.from(blankBuffer));
|
||||
if (typeof value === 'string') {
|
||||
this.ws.send(value);
|
||||
} else {
|
||||
this.ws.send(JSON.stringify(value));
|
||||
}
|
||||
} else if (buffer && Buffer.isBuffer(buffer)) {
|
||||
this.ws.send(buffer);
|
||||
return;
|
||||
}
|
||||
@@ -219,4 +231,7 @@ export class WSServer implements WSServerInterface {
|
||||
const blankBuffer = this.generateSilence(2);
|
||||
this.ws.send(blankBuffer);
|
||||
}
|
||||
async sendBlankJson() {
|
||||
this.ws.send(JSON.stringify({ type: 'blankVoice' }));
|
||||
}
|
||||
}
|
||||
|
||||
47
src/test/asr.ts
Normal file
47
src/test/asr.ts
Normal file
@@ -0,0 +1,47 @@
|
||||
import { WSServer } from "../asr/ws.ts";
|
||||
import net from "net";
|
||||
import fs from 'fs/promises';
|
||||
import path from 'path';
|
||||
import dotenv from 'dotenv';
|
||||
dotenv.config();
|
||||
// const videoTestPath = path.join(process.cwd(), 'videos/asr_example.wav');
|
||||
// const videoTestPath = path.join(process.cwd(), 'videos/asr_example2.wav');
|
||||
// const videoTestPath = path.join(process.cwd(), 'videos/tts_mix.mp3');
|
||||
const videoTestPath = path.join(process.cwd(), 'videos/my_speech_text.wav');
|
||||
const videoTestPath2 = path.join(process.cwd(), 'videos/asr_example2.wav');
|
||||
const videoBlankPath = path.join(process.cwd(), 'videos/blank.wav');
|
||||
const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));
|
||||
|
||||
|
||||
const ws = new WSServer({
|
||||
url: "ws://localhost:51015/ws/asr?id=test",
|
||||
onConnect: async () => {
|
||||
console.log("WebSocket connected");
|
||||
ws.emitter.on("message", (data) => {
|
||||
// console.log("Received message:", data.data);
|
||||
const json = JSON.parse(data.data);
|
||||
// console.log('json', json);
|
||||
if (json && json.type === 'connected') {
|
||||
ws.ws.send(JSON.stringify({ type: 'init' }));
|
||||
}
|
||||
if (json && json.type === 'asr' && json.code === 200) {
|
||||
ws.emitter.emit('asr');
|
||||
}
|
||||
});
|
||||
ws.emitter.once('asr', async () => {
|
||||
const audioContent = await fs.readFile(videoTestPath);
|
||||
const audioContent2 = await fs.readFile(videoTestPath2);
|
||||
const base64Audio = audioContent.toString('base64');
|
||||
const value = { voice: base64Audio };
|
||||
ws.ws.send(JSON.stringify(value));
|
||||
console.log('slice 40', base64Audio.slice(0, 40));
|
||||
ws.sendBlank((buffer) => ({ type: 'blankVoice', voice: buffer.toString('base64') }));
|
||||
ws.ws.send(JSON.stringify({ voice: audioContent2.toString('base64') }));
|
||||
ws.sendBlank((buffer) => ({ type: 'blankVoice', voice: buffer.toString('base64') }));
|
||||
});
|
||||
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
net.createServer().listen(60000);
|
||||
@@ -1,5 +1,5 @@
|
||||
const isBrowser = (typeof process === 'undefined') ||
|
||||
(typeof window !== 'undefined' && typeof window.document !== 'undefined') ||
|
||||
const isBrowser = (typeof process === 'undefined') ||
|
||||
(typeof window !== 'undefined' && typeof window.document !== 'undefined') ||
|
||||
(typeof process !== 'undefined' && process?.env?.BROWSER === 'true');
|
||||
const chantHttpToWs = (url: string) => {
|
||||
if (url.startsWith('http://')) {
|
||||
@@ -24,7 +24,8 @@ export const initWs = async (url: string, options?: WebSocketOptions) => {
|
||||
if (isBrowser) {
|
||||
ws = new WebSocket(url);
|
||||
} else {
|
||||
const WebSocket = await import('ws').then((module) => module.default);
|
||||
const wsPakcages = 'ws' // 避免vite 自动会默认的在浏览器引入ws然后报错
|
||||
const WebSocket = await import(wsPakcages).then((module) => module.default);
|
||||
const { rejectUnauthorized, headers, ...rest } = options || {};
|
||||
ws = new WebSocket(url, {
|
||||
rejectUnauthorized: rejectUnauthorized ?? true,
|
||||
|
||||
@@ -14,9 +14,6 @@
|
||||
],
|
||||
"@agent/*": [
|
||||
"agent/*"
|
||||
],
|
||||
"@kevisual/video-tools/*": [
|
||||
"src/*"
|
||||
]
|
||||
},
|
||||
},
|
||||
|
||||
Reference in New Issue
Block a user