更新 package.json 和 pnpm-lock.yaml,修改版本号并添加依赖;重构 ASR 模块,优化音频处理逻辑,新增 AliyunAucChat 类及测试用例

This commit is contained in:
2025-12-22 01:49:48 +08:00
parent 00e91e8b28
commit 5db0418cb8
10 changed files with 219 additions and 24 deletions

View File

@@ -1,6 +1,6 @@
{
"name": "@kevisual/video-tools",
"version": "0.0.8",
"version": "0.0.12",
"description": "",
"main": "index.js",
"basename": "/root/video-tools",
@@ -9,7 +9,7 @@
"type": "system-app"
},
"scripts": {
"build": "bun bun.config.ts",
"build": "npm publish",
"dev:bun": "bun run src/dev.ts --watch",
"test": "tsx test/**/*.ts",
"clean": "rm -rf dist",
@@ -30,6 +30,7 @@
},
"dependencies": {
"@gradio/client": "^2.0.1",
"@kevisual/ai": "^0.0.19",
"@kevisual/router": "0.0.48",
"@kevisual/use-config": "^1.0.21",
"@kevisual/video": "^0.0.2",

22
pnpm-lock.yaml generated
View File

@@ -11,6 +11,9 @@ importers:
'@gradio/client':
specifier: ^2.0.1
version: 2.0.1
'@kevisual/ai':
specifier: ^0.0.19
version: 0.0.19
'@kevisual/router':
specifier: 0.0.48
version: 0.0.48
@@ -68,12 +71,21 @@ packages:
resolution: {integrity: sha512-NLaQNj5fn+Klgtf9ESL2NhlfBo9GHYjxBCbLMXamRev36nQ/fVmhKV2V2DLV91IVTbL/gAMzeTsCmZ1Cl2CLlQ==}
engines: {node: '>=18.0.0'}
'@kevisual/ai@0.0.19':
resolution: {integrity: sha512-AFc8m6OcHZNxCb88bvzhvwWTZ4EVYyPupBzPUsLKLpdNBvsqm9TRboKCM2brJj2cqHnm+H+RbAk9AcGJkYhRCA==}
'@kevisual/load@0.0.6':
resolution: {integrity: sha512-+3YTFehRcZ1haGel5DKYMUwmi5i6f2psyaPZlfkKU/cOXgkpwoG9/BEqPCnPjicKqqnksEpixVRkyHJ+5bjLVA==}
'@kevisual/logger@0.0.4':
resolution: {integrity: sha512-+fpr92eokSxoGOW1SIRl/27lPuO+zyY+feR5o2Q4YCNlAdt2x64NwC/w8r/3NEC5QenLgd4K0azyKTI2mHbARw==}
'@kevisual/permission@0.0.3':
resolution: {integrity: sha512-8JsA/5O5Ax/z+M+MYpFYdlioHE6jNmWMuFSokBWYs9CCAHNiSKMR01YLkoVDoPvncfH/Y8F5K/IEXRCbptuMNA==}
'@kevisual/query@0.0.31':
resolution: {integrity: sha512-bBdepjmMICLpcj/a9fnn82/0CGGYUZiCV+usWsJZKAwVlZcnj+WtKmbgKT09KpP6g3jjYzYOaXHiNFB8N0bQAQ==}
'@kevisual/router@0.0.48':
resolution: {integrity: sha512-WsSvT+NpfC/bZbaAzE3WSKD2DRZP0JuPQJGr4YucSdO/lOLB4cEpOZRbPlV3l7G064ow8QJRAN2DUW+bRjrp1A==}
@@ -336,12 +348,22 @@ snapshots:
dependencies:
fetch-event-stream: 0.1.5
'@kevisual/ai@0.0.19':
dependencies:
'@kevisual/logger': 0.0.4
'@kevisual/permission': 0.0.3
'@kevisual/query': 0.0.31
'@kevisual/load@0.0.6':
dependencies:
eventemitter3: 5.0.1
'@kevisual/logger@0.0.4': {}
'@kevisual/permission@0.0.3': {}
'@kevisual/query@0.0.31': {}
'@kevisual/router@0.0.48':
dependencies:
path-to-regexp: 8.3.0

View File

@@ -1,5 +1,5 @@
import { AsrRelatime as QwenAsrRelatime } from "./provider/aliyun/base.ts";
export { WSServer } from "./ws.ts";
export {
QwenAsrRelatime

View File

@@ -0,0 +1,59 @@
import { BailianChat } from '@kevisual/ai'
type Options = {
token?: string
}
export class AliyunAucChat extends BailianChat {
constructor(opts?: Options) {
super({
apiKey: opts?.token,
baseURL: 'https://dashscope.aliyuncs.com/api/v1',
})
}
async getText(messages: TextMessages[], options?: { model?: string, parameters?: any }): Promise<any> {
const model = options?.model || 'qwen3-asr-flash'
const data = {
model: model,
input: {
messages: messages,
},
parameters: {
"incremental_output": true,
"asr_options": {
"enable_itn": false
},
...options?.parameters
},
stream: false,
}
const response = await this.post(`${this.baseURL}/services/aigc/multimodal-generation/generation`, { data: data });
if (!response.ok) {
const errorText = await response.text();
throw new Error(`Chat API request failed: ${response.status} ${response.statusText} - ${errorText}`);
}
const res = await response.json() as any;
const choices = res.output?.choices || [];
const choice = choices[0] || {};
const message = choice.message || {};
this.prompt_tokens = res.usage?.prompt_tokens ?? 0;
this.total_tokens = res.usage?.total_tokens ?? 0;
this.completion_tokens = res.usage?.completion_tokens ?? 0;
const text = message.content?.map?.((item: any) => item.text).join('') || '';
this.responseText = text;
return message as ResponseMessage;
}
}
type TextMessages = {
role?: 'system' | 'user' | 'assistant',
content?: string | Array<{ audio: string }>
}
type ResponseMessage = {
role?: string,
content?: Array<{ text: string }>,
annotations?: { emotion: string, language: string, type: string }[],
}

View File

@@ -103,18 +103,43 @@ export class AsrRelatime extends WSServer {
const data = event.data;
try {
const result = JSON.parse(data.toString());
const isEnd = await this.isEnd(result.type);
const types = ['conversation.item.input_audio_transcription.text', 'conversation.item.input_audio_transcription.completed'];
const isEnd = this.isComplated(result.type, types[1]);
const isText = this.isComplated(result.type, types[0]);
if (isEnd && result?.transcript) {
const text = result.transcript;
this.emitter.emit('result', {
text: text,
raw: result
});
} else if (isText && result?.stash) {
this.emitter.emit('partial', {
text: result.stash,
raw: result
});
}
} catch (error) {
console.log('error', error);
}
}
/**
* 运行在node环境将浏览器发送的Float32格式音频数据转换为PCM16格式
* @param base64
* @returns
*/
async fixBrowerBuffer(base64: string): Promise<Buffer> {
let voice = Buffer.from(base64, 'base64');
// 浏览器发送的Float32格式音频数据需要转换为PCM16
const floatArray = new Float32Array(voice.buffer, voice.byteOffset, voice.length / 4);
const pcm16 = Buffer.alloc(floatArray.length * 2);
for (let i = 0; i < floatArray.length; i++) {
// 将浮点数 [-1.0, 1.0] 转换为 Int16 [-32768, 32767]
const sample = Math.max(-1, Math.min(1, floatArray[i]));
pcm16.writeInt16LE(sample < 0 ? sample * 0x8000 : sample * 0x7FFF, i * 2);
}
voice = pcm16;
return voice;
}
async onClose(event: CloseEvent) {
let { code } = event;
if (code === 1007) {
@@ -123,14 +148,7 @@ export class AsrRelatime extends WSServer {
}
super.onClose({ ...event, code });
}
async isEnd(type: string) {
const types = ['conversation.item.input_audio_transcription.text', 'conversation.item.input_audio_transcription.completed'];
if (type === types[1]) {
return true;
}
return false;
}
async sendBlank(buffer?: Buffer): Promise<void> {
this.sendBuffer(buffer || this.generateSilence(2));
this.sendBuffer(buffer || this.generateSilence(2) as Buffer);
}
}

View File

@@ -0,0 +1,35 @@
import { AliyunAucChat } from '../auc.ts';
import fs from 'fs/promises';
import path from 'path';
import net from 'net';
import dotenv from 'dotenv';
dotenv.config();
// const videoTestPath = path.join(process.cwd(), 'videos/asr_example.wav');
// const videoTestPath = path.join(process.cwd(), 'videos/asr_example2.wav');
// const videoTestPath = path.join(process.cwd(), 'videos/tts_mix.mp3');
const videoTestPath = path.join(process.cwd(), 'videos/my_speech_text.wav');
const videoTestPath2 = path.join(process.cwd(), 'videos/asr_example2.wav');
const videoBlankPath = path.join(process.cwd(), 'videos/blank.wav');
const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));
const messages = [
{
role: 'user' as const,
content: [
{
audio: `data:audio/wav;base64,${(await fs.readFile(videoTestPath2)).toString('base64')}`,
},
],
},
];
async function main() {
const chat = new AliyunAucChat({
token: process.env.BAILIAN_API_KEY,
});
const response = await chat.getText(messages, { stream: false, model: 'qwen3-asr-flash' });
console.log('Final response:', response);
}
main().catch(console.error);

View File

@@ -9,7 +9,7 @@ export type WSSOptions = {
emitter?: EventEmitter;
};
interface WSServerInterface {
isComplated(type: string, endType?: string): Promise<boolean>;
isComplated(type: string, endType?: string): boolean;
start(): Promise<void>;
}
export class WSServer implements WSServerInterface {
@@ -151,7 +151,7 @@ export class WSServer implements WSServerInterface {
}
}
async isComplated(type: string, endType = '') {
isComplated(type: string, endType = '') {
if (type === endType) {
return true;
}
@@ -163,7 +163,7 @@ export class WSServer implements WSServerInterface {
* @param durationSeconds 静音时长(秒)
* @returns WAV 音频缓冲区
*/
generateSilence(durationSeconds: number): Buffer {
generateSilence(durationSeconds: number, { encoding = 'buffer' }: { encoding?: 'buffer' | 'base64' } = {}): Buffer | string {
const sampleRate = 16000; // 采样率 16kHz
const bitDepth = 16; // 位深 16bit
const channels = 1; // 单声道
@@ -203,15 +203,27 @@ export class WSServer implements WSServerInterface {
const silenceData = Buffer.alloc(dataSize);
// 合并头部和数据
return Buffer.concat([header, silenceData]);
const buffer = Buffer.concat([header, silenceData]);
if (encoding === 'base64') {
return buffer.toString('base64');
}
return buffer;
}
async sendBlank(buffer?: Buffer) {
async sendBlank(buffer?: Buffer | ((buffer: Buffer) => any)) {
const isConnected = await this.checkConnected();
if (!isConnected) {
this.reconnect({ timeout: 1000 });
return;
}
if (buffer) {
if (buffer && typeof buffer === 'function') {
const blankBuffer = this.generateSilence(2);
const value = await buffer(Buffer.from(blankBuffer));
if (typeof value === 'string') {
this.ws.send(value);
} else {
this.ws.send(JSON.stringify(value));
}
} else if (buffer && Buffer.isBuffer(buffer)) {
this.ws.send(buffer);
return;
}
@@ -219,4 +231,7 @@ export class WSServer implements WSServerInterface {
const blankBuffer = this.generateSilence(2);
this.ws.send(blankBuffer);
}
async sendBlankJson() {
this.ws.send(JSON.stringify({ type: 'blankVoice' }));
}
}

47
src/test/asr.ts Normal file
View File

@@ -0,0 +1,47 @@
import { WSServer } from "../asr/ws.ts";
import net from "net";
import fs from 'fs/promises';
import path from 'path';
import dotenv from 'dotenv';
dotenv.config();
// const videoTestPath = path.join(process.cwd(), 'videos/asr_example.wav');
// const videoTestPath = path.join(process.cwd(), 'videos/asr_example2.wav');
// const videoTestPath = path.join(process.cwd(), 'videos/tts_mix.mp3');
const videoTestPath = path.join(process.cwd(), 'videos/my_speech_text.wav');
const videoTestPath2 = path.join(process.cwd(), 'videos/asr_example2.wav');
const videoBlankPath = path.join(process.cwd(), 'videos/blank.wav');
const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));
const ws = new WSServer({
url: "ws://localhost:51015/ws/asr?id=test",
onConnect: async () => {
console.log("WebSocket connected");
ws.emitter.on("message", (data) => {
// console.log("Received message:", data.data);
const json = JSON.parse(data.data);
// console.log('json', json);
if (json && json.type === 'connected') {
ws.ws.send(JSON.stringify({ type: 'init' }));
}
if (json && json.type === 'asr' && json.code === 200) {
ws.emitter.emit('asr');
}
});
ws.emitter.once('asr', async () => {
const audioContent = await fs.readFile(videoTestPath);
const audioContent2 = await fs.readFile(videoTestPath2);
const base64Audio = audioContent.toString('base64');
const value = { voice: base64Audio };
ws.ws.send(JSON.stringify(value));
console.log('slice 40', base64Audio.slice(0, 40));
ws.sendBlank((buffer) => ({ type: 'blankVoice', voice: buffer.toString('base64') }));
ws.ws.send(JSON.stringify({ voice: audioContent2.toString('base64') }));
ws.sendBlank((buffer) => ({ type: 'blankVoice', voice: buffer.toString('base64') }));
});
}
});
net.createServer().listen(60000);

View File

@@ -1,5 +1,5 @@
const isBrowser = (typeof process === 'undefined') ||
(typeof window !== 'undefined' && typeof window.document !== 'undefined') ||
const isBrowser = (typeof process === 'undefined') ||
(typeof window !== 'undefined' && typeof window.document !== 'undefined') ||
(typeof process !== 'undefined' && process?.env?.BROWSER === 'true');
const chantHttpToWs = (url: string) => {
if (url.startsWith('http://')) {
@@ -24,7 +24,8 @@ export const initWs = async (url: string, options?: WebSocketOptions) => {
if (isBrowser) {
ws = new WebSocket(url);
} else {
const WebSocket = await import('ws').then((module) => module.default);
const wsPakcages = 'ws' // 避免vite 自动会默认的在浏览器引入ws然后报错
const WebSocket = await import(wsPakcages).then((module) => module.default);
const { rejectUnauthorized, headers, ...rest } = options || {};
ws = new WebSocket(url, {
rejectUnauthorized: rejectUnauthorized ?? true,

View File

@@ -14,9 +14,6 @@
],
"@agent/*": [
"agent/*"
],
"@kevisual/video-tools/*": [
"src/*"
]
},
},