diff --git a/package.json b/package.json index c9ef365..de29521 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@kevisual/video-tools", - "version": "0.0.8", + "version": "0.0.12", "description": "", "main": "index.js", "basename": "/root/video-tools", @@ -9,7 +9,7 @@ "type": "system-app" }, "scripts": { - "build": "bun bun.config.ts", + "build": "npm publish", "dev:bun": "bun run src/dev.ts --watch", "test": "tsx test/**/*.ts", "clean": "rm -rf dist", @@ -30,6 +30,7 @@ }, "dependencies": { "@gradio/client": "^2.0.1", + "@kevisual/ai": "^0.0.19", "@kevisual/router": "0.0.48", "@kevisual/use-config": "^1.0.21", "@kevisual/video": "^0.0.2", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 498a408..853d2b5 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -11,6 +11,9 @@ importers: '@gradio/client': specifier: ^2.0.1 version: 2.0.1 + '@kevisual/ai': + specifier: ^0.0.19 + version: 0.0.19 '@kevisual/router': specifier: 0.0.48 version: 0.0.48 @@ -68,12 +71,21 @@ packages: resolution: {integrity: sha512-NLaQNj5fn+Klgtf9ESL2NhlfBo9GHYjxBCbLMXamRev36nQ/fVmhKV2V2DLV91IVTbL/gAMzeTsCmZ1Cl2CLlQ==} engines: {node: '>=18.0.0'} + '@kevisual/ai@0.0.19': + resolution: {integrity: sha512-AFc8m6OcHZNxCb88bvzhvwWTZ4EVYyPupBzPUsLKLpdNBvsqm9TRboKCM2brJj2cqHnm+H+RbAk9AcGJkYhRCA==} + '@kevisual/load@0.0.6': resolution: {integrity: sha512-+3YTFehRcZ1haGel5DKYMUwmi5i6f2psyaPZlfkKU/cOXgkpwoG9/BEqPCnPjicKqqnksEpixVRkyHJ+5bjLVA==} '@kevisual/logger@0.0.4': resolution: {integrity: sha512-+fpr92eokSxoGOW1SIRl/27lPuO+zyY+feR5o2Q4YCNlAdt2x64NwC/w8r/3NEC5QenLgd4K0azyKTI2mHbARw==} + '@kevisual/permission@0.0.3': + resolution: {integrity: sha512-8JsA/5O5Ax/z+M+MYpFYdlioHE6jNmWMuFSokBWYs9CCAHNiSKMR01YLkoVDoPvncfH/Y8F5K/IEXRCbptuMNA==} + + '@kevisual/query@0.0.31': + resolution: {integrity: sha512-bBdepjmMICLpcj/a9fnn82/0CGGYUZiCV+usWsJZKAwVlZcnj+WtKmbgKT09KpP6g3jjYzYOaXHiNFB8N0bQAQ==} + '@kevisual/router@0.0.48': resolution: {integrity: sha512-WsSvT+NpfC/bZbaAzE3WSKD2DRZP0JuPQJGr4YucSdO/lOLB4cEpOZRbPlV3l7G064ow8QJRAN2DUW+bRjrp1A==} @@ -336,12 +348,22 @@ snapshots: dependencies: fetch-event-stream: 0.1.5 + '@kevisual/ai@0.0.19': + dependencies: + '@kevisual/logger': 0.0.4 + '@kevisual/permission': 0.0.3 + '@kevisual/query': 0.0.31 + '@kevisual/load@0.0.6': dependencies: eventemitter3: 5.0.1 '@kevisual/logger@0.0.4': {} + '@kevisual/permission@0.0.3': {} + + '@kevisual/query@0.0.31': {} + '@kevisual/router@0.0.48': dependencies: path-to-regexp: 8.3.0 diff --git a/src/asr/index.ts b/src/asr/index.ts index c882806..4532b18 100644 --- a/src/asr/index.ts +++ b/src/asr/index.ts @@ -1,5 +1,5 @@ import { AsrRelatime as QwenAsrRelatime } from "./provider/aliyun/base.ts"; - +export { WSServer } from "./ws.ts"; export { QwenAsrRelatime diff --git a/src/asr/provider/aliyun/auc.ts b/src/asr/provider/aliyun/auc.ts new file mode 100644 index 0000000..466a5f4 --- /dev/null +++ b/src/asr/provider/aliyun/auc.ts @@ -0,0 +1,59 @@ +import { BailianChat } from '@kevisual/ai' + +type Options = { + token?: string +} +export class AliyunAucChat extends BailianChat { + constructor(opts?: Options) { + super({ + apiKey: opts?.token, + baseURL: 'https://dashscope.aliyuncs.com/api/v1', + }) + } + async getText(messages: TextMessages[], options?: { model?: string, parameters?: any }): Promise { + const model = options?.model || 'qwen3-asr-flash' + const data = { + model: model, + input: { + messages: messages, + }, + parameters: { + "incremental_output": true, + "asr_options": { + "enable_itn": false + }, + ...options?.parameters + }, + stream: false, + } + const response = await this.post(`${this.baseURL}/services/aigc/multimodal-generation/generation`, { data: data }); + if (!response.ok) { + const errorText = await response.text(); + throw new Error(`Chat API request failed: ${response.status} ${response.statusText} - ${errorText}`); + } + + const res = await response.json() as any; + const choices = res.output?.choices || []; + const choice = choices[0] || {}; + const message = choice.message || {}; + this.prompt_tokens = res.usage?.prompt_tokens ?? 0; + this.total_tokens = res.usage?.total_tokens ?? 0; + this.completion_tokens = res.usage?.completion_tokens ?? 0; + + const text = message.content?.map?.((item: any) => item.text).join('') || ''; + this.responseText = text; + + return message as ResponseMessage; + } +} + +type TextMessages = { + role?: 'system' | 'user' | 'assistant', + content?: string | Array<{ audio: string }> +} + +type ResponseMessage = { + role?: string, + content?: Array<{ text: string }>, + annotations?: { emotion: string, language: string, type: string }[], +} \ No newline at end of file diff --git a/src/asr/provider/aliyun/base.ts b/src/asr/provider/aliyun/base.ts index bea108e..cafaf22 100644 --- a/src/asr/provider/aliyun/base.ts +++ b/src/asr/provider/aliyun/base.ts @@ -103,18 +103,43 @@ export class AsrRelatime extends WSServer { const data = event.data; try { const result = JSON.parse(data.toString()); - const isEnd = await this.isEnd(result.type); + const types = ['conversation.item.input_audio_transcription.text', 'conversation.item.input_audio_transcription.completed']; + const isEnd = this.isComplated(result.type, types[1]); + const isText = this.isComplated(result.type, types[0]); if (isEnd && result?.transcript) { const text = result.transcript; this.emitter.emit('result', { text: text, raw: result }); + } else if (isText && result?.stash) { + this.emitter.emit('partial', { + text: result.stash, + raw: result + }); } } catch (error) { console.log('error', error); } } + /** + * 运行在node环境,将浏览器发送的Float32格式音频数据转换为PCM16格式 + * @param base64 + * @returns + */ + async fixBrowerBuffer(base64: string): Promise { + let voice = Buffer.from(base64, 'base64'); + // 浏览器发送的Float32格式音频数据,需要转换为PCM16 + const floatArray = new Float32Array(voice.buffer, voice.byteOffset, voice.length / 4); + const pcm16 = Buffer.alloc(floatArray.length * 2); + for (let i = 0; i < floatArray.length; i++) { + // 将浮点数 [-1.0, 1.0] 转换为 Int16 [-32768, 32767] + const sample = Math.max(-1, Math.min(1, floatArray[i])); + pcm16.writeInt16LE(sample < 0 ? sample * 0x8000 : sample * 0x7FFF, i * 2); + } + voice = pcm16; + return voice; + } async onClose(event: CloseEvent) { let { code } = event; if (code === 1007) { @@ -123,14 +148,7 @@ export class AsrRelatime extends WSServer { } super.onClose({ ...event, code }); } - async isEnd(type: string) { - const types = ['conversation.item.input_audio_transcription.text', 'conversation.item.input_audio_transcription.completed']; - if (type === types[1]) { - return true; - } - return false; - } async sendBlank(buffer?: Buffer): Promise { - this.sendBuffer(buffer || this.generateSilence(2)); + this.sendBuffer(buffer || this.generateSilence(2) as Buffer); } } diff --git a/src/asr/provider/aliyun/test/get-text2.ts b/src/asr/provider/aliyun/test/get-text2.ts new file mode 100644 index 0000000..a1c6d35 --- /dev/null +++ b/src/asr/provider/aliyun/test/get-text2.ts @@ -0,0 +1,35 @@ +import { AliyunAucChat } from '../auc.ts'; +import fs from 'fs/promises'; +import path from 'path'; +import net from 'net'; +import dotenv from 'dotenv'; +dotenv.config(); +// const videoTestPath = path.join(process.cwd(), 'videos/asr_example.wav'); +// const videoTestPath = path.join(process.cwd(), 'videos/asr_example2.wav'); +// const videoTestPath = path.join(process.cwd(), 'videos/tts_mix.mp3'); +const videoTestPath = path.join(process.cwd(), 'videos/my_speech_text.wav'); +const videoTestPath2 = path.join(process.cwd(), 'videos/asr_example2.wav'); +const videoBlankPath = path.join(process.cwd(), 'videos/blank.wav'); +const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); + + +const messages = [ + { + role: 'user' as const, + content: [ + { + audio: `data:audio/wav;base64,${(await fs.readFile(videoTestPath2)).toString('base64')}`, + }, + ], + }, +]; + +async function main() { + const chat = new AliyunAucChat({ + token: process.env.BAILIAN_API_KEY, + }); + const response = await chat.getText(messages, { stream: false, model: 'qwen3-asr-flash' }); + console.log('Final response:', response); +} + +main().catch(console.error); diff --git a/src/asr/ws.ts b/src/asr/ws.ts index bbef408..fc34517 100644 --- a/src/asr/ws.ts +++ b/src/asr/ws.ts @@ -9,7 +9,7 @@ export type WSSOptions = { emitter?: EventEmitter; }; interface WSServerInterface { - isComplated(type: string, endType?: string): Promise; + isComplated(type: string, endType?: string): boolean; start(): Promise; } export class WSServer implements WSServerInterface { @@ -151,7 +151,7 @@ export class WSServer implements WSServerInterface { } } - async isComplated(type: string, endType = '') { + isComplated(type: string, endType = '') { if (type === endType) { return true; } @@ -163,7 +163,7 @@ export class WSServer implements WSServerInterface { * @param durationSeconds 静音时长(秒) * @returns WAV 音频缓冲区 */ - generateSilence(durationSeconds: number): Buffer { + generateSilence(durationSeconds: number, { encoding = 'buffer' }: { encoding?: 'buffer' | 'base64' } = {}): Buffer | string { const sampleRate = 16000; // 采样率 16kHz const bitDepth = 16; // 位深 16bit const channels = 1; // 单声道 @@ -203,15 +203,27 @@ export class WSServer implements WSServerInterface { const silenceData = Buffer.alloc(dataSize); // 合并头部和数据 - return Buffer.concat([header, silenceData]); + const buffer = Buffer.concat([header, silenceData]); + if (encoding === 'base64') { + return buffer.toString('base64'); + } + return buffer; } - async sendBlank(buffer?: Buffer) { + async sendBlank(buffer?: Buffer | ((buffer: Buffer) => any)) { const isConnected = await this.checkConnected(); if (!isConnected) { this.reconnect({ timeout: 1000 }); return; } - if (buffer) { + if (buffer && typeof buffer === 'function') { + const blankBuffer = this.generateSilence(2); + const value = await buffer(Buffer.from(blankBuffer)); + if (typeof value === 'string') { + this.ws.send(value); + } else { + this.ws.send(JSON.stringify(value)); + } + } else if (buffer && Buffer.isBuffer(buffer)) { this.ws.send(buffer); return; } @@ -219,4 +231,7 @@ export class WSServer implements WSServerInterface { const blankBuffer = this.generateSilence(2); this.ws.send(blankBuffer); } + async sendBlankJson() { + this.ws.send(JSON.stringify({ type: 'blankVoice' })); + } } diff --git a/src/test/asr.ts b/src/test/asr.ts new file mode 100644 index 0000000..78f09ac --- /dev/null +++ b/src/test/asr.ts @@ -0,0 +1,47 @@ +import { WSServer } from "../asr/ws.ts"; +import net from "net"; +import fs from 'fs/promises'; +import path from 'path'; +import dotenv from 'dotenv'; +dotenv.config(); +// const videoTestPath = path.join(process.cwd(), 'videos/asr_example.wav'); +// const videoTestPath = path.join(process.cwd(), 'videos/asr_example2.wav'); +// const videoTestPath = path.join(process.cwd(), 'videos/tts_mix.mp3'); +const videoTestPath = path.join(process.cwd(), 'videos/my_speech_text.wav'); +const videoTestPath2 = path.join(process.cwd(), 'videos/asr_example2.wav'); +const videoBlankPath = path.join(process.cwd(), 'videos/blank.wav'); +const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); + + +const ws = new WSServer({ + url: "ws://localhost:51015/ws/asr?id=test", + onConnect: async () => { + console.log("WebSocket connected"); + ws.emitter.on("message", (data) => { + // console.log("Received message:", data.data); + const json = JSON.parse(data.data); + // console.log('json', json); + if (json && json.type === 'connected') { + ws.ws.send(JSON.stringify({ type: 'init' })); + } + if (json && json.type === 'asr' && json.code === 200) { + ws.emitter.emit('asr'); + } + }); + ws.emitter.once('asr', async () => { + const audioContent = await fs.readFile(videoTestPath); + const audioContent2 = await fs.readFile(videoTestPath2); + const base64Audio = audioContent.toString('base64'); + const value = { voice: base64Audio }; + ws.ws.send(JSON.stringify(value)); + console.log('slice 40', base64Audio.slice(0, 40)); + ws.sendBlank((buffer) => ({ type: 'blankVoice', voice: buffer.toString('base64') })); + ws.ws.send(JSON.stringify({ voice: audioContent2.toString('base64') })); + ws.sendBlank((buffer) => ({ type: 'blankVoice', voice: buffer.toString('base64') })); + }); + + } +}); + + +net.createServer().listen(60000); \ No newline at end of file diff --git a/src/ws/index.ts b/src/ws/index.ts index 68e7109..025f45e 100644 --- a/src/ws/index.ts +++ b/src/ws/index.ts @@ -1,5 +1,5 @@ -const isBrowser = (typeof process === 'undefined') || - (typeof window !== 'undefined' && typeof window.document !== 'undefined') || +const isBrowser = (typeof process === 'undefined') || + (typeof window !== 'undefined' && typeof window.document !== 'undefined') || (typeof process !== 'undefined' && process?.env?.BROWSER === 'true'); const chantHttpToWs = (url: string) => { if (url.startsWith('http://')) { @@ -24,7 +24,8 @@ export const initWs = async (url: string, options?: WebSocketOptions) => { if (isBrowser) { ws = new WebSocket(url); } else { - const WebSocket = await import('ws').then((module) => module.default); + const wsPakcages = 'ws' // 避免vite 自动会默认的在浏览器引入ws然后报错 + const WebSocket = await import(wsPakcages).then((module) => module.default); const { rejectUnauthorized, headers, ...rest } = options || {}; ws = new WebSocket(url, { rejectUnauthorized: rejectUnauthorized ?? true, diff --git a/tsconfig.json b/tsconfig.json index 12eef50..010ba70 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -14,9 +14,6 @@ ], "@agent/*": [ "agent/*" - ], - "@kevisual/video-tools/*": [ - "src/*" ] }, },