阿里云一句话识别

fix funasr
2025-05-19 01:44:24 +08:00 · 2025-05-19 01:01:38 +08:00
11 changed files with 386 additions and 129 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,21 +1,69 @@
 node_modules

-dist
-
-app.config.json5
-
-apps.config.json
-
-deploy.tar.gz
-cache-file
-
-/apps
-
-logs
+# mac
+.DS_Store

 .env*
-!.env.example
+!.env*example

+dist
+build
+logs
+
+.turbo
+
+pack-dist
+
+# astro
+.astro
+
+# next
+.next
+
+# nuxt
+.nuxt
+
+# vercel
+.vercel
+
+# vuepress
+.vuepress/dist
+
+# coverage
+coverage/
+
+# typescript
+*.tsbuildinfo
+
+# debug logs
+*.log
+*.tmp
+
+# vscode
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+
+# idea
+.idea
+
+# system
+Thumbs.db
+ehthumbs.db
+Desktop.ini
+
+# temp files
+*.tmp
+*.temp
+
+# local development
+*.local
+
+public/r
+
+.pnpm-store
 models

-videos/tts_mix.wav
+videos/tts_mix.mp3
--- a/package.json
+++ b/package.json
@@ -49,6 +49,8 @@
    "vosk": "^0.3.39"
  },
  "devDependencies": {
+    "@alicloud/pop-core": "^1.8.0",
+    "@kevisual/logger": "^0.0.3",
    "@kevisual/types": "^0.0.6",
    "@kevisual/use-config": "^1.0.10",
    "@rollup/plugin-alias": "^5.1.1",
--- a/src/asr/provider/aliyun/aliyun-asr-server.ts
+++ b/src/asr/provider/aliyun/aliyun-asr-server.ts
@@ -0,0 +1,131 @@
+type AliAsrServerOptions = {
+  baseUrl?: string;
+  appkey: string;
+  token: string;
+  format?: string;
+  sampleRate?: string;
+  enablePunctuationPrediction?: boolean;
+  enableInverseTextNormalization?: boolean;
+  enableVoiceDetection?: boolean;
+};
+export class AliAsrServer {
+  private baseUrl: string;
+  private appkey: string;
+  private token: string;
+  private format: string;
+  private sampleRate: string;
+  private enablePunctuationPrediction: boolean;
+  private enableInverseTextNormalization: boolean;
+  private enableVoiceDetection: boolean;
+
+  constructor(opts?: AliAsrServerOptions) {
+    const {
+      baseUrl = 'https://nls-gateway-cn-shanghai.aliyuncs.com/stream/v1/asr',
+      appkey = '',
+      token = '',
+      format,
+      sampleRate,
+      enablePunctuationPrediction = true,
+      enableInverseTextNormalization = true,
+      enableVoiceDetection = false,
+    } = opts || {};
+    this.baseUrl = baseUrl;
+    this.appkey = appkey;
+    this.token = token;
+    this.format = format;
+    this.sampleRate = sampleRate;
+    this.enablePunctuationPrediction = enablePunctuationPrediction;
+    this.enableInverseTextNormalization = enableInverseTextNormalization;
+    this.enableVoiceDetection = enableVoiceDetection;
+  }
+
+  buildRequestUrl(): string {
+    const params = new URLSearchParams();
+    params.append('appkey', this.appkey);
+    this.format && params.append('format', this.format);
+    this.sampleRate && params.append('sample_rate', this.sampleRate);
+
+    if (this.enablePunctuationPrediction) {
+      params.append('enable_punctuation_prediction', 'true');
+    }
+
+    if (this.enableInverseTextNormalization) {
+      params.append('enable_inverse_text_normalization', 'true');
+    }
+
+    if (this.enableVoiceDetection) {
+      params.append('enable_voice_detection', 'true');
+    }
+
+    return `${this.baseUrl}?${params.toString()}`;
+  }
+
+  async processAudio(audioContent: Buffer): Promise<any> {
+    try {
+      // 设置请求头
+      const headers = {
+        'X-NLS-Token': this.token,
+        'Content-Type': 'application/octet-stream',
+      };
+
+      // 构建请求URL
+      const requestUrl = this.buildRequestUrl();
+
+      // 发送请求
+      const response = await fetch(requestUrl, {
+        method: 'POST',
+        headers,
+        body: audioContent,
+      });
+
+      // 处理响应
+      if (!response.ok) {
+        console.log(`The audio file recognized failed, http code: ${response.status}`);
+        const v = await response.text();
+        console.log('The audio file recognized response:', v);
+        return null;
+      }
+      // 解析响应体
+      // console.log('The audio file recognized response:', v);
+      const body = await response.json();
+
+      if (body.status === 20000000) {
+        console.log('The audio file recognized result:');
+        console.log(body);
+        console.log('result: ' + body.result);
+        console.log('The audio file recognized succeed!');
+        return body;
+      } else {
+        console.log('The audio file recognized failed!');
+        console.log(body);
+        return null;
+      }
+    } catch (error) {
+      if (error.code === 'ENOENT') {
+        console.log('The audio file does not exist!');
+      } else {
+        console.log('Error during audio processing:', error);
+      }
+      return null;
+    }
+  }
+}
+
+// // 使用示例
+// async function main() {
+//   const asrServer = new AliAsrServer({
+//     appkey: '填入appkey',
+//     token: '填入服务鉴权Token',
+//     format: 'pcm',
+//     sampleRate: '16000',
+//     enablePunctuationPrediction: true,
+//     enableInverseTextNormalization: true,
+//     enableVoiceDetection: false,
+//   });
+
+//   const audioFile = '/path/to/nls-sample-16k.wav';
+//   await asrServer.processAudio(audioFile);
+// }
+
+// // 执行主函数
+// main().catch(console.error);
--- a/src/asr/provider/aliyun/base.ts
+++ b/src/asr/provider/aliyun/base.ts
@@ -0,0 +1,42 @@
+import RPCClient from '@alicloud/pop-core';
+
+interface TokenResponse {
+  Token: {
+    Id: string;
+    ExpireTime: number;
+  };
+}
+type AliCommonOptions = {
+  accessKeyId: string;
+  accessKeySecret: string;
+};
+export class AliCommon {
+  private accessKeyId: string;
+  private accessKeySecret: string;
+  private endpoint: string;
+  private apiVersion: string;
+  token = '';
+  expireTime = 0;
+  constructor(opts?: AliCommonOptions) {
+    this.accessKeyId = opts?.accessKeyId || process.env.ALIYUN_AK_ID || '';
+    this.accessKeySecret = opts?.accessKeySecret || process.env.ALIYUN_AK_SECRET || '';
+    this.endpoint = 'http://nls-meta.cn-shanghai.aliyuncs.com';
+    this.apiVersion = '2019-02-28';
+  }
+  async getToken() {
+    if (this.token && this.expireTime > Date.now()) {
+      return this.token;
+    }
+    const client = new RPCClient({
+      accessKeyId: this.accessKeyId,
+      accessKeySecret: this.accessKeySecret,
+      endpoint: this.endpoint,
+      apiVersion: this.apiVersion,
+    });
+
+    const result = await client.request<TokenResponse>('CreateToken', {});
+    this.token = result.Token.Id;
+    this.expireTime = result.Token.ExpireTime * 1000;
+    return result.Token.Id;
+  }
+}
--- a/src/asr/provider/aliyun/test/get-text.ts
+++ b/src/asr/provider/aliyun/test/get-text.ts
@@ -0,0 +1,25 @@
+import { AliAsrServer } from '../aliyun-asr-server.ts';
+import fs from 'fs/promises';
+import path from 'path';
+// const videoTestPath = path.join(process.cwd(), 'videos/asr_example.wav');
+// const videoTestPath = path.join(process.cwd(), 'videos/asr_example2.wav');
+const videoTestPath = path.join(process.cwd(), 'videos/tts_mix.mp3');
+// const videoTestPath = path.join(process.cwd(), 'videos/my_speech_text.wav');
+const name = 'output-1746007775571.mp3';
+const videoTestPath2 = path.join(process.cwd(), 'build', name);
+
+// 使用示例
+async function main() {
+  const asrServer = new AliAsrServer({
+    appkey: process.env.ALI_ASR_APP_KEY,
+    token: process.env.ALI_ASR_TOKEN,
+    format: 'mp3',
+    // format: 'wav',
+  });
+
+  const audioContent = await fs.readFile(videoTestPath);
+  await asrServer.processAudio(audioContent);
+}
+
+// 执行主函数
+main().catch(console.error);
--- a/src/asr/provider/aliyun/test/get-token.ts
+++ b/src/asr/provider/aliyun/test/get-token.ts
@@ -0,0 +1,10 @@
+import dotenv from 'dotenv';
+dotenv.config();
+import { AliCommon } from '../base.ts';
+
+const aliCommon = new AliCommon({
+  accessKeyId: process.env.ALIYUN_AK_ID,
+  accessKeySecret: process.env.ALIYUN_AK_SECRET,
+});
+
+aliCommon.getToken().then(console.log);
--- a/src/asr/provider/funasr/test/get-text.ts
+++ b/src/asr/provider/funasr/test/get-text.ts
@@ -3,40 +3,63 @@ import net from 'net';
 import path from 'path';
 import fs from 'fs';

-const videoTestPath = path.join(process.cwd(), 'videos/asr_example.wav');
-const ws = new VideoWS({
-  // url: 'wss://192.168.31.220:10095',
-  url: 'wss://funasr.xiongxiao.me',
-  isFile: true,
-  onConnect: async () => {
-    console.log('onConnect');
-    const data = fs.readFileSync(videoTestPath);
-    let sampleBuf = new Uint8Array(data);
+// const videoTestPath = path.join(process.cwd(), 'videos/asr_example.wav');
+// const videoTestPath = path.join(process.cwd(), 'videos/asr_example2.wav');
+const videoTestPath = path.join(process.cwd(), 'videos/tts_mix.mp3');
+// const videoTestPath = path.join(process.cwd(), 'videos/my_speech_text.wav');
+const name = 'output-1746007775571.mp3';
+const videoTestPath2 = path.join(process.cwd(), 'build', name);

-    var chunk_size = 960; // for asr chunk_size [5, 10, 5]
-    let totalsend = 0;
-    let len = 0;
-    ws.start();
-    while (sampleBuf.length >= chunk_size) {
-      const sendBuf = sampleBuf.slice(0, chunk_size);
-      totalsend = totalsend + sampleBuf.length;
-      sampleBuf = sampleBuf.slice(chunk_size, sampleBuf.length);
-      if (len === 100) {
-        // ws.stop();
-        // ws.start();
-        await new Promise((resolve) => setTimeout(resolve, 1000));
-      }
-      ws.send(sendBuf);
-      len++;
-    }
-    ws.stop();
-    console.log('len', len);
+const url = 'wss://funasr.xiongxiao.me';
+// const ws = new VideoWS({
+//   // url: 'wss://192.168.31.220:10095',
+//   url: 'wss://funasr.xiongxiao.me',
+//   isFile: true,
+//   // mode: 'offline',
+//   wav_format: 'mp3',
+//   onConnect: async () => {
+//     console.log('onConnect');
+//     const data = fs.readFileSync(videoTestPath);
+//     let sampleBuf = new Uint8Array(data);
+
+//     var chunk_size = 960; // for asr chunk_size [5, 10, 5]
+//     let totalsend = 0;
+//     let len = 0;
+//     ws.start();
+//     while (sampleBuf.length >= chunk_size) {
+//       const sendBuf = sampleBuf.slice(0, chunk_size);
+//       totalsend = totalsend + sampleBuf.length;
+//       sampleBuf = sampleBuf.slice(chunk_size, sampleBuf.length);
+//       if (len === 100) {
+//         // ws.stop();
+//         // ws.start();
+//         // await new Promise((resolve) => setTimeout(resolve, 1000));
+//       }
+
+//       await new Promise((resolve) => setTimeout(resolve, 10));
+//       ws.send(sendBuf);
+//       len++;
+//     }
+//     await new Promise((resolve) => setTimeout(resolve, 1000));
+//     ws.stop();
+//     console.log('len', len);
+//   },
+// });
+
+// const server = net.createServer((socket) => {
+//   socket.on('data', (data) => {
+//     console.log('data', data);
+//   });
+// });
+// server.listen(10096);
+
+const ws2 = new VideoWS({
+  url: url,
+  onConnect: async () => {
+    const data = fs.readFileSync(videoTestPath);
+    await ws2.sendBuffer(data, { wav_format: 'mp3' });
+    await new Promise((resolve) => setTimeout(resolve, 1000));
+    const data2 = fs.readFileSync(videoTestPath2);
+    await ws2.sendBuffer(data2, { wav_format: 'mp3' });
  },
 });
-
-const server = net.createServer((socket) => {
-  socket.on('data', (data) => {
-    console.log('data', data);
-  });
-});
-server.listen(10096);
--- a/src/asr/provider/funasr/ws.ts
+++ b/src/asr/provider/funasr/ws.ts
@@ -1,5 +1,7 @@
 // import WebSocket from 'ws';
 import { initWs } from '../../../ws-adapter/index.ts';
+import { logger } from '@/logger/index.ts';
+import { WSServer } from '../../provider/ws-server.ts';

 export type VideoWSOptions = {
  url?: string;
@@ -8,10 +10,22 @@ export type VideoWSOptions = {
  mode?: VideoWsMode;
  isFile?: boolean;
  onConnect?: () => void;
+  wav_format?: string;
 };
 export const videoWsMode = ['2pass', 'online', 'offline'] as const;
 type VideoWsMode = (typeof videoWsMode)[number];
-
+type OpenRequest = {
+  chunk_size: number[];
+  wav_name: string;
+  is_speaking: boolean;
+  chunk_interval: number;
+  // 逆文本标准化(ITN):
+  itn: boolean;
+  mode: VideoWsMode;
+  wav_format?: string;
+  audio_fs?: number;
+  hotwords?: string;
+};
 export type VideoWsResult = {
  isFinal: boolean;
  mode: VideoWsMode;
@@ -21,48 +35,21 @@ export type VideoWsResult = {
  wav_name: string;
 };

-export class VideoWS {
-  ws: WebSocket;
+export class VideoWS extends WSServer {
  itn?: boolean;
  mode?: VideoWsMode;
-  isFile?: boolean;
-  onConnect?: () => void;
+  wav_format?: string;
  constructor(options?: VideoWSOptions) {
+    super({ url: options?.url, ws: options?.ws, onConnect: options?.onConnect });
+    this.itn = options?.itn || false;
    this.itn = options?.itn || false;
    this.mode = options?.mode || 'online';
-    this.isFile = options?.isFile || false;
-    this.initWs(options);
-  }
-  async initWs(options: VideoWSOptions) {
-    if (options?.ws) {
-      this.ws = options.ws;
-    } else {
-      this.ws = await initWs(options.url);
-    }
-    this.onConnect = options?.onConnect || (() => {});
-    this.ws.onopen = this.onOpen.bind(this);
-    this.ws.onmessage = this.onMessage.bind(this);
-    this.ws.onerror = this.onError.bind(this);
-    this.ws.onclose = this.onClose.bind(this);
+    this.wav_format = options?.wav_format;
  }

-  async onOpen() {
-    this.onConnect();
-  }
-  async start() {
-    let isFileMode = this.isFile;
+  async start(opts?: Partial<OpenRequest>) {
    const chunk_size = new Array(5, 10, 5);
-    type OpenRequest = {
-      chunk_size: number[];
-      wav_name: string;
-      is_speaking: boolean;
-      chunk_interval: number;
-      itn: boolean;
-      mode: VideoWsMode;
-      wav_format?: string;
-      audio_fs?: number;
-      hotwords?: string;
-    };
+
    const request: OpenRequest = {
      chunk_size: chunk_size,
      wav_name: 'h5', //
@@ -70,17 +57,16 @@ export class VideoWS {
      chunk_interval: 10,
      itn: this.itn,
      mode: this.mode || 'online',
+      ...opts,
    };
-    console.log('request', request);
-    if (isFileMode) {
-      const file_ext = 'wav';
-      const file_sample_rate = 16000;
-      request.wav_format = file_ext;
-      if (file_ext == 'wav') {
-        request.wav_format = 'PCM';
-        request.audio_fs = file_sample_rate;
-      }
+    const file_sample_rate = 16000;
+    request.wav_format = request.wav_format || this.wav_format || 'wav';
+    if ('wav' == request.wav_format) {
+      request.wav_format = 'PCM';
+      request.audio_fs = file_sample_rate;
    }
+    console.log('request', request);
+
    this.ws.send(JSON.stringify(request));
  }
  async stop() {
@@ -99,7 +85,28 @@ export class VideoWS {
      this.ws.send(data);
    }
  }
+  async sendBuffer(data: Buffer, opts?: { isFile?: boolean; wav_format?: string }) {
+    const { wav_format = 'wav' } = opts || {};
+    if (this.ws && this.ws.readyState === WebSocket.OPEN) {
+      let sampleBuf = new Uint8Array(data);
+      const ws = this;
+      var chunk_size = 960; // for asr chunk_size [5, 10, 5]
+      let totalsend = 0;
+      let len = 0;
+      ws.start({ wav_format });
+      while (sampleBuf.length >= chunk_size) {
+        const sendBuf = sampleBuf.slice(0, chunk_size);
+        totalsend = totalsend + sampleBuf.length;
+        sampleBuf = sampleBuf.slice(chunk_size, sampleBuf.length);
+        await new Promise((resolve) => setTimeout(resolve, 10));
+        ws.send(sendBuf);
+        len++;
+      }
+      ws.stop();
+    }
+  }
  async onMessage(event: MessageEvent) {
+    super.onMessage(event);
    const data = event.data;
    try {
      const result = JSON.parse(data.toString());
--- a/src/logger/index.ts
+++ b/src/logger/index.ts
@@ -1,37 +1,6 @@
-import { pino } from 'pino';
-import { useConfig } from '@kevisual/use-config/env';
+import { Logger } from '@kevisual/logger/node';

-const config = useConfig();
-
-export const logger = pino({
-  level: config.LOG_LEVEL || 'info',
-  transport: {
-    target: 'pino-pretty',
-    options: {
-      colorize: true,
-      translateTime: 'SYS:standard',
-      ignore: 'pid,hostname',
-    },
-  },
-  serializers: {
-    error: pino.stdSerializers.err,
-    req: pino.stdSerializers.req,
-    res: pino.stdSerializers.res,
-  },
-  // base: {
-  //   app: 'ai-videos',
-  //   env: process.env.NODE_ENV || 'development',
-  // },
+const level = process.env.LOG_LEVEL || 'info';
+export const logger = new Logger({
+  level: level as any,
 });
-
-export const logError = (message: string, data?: any) => logger.error({ data }, message);
-export const logWarning = (message: string, data?: any) => logger.warn({ data }, message);
-export const logInfo = (message: string, data?: any) => logger.info({ data }, message);
-export const logDebug = (message: string, data?: any) => logger.debug({ data }, message);
-
-export const log = {
-  error: logError,
-  warn: logWarning,
-  info: logInfo,
-  debug: logDebug,
-};
--- a/src/recorder/index.ts
+++ b/src/recorder/index.ts
@@ -1,9 +1,9 @@
 import assert from 'assert';
-import { logDebug, logInfo } from '../logger/index.ts';
+import { logger } from '../logger/index.ts';
 import { ChildProcessWithoutNullStreams, spawn } from 'child_process';
 import recorders from '../recorder/recorders/index.ts';
 import Stream from 'stream';
-
+const logDebug = logger.debug;
 export type RecordingOptions = {
  /* 采样率，默认为16000 */
  sampleRate?: number;
--- a/videos/my_speech_text.wav
+++ b/videos/my_speech_text.wav
Author	SHA1	Message	Date
abearxiong	54da76bf9d	阿里云一句话识别	2025-05-19 01:44:24 +08:00
abearxiong	a1df51f56b	fix funasr	2025-05-19 01:01:38 +08:00