code-center/src/aura/libs/auc.ts

// https://git.xiongxiao.me/kevisual/video-tools/raw/branch/main/src/asr/provider/volcengine/auc.ts
import { nanoid } from "nanoid"

export const FlashURL = "https://openspeech.bytedance.com/api/v3/auc/bigmodel/recognize/flash"
export const AsrBaseURL = 'https://openspeech.bytedance.com/api/v3/auc/bigmodel/submit'
export const AsrBase = 'volc.bigasr.auc'
export const AsrTurbo = 'volc.bigasr.auc_turbo'

const uuid = () => nanoid()

type AsrOptions = {
  url?: string
  appid?: string
  token?: string
  type?: AsrType
}

type AsrType = 'flash' | 'standard' | 'turbo'
export class Asr {
  url: string = FlashURL
  appid: string = ""
  token: string = ""
  type: AsrType = 'flash'
  constructor(options: AsrOptions = {}) {
    this.appid = options.appid || ""
    this.token = options.token || ""
    this.type = options.type || 'flash'
    if (this.type !== 'flash') {
      this.url = AsrBaseURL
    }
    if (!this.appid || !this.token) {
      console.error("VOLCENGINE_ASR_APPID or VOLCENGINE_ASR_TOKEN is not set")
    }
  }

  header() {
    const model = this.type === 'flash' ? AsrTurbo : AsrBase
    return {
      "X-Api-App-Key": this.appid,
      "X-Api-Access-Key": this.token,
      "X-Api-Resource-Id": model,
      "X-Api-Request-Id": uuid(),
      "X-Api-Sequence": "-1",
    }
  }
  submit(body: AsrRequest) {
    if (!body.audio || (!body.audio.url && !body.audio.data)) {
      throw new Error("audio.url or audio.data is required")
    }
    const data: AsrRequest = {
      ...body,
    }
    return fetch(this.url, { method: "POST", headers: this.header(), body: JSON.stringify(data) })
  }
  async getText(body: AsrRequest) {
    const res = await this.submit(body)
    return res.json()
  }
}

export type AsrResponse = {
  audio_info: {
    /**
     * 音频时长，单位为 ms
     */
    duration: number;
  };
  result: {
    additions: {
      duration: string;
    };
    text: string;
    utterances: Array<{
      end_time: number;
      start_time: number;
      text: string;
      words: Array<{
        confidence: number;
        end_time: number;
        start_time: number;
        text: string;
      }>;
    }>;
  };
}
export interface AsrRequest {
  user?: {
    uid: string;
  };
  audio: {
    url?: string;
    data?: string;
    format?: 'wav' | 'pcm' | 'mp3' | 'ogg';
    codec?: 'raw' | 'opus'; 	// raw / opus，默认为 raw(pcm) 。
    rate?: 8000 | 16000; // 采样率，支持 8000 或 16000，默认为 16000 。
    channel?: 1 | 2; // 声道数，支持 1 或 2，默认为 1。
  };


  request?: {
    model_name?: string; // 识别模型名称，如 "bigmodel"
    enable_words?: boolean; // 是否开启词级别时间戳，默认为 false。
    enable_sentence_info?: boolean; // 是否开启句子级别时间戳，默认为 false。
    enable_utterance_info?: boolean; // 是否开启语句级别时间戳，默认为 true。
    enable_punctuation_prediction?: boolean; // 是否开启标点符号预测，默认为 true。
    enable_inverse_text_normalization?: boolean; // 是否开启文本规范化，默认为 true。
    enable_separate_recognition_per_channel?: boolean; // 是否开启声道分离识别，默认为 false。
    audio_channel_count?: 1 | 2; // 音频声道数，仅在 enable_separate_recognition_per_channel 开启时有效，支持 1 或 2，默认为 1。
    max_sentence_silence?: number; // 句子最大静音时间，仅在 enable_sentence_info 开启时有效，单位为 ms，默认为 800。
    custom_words?: string[];
    enable_channel_split?: boolean; // 是否开启声道分离
    enable_ddc?: boolean; // 是否开启 DDC（双通道降噪）
    enable_speaker_info?: boolean; // 是否开启说话人分离
    enable_punc?: boolean; // 是否开启标点符号预测（简写）
    enable_itn?: boolean; // 是否开启文本规范化（简写）
    vad_segment?: boolean; // 是否开启 VAD 断句
    show_utterances?: boolean; // 是否返回语句级别结果
    corpus?: {
      boosting_table_name?: string;
      correct_table_name?: string;
      context?: string;
    };
  };
}

// const main = async () => {
//   const base64Audio = wavToBase64(audioPath);
//   const auc = new Asr({
//     appid: config.VOLCENGINE_AUC_APPID,
//     token: config.VOLCENGINE_AUC_TOKEN,
//   });
//   const result = await auc.getText({ audio: { data: base64Audio } });
//   console.log(util.inspect(result, { showHidden: false, depth: null, colors: true }))
// }

// main();