import * as fs from 'fs/promises'; import * as path from 'path'; import * as zlib from 'zlib'; import * as util from 'util'; import { Readable } from 'stream'; import { promisify } from 'util'; import { nanoid } from 'nanoid'; import { VolcEngineBase, uuid } from './base.ts'; // Promisify zlib methods const gzipPromise = promisify(zlib.gzip); const gunzipPromise = promisify(zlib.gunzip); // Protocol constants const PROTOCOL_VERSION = 0b0001; const DEFAULT_HEADER_SIZE = 0b0001; // Message Type const FULL_CLIENT_REQUEST = 0b0001; const AUDIO_ONLY_REQUEST = 0b0010; const FULL_SERVER_RESPONSE = 0b1001; const SERVER_ACK = 0b1011; const SERVER_ERROR_RESPONSE = 0b1111; // Message Type Specific Flags const NO_SEQUENCE = 0b0000; // no check sequence const POS_SEQUENCE = 0b0001; const NEG_SEQUENCE = 0b0010; const NEG_WITH_SEQUENCE = 0b0011; const NEG_SEQUENCE_1 = 0b0011; // Message Serialization const NO_SERIALIZATION = 0b0000; const JSON_SERIALIZATION = 0b0001; // Message Compression const NO_COMPRESSION = 0b0000; const GZIP_COMPRESSION = 0b0001; /** * Generate header for the WebSocket request */ function generateHeader( messageType = FULL_CLIENT_REQUEST, messageTypeSpecificFlags = NO_SEQUENCE, serialMethod = JSON_SERIALIZATION, compressionType = GZIP_COMPRESSION, reservedData = 0x00, ): Buffer { const header = Buffer.alloc(4); const headerSize = 1; header[0] = (PROTOCOL_VERSION << 4) | headerSize; header[1] = (messageType << 4) | messageTypeSpecificFlags; header[2] = (serialMethod << 4) | compressionType; header[3] = reservedData; return header; } /** * Generate the sequence part of the request */ function generateBeforePayload(sequence: number): Buffer { const beforePayload = Buffer.alloc(4); beforePayload.writeInt32BE(sequence); return beforePayload; } /** * Parse response from the WebSocket server */ function parseResponse(res: Buffer): any { const protocolVersion = res[0] >> 4; const headerSize = res[0] & 0x0f; const messageType = res[1] >> 4; const messageTypeSpecificFlags = res[1] & 0x0f; const serializationMethod = res[2] >> 4; const messageCompression = res[2] & 0x0f; const reserved = res[3]; const headerExtensions = res.slice(4, headerSize * 4); const payload = res.slice(headerSize * 4); const result: any = { isLastPackage: false, }; let payloadMsg = null; let payloadSize = 0; let offset = 0; if (messageTypeSpecificFlags & 0x01) { // receive frame with sequence const seq = payload.readInt32BE(0); result.payloadSequence = seq; offset += 4; } if (messageTypeSpecificFlags & 0x02) { // receive last package result.isLastPackage = true; } const remainingPayload = payload.slice(offset); if (messageType === FULL_SERVER_RESPONSE) { payloadSize = remainingPayload.readInt32BE(0); payloadMsg = remainingPayload.slice(4); } else if (messageType === SERVER_ACK) { const seq = remainingPayload.readInt32BE(0); result.seq = seq; if (remainingPayload.length >= 8) { payloadSize = remainingPayload.readUInt32BE(4); payloadMsg = remainingPayload.slice(8); } } else if (messageType === SERVER_ERROR_RESPONSE) { const code = remainingPayload.readUInt32BE(0); result.code = code; payloadSize = remainingPayload.readUInt32BE(4); payloadMsg = remainingPayload.slice(8); } if (!payloadMsg) { return result; } if (messageCompression === GZIP_COMPRESSION) { try { const decompressed = zlib.gunzipSync(payloadMsg); payloadMsg = decompressed; } catch (error) { console.error('Error decompressing payload:', error); } } if (serializationMethod === JSON_SERIALIZATION) { try { payloadMsg = JSON.parse(payloadMsg.toString('utf-8')); } catch (error) { console.error('Error parsing JSON payload:', error); } } else if (serializationMethod !== NO_SERIALIZATION) { payloadMsg = payloadMsg.toString('utf-8'); } result.payloadMsg = payloadMsg; result.payloadSize = payloadSize; return result; } /** * Read WAV file information */ async function readWavInfo(data: Buffer): Promise<{ channels: number; sampleWidth: number; sampleRate: number; frames: number; audioData: Buffer; }> { // This is a simplified WAV parser - in production you should use a proper library if (data.length < 44) { throw new Error('Invalid WAV file: too short'); } // Check WAV header if (data.slice(0, 4).toString() !== 'RIFF' || data.slice(8, 12).toString() !== 'WAVE') { throw new Error('Invalid WAV file: not a WAV format'); } // Parse header information const channels = data.readUInt16LE(22); const sampleRate = data.readUInt32LE(24); const bitsPerSample = data.readUInt16LE(34); const sampleWidth = bitsPerSample / 8; // Find data chunk let offset = 12; // Start after "WAVE" let dataSize = 0; let audioData: Buffer = Buffer.alloc(0); while (offset < data.length) { const chunkType = data.slice(offset, offset + 4).toString(); const chunkSize = data.readUInt32LE(offset + 4); if (chunkType === 'data') { dataSize = chunkSize; audioData = data.slice(offset + 8, offset + 8 + chunkSize); break; } offset += 8 + chunkSize; } const frames = dataSize / (channels * sampleWidth); return { channels, sampleWidth, sampleRate, frames, audioData, }; } /** * Check if data is a valid WAV file */ function judgeWav(data: Buffer): boolean { if (data.length < 44) { return false; } return data.slice(0, 4).toString() === 'RIFF' && data.slice(8, 12).toString() === 'WAVE'; } /** * Slice data into chunks */ function* sliceData(data: Buffer, chunkSize: number): Generator<[Buffer, boolean]> { const dataLen = data.length; let offset = 0; while (offset + chunkSize < dataLen) { yield [data.slice(offset, offset + chunkSize), false]; offset += chunkSize; } yield [data.slice(offset, dataLen), true]; } interface AsrClientOptions { segDuration?: number; wsUrl?: string; uid?: string; format?: string; rate?: number; bits?: number; channel?: number; codec?: string; authMethod?: string; hotWords?: string[]; streaming?: boolean; mp3SegSize?: number; resourceId?: string; token?: string; appid?: string; } interface AudioItem { id: string | number; path: string; } /** * ASR WebSocket Client */ export class AsrWsClient extends VolcEngineBase { private audioPath: string; private successCode: number = 1000; private segDuration: number; private format: string; private rate: number; private bits: number; private channel: number; private codec: string; private authMethod: string; private hotWords: string[] | null; private streaming: boolean; private mp3SegSize: number; private reqEvent: number = 1; private uid: string; private seq: number = 1; private hasSendFullClientRequest: boolean = false; constructor(audioPath: string, options: AsrClientOptions = {}) { super({ url: options.wsUrl || 'wss://openspeech.bytedance.com/api/v3/sauc/bigmodel', onConnect: () => this.onWsConnect(), wsOptions: { headers: { 'X-Api-Resource-Id': options.resourceId || 'volc.bigasr.sauc.duration', 'X-Api-Access-Key': options.token || '', 'X-Api-App-Key': options.appid || '', 'X-Api-Request-Id': uuid(), }, }, }); this.audioPath = audioPath; this.segDuration = options.segDuration || 100; this.uid = options.uid || 'test'; this.format = options.format || 'wav'; this.rate = options.rate || 16000; this.bits = options.bits || 16; this.channel = options.channel || 1; this.codec = options.codec || 'raw'; this.authMethod = options.authMethod || 'none'; this.hotWords = options.hotWords || null; this.streaming = options.streaming !== undefined ? options.streaming : true; this.mp3SegSize = options.mp3SegSize || 1000; } private onWsConnect() { console.log('ASR WebSocket connected'); } /** * Construct request parameters */ private constructRequest(reqId: string, data?: any): any { return { user: { uid: this.uid, }, audio: { format: this.format, sample_rate: this.rate, bits: this.bits, channel: this.channel, codec: this.codec, }, request: { model_name: 'bigmodel', enable_punc: true, }, }; } private async sendFullClientRequest() { if (this.hasSendFullClientRequest) { return; } this.seq = 1; const seq = this.seq; const reqId = nanoid(); const requestParams = this.constructRequest(reqId); // Prepare and send initial request const payloadStr = JSON.stringify(requestParams); const compressedPayload = await gzipPromise(Buffer.from(payloadStr)); const fullClientRequest = Buffer.concat([ generateHeader(FULL_CLIENT_REQUEST, POS_SEQUENCE), generateBeforePayload(seq), Buffer.alloc(4), compressedPayload, ]); // Set payload size fullClientRequest.writeUInt32BE(compressedPayload.length, 8); // Send initial request (this as any).ws.send(fullClientRequest); this.hasSendFullClientRequest = true; } /** * Process audio data in segments */ private async segmentDataProcessor(audioData: Buffer, segmentSize: number): Promise { await this.sendFullClientRequest(); const that = this; // Wait for response const result = await new Promise((resolve, reject) => { const onMessage = async (event: MessageEvent) => { try { const response = parseResponse(Buffer.from(event.data as ArrayBuffer)); console.log('Initial response:', response); // Process audio chunks for (const [chunk, last] of sliceData(audioData, segmentSize)) { that.seq += 1; if (last) { that.seq = -that.seq; } const seq = that.seq; const start = Date.now(); const compressedChunk = await gzipPromise(chunk); const messageType = AUDIO_ONLY_REQUEST; const flags = last ? NEG_WITH_SEQUENCE : POS_SEQUENCE; const audioRequest = Buffer.concat([generateHeader(messageType, flags), generateBeforePayload(seq), Buffer.alloc(4), compressedChunk]); // Set payload size audioRequest.writeUInt32BE(compressedChunk.length, 8); // Send audio chunk (this as any).ws.send(audioRequest); // Wait for each response const chunkResponse = await new Promise((resolveChunk) => { const onChunkMessage = (chunkEvent: MessageEvent) => { (this as any).ws.removeEventListener('message', onChunkMessage); const parsed = parseResponse(Buffer.from(chunkEvent.data as ArrayBuffer)); console.log(`Seq ${seq} response:`, parsed); resolveChunk(parsed); }; (this as any).ws.addEventListener('message', onChunkMessage, { once: true }); }); // If streaming, add delay to simulate real-time if (this.streaming) { const elapsed = Date.now() - start; const sleepTime = Math.max(0, this.segDuration - elapsed); await new Promise((r) => setTimeout(r, sleepTime)); } // If this is the last chunk, resolve with final result if (last) { resolve(chunkResponse); break; } } (this as any).ws.removeEventListener('message', onMessage); } catch (error) { console.error('Error processing response:', error); reject(error); } }; (this as any).ws.addEventListener('message', onMessage, { once: true }); (this as any).ws.addEventListener( 'error', (error) => { console.error('WebSocket error:', error); reject(error); }, { once: true }, ); }); return result; } /** * Execute ASR on the audio file */ public async execute(): Promise { try { const data = await fs.readFile(this.audioPath); if (this.format === 'mp3') { const segmentSize = this.mp3SegSize; return await this.segmentDataProcessor(data, segmentSize); } if (this.format === 'wav') { const wavInfo = await readWavInfo(data); const sizePerSec = wavInfo.channels * wavInfo.sampleWidth * wavInfo.sampleRate; const segmentSize = Math.floor((sizePerSec * this.segDuration) / 1000); // 3200 return await this.segmentDataProcessor(data, segmentSize); } if (this.format === 'pcm') { const segmentSize = Math.floor((this.rate * 2 * this.channel * this.segDuration) / 500); return await this.segmentDataProcessor(data, segmentSize); } throw new Error('Unsupported format'); } catch (error) { console.error('Error executing ASR:', error); throw error; } } /** * Send OPUS data for processing */ public async sendOpusData(audioData: Buffer): Promise { const segmentSize = Math.floor((this.rate * 2 * this.channel * this.segDuration) / 500); return await this.segmentDataProcessor(audioData, segmentSize); } } /** * Execute ASR on a single audio file */ export async function executeOne(audioItem: AudioItem, options: AsrClientOptions = {}): Promise { if (!audioItem.id || !audioItem.path) { throw new Error('Audio item must have id and path properties'); } const audioId = audioItem.id; const audioPath = path.resolve(process.cwd(), audioItem.path); const asrClient = new AsrWsClient(audioPath, options); await new Promise((resolve) => setTimeout(resolve, 2000)); return asrClient.execute().then((result) => { return { id: audioId, path: audioPath, result: result, }; }); } /** * Test stream processing */ export const testStream = async () => { console.log('测试流式'); const audioPath = 'videos/asr_example.wav'; const res = await executeOne({ id: 1, path: audioPath, }) .then((result) => { console.log('====end test====='); console.log(result); return result; }) .catch((error) => { console.error('Test error:', error); return ''; }); }; /** * Handle audio data directly */ export async function handleAudioData(audioData: Buffer, options: AsrClientOptions = {}): Promise { const asrClient = new AsrWsClient('', options); return await asrClient.sendOpusData(audioData); }