// https://www.volcengine.com/docs/6561/1329505#%E7%A4%BA%E4%BE%8Bsamples import { WebSocket } from 'ws'; import fs from 'fs/promises'; import { nanoid } from 'nanoid'; const uuidv4 = nanoid; const PROTOCOL_VERSION = 0b0001; const DEFAULT_HEADER_SIZE = 0b0001; // Message Type: const FULL_CLIENT_REQUEST = 0b0001; const AUDIO_ONLY_RESPONSE = 0b1011; const FULL_SERVER_RESPONSE = 0b1001; const ERROR_INFORMATION = 0b1111; // Message Type Specific Flags const MsgTypeFlagNoSeq = 0b0000; // Non-terminal packet with no sequence const MsgTypeFlagPositiveSeq = 0b1; // Non-terminal packet with sequence > 0 const MsgTypeFlagLastNoSeq = 0b10; // last packet with no sequence const MsgTypeFlagNegativeSeq = 0b11; // Payload contains event number (int32) const MsgTypeFlagWithEvent = 0b100; // Message Serialization const NO_SERIALIZATION = 0b0000; const JSON_TYPE = 0b0001; // Message Compression const COMPRESSION_NO = 0b0000; const COMPRESSION_GZIP = 0b0001; const EVENT_NONE = 0; const EVENT_Start_Connection = 1; const EVENT_FinishConnection = 2; const EVENT_ConnectionStarted = 50; // 成功建连 const EVENT_ConnectionFailed = 51; // 建连失败(可能是无法通过权限认证) const EVENT_ConnectionFinished = 52; // 连接结束 // 上行Session事件 const EVENT_StartSession = 100; const EVENT_FinishSession = 102; // 下行Session事件 const EVENT_SessionStarted = 150; const EVENT_SessionFinished = 152; const EVENT_SessionFailed = 153; // 上行通用事件 const EVENT_TaskRequest = 200; // 下行TTS事件 const EVENT_TTSSentenceStart = 350; const EVENT_TTSSentenceEnd = 351; const EVENT_TTSResponse = 352; class Header { headerSize: number; protocolVersion: number; messageType: number; messageTypeSpecificFlags: number; serialMethod: number; compressionType: number; reservedData: number; constructor( protocolVersion: number = PROTOCOL_VERSION, headerSize: number = DEFAULT_HEADER_SIZE, messageType: number = 0, messageTypeSpecificFlags: number = 0, serialMethod: number = NO_SERIALIZATION, compressionType: number = COMPRESSION_NO, reservedData: number = 0, ) { this.headerSize = headerSize; this.protocolVersion = protocolVersion; this.messageType = messageType; this.messageTypeSpecificFlags = messageTypeSpecificFlags; this.serialMethod = serialMethod; this.compressionType = compressionType; this.reservedData = reservedData; } asBytes(): Buffer { return Buffer.from([ (this.protocolVersion << 4) | this.headerSize, (this.messageType << 4) | this.messageTypeSpecificFlags, (this.serialMethod << 4) | this.compressionType, this.reservedData, ]); } } class Optional { event: number; sessionId: string | null; errorCode: number; connectionId: string | null; responseMetaJson: string | null; sequence: number | null; constructor(event: number = EVENT_NONE, sessionId: string | null = null, sequence: number | null = null) { this.event = event; this.sessionId = sessionId; this.errorCode = 0; this.connectionId = null; this.responseMetaJson = null; this.sequence = sequence; } // 转成 byte 序列 asBytes(): Buffer { const optionBytes = Buffer.alloc(0); let result = optionBytes; if (this.event !== EVENT_NONE) { const eventBuffer = Buffer.alloc(4); eventBuffer.writeInt32BE(this.event); result = Buffer.concat([result, eventBuffer]); } if (this.sessionId !== null) { const sessionIdBytes = Buffer.from(this.sessionId); const sizeBuffer = Buffer.alloc(4); sizeBuffer.writeInt32BE(sessionIdBytes.length); result = Buffer.concat([result, sizeBuffer, sessionIdBytes]); } if (this.sequence !== null) { const sequenceBuffer = Buffer.alloc(4); sequenceBuffer.writeInt32BE(this.sequence); result = Buffer.concat([result, sequenceBuffer]); } return result; } } class Response { optional: Optional; header: Header; payload: Buffer | null; constructor(header: Header, optional: Optional) { this.optional = optional; this.header = header; this.payload = null; } toString(): string { return this.payload?.toString() || ''; } } // 发送事件 async function sendEvent(ws: WebSocket, header: Buffer, optional: Buffer | null = null, payload: Buffer | null = null): Promise { return new Promise((resolve, reject) => { const fullClientRequest: Buffer[] = [header]; if (optional !== null) { fullClientRequest.push(optional); } if (payload !== null) { const payloadSizeBuffer = Buffer.alloc(4); payloadSizeBuffer.writeInt32BE(payload.length); fullClientRequest.push(payloadSizeBuffer, payload); } ws.send(Buffer.concat(fullClientRequest), (err) => { if (err) reject(err); else resolve(); }); }); } // 读取 res 数组某段 字符串内容 function readResContent(res: Buffer, offset: number): [string, number] { const contentSize = res.readInt32BE(offset); offset += 4; const content = res.slice(offset, offset + contentSize).toString(); offset += contentSize; return [content, offset]; } // 读取 payload function readResPayload(res: Buffer, offset: number): [Buffer, number] { const payloadSize = res.readInt32BE(offset); offset += 4; const payload = res.slice(offset, offset + payloadSize); offset += payloadSize; return [payload, offset]; } // 解析响应结果 function parserResponse(res: Buffer | string): Response { if (typeof res === 'string') { throw new Error(res); } const response = new Response(new Header(), new Optional()); // 解析结果 // header const header = response.header; const num = 0b00001111; header.protocolVersion = (res[0] >> 4) & num; header.headerSize = res[0] & 0x0f; header.messageType = (res[1] >> 4) & num; header.messageTypeSpecificFlags = res[1] & 0x0f; header.serialMethod = (res[2] >> 4) & num; header.compressionType = res[2] & 0x0f; header.reservedData = res[3]; let offset = 4; const optional = response.optional; if (header.messageType === FULL_SERVER_RESPONSE || header.messageType === AUDIO_ONLY_RESPONSE) { // read event if (header.messageTypeSpecificFlags === MsgTypeFlagWithEvent) { optional.event = res.readInt32BE(offset); offset += 4; if (optional.event === EVENT_NONE) { return response; } // read connectionId else if (optional.event === EVENT_ConnectionStarted) { [optional.connectionId, offset] = readResContent(res, offset); } else if (optional.event === EVENT_ConnectionFailed) { [optional.responseMetaJson, offset] = readResContent(res, offset); } else if (optional.event === EVENT_SessionStarted || optional.event === EVENT_SessionFailed || optional.event === EVENT_SessionFinished) { [optional.sessionId, offset] = readResContent(res, offset); [optional.responseMetaJson, offset] = readResContent(res, offset); } else { [optional.sessionId, offset] = readResContent(res, offset); [response.payload, offset] = readResPayload(res, offset); } } } else if (header.messageType === ERROR_INFORMATION) { optional.errorCode = res.readInt32BE(offset); offset += 4; [response.payload, offset] = readResPayload(res, offset); } return response; } function printResponse(res: Response, tag: string): void { // console.log(`===>${tag} header:`, res.header, res.optional.event); // console.log(`===>${tag} optional:`, res.optional); } function getPayloadBytes( uid: string = '1234', event: number = EVENT_NONE, text: string = '', speaker: string = '', audioFormat: string = 'mp3', audioSampleRate: number = 24000, ): Buffer { return Buffer.from( JSON.stringify({ user: { uid }, event, namespace: 'BidirectionalTTS', req_params: { text, speaker, audio_params: { format: audioFormat, sample_rate: audioSampleRate, }, }, }), ); } async function startConnection(websocket: WebSocket): Promise { const header = new Header(PROTOCOL_VERSION, DEFAULT_HEADER_SIZE, FULL_CLIENT_REQUEST, MsgTypeFlagWithEvent).asBytes(); const optional = new Optional(EVENT_Start_Connection).asBytes(); const payload = Buffer.from('{}'); return await sendEvent(websocket, header, optional, payload); } async function startSession(websocket: WebSocket, speaker: string, sessionId: string): Promise { const header = new Header(PROTOCOL_VERSION, DEFAULT_HEADER_SIZE, FULL_CLIENT_REQUEST, MsgTypeFlagWithEvent, JSON_TYPE).asBytes(); const optional = new Optional(EVENT_StartSession, sessionId).asBytes(); const payload = getPayloadBytes('1234', EVENT_StartSession, '', speaker); return await sendEvent(websocket, header, optional, payload); } async function sendText(ws: WebSocket, speaker: string, text: string, sessionId: string): Promise { const header = new Header(PROTOCOL_VERSION, DEFAULT_HEADER_SIZE, FULL_CLIENT_REQUEST, MsgTypeFlagWithEvent, JSON_TYPE).asBytes(); const optional = new Optional(EVENT_TaskRequest, sessionId).asBytes(); const payload = getPayloadBytes('1234', EVENT_TaskRequest, text, speaker); return await sendEvent(ws, header, optional, payload); } async function finishSession(ws: WebSocket, sessionId: string): Promise { const header = new Header(PROTOCOL_VERSION, DEFAULT_HEADER_SIZE, FULL_CLIENT_REQUEST, MsgTypeFlagWithEvent, JSON_TYPE).asBytes(); const optional = new Optional(EVENT_FinishSession, sessionId).asBytes(); const payload = Buffer.from('{}'); return await sendEvent(ws, header, optional, payload); } async function finishConnection(ws: WebSocket): Promise { const header = new Header(PROTOCOL_VERSION, DEFAULT_HEADER_SIZE, FULL_CLIENT_REQUEST, MsgTypeFlagWithEvent, JSON_TYPE).asBytes(); const optional = new Optional(EVENT_FinishConnection).asBytes(); const payload = Buffer.from('{}'); return await sendEvent(ws, header, optional, payload); } export async function runDemo(appId: string, token: string, speaker: string, text: string, outputPath: string): Promise { return new Promise((resolve, reject) => { const wsHeader = { 'X-Api-App-Key': appId, 'X-Api-Access-Key': token, 'X-Api-Resource-Id': 'volc.service_type.10029', 'X-Api-Connect-Id': uuidv4(), }; const url = 'wss://openspeech.bytedance.com/api/v3/tts/bidirection'; const ws = new WebSocket(url, { headers: wsHeader }); ws.on('open', async () => { try { await startConnection(ws); let isFirstResponse = true; let fileHandle: fs.FileHandle | null = null; let sessionId: string = ''; ws.on('message', async (data) => { try { const res = parserResponse(data as Buffer); printResponse(res, 'message res:'); if (res.optional.event === EVENT_ConnectionStarted) { sessionId = uuidv4().replace(/-/g, ''); await startSession(ws, speaker, sessionId); return; } else if (res.optional.event === EVENT_ConnectionFinished) { ws.close(); resolve(); return; } if (res.optional.event === EVENT_SessionStarted && isFirstResponse) { isFirstResponse = false; await sendText(ws, speaker, text, sessionId); await finishSession(ws, sessionId); fileHandle = await fs.open(outputPath, 'w'); } else if (!isFirstResponse) { if (res.optional.event === EVENT_TTSResponse && res.header.messageType === AUDIO_ONLY_RESPONSE && res.payload && fileHandle) { await fileHandle.write(res.payload); } else if (res.optional.event === EVENT_TTSSentenceStart || res.optional.event === EVENT_TTSSentenceEnd) { // continue } else { if (fileHandle) { await fileHandle.close(); fileHandle = null; } await finishConnection(ws); } } } catch (err) { ws.close(); reject(err); } }); } catch (err) { ws.close(); reject(err); } }); ws.on('error', (err) => { reject(err); }); }); } export class TtsMix { appId: string; token: string; constructor(appId: string, token: string) { this.appId = appId; this.token = token; } /** * 获取语音 * @param speaker 说话人 * @param text 文本 * @param outputPath 输出路径 * @returns */ async getVoiceDemo(speaker: string, text: string, outputPath: string): Promise { return runDemo(this.appId, this.token, speaker, text, outputPath); } }