From 5e781499e960251597d51cba8594add42cc924ea Mon Sep 17 00:00:00 2001 From: abearxiong Date: Fri, 18 Apr 2025 23:55:40 +0800 Subject: [PATCH] feat: add tss module demo --- .gitignore | 4 +- package.json | 8 +- pnpm-lock.yaml | 10 +- src/asr/provider/funasr/ws.ts | 2 +- src/asr/provider/volcengine/base.ts | 2 +- src/asr/provider/ws-server.ts | 2 +- src/tts/provider/volcengine/test/common.ts | 12 + src/tts/provider/volcengine/test/mix.ts | 17 + src/tts/provider/volcengine/tts-mix.ts | 375 +++++++++++++++++++++ src/{asr => }/ws-adapter/browser.ts | 0 src/{asr => }/ws-adapter/index.ts | 0 src/{asr => }/ws-adapter/node.ts | 0 12 files changed, 422 insertions(+), 10 deletions(-) create mode 100644 src/tts/provider/volcengine/test/common.ts create mode 100644 src/tts/provider/volcengine/test/mix.ts create mode 100644 src/tts/provider/volcengine/tts-mix.ts rename src/{asr => }/ws-adapter/browser.ts (100%) rename src/{asr => }/ws-adapter/index.ts (100%) rename src/{asr => }/ws-adapter/node.ts (100%) diff --git a/.gitignore b/.gitignore index be692a8..5b7164b 100644 --- a/.gitignore +++ b/.gitignore @@ -16,4 +16,6 @@ logs .env* !.env.example -models \ No newline at end of file +models + +videos/tts_mix.wav \ No newline at end of file diff --git a/package.json b/package.json index 8a78ce0..13ab413 100644 --- a/package.json +++ b/package.json @@ -1,11 +1,11 @@ { - "name": "demo-app", + "name": "@kevisual/video-tools", "version": "0.0.1", "description": "", "main": "index.js", - "basename": "/root/demo-app", + "basename": "/root/video-tools", "app": { - "key": "demo-app", + "key": "video-tools", "entry": "dist/app.mjs", "type": "system-app", "files": [ @@ -84,7 +84,7 @@ "tsup": "^8.4.0", "tsx": "^4.19.3", "typescript": "^5.8.3", - "ws": "^8.18.1" + "ws": "npm:@kevisual/ws" }, "packageManager": "pnpm@10.8.1" } \ No newline at end of file diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 7f3a699..076d4c3 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -145,8 +145,8 @@ importers: specifier: ^5.8.3 version: 5.8.3 ws: - specifier: ^8.18.1 - version: 8.18.1 + specifier: npm:@kevisual/ws + version: '@kevisual/ws@8.0.0' packages: @@ -350,6 +350,10 @@ packages: '@kevisual/video@0.0.1': resolution: {integrity: sha512-HZMv6JsXZwfIEom+ZTyoCr6VrmeyHhPFfFwNAe9rUYKQupfNY1jXSEhk1eBAM9Cek6uR4A8KjJmX2b9QG7k5AQ==} + '@kevisual/ws@8.0.0': + resolution: {integrity: sha512-jlFxSlXUEz93cFW+UYT5BXv/rFVgiMQnIfqRYZ0gj1hSP8PMGRqMqUoHSLfKvfRRS4jseLSvTTeEKSQpZJtURg==} + engines: {node: '>=10.0.0'} + '@ljharb/resumer@0.1.3': resolution: {integrity: sha512-d+tsDgfkj9X5QTriqM4lKesCkMMJC3IrbPKHvayP00ELx2axdXvDfWkqjxrLXIzGcQzmj7VAUT1wopqARTvafw==} engines: {node: '>= 0.4'} @@ -2557,6 +2561,8 @@ snapshots: '@kevisual/video@0.0.1': {} + '@kevisual/ws@8.0.0': {} + '@ljharb/resumer@0.1.3': dependencies: '@ljharb/through': 2.3.14 diff --git a/src/asr/provider/funasr/ws.ts b/src/asr/provider/funasr/ws.ts index 1323249..8349068 100644 --- a/src/asr/provider/funasr/ws.ts +++ b/src/asr/provider/funasr/ws.ts @@ -1,5 +1,5 @@ // import WebSocket from 'ws'; -import { initWs } from '../../ws-adapter/index.ts'; +import { initWs } from '../../../ws-adapter/index.ts'; export type VideoWSOptions = { url?: string; diff --git a/src/asr/provider/volcengine/base.ts b/src/asr/provider/volcengine/base.ts index ef73bb0..62fb12a 100644 --- a/src/asr/provider/volcengine/base.ts +++ b/src/asr/provider/volcengine/base.ts @@ -1,4 +1,4 @@ -import { initWs } from '../../ws-adapter/index.ts'; +import { initWs } from '../../../ws-adapter/index.ts'; import { WSServer } from '../../provider/ws-server.ts'; import { nanoid } from 'nanoid'; diff --git a/src/asr/provider/ws-server.ts b/src/asr/provider/ws-server.ts index 38cb848..292c040 100644 --- a/src/asr/provider/ws-server.ts +++ b/src/asr/provider/ws-server.ts @@ -1,5 +1,5 @@ import { EventEmitter } from 'eventemitter3'; -import { initWs } from '../ws-adapter/index.ts'; +import { initWs } from '../../ws-adapter/index.ts'; import type { ClientOptions } from 'ws'; type WSSOptions = { url: string; diff --git a/src/tts/provider/volcengine/test/common.ts b/src/tts/provider/volcengine/test/common.ts new file mode 100644 index 0000000..9e04322 --- /dev/null +++ b/src/tts/provider/volcengine/test/common.ts @@ -0,0 +1,12 @@ +import path from 'path'; +import dotenv from 'dotenv'; + +export const config = dotenv.config({ + path: path.join(process.cwd(), '.env'), +}).parsed; + +export const audioPath = path.join(process.cwd(), 'videos/asr_example.wav'); +export const audioPath2 = path.join(process.cwd(), 'videos/asr_example2.wav'); +export const blankAudioPath = path.join(process.cwd(), 'videos/blank.wav'); + +export const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); diff --git a/src/tts/provider/volcengine/test/mix.ts b/src/tts/provider/volcengine/test/mix.ts new file mode 100644 index 0000000..18cdf79 --- /dev/null +++ b/src/tts/provider/volcengine/test/mix.ts @@ -0,0 +1,17 @@ +import { config } from './common.ts'; +import { runDemo } from '../tts-mix.ts'; +const appId = config.APP_ID; +const token = config.TOKEN; + +// const speaker = 'zh_female_shuangkuaisisi_moon_bigtts'; +const speaker = 'zh_female_roumeinvyou_emo_v2_mars_bigtts'; +const text = '明朝开国皇帝朱元璋也称这本书为,万物之根'; +const outputPath = 'videos/tts_mix.wav'; + +// tsx src/tts/provider/volcengine/test/mix.ts +const main = async () => { + await runDemo(appId, token, speaker, text, outputPath); + console.log('完成'); +}; + +main(); diff --git a/src/tts/provider/volcengine/tts-mix.ts b/src/tts/provider/volcengine/tts-mix.ts new file mode 100644 index 0000000..5b83fc1 --- /dev/null +++ b/src/tts/provider/volcengine/tts-mix.ts @@ -0,0 +1,375 @@ +// https://www.volcengine.com/docs/6561/1329505#%E7%A4%BA%E4%BE%8Bsamples + +import { WebSocket } from 'ws'; + +import fs from 'fs/promises'; +import { nanoid } from 'nanoid'; +const uuidv4 = nanoid; +const PROTOCOL_VERSION = 0b0001; +const DEFAULT_HEADER_SIZE = 0b0001; + +// Message Type: +const FULL_CLIENT_REQUEST = 0b0001; +const AUDIO_ONLY_RESPONSE = 0b1011; +const FULL_SERVER_RESPONSE = 0b1001; +const ERROR_INFORMATION = 0b1111; + +// Message Type Specific Flags +const MsgTypeFlagNoSeq = 0b0000; // Non-terminal packet with no sequence +const MsgTypeFlagPositiveSeq = 0b1; // Non-terminal packet with sequence > 0 +const MsgTypeFlagLastNoSeq = 0b10; // last packet with no sequence +const MsgTypeFlagNegativeSeq = 0b11; // Payload contains event number (int32) +const MsgTypeFlagWithEvent = 0b100; +// Message Serialization +const NO_SERIALIZATION = 0b0000; +const JSON_TYPE = 0b0001; +// Message Compression +const COMPRESSION_NO = 0b0000; +const COMPRESSION_GZIP = 0b0001; + +const EVENT_NONE = 0; +const EVENT_Start_Connection = 1; +const EVENT_FinishConnection = 2; +const EVENT_ConnectionStarted = 50; // 成功建连 +const EVENT_ConnectionFailed = 51; // 建连失败(可能是无法通过权限认证) +const EVENT_ConnectionFinished = 52; // 连接结束 +// 上行Session事件 +const EVENT_StartSession = 100; +const EVENT_FinishSession = 102; +// 下行Session事件 +const EVENT_SessionStarted = 150; +const EVENT_SessionFinished = 152; +const EVENT_SessionFailed = 153; +// 上行通用事件 +const EVENT_TaskRequest = 200; +// 下行TTS事件 +const EVENT_TTSSentenceStart = 350; +const EVENT_TTSSentenceEnd = 351; +const EVENT_TTSResponse = 352; + +class Header { + headerSize: number; + protocolVersion: number; + messageType: number; + messageTypeSpecificFlags: number; + serialMethod: number; + compressionType: number; + reservedData: number; + + constructor( + protocolVersion: number = PROTOCOL_VERSION, + headerSize: number = DEFAULT_HEADER_SIZE, + messageType: number = 0, + messageTypeSpecificFlags: number = 0, + serialMethod: number = NO_SERIALIZATION, + compressionType: number = COMPRESSION_NO, + reservedData: number = 0, + ) { + this.headerSize = headerSize; + this.protocolVersion = protocolVersion; + this.messageType = messageType; + this.messageTypeSpecificFlags = messageTypeSpecificFlags; + this.serialMethod = serialMethod; + this.compressionType = compressionType; + this.reservedData = reservedData; + } + + asBytes(): Buffer { + return Buffer.from([ + (this.protocolVersion << 4) | this.headerSize, + (this.messageType << 4) | this.messageTypeSpecificFlags, + (this.serialMethod << 4) | this.compressionType, + this.reservedData, + ]); + } +} + +class Optional { + event: number; + sessionId: string | null; + errorCode: number; + connectionId: string | null; + responseMetaJson: string | null; + sequence: number | null; + + constructor(event: number = EVENT_NONE, sessionId: string | null = null, sequence: number | null = null) { + this.event = event; + this.sessionId = sessionId; + this.errorCode = 0; + this.connectionId = null; + this.responseMetaJson = null; + this.sequence = sequence; + } + + // 转成 byte 序列 + asBytes(): Buffer { + const optionBytes = Buffer.alloc(0); + let result = optionBytes; + + if (this.event !== EVENT_NONE) { + const eventBuffer = Buffer.alloc(4); + eventBuffer.writeInt32BE(this.event); + result = Buffer.concat([result, eventBuffer]); + } + + if (this.sessionId !== null) { + const sessionIdBytes = Buffer.from(this.sessionId); + const sizeBuffer = Buffer.alloc(4); + sizeBuffer.writeInt32BE(sessionIdBytes.length); + result = Buffer.concat([result, sizeBuffer, sessionIdBytes]); + } + + if (this.sequence !== null) { + const sequenceBuffer = Buffer.alloc(4); + sequenceBuffer.writeInt32BE(this.sequence); + result = Buffer.concat([result, sequenceBuffer]); + } + + return result; + } +} + +class Response { + optional: Optional; + header: Header; + payload: Buffer | null; + + constructor(header: Header, optional: Optional) { + this.optional = optional; + this.header = header; + this.payload = null; + } + + toString(): string { + return this.payload?.toString() || ''; + } +} + +// 发送事件 +async function sendEvent(ws: WebSocket, header: Buffer, optional: Buffer | null = null, payload: Buffer | null = null): Promise { + return new Promise((resolve, reject) => { + const fullClientRequest: Buffer[] = [header]; + + if (optional !== null) { + fullClientRequest.push(optional); + } + + if (payload !== null) { + const payloadSizeBuffer = Buffer.alloc(4); + payloadSizeBuffer.writeInt32BE(payload.length); + fullClientRequest.push(payloadSizeBuffer, payload); + } + + ws.send(Buffer.concat(fullClientRequest), (err) => { + if (err) reject(err); + else resolve(); + }); + }); +} + +// 读取 res 数组某段 字符串内容 +function readResContent(res: Buffer, offset: number): [string, number] { + const contentSize = res.readInt32BE(offset); + offset += 4; + const content = res.slice(offset, offset + contentSize).toString(); + offset += contentSize; + return [content, offset]; +} + +// 读取 payload +function readResPayload(res: Buffer, offset: number): [Buffer, number] { + const payloadSize = res.readInt32BE(offset); + offset += 4; + const payload = res.slice(offset, offset + payloadSize); + offset += payloadSize; + return [payload, offset]; +} + +// 解析响应结果 +function parserResponse(res: Buffer | string): Response { + if (typeof res === 'string') { + throw new Error(res); + } + + const response = new Response(new Header(), new Optional()); + // 解析结果 + // header + const header = response.header; + const num = 0b00001111; + header.protocolVersion = (res[0] >> 4) & num; + header.headerSize = res[0] & 0x0f; + header.messageType = (res[1] >> 4) & num; + header.messageTypeSpecificFlags = res[1] & 0x0f; + header.serialMethod = (res[2] >> 4) & num; + header.compressionType = res[2] & 0x0f; + header.reservedData = res[3]; + + let offset = 4; + const optional = response.optional; + + if (header.messageType === FULL_SERVER_RESPONSE || header.messageType === AUDIO_ONLY_RESPONSE) { + // read event + if (header.messageTypeSpecificFlags === MsgTypeFlagWithEvent) { + optional.event = res.readInt32BE(offset); + offset += 4; + + if (optional.event === EVENT_NONE) { + return response; + } + // read connectionId + else if (optional.event === EVENT_ConnectionStarted) { + [optional.connectionId, offset] = readResContent(res, offset); + } else if (optional.event === EVENT_ConnectionFailed) { + [optional.responseMetaJson, offset] = readResContent(res, offset); + } else if (optional.event === EVENT_SessionStarted || optional.event === EVENT_SessionFailed || optional.event === EVENT_SessionFinished) { + [optional.sessionId, offset] = readResContent(res, offset); + [optional.responseMetaJson, offset] = readResContent(res, offset); + } else { + [optional.sessionId, offset] = readResContent(res, offset); + [response.payload, offset] = readResPayload(res, offset); + } + } + } else if (header.messageType === ERROR_INFORMATION) { + optional.errorCode = res.readInt32BE(offset); + offset += 4; + [response.payload, offset] = readResPayload(res, offset); + } + + return response; +} + +function printResponse(res: Response, tag: string): void { + console.log(`===>${tag} header:`, res.header, res.optional.event); + console.log(`===>${tag} optional:`, res.optional); +} + +function getPayloadBytes( + uid: string = '1234', + event: number = EVENT_NONE, + text: string = '', + speaker: string = '', + audioFormat: string = 'mp3', + audioSampleRate: number = 24000, +): Buffer { + return Buffer.from( + JSON.stringify({ + user: { uid }, + event, + namespace: 'BidirectionalTTS', + req_params: { + text, + speaker, + audio_params: { + format: audioFormat, + sample_rate: audioSampleRate, + }, + }, + }), + ); +} + +async function startConnection(websocket: WebSocket): Promise { + const header = new Header(PROTOCOL_VERSION, DEFAULT_HEADER_SIZE, FULL_CLIENT_REQUEST, MsgTypeFlagWithEvent).asBytes(); + + const optional = new Optional(EVENT_Start_Connection).asBytes(); + const payload = Buffer.from('{}'); + return await sendEvent(websocket, header, optional, payload); +} + +async function startSession(websocket: WebSocket, speaker: string, sessionId: string): Promise { + const header = new Header(PROTOCOL_VERSION, DEFAULT_HEADER_SIZE, FULL_CLIENT_REQUEST, MsgTypeFlagWithEvent, JSON_TYPE).asBytes(); + + const optional = new Optional(EVENT_StartSession, sessionId).asBytes(); + const payload = getPayloadBytes('1234', EVENT_StartSession, '', speaker); + return await sendEvent(websocket, header, optional, payload); +} + +async function sendText(ws: WebSocket, speaker: string, text: string, sessionId: string): Promise { + const header = new Header(PROTOCOL_VERSION, DEFAULT_HEADER_SIZE, FULL_CLIENT_REQUEST, MsgTypeFlagWithEvent, JSON_TYPE).asBytes(); + + const optional = new Optional(EVENT_TaskRequest, sessionId).asBytes(); + const payload = getPayloadBytes('1234', EVENT_TaskRequest, text, speaker); + return await sendEvent(ws, header, optional, payload); +} + +async function finishSession(ws: WebSocket, sessionId: string): Promise { + const header = new Header(PROTOCOL_VERSION, DEFAULT_HEADER_SIZE, FULL_CLIENT_REQUEST, MsgTypeFlagWithEvent, JSON_TYPE).asBytes(); + + const optional = new Optional(EVENT_FinishSession, sessionId).asBytes(); + const payload = Buffer.from('{}'); + return await sendEvent(ws, header, optional, payload); +} + +async function finishConnection(ws: WebSocket): Promise { + const header = new Header(PROTOCOL_VERSION, DEFAULT_HEADER_SIZE, FULL_CLIENT_REQUEST, MsgTypeFlagWithEvent, JSON_TYPE).asBytes(); + + const optional = new Optional(EVENT_FinishConnection).asBytes(); + const payload = Buffer.from('{}'); + return await sendEvent(ws, header, optional, payload); +} + +export async function runDemo(appId: string, token: string, speaker: string, text: string, outputPath: string): Promise { + return new Promise((resolve, reject) => { + const wsHeader = { + 'X-Api-App-Key': appId, + 'X-Api-Access-Key': token, + 'X-Api-Resource-Id': 'volc.service_type.10029', + 'X-Api-Connect-Id': uuidv4(), + }; + + const url = 'wss://openspeech.bytedance.com/api/v3/tts/bidirection'; + const ws = new WebSocket(url, { headers: wsHeader }); + + ws.on('open', async () => { + try { + await startConnection(ws); + let isFirstResponse = true; + let fileHandle: fs.FileHandle | null = null; + let sessionId: string = ''; + ws.on('message', async (data) => { + try { + const res = parserResponse(data as Buffer); + printResponse(res, 'message res:'); + if (res.optional.event === EVENT_ConnectionStarted) { + sessionId = uuidv4().replace(/-/g, ''); + await startSession(ws, speaker, sessionId); + return; + } else if (res.optional.event === EVENT_ConnectionFinished) { + ws.close(); + resolve(); + return; + } + if (res.optional.event === EVENT_SessionStarted && isFirstResponse) { + isFirstResponse = false; + await sendText(ws, speaker, text, sessionId); + await finishSession(ws, sessionId); + fileHandle = await fs.open(outputPath, 'w'); + } else if (!isFirstResponse) { + if (res.optional.event === EVENT_TTSResponse && res.header.messageType === AUDIO_ONLY_RESPONSE && res.payload && fileHandle) { + await fileHandle.write(res.payload); + } else if (res.optional.event === EVENT_TTSSentenceStart || res.optional.event === EVENT_TTSSentenceEnd) { + // continue + } else { + if (fileHandle) { + await fileHandle.close(); + fileHandle = null; + } + await finishConnection(ws); + } + } + } catch (err) { + ws.close(); + reject(err); + } + }); + } catch (err) { + ws.close(); + reject(err); + } + }); + + ws.on('error', (err) => { + reject(err); + }); + }); +} diff --git a/src/asr/ws-adapter/browser.ts b/src/ws-adapter/browser.ts similarity index 100% rename from src/asr/ws-adapter/browser.ts rename to src/ws-adapter/browser.ts diff --git a/src/asr/ws-adapter/index.ts b/src/ws-adapter/index.ts similarity index 100% rename from src/asr/ws-adapter/index.ts rename to src/ws-adapter/index.ts diff --git a/src/asr/ws-adapter/node.ts b/src/ws-adapter/node.ts similarity index 100% rename from src/asr/ws-adapter/node.ts rename to src/ws-adapter/node.ts