feat: add tss module demo

This commit is contained in:
熊潇 2025-04-18 23:55:40 +08:00
parent fdc3985b93
commit 5e781499e9
12 changed files with 422 additions and 10 deletions

2
.gitignore vendored
View File

@ -17,3 +17,5 @@ logs
!.env.example !.env.example
models models
videos/tts_mix.wav

View File

@ -1,11 +1,11 @@
{ {
"name": "demo-app", "name": "@kevisual/video-tools",
"version": "0.0.1", "version": "0.0.1",
"description": "", "description": "",
"main": "index.js", "main": "index.js",
"basename": "/root/demo-app", "basename": "/root/video-tools",
"app": { "app": {
"key": "demo-app", "key": "video-tools",
"entry": "dist/app.mjs", "entry": "dist/app.mjs",
"type": "system-app", "type": "system-app",
"files": [ "files": [
@ -84,7 +84,7 @@
"tsup": "^8.4.0", "tsup": "^8.4.0",
"tsx": "^4.19.3", "tsx": "^4.19.3",
"typescript": "^5.8.3", "typescript": "^5.8.3",
"ws": "^8.18.1" "ws": "npm:@kevisual/ws"
}, },
"packageManager": "pnpm@10.8.1" "packageManager": "pnpm@10.8.1"
} }

10
pnpm-lock.yaml generated
View File

@ -145,8 +145,8 @@ importers:
specifier: ^5.8.3 specifier: ^5.8.3
version: 5.8.3 version: 5.8.3
ws: ws:
specifier: ^8.18.1 specifier: npm:@kevisual/ws
version: 8.18.1 version: '@kevisual/ws@8.0.0'
packages: packages:
@ -350,6 +350,10 @@ packages:
'@kevisual/video@0.0.1': '@kevisual/video@0.0.1':
resolution: {integrity: sha512-HZMv6JsXZwfIEom+ZTyoCr6VrmeyHhPFfFwNAe9rUYKQupfNY1jXSEhk1eBAM9Cek6uR4A8KjJmX2b9QG7k5AQ==} resolution: {integrity: sha512-HZMv6JsXZwfIEom+ZTyoCr6VrmeyHhPFfFwNAe9rUYKQupfNY1jXSEhk1eBAM9Cek6uR4A8KjJmX2b9QG7k5AQ==}
'@kevisual/ws@8.0.0':
resolution: {integrity: sha512-jlFxSlXUEz93cFW+UYT5BXv/rFVgiMQnIfqRYZ0gj1hSP8PMGRqMqUoHSLfKvfRRS4jseLSvTTeEKSQpZJtURg==}
engines: {node: '>=10.0.0'}
'@ljharb/resumer@0.1.3': '@ljharb/resumer@0.1.3':
resolution: {integrity: sha512-d+tsDgfkj9X5QTriqM4lKesCkMMJC3IrbPKHvayP00ELx2axdXvDfWkqjxrLXIzGcQzmj7VAUT1wopqARTvafw==} resolution: {integrity: sha512-d+tsDgfkj9X5QTriqM4lKesCkMMJC3IrbPKHvayP00ELx2axdXvDfWkqjxrLXIzGcQzmj7VAUT1wopqARTvafw==}
engines: {node: '>= 0.4'} engines: {node: '>= 0.4'}
@ -2557,6 +2561,8 @@ snapshots:
'@kevisual/video@0.0.1': {} '@kevisual/video@0.0.1': {}
'@kevisual/ws@8.0.0': {}
'@ljharb/resumer@0.1.3': '@ljharb/resumer@0.1.3':
dependencies: dependencies:
'@ljharb/through': 2.3.14 '@ljharb/through': 2.3.14

View File

@ -1,5 +1,5 @@
// import WebSocket from 'ws'; // import WebSocket from 'ws';
import { initWs } from '../../ws-adapter/index.ts'; import { initWs } from '../../../ws-adapter/index.ts';
export type VideoWSOptions = { export type VideoWSOptions = {
url?: string; url?: string;

View File

@ -1,4 +1,4 @@
import { initWs } from '../../ws-adapter/index.ts'; import { initWs } from '../../../ws-adapter/index.ts';
import { WSServer } from '../../provider/ws-server.ts'; import { WSServer } from '../../provider/ws-server.ts';
import { nanoid } from 'nanoid'; import { nanoid } from 'nanoid';

View File

@ -1,5 +1,5 @@
import { EventEmitter } from 'eventemitter3'; import { EventEmitter } from 'eventemitter3';
import { initWs } from '../ws-adapter/index.ts'; import { initWs } from '../../ws-adapter/index.ts';
import type { ClientOptions } from 'ws'; import type { ClientOptions } from 'ws';
type WSSOptions = { type WSSOptions = {
url: string; url: string;

View File

@ -0,0 +1,12 @@
import path from 'path';
import dotenv from 'dotenv';
export const config = dotenv.config({
path: path.join(process.cwd(), '.env'),
}).parsed;
export const audioPath = path.join(process.cwd(), 'videos/asr_example.wav');
export const audioPath2 = path.join(process.cwd(), 'videos/asr_example2.wav');
export const blankAudioPath = path.join(process.cwd(), 'videos/blank.wav');
export const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));

View File

@ -0,0 +1,17 @@
import { config } from './common.ts';
import { runDemo } from '../tts-mix.ts';
const appId = config.APP_ID;
const token = config.TOKEN;
// const speaker = 'zh_female_shuangkuaisisi_moon_bigtts';
const speaker = 'zh_female_roumeinvyou_emo_v2_mars_bigtts';
const text = '明朝开国皇帝朱元璋也称这本书为,万物之根';
const outputPath = 'videos/tts_mix.wav';
// tsx src/tts/provider/volcengine/test/mix.ts
const main = async () => {
await runDemo(appId, token, speaker, text, outputPath);
console.log('完成');
};
main();

View File

@ -0,0 +1,375 @@
// https://www.volcengine.com/docs/6561/1329505#%E7%A4%BA%E4%BE%8Bsamples
import { WebSocket } from 'ws';
import fs from 'fs/promises';
import { nanoid } from 'nanoid';
const uuidv4 = nanoid;
const PROTOCOL_VERSION = 0b0001;
const DEFAULT_HEADER_SIZE = 0b0001;
// Message Type:
const FULL_CLIENT_REQUEST = 0b0001;
const AUDIO_ONLY_RESPONSE = 0b1011;
const FULL_SERVER_RESPONSE = 0b1001;
const ERROR_INFORMATION = 0b1111;
// Message Type Specific Flags
const MsgTypeFlagNoSeq = 0b0000; // Non-terminal packet with no sequence
const MsgTypeFlagPositiveSeq = 0b1; // Non-terminal packet with sequence > 0
const MsgTypeFlagLastNoSeq = 0b10; // last packet with no sequence
const MsgTypeFlagNegativeSeq = 0b11; // Payload contains event number (int32)
const MsgTypeFlagWithEvent = 0b100;
// Message Serialization
const NO_SERIALIZATION = 0b0000;
const JSON_TYPE = 0b0001;
// Message Compression
const COMPRESSION_NO = 0b0000;
const COMPRESSION_GZIP = 0b0001;
const EVENT_NONE = 0;
const EVENT_Start_Connection = 1;
const EVENT_FinishConnection = 2;
const EVENT_ConnectionStarted = 50; // 成功建连
const EVENT_ConnectionFailed = 51; // 建连失败(可能是无法通过权限认证)
const EVENT_ConnectionFinished = 52; // 连接结束
// 上行Session事件
const EVENT_StartSession = 100;
const EVENT_FinishSession = 102;
// 下行Session事件
const EVENT_SessionStarted = 150;
const EVENT_SessionFinished = 152;
const EVENT_SessionFailed = 153;
// 上行通用事件
const EVENT_TaskRequest = 200;
// 下行TTS事件
const EVENT_TTSSentenceStart = 350;
const EVENT_TTSSentenceEnd = 351;
const EVENT_TTSResponse = 352;
class Header {
headerSize: number;
protocolVersion: number;
messageType: number;
messageTypeSpecificFlags: number;
serialMethod: number;
compressionType: number;
reservedData: number;
constructor(
protocolVersion: number = PROTOCOL_VERSION,
headerSize: number = DEFAULT_HEADER_SIZE,
messageType: number = 0,
messageTypeSpecificFlags: number = 0,
serialMethod: number = NO_SERIALIZATION,
compressionType: number = COMPRESSION_NO,
reservedData: number = 0,
) {
this.headerSize = headerSize;
this.protocolVersion = protocolVersion;
this.messageType = messageType;
this.messageTypeSpecificFlags = messageTypeSpecificFlags;
this.serialMethod = serialMethod;
this.compressionType = compressionType;
this.reservedData = reservedData;
}
asBytes(): Buffer {
return Buffer.from([
(this.protocolVersion << 4) | this.headerSize,
(this.messageType << 4) | this.messageTypeSpecificFlags,
(this.serialMethod << 4) | this.compressionType,
this.reservedData,
]);
}
}
class Optional {
event: number;
sessionId: string | null;
errorCode: number;
connectionId: string | null;
responseMetaJson: string | null;
sequence: number | null;
constructor(event: number = EVENT_NONE, sessionId: string | null = null, sequence: number | null = null) {
this.event = event;
this.sessionId = sessionId;
this.errorCode = 0;
this.connectionId = null;
this.responseMetaJson = null;
this.sequence = sequence;
}
// 转成 byte 序列
asBytes(): Buffer {
const optionBytes = Buffer.alloc(0);
let result = optionBytes;
if (this.event !== EVENT_NONE) {
const eventBuffer = Buffer.alloc(4);
eventBuffer.writeInt32BE(this.event);
result = Buffer.concat([result, eventBuffer]);
}
if (this.sessionId !== null) {
const sessionIdBytes = Buffer.from(this.sessionId);
const sizeBuffer = Buffer.alloc(4);
sizeBuffer.writeInt32BE(sessionIdBytes.length);
result = Buffer.concat([result, sizeBuffer, sessionIdBytes]);
}
if (this.sequence !== null) {
const sequenceBuffer = Buffer.alloc(4);
sequenceBuffer.writeInt32BE(this.sequence);
result = Buffer.concat([result, sequenceBuffer]);
}
return result;
}
}
class Response {
optional: Optional;
header: Header;
payload: Buffer | null;
constructor(header: Header, optional: Optional) {
this.optional = optional;
this.header = header;
this.payload = null;
}
toString(): string {
return this.payload?.toString() || '';
}
}
// 发送事件
async function sendEvent(ws: WebSocket, header: Buffer, optional: Buffer | null = null, payload: Buffer | null = null): Promise<void> {
return new Promise((resolve, reject) => {
const fullClientRequest: Buffer[] = [header];
if (optional !== null) {
fullClientRequest.push(optional);
}
if (payload !== null) {
const payloadSizeBuffer = Buffer.alloc(4);
payloadSizeBuffer.writeInt32BE(payload.length);
fullClientRequest.push(payloadSizeBuffer, payload);
}
ws.send(Buffer.concat(fullClientRequest), (err) => {
if (err) reject(err);
else resolve();
});
});
}
// 读取 res 数组某段 字符串内容
function readResContent(res: Buffer, offset: number): [string, number] {
const contentSize = res.readInt32BE(offset);
offset += 4;
const content = res.slice(offset, offset + contentSize).toString();
offset += contentSize;
return [content, offset];
}
// 读取 payload
function readResPayload(res: Buffer, offset: number): [Buffer, number] {
const payloadSize = res.readInt32BE(offset);
offset += 4;
const payload = res.slice(offset, offset + payloadSize);
offset += payloadSize;
return [payload, offset];
}
// 解析响应结果
function parserResponse(res: Buffer | string): Response {
if (typeof res === 'string') {
throw new Error(res);
}
const response = new Response(new Header(), new Optional());
// 解析结果
// header
const header = response.header;
const num = 0b00001111;
header.protocolVersion = (res[0] >> 4) & num;
header.headerSize = res[0] & 0x0f;
header.messageType = (res[1] >> 4) & num;
header.messageTypeSpecificFlags = res[1] & 0x0f;
header.serialMethod = (res[2] >> 4) & num;
header.compressionType = res[2] & 0x0f;
header.reservedData = res[3];
let offset = 4;
const optional = response.optional;
if (header.messageType === FULL_SERVER_RESPONSE || header.messageType === AUDIO_ONLY_RESPONSE) {
// read event
if (header.messageTypeSpecificFlags === MsgTypeFlagWithEvent) {
optional.event = res.readInt32BE(offset);
offset += 4;
if (optional.event === EVENT_NONE) {
return response;
}
// read connectionId
else if (optional.event === EVENT_ConnectionStarted) {
[optional.connectionId, offset] = readResContent(res, offset);
} else if (optional.event === EVENT_ConnectionFailed) {
[optional.responseMetaJson, offset] = readResContent(res, offset);
} else if (optional.event === EVENT_SessionStarted || optional.event === EVENT_SessionFailed || optional.event === EVENT_SessionFinished) {
[optional.sessionId, offset] = readResContent(res, offset);
[optional.responseMetaJson, offset] = readResContent(res, offset);
} else {
[optional.sessionId, offset] = readResContent(res, offset);
[response.payload, offset] = readResPayload(res, offset);
}
}
} else if (header.messageType === ERROR_INFORMATION) {
optional.errorCode = res.readInt32BE(offset);
offset += 4;
[response.payload, offset] = readResPayload(res, offset);
}
return response;
}
function printResponse(res: Response, tag: string): void {
console.log(`===>${tag} header:`, res.header, res.optional.event);
console.log(`===>${tag} optional:`, res.optional);
}
function getPayloadBytes(
uid: string = '1234',
event: number = EVENT_NONE,
text: string = '',
speaker: string = '',
audioFormat: string = 'mp3',
audioSampleRate: number = 24000,
): Buffer {
return Buffer.from(
JSON.stringify({
user: { uid },
event,
namespace: 'BidirectionalTTS',
req_params: {
text,
speaker,
audio_params: {
format: audioFormat,
sample_rate: audioSampleRate,
},
},
}),
);
}
async function startConnection(websocket: WebSocket): Promise<void> {
const header = new Header(PROTOCOL_VERSION, DEFAULT_HEADER_SIZE, FULL_CLIENT_REQUEST, MsgTypeFlagWithEvent).asBytes();
const optional = new Optional(EVENT_Start_Connection).asBytes();
const payload = Buffer.from('{}');
return await sendEvent(websocket, header, optional, payload);
}
async function startSession(websocket: WebSocket, speaker: string, sessionId: string): Promise<void> {
const header = new Header(PROTOCOL_VERSION, DEFAULT_HEADER_SIZE, FULL_CLIENT_REQUEST, MsgTypeFlagWithEvent, JSON_TYPE).asBytes();
const optional = new Optional(EVENT_StartSession, sessionId).asBytes();
const payload = getPayloadBytes('1234', EVENT_StartSession, '', speaker);
return await sendEvent(websocket, header, optional, payload);
}
async function sendText(ws: WebSocket, speaker: string, text: string, sessionId: string): Promise<void> {
const header = new Header(PROTOCOL_VERSION, DEFAULT_HEADER_SIZE, FULL_CLIENT_REQUEST, MsgTypeFlagWithEvent, JSON_TYPE).asBytes();
const optional = new Optional(EVENT_TaskRequest, sessionId).asBytes();
const payload = getPayloadBytes('1234', EVENT_TaskRequest, text, speaker);
return await sendEvent(ws, header, optional, payload);
}
async function finishSession(ws: WebSocket, sessionId: string): Promise<void> {
const header = new Header(PROTOCOL_VERSION, DEFAULT_HEADER_SIZE, FULL_CLIENT_REQUEST, MsgTypeFlagWithEvent, JSON_TYPE).asBytes();
const optional = new Optional(EVENT_FinishSession, sessionId).asBytes();
const payload = Buffer.from('{}');
return await sendEvent(ws, header, optional, payload);
}
async function finishConnection(ws: WebSocket): Promise<void> {
const header = new Header(PROTOCOL_VERSION, DEFAULT_HEADER_SIZE, FULL_CLIENT_REQUEST, MsgTypeFlagWithEvent, JSON_TYPE).asBytes();
const optional = new Optional(EVENT_FinishConnection).asBytes();
const payload = Buffer.from('{}');
return await sendEvent(ws, header, optional, payload);
}
export async function runDemo(appId: string, token: string, speaker: string, text: string, outputPath: string): Promise<void> {
return new Promise((resolve, reject) => {
const wsHeader = {
'X-Api-App-Key': appId,
'X-Api-Access-Key': token,
'X-Api-Resource-Id': 'volc.service_type.10029',
'X-Api-Connect-Id': uuidv4(),
};
const url = 'wss://openspeech.bytedance.com/api/v3/tts/bidirection';
const ws = new WebSocket(url, { headers: wsHeader });
ws.on('open', async () => {
try {
await startConnection(ws);
let isFirstResponse = true;
let fileHandle: fs.FileHandle | null = null;
let sessionId: string = '';
ws.on('message', async (data) => {
try {
const res = parserResponse(data as Buffer);
printResponse(res, 'message res:');
if (res.optional.event === EVENT_ConnectionStarted) {
sessionId = uuidv4().replace(/-/g, '');
await startSession(ws, speaker, sessionId);
return;
} else if (res.optional.event === EVENT_ConnectionFinished) {
ws.close();
resolve();
return;
}
if (res.optional.event === EVENT_SessionStarted && isFirstResponse) {
isFirstResponse = false;
await sendText(ws, speaker, text, sessionId);
await finishSession(ws, sessionId);
fileHandle = await fs.open(outputPath, 'w');
} else if (!isFirstResponse) {
if (res.optional.event === EVENT_TTSResponse && res.header.messageType === AUDIO_ONLY_RESPONSE && res.payload && fileHandle) {
await fileHandle.write(res.payload);
} else if (res.optional.event === EVENT_TTSSentenceStart || res.optional.event === EVENT_TTSSentenceEnd) {
// continue
} else {
if (fileHandle) {
await fileHandle.close();
fileHandle = null;
}
await finishConnection(ws);
}
}
} catch (err) {
ws.close();
reject(err);
}
});
} catch (err) {
ws.close();
reject(err);
}
});
ws.on('error', (err) => {
reject(err);
});
});
}