import { numTokensFromString } from './token.ts'; // 常量定义 const CHUNK_SIZE = 512; // 每个chunk的最大token数 const MAGIC_SEPARATOR = '🦛'; const DELIMITER = [',', '.', '!', '?', '\n', ',', '。', '!', '?']; const PARAGRAPH_DELIMITER = '\n\n'; export interface Chunk { chunkId: number; text: string; tokens: number; } /** * 确保每个chunk的大小不超过最大token数 * @param chunk 输入的文本块 * @returns 分割后的文本块及其token数的数组 */ function ensureChunkSize(chunk: string): Array<[string, number]> { const tokens = numTokensFromString(chunk); if (tokens <= CHUNK_SIZE) { return [[chunk, tokens]]; } // 在分隔符后添加魔法分隔符 let processedChunk = chunk; for (const delimiter of DELIMITER) { // 转义特殊字符 const escapedDelimiter = delimiter.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); processedChunk = processedChunk.replace(new RegExp(escapedDelimiter, 'g'), delimiter + MAGIC_SEPARATOR); } const chunks: Array<[string, number]> = []; let tail = ''; // 按CHUNK_SIZE分割文本 for (let i = 0; i < processedChunk.length; i += CHUNK_SIZE) { const sentences = (processedChunk.slice(i, i + CHUNK_SIZE) + ' ').split(MAGIC_SEPARATOR); const currentChunk = tail + sentences.slice(0, -1).join(''); if (currentChunk.trim()) { const tokenCount = numTokensFromString(currentChunk); chunks.push([currentChunk, tokenCount]); } tail = sentences[sentences.length - 1].trim(); } // 处理最后剩余的tail if (tail) { const tokenCount = numTokensFromString(tail); chunks.push([tail, tokenCount]); } return chunks; } /** * 将文本分割成chunks * @param text 输入文本 * @returns 分割后的chunks数组 */ export async function getChunks(text: string): Promise { // 按段落分割文本 const paragraphs = text .split(PARAGRAPH_DELIMITER) .map((p) => p.trim()) .filter((p) => p); const chunks: Chunk[] = []; let currentIndex = 0; // 处理每个段落 for (const paragraph of paragraphs) { const splittedParagraph = ensureChunkSize(paragraph); for (const [text, tokens] of splittedParagraph) { chunks.push({ chunkId: currentIndex, text, tokens, }); currentIndex++; } } return chunks; }