This commit is contained in:
2025-05-25 14:01:37 +08:00
commit 8f52a10ae0
42 changed files with 1946 additions and 0 deletions

View File

@@ -0,0 +1,86 @@
import { numTokensFromString } from './token.ts';
// 常量定义
const CHUNK_SIZE = 512; // 每个chunk的最大token数
const MAGIC_SEPARATOR = '🦛';
const DELIMITER = [',', '.', '!', '?', '\n', '', '。', '', ''];
const PARAGRAPH_DELIMITER = '\n\n';
export interface Chunk {
chunkId: number;
text: string;
tokens: number;
}
/**
* 确保每个chunk的大小不超过最大token数
* @param chunk 输入的文本块
* @returns 分割后的文本块及其token数的数组
*/
function ensureChunkSize(chunk: string): Array<[string, number]> {
const tokens = numTokensFromString(chunk);
if (tokens <= CHUNK_SIZE) {
return [[chunk, tokens]];
}
// 在分隔符后添加魔法分隔符
let processedChunk = chunk;
for (const delimiter of DELIMITER) {
// 转义特殊字符
const escapedDelimiter = delimiter.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
processedChunk = processedChunk.replace(new RegExp(escapedDelimiter, 'g'), delimiter + MAGIC_SEPARATOR);
}
const chunks: Array<[string, number]> = [];
let tail = '';
// 按CHUNK_SIZE分割文本
for (let i = 0; i < processedChunk.length; i += CHUNK_SIZE) {
const sentences = (processedChunk.slice(i, i + CHUNK_SIZE) + ' ').split(MAGIC_SEPARATOR);
const currentChunk = tail + sentences.slice(0, -1).join('');
if (currentChunk.trim()) {
const tokenCount = numTokensFromString(currentChunk);
chunks.push([currentChunk, tokenCount]);
}
tail = sentences[sentences.length - 1].trim();
}
// 处理最后剩余的tail
if (tail) {
const tokenCount = numTokensFromString(tail);
chunks.push([tail, tokenCount]);
}
return chunks;
}
/**
* 将文本分割成chunks
* @param text 输入文本
* @returns 分割后的chunks数组
*/
export async function getChunks(text: string): Promise<Chunk[]> {
// 按段落分割文本
const paragraphs = text
.split(PARAGRAPH_DELIMITER)
.map((p) => p.trim())
.filter((p) => p);
const chunks: Chunk[] = [];
let currentIndex = 0;
// 处理每个段落
for (const paragraph of paragraphs) {
const splittedParagraph = ensureChunkSize(paragraph);
for (const [text, tokens] of splittedParagraph) {
chunks.push({
chunkId: currentIndex,
text,
tokens,
});
currentIndex++;
}
}
return chunks;
}