87 lines
2.3 KiB
TypeScript
87 lines
2.3 KiB
TypeScript
import { numTokensFromString } from './token.ts';
|
||
|
||
// 常量定义
|
||
const CHUNK_SIZE = 512; // 每个chunk的最大token数
|
||
const MAGIC_SEPARATOR = '🦛';
|
||
const DELIMITER = [',', '.', '!', '?', '\n', ',', '。', '!', '?'];
|
||
const PARAGRAPH_DELIMITER = '\n\n';
|
||
|
||
export interface Chunk {
|
||
chunkId: number;
|
||
text: string;
|
||
tokens: number;
|
||
}
|
||
|
||
/**
|
||
* 确保每个chunk的大小不超过最大token数
|
||
* @param chunk 输入的文本块
|
||
* @returns 分割后的文本块及其token数的数组
|
||
*/
|
||
function ensureChunkSize(chunk: string): Array<[string, number]> {
|
||
const tokens = numTokensFromString(chunk);
|
||
if (tokens <= CHUNK_SIZE) {
|
||
return [[chunk, tokens]];
|
||
}
|
||
|
||
// 在分隔符后添加魔法分隔符
|
||
let processedChunk = chunk;
|
||
for (const delimiter of DELIMITER) {
|
||
// 转义特殊字符
|
||
const escapedDelimiter = delimiter.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
||
processedChunk = processedChunk.replace(new RegExp(escapedDelimiter, 'g'), delimiter + MAGIC_SEPARATOR);
|
||
}
|
||
|
||
const chunks: Array<[string, number]> = [];
|
||
let tail = '';
|
||
|
||
// 按CHUNK_SIZE分割文本
|
||
for (let i = 0; i < processedChunk.length; i += CHUNK_SIZE) {
|
||
const sentences = (processedChunk.slice(i, i + CHUNK_SIZE) + ' ').split(MAGIC_SEPARATOR);
|
||
const currentChunk = tail + sentences.slice(0, -1).join('');
|
||
if (currentChunk.trim()) {
|
||
const tokenCount = numTokensFromString(currentChunk);
|
||
chunks.push([currentChunk, tokenCount]);
|
||
}
|
||
tail = sentences[sentences.length - 1].trim();
|
||
}
|
||
|
||
// 处理最后剩余的tail
|
||
if (tail) {
|
||
const tokenCount = numTokensFromString(tail);
|
||
chunks.push([tail, tokenCount]);
|
||
}
|
||
|
||
return chunks;
|
||
}
|
||
|
||
/**
|
||
* 将文本分割成chunks
|
||
* @param text 输入文本
|
||
* @returns 分割后的chunks数组
|
||
*/
|
||
export async function getChunks(text: string): Promise<Chunk[]> {
|
||
// 按段落分割文本
|
||
const paragraphs = text
|
||
.split(PARAGRAPH_DELIMITER)
|
||
.map((p) => p.trim())
|
||
.filter((p) => p);
|
||
|
||
const chunks: Chunk[] = [];
|
||
let currentIndex = 0;
|
||
|
||
// 处理每个段落
|
||
for (const paragraph of paragraphs) {
|
||
const splittedParagraph = ensureChunkSize(paragraph);
|
||
for (const [text, tokens] of splittedParagraph) {
|
||
chunks.push({
|
||
chunkId: currentIndex,
|
||
text,
|
||
tokens,
|
||
});
|
||
currentIndex++;
|
||
}
|
||
}
|
||
|
||
return chunks;
|
||
}
|