Files
ai/src/provider/utils/chunk.ts
2025-05-25 14:01:37 +08:00

87 lines
2.3 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import { numTokensFromString } from './token.ts';
// 常量定义
const CHUNK_SIZE = 512; // 每个chunk的最大token数
const MAGIC_SEPARATOR = '🦛';
const DELIMITER = [',', '.', '!', '?', '\n', '', '。', '', ''];
const PARAGRAPH_DELIMITER = '\n\n';
export interface Chunk {
chunkId: number;
text: string;
tokens: number;
}
/**
* 确保每个chunk的大小不超过最大token数
* @param chunk 输入的文本块
* @returns 分割后的文本块及其token数的数组
*/
function ensureChunkSize(chunk: string): Array<[string, number]> {
const tokens = numTokensFromString(chunk);
if (tokens <= CHUNK_SIZE) {
return [[chunk, tokens]];
}
// 在分隔符后添加魔法分隔符
let processedChunk = chunk;
for (const delimiter of DELIMITER) {
// 转义特殊字符
const escapedDelimiter = delimiter.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
processedChunk = processedChunk.replace(new RegExp(escapedDelimiter, 'g'), delimiter + MAGIC_SEPARATOR);
}
const chunks: Array<[string, number]> = [];
let tail = '';
// 按CHUNK_SIZE分割文本
for (let i = 0; i < processedChunk.length; i += CHUNK_SIZE) {
const sentences = (processedChunk.slice(i, i + CHUNK_SIZE) + ' ').split(MAGIC_SEPARATOR);
const currentChunk = tail + sentences.slice(0, -1).join('');
if (currentChunk.trim()) {
const tokenCount = numTokensFromString(currentChunk);
chunks.push([currentChunk, tokenCount]);
}
tail = sentences[sentences.length - 1].trim();
}
// 处理最后剩余的tail
if (tail) {
const tokenCount = numTokensFromString(tail);
chunks.push([tail, tokenCount]);
}
return chunks;
}
/**
* 将文本分割成chunks
* @param text 输入文本
* @returns 分割后的chunks数组
*/
export async function getChunks(text: string): Promise<Chunk[]> {
// 按段落分割文本
const paragraphs = text
.split(PARAGRAPH_DELIMITER)
.map((p) => p.trim())
.filter((p) => p);
const chunks: Chunk[] = [];
let currentIndex = 0;
// 处理每个段落
for (const paragraph of paragraphs) {
const splittedParagraph = ensureChunkSize(paragraph);
for (const [text, tokens] of splittedParagraph) {
chunks.push({
chunkId: currentIndex,
text,
tokens,
});
currentIndex++;
}
}
return chunks;
}