init
This commit is contained in:
86
src/provider/utils/chunk.ts
Normal file
86
src/provider/utils/chunk.ts
Normal file
@@ -0,0 +1,86 @@
|
||||
import { numTokensFromString } from './token.ts';
|
||||
|
||||
// 常量定义
|
||||
const CHUNK_SIZE = 512; // 每个chunk的最大token数
|
||||
const MAGIC_SEPARATOR = '🦛';
|
||||
const DELIMITER = [',', '.', '!', '?', '\n', ',', '。', '!', '?'];
|
||||
const PARAGRAPH_DELIMITER = '\n\n';
|
||||
|
||||
export interface Chunk {
|
||||
chunkId: number;
|
||||
text: string;
|
||||
tokens: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* 确保每个chunk的大小不超过最大token数
|
||||
* @param chunk 输入的文本块
|
||||
* @returns 分割后的文本块及其token数的数组
|
||||
*/
|
||||
function ensureChunkSize(chunk: string): Array<[string, number]> {
|
||||
const tokens = numTokensFromString(chunk);
|
||||
if (tokens <= CHUNK_SIZE) {
|
||||
return [[chunk, tokens]];
|
||||
}
|
||||
|
||||
// 在分隔符后添加魔法分隔符
|
||||
let processedChunk = chunk;
|
||||
for (const delimiter of DELIMITER) {
|
||||
// 转义特殊字符
|
||||
const escapedDelimiter = delimiter.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
||||
processedChunk = processedChunk.replace(new RegExp(escapedDelimiter, 'g'), delimiter + MAGIC_SEPARATOR);
|
||||
}
|
||||
|
||||
const chunks: Array<[string, number]> = [];
|
||||
let tail = '';
|
||||
|
||||
// 按CHUNK_SIZE分割文本
|
||||
for (let i = 0; i < processedChunk.length; i += CHUNK_SIZE) {
|
||||
const sentences = (processedChunk.slice(i, i + CHUNK_SIZE) + ' ').split(MAGIC_SEPARATOR);
|
||||
const currentChunk = tail + sentences.slice(0, -1).join('');
|
||||
if (currentChunk.trim()) {
|
||||
const tokenCount = numTokensFromString(currentChunk);
|
||||
chunks.push([currentChunk, tokenCount]);
|
||||
}
|
||||
tail = sentences[sentences.length - 1].trim();
|
||||
}
|
||||
|
||||
// 处理最后剩余的tail
|
||||
if (tail) {
|
||||
const tokenCount = numTokensFromString(tail);
|
||||
chunks.push([tail, tokenCount]);
|
||||
}
|
||||
|
||||
return chunks;
|
||||
}
|
||||
|
||||
/**
|
||||
* 将文本分割成chunks
|
||||
* @param text 输入文本
|
||||
* @returns 分割后的chunks数组
|
||||
*/
|
||||
export async function getChunks(text: string): Promise<Chunk[]> {
|
||||
// 按段落分割文本
|
||||
const paragraphs = text
|
||||
.split(PARAGRAPH_DELIMITER)
|
||||
.map((p) => p.trim())
|
||||
.filter((p) => p);
|
||||
|
||||
const chunks: Chunk[] = [];
|
||||
let currentIndex = 0;
|
||||
|
||||
// 处理每个段落
|
||||
for (const paragraph of paragraphs) {
|
||||
const splittedParagraph = ensureChunkSize(paragraph);
|
||||
for (const [text, tokens] of splittedParagraph) {
|
||||
chunks.push({
|
||||
chunkId: currentIndex,
|
||||
text,
|
||||
tokens,
|
||||
});
|
||||
currentIndex++;
|
||||
}
|
||||
}
|
||||
|
||||
return chunks;
|
||||
}
|
||||
Reference in New Issue
Block a user