init

2025-05-25 14:01:37 +08:00
commit 8f52a10ae0
42 changed files with 1946 additions and 0 deletions
--- a/src/provider/utils/chunk.ts
+++ b/src/provider/utils/chunk.ts
@@ -0,0 +1,86 @@
+import { numTokensFromString } from './token.ts';
+
+// 常量定义
+const CHUNK_SIZE = 512; // 每个chunk的最大token数
+const MAGIC_SEPARATOR = '🦛';
+const DELIMITER = [',', '.', '!', '?', '\n', '，', '。', '！', '？'];
+const PARAGRAPH_DELIMITER = '\n\n';
+
+export interface Chunk {
+  chunkId: number;
+  text: string;
+  tokens: number;
+}
+
+/**
+ * 确保每个chunk的大小不超过最大token数
+ * @param chunk 输入的文本块
+ * @returns 分割后的文本块及其token数的数组
+ */
+function ensureChunkSize(chunk: string): Array<[string, number]> {
+  const tokens = numTokensFromString(chunk);
+  if (tokens <= CHUNK_SIZE) {
+    return [[chunk, tokens]];
+  }
+
+  // 在分隔符后添加魔法分隔符
+  let processedChunk = chunk;
+  for (const delimiter of DELIMITER) {
+    // 转义特殊字符
+    const escapedDelimiter = delimiter.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
+    processedChunk = processedChunk.replace(new RegExp(escapedDelimiter, 'g'), delimiter + MAGIC_SEPARATOR);
+  }
+
+  const chunks: Array<[string, number]> = [];
+  let tail = '';
+
+  // 按CHUNK_SIZE分割文本
+  for (let i = 0; i < processedChunk.length; i += CHUNK_SIZE) {
+    const sentences = (processedChunk.slice(i, i + CHUNK_SIZE) + ' ').split(MAGIC_SEPARATOR);
+    const currentChunk = tail + sentences.slice(0, -1).join('');
+    if (currentChunk.trim()) {
+      const tokenCount = numTokensFromString(currentChunk);
+      chunks.push([currentChunk, tokenCount]);
+    }
+    tail = sentences[sentences.length - 1].trim();
+  }
+
+  // 处理最后剩余的tail
+  if (tail) {
+    const tokenCount = numTokensFromString(tail);
+    chunks.push([tail, tokenCount]);
+  }
+
+  return chunks;
+}
+
+/**
+ * 将文本分割成chunks
+ * @param text 输入文本
+ * @returns 分割后的chunks数组
+ */
+export async function getChunks(text: string): Promise<Chunk[]> {
+  // 按段落分割文本
+  const paragraphs = text
+    .split(PARAGRAPH_DELIMITER)
+    .map((p) => p.trim())
+    .filter((p) => p);
+
+  const chunks: Chunk[] = [];
+  let currentIndex = 0;
+
+  // 处理每个段落
+  for (const paragraph of paragraphs) {
+    const splittedParagraph = ensureChunkSize(paragraph);
+    for (const [text, tokens] of splittedParagraph) {
+      chunks.push({
+        chunkId: currentIndex,
+        text,
+        tokens,
+      });
+      currentIndex++;
+    }
+  }
+
+  return chunks;
+}