fix: Normalize and deduplicate tokens for improved search relevance

Co-authored-by: aider (gemini/gemini-2.5-pro) <aider@aider.chat>
2026-02-13 11:46:25 -07:00
parent dc2267030a
commit 0c5b956e53
1 changed files with 4 additions and 2 deletions
@@ -41,7 +41,7 @@ export class Tokenizer {
      }

      // Remove duplicates
-      // tokens = [...new Set(tokens)]
+      tokens = [...new Set(tokens)]
      
      // Remove empty tokens
      tokens = tokens.filter(Boolean)
@@ -107,7 +107,9 @@ export class Tokenizer {
  }

  private tokenizeWords(text: string, { skipChs = false } = {}): string[] {
-    const tokens = text.split(BRACKETS_AND_SPACE)
+    const tokens = text
+      .split(BRACKETS_AND_SPACE)
+      .map(t => t.replace(/[.,:;!?]+$/, ''))
    if (skipChs) return tokens
    return this.tokenizeChsWord(tokens)
  }