feat: early vector search testing

This commit is contained in:
SethBurkart123
2025-04-01 23:14:45 +11:00
parent 13f830ee16
commit 07aa9524aa
10 changed files with 833 additions and 34 deletions
+1
View File
@@ -75,6 +75,7 @@
"@uiw/codemirror-extensions-color": "^4.23.10",
"@uiw/codemirror-theme-github": "^4.23.10",
"autoprefixer": "^10.4.21",
"client-vector-search": "^0.2.0",
"codemirror": "^6.0.1",
"color": "^5.0.0",
"dompurify": "^3.2.4",
@@ -6,7 +6,7 @@
import { type StaticCommandItem } from './commands';
import type { CombinedResult } from './types';
import { createSearchIndexes, performSearch as doSearch } from './searchUtils';
import { highlightMatch, highlightSnippet } from './highlightUtils';
import { highlightMatch, highlightSnippet, stripHtmlButKeepHighlights } from './highlightUtils';
import Fuse from 'fuse.js';
import Calculator from './Calculator.svelte';
import { actionMap } from './indexing/actions';
@@ -105,14 +105,14 @@
};
});
const performSearch = () => {
const performSearch = async () => {
isLoading = true;
selectedIndex = 0;
const term = searchTerm.trim().toLowerCase();
if (commandsFuse && dynamicContentFuse) {
combinedResults = doSearch(
combinedResults = await doSearch(
term,
commandsFuse,
dynamicContentFuse,
@@ -288,8 +288,9 @@
onclick={() => executeItemAction(dynamicItem)}
>
<div class="flex items-center w-full">
<div class="flex-none w-8 h-8 text-xl font-IconFamily flex items-center justify-center {isSelected ? 'text-zinc-900 dark:text-white' : 'text-zinc-600 dark:text-zinc-400'}">{dynamicItem.metadata?.icon || '\ue924'}</div>
<span class="ml-4 text-lg truncate">
{@html highlightMatch(dynamicItem.text, searchTerm, result.matches)}
{@html stripHtmlButKeepHighlights(highlightMatch(dynamicItem.text, searchTerm, result.matches))}
</span>
<span class="flex-none ml-auto text-xs text-zinc-500 dark:text-zinc-400">
{dynamicItem.category}
@@ -297,7 +298,7 @@
</div>
{#if dynamicItem.content}
<div class="mt-1 ml-12 text-sm text-zinc-600 dark:text-zinc-400 line-clamp-2 text-start">
{@html highlightSnippet(dynamicItem.content, searchTerm, result.matches)}
{@html stripHtmlButKeepHighlights(highlightSnippet(dynamicItem.content, searchTerm, result.matches))}
</div>
{/if}
</button>
@@ -0,0 +1,597 @@
# client-vector-search
A client side vector search library that can embed, search, and cache. Works on the browser and server side.
It outperforms OpenAI's text-embedding-ada-002 and is way faster than Pinecone and other VectorDBs.
I'm the founder of [searchbase.app](https://searchbase.app) and we needed this for our product and customers. We'll be using this library in production. You can be sure it'll be maintained and improved.
- Embed documents using transformers by default: gte-small (~30mb).
- Calculate cosine similarity between embeddings.
- Create an index and search on the client side
- Cache vectors with browser caching support.
Lots of improvements are coming!
## Roadmap
Our goal is to build a super simple, fast vector search that works with couple hundred to thousands vectors. ~1k vectors per user covers 99% of the use cases.
We'll initially keep things super simple and sub 100ms
### TODOs
- [ ] add HNSW index that works on node and browser env, don't rely on hnsw binder libs
- [ ] add a proper testing suite and ci/cd for the lib
- [ ] simple health tests
- [ ] mock the @xenova/transformers for jest, it's not happy with it
- [ ] performance tests, recall, memory usage, cpu usage etc.
## Installation
```bash
npm i client-vector-search
```
## Quickstart
This library provides a plug-and-play solution for embedding and vector search. It's designed to be easy to use, efficient, and versatile. Here's a quick start guide:
```ts
import { getEmbedding, EmbeddingIndex } from 'client-vector-search';
// getEmbedding is an async function, so you need to use 'await' or '.then()' to get the result
const embedding = await getEmbedding("Apple"); // Returns embedding as number[]
// Each object should have an 'embedding' property of type number[]
const initialObjects = [
{ id: 1, name: "Apple", embedding: embedding },
{ id: 2, name: "Banana", embedding: await getEmbedding("Banana") },
{ id: 3, name: "Cheddar", embedding: await getEmbedding("Cheddar")},
{ id: 4, name: "Space", embedding: await getEmbedding("Space")},
{ id: 5, name: "database", embedding: await getEmbedding("database")},
];
const index = new EmbeddingIndex(initialObjects); // Creates an index
// The query should be an embedding of type number[]
const queryEmbedding = await getEmbedding('Fruit'); // Query embedding
const results = await index.search(queryEmbedding, { topK: 5 }); // Returns top similar objects
// specify the storage type
await index.saveIndex('indexedDB');
const results = await index.search([1, 2, 3], {
topK: 5,
useStorage: 'indexedDB',
// storageOptions: { // use only if you overrode the defaults
// indexedDBName: 'clientVectorDB',
// indexedDBObjectStoreName: 'ClientEmbeddingStore',
// },
});
console.log(results);
await index.deleteIndexedDB(); // if you overrode default, specify db name
```
## Trouble-shooting
### NextJS
To use it inside NextJS projects you'll need to update the `next.config.js` file to include the following:
```js
module.exports = {
// Override the default webpack configuration
webpack: (config) => {
// See https://webpack.js.org/configuration/resolve/#resolvealias
config.resolve.alias = {
...config.resolve.alias,
sharp$: false,
"onnxruntime-node$": false,
};
return config;
},
};
```
#### Model load after page is loaded
You can initialize the model before using it to generate embeddings. This will ensure that the model is loaded before you use it and provide a better UX.
```js
import { initializeModel } from "client-vector-search"
...
useEffect(() => {
try {
initializeModel();
} catch (e) {
console.log(e);
}
}, []);
```
## Usage Guide
This guide provides a step-by-step walkthrough of the library's main features. It covers everything from generating embeddings for a string to performing operations on the index such as adding, updating, and removing objects. It also includes instructions on how to save the index to a database and perform search operations within it.
Until we have a reference documentation, you can find all the methods and their usage in this guide. Each step is accompanied by a code snippet to illustrate the usage of the method in question. Make sure to follow along and try out the examples in your own environment to get a better understanding of how everything works.
Let's get started!
### Step 1: Generate Embeddings for String
Generate embeddings for a given string using the `getEmbedding` method.
```ts
const embedding = await getEmbedding("Apple"); // Returns embedding as number[]
```
> **Note**: `getEmbedding` is asynchronous; make sure to use `await`.
---
### Step 2: Calculate Cosine Similarity
Calculate the cosine similarity between two embeddings.
```ts
const similarity = cosineSimilarity(embedding1, embedding2, 6);
```
> **Note**: Both embeddings should be of the same length.
---
### Step 3: Create an Index
Create an index with an initial array of objects. Each object must have an 'embedding' property.
```ts
const initialObjects = [...];
const index = new EmbeddingIndex(initialObjects);
```
---
### Step 4: Add to Index
Add an object to the index.
```ts
const objectToAdd = { id: 6, name: 'Cat', embedding: await getEmbedding('Cat') };
index.add(objectToAdd);
```
---
### Step 5: Update Index
Update an existing object in the index.
```ts
const vectorToUpdate = { id: 6, name: 'Dog', embedding: await getEmbedding('Dog') };
index.update({ id: 6 }, vectorToUpdate);
```
---
### Step 6: Remove from Index
Remove an object from the index.
```ts
index.remove({ id: 6 });
```
---
### Step 7: Retrieve from Index
Retrieve an object from the index.
```ts
const vector = index.get({ id: 1 });
```
---
### Step 8: Search the Index
Search the index with a query embedding.
```ts
const queryEmbedding = await getEmbedding('Fruit');
const results = await index.search(queryEmbedding, { topK: 5 });
```
---
### Step 9: Print the Index
Print the entire index to the console.
```ts
index.printIndex();
```
---
### Step 10: Save Index to IndexedDB (for browser)
Save the index to a persistent IndexedDB database. Note
```ts
await index.saveIndex("indexedDB", { DBName: "clientVectorDB", objectStoreName:"ClientEmbeddingStore"})
```
---
### Important: Search in indexedDB
Perform a search operation in the IndexedDB.
```ts
const results = await index.search(queryEmbedding, {
topK: 5,
useStorage: "indexedDB",
storageOptions: { // only if you want to override the default options, defaults are below
indexedDBName: 'clientVectorDB',
indexedDBObjectStoreName: 'ClientEmbeddingStore'
}
});
---
### Delete Database
To delete an entire database.
```ts
await IndexedDbManager.deleteIndexedDB("clientVectorDB");
```
---
### Delete Object Store
To delete an object store from a database.
```ts
await IndexedDbManager.deleteIndexedDBObjectStore("clientVectorDB", "ClientEmbeddingStore");
```
---
### Retrieve All Objects
To retrieve all objects from a specific object store.
```ts
const allObjects = await IndexedDbManager.getAllObjectsFromIndexedDB("clientVectorDB", "ClientEmbeddingStore");
```
# THE MAIN INDEX.TS FILE THAT YOU ARE IMPORTING FROM
```index.ts
const DEFAULT_TOP_K = 3;
interface Filter {
[key: string]: any;
}
import Cache from './cache';
import { IndexedDbManager } from './indexedDB';
import { cosineSimilarity } from './utils';
export { ExperimentalHNSWIndex } from './hnsw';
// uncomment if you want to test indexedDB implementation in node env for faster dev cycle
// import { IDBFactory } from 'fake-indexeddb';
// const indexedDB = new IDBFactory();
export interface SearchResult {
similarity: number;
object: any;
}
type StorageOptions = 'indexedDB' | 'localStorage' | 'none';
/**
* Interface for search options in the EmbeddingIndex class.
* topK: The number of top similar items to return.
* filter: An optional filter to apply to the objects before searching.
* useStorage: A flag to indicate whether to use storage options like indexedDB or localStorage.
*/
interface SearchOptions {
topK?: number;
filter?: Filter;
useStorage?: StorageOptions;
storageOptions?: { indexedDBName: string; indexedDBObjectStoreName: string }; // TODO: generalize it to localStorage as well
}
const cacheInstance = Cache.getInstance();
let pipe: any;
let currentModel: string;
export const initializeModel = async (
model: string = 'Xenova/gte-small',
): Promise<void> => {
if (model !== currentModel) {
const transformersModule = await import('@xenova/transformers');
const pipeline = transformersModule.pipeline;
pipe = await pipeline('feature-extraction', model);
currentModel = model;
}
};
export const getEmbedding = async (
text: string,
precision: number = 7,
options = { pooling: 'mean', normalize: false },
model = 'Xenova/gte-small',
): Promise<number[]> => {
const cachedEmbedding = cacheInstance.get(text);
if (cachedEmbedding) {
return Promise.resolve(cachedEmbedding);
}
if (model !== currentModel) {
await initializeModel(model);
}
const output = await pipe(text, options);
const roundedOutput = Array.from(output.data as number[]).map(
(value: number) => parseFloat(value.toFixed(precision)),
);
cacheInstance.set(text, roundedOutput);
return Array.from(roundedOutput);
};
export class EmbeddingIndex {
private objects: Filter[];
private keys: string[];
constructor(initialObjects?: Filter[]) {
// TODO: add support for options while creating index such as {... indexedDB: true, ...}
this.objects = [];
this.keys = [];
if (initialObjects && initialObjects.length > 0) {
initialObjects.forEach((obj) => this.validateAndAdd(obj));
if (initialObjects[0]) {
this.keys = Object.keys(initialObjects[0]);
}
}
}
private findVectorIndex(filter: Filter): number {
return this.objects.findIndex((object) =>
Object.keys(filter).every((key) => object[key] === filter[key]),
);
}
private validateAndAdd(obj: Filter) {
if (!Array.isArray(obj.embedding) || obj.embedding.some(isNaN)) {
throw new Error(
'Object must have an embedding property of type number[]',
);
}
if (this.keys.length === 0) {
this.keys = Object.keys(obj);
} else if (!this.keys.every((key) => key in obj)) {
throw new Error(
'Object must have the same properties as the initial objects',
);
}
this.objects.push(obj);
}
add(obj: Filter) {
this.validateAndAdd(obj);
}
// Method to update an existing vector in the index
update(filter: Filter, vector: Filter) {
const index = this.findVectorIndex(filter);
if (index === -1) {
throw new Error('Vector not found');
}
if (vector.hasOwnProperty('embedding')) {
// Validate and add the new vector
this.validateAndAdd(vector);
}
// Replace the old vector with the new one
this.objects[index] = Object.assign(this.objects[index] as Filter, vector);
}
// Method to remove a vector from the index
remove(filter: Filter) {
const index = this.findVectorIndex(filter);
if (index === -1) {
throw new Error('Vector not found');
}
// Remove the vector from the index
this.objects.splice(index, 1);
}
// Method to remove multiple vectors from the index
removeBatch(filters: Filter[]) {
filters.forEach((filter) => {
const index = this.findVectorIndex(filter);
if (index !== -1) {
// Remove the vector from the index
this.objects.splice(index, 1);
}
});
}
// Method to retrieve a vector from the index
get(filter: Filter) {
const vector = this.objects[this.findVectorIndex(filter)];
return vector || null;
}
size(): number {
// Returns the size of the index
return this.objects.length;
}
clear() {
this.objects = [];
}
async search(
queryEmbedding: number[],
options: SearchOptions = {
topK: 3,
useStorage: 'none',
storageOptions: {
indexedDBName: 'clientVectorDB',
indexedDBObjectStoreName: 'ClientEmbeddingStore',
},
},
): Promise<SearchResult[]> {
const topK = options.topK || DEFAULT_TOP_K;
const filter = options.filter || {};
const useStorage = options.useStorage || 'none';
if (useStorage === 'indexedDB') {
const DBname = options.storageOptions?.indexedDBName || 'clientVectorDB';
const objectStoreName =
options.storageOptions?.indexedDBObjectStoreName ||
'ClientEmbeddingStore';
if (typeof indexedDB === 'undefined') {
console.error('IndexedDB is not supported');
throw new Error('IndexedDB is not supported');
}
const results = await this.loadAndSearchFromIndexedDB(
DBname,
objectStoreName,
queryEmbedding,
topK,
filter,
);
return results;
} else {
// Compute similarities
const similarities = this.objects
.filter((object) =>
Object.keys(filter).every((key) => object[key] === filter[key]),
)
.map((obj) => ({
similarity: cosineSimilarity(queryEmbedding, obj.embedding),
object: obj,
}));
// Sort by similarity and return topK results
return similarities
.sort((a, b) => b.similarity - a.similarity)
.slice(0, topK);
}
}
printIndex() {
console.log('Index Content:');
this.objects.forEach((obj, idx) => {
console.log(`Item ${idx + 1}:`, obj);
});
}
async saveIndex(
storageType: string,
options: { DBName: string; objectStoreName: string } = {
DBName: 'clientVectorDB',
objectStoreName: 'ClientEmbeddingStore',
},
) {
if (storageType === 'indexedDB') {
await this.saveToIndexedDB(options.DBName, options.objectStoreName);
} else {
throw new Error(
`Unsupported storage type: ${storageType} \n Supported storage types: "indexedDB"`,
);
}
}
async saveToIndexedDB(
DBname: string = 'clientVectorDB',
objectStoreName: string = 'ClientEmbeddingStore',
): Promise<void> {
if (typeof indexedDB === 'undefined') {
console.error('IndexedDB is not defined');
throw new Error('IndexedDB is not supported');
}
if (!this.objects || this.objects.length === 0) {
throw new Error('Index is empty. Nothing to save');
}
try {
const db = await IndexedDbManager.create(DBname, objectStoreName);
await db.addToIndexedDB(this.objects);
console.log(
`Index saved to database '${DBname}' object store '${objectStoreName}'`,
);
} catch (error) {
console.error('Error saving index to database:', error);
throw new Error('Error saving index to database');
}
}
async loadAndSearchFromIndexedDB(
DBname: string = 'clientVectorDB',
objectStoreName: string = 'ClientEmbeddingStore',
queryEmbedding: number[],
topK: number,
filter: { [key: string]: any },
): Promise<SearchResult[]> {
const db = await IndexedDbManager.create(DBname, objectStoreName);
const generator = db.dbGenerator();
const results: { similarity: number; object: any }[] = [];
for await (const record of generator) {
if (Object.keys(filter).every((key) => record[key] === filter[key])) {
const similarity = cosineSimilarity(queryEmbedding, record.embedding);
results.push({ similarity, object: record });
}
}
results.sort((a, b) => b.similarity - a.similarity);
return results.slice(0, topK);
}
async deleteIndexedDB(DBname: string = 'clientVectorDB'): Promise<void> {
if (typeof indexedDB === 'undefined') {
console.error('IndexedDB is not defined');
throw new Error('IndexedDB is not supported');
}
return new Promise((resolve, reject) => {
const request = indexedDB.deleteDatabase(DBname);
request.onsuccess = () => {
console.log(`Database '${DBname}' deleted`);
resolve();
};
request.onerror = (event) => {
console.error('Failed to delete database', event);
reject(new Error('Failed to delete database'));
};
});
}
async deleteIndexedDBObjectStore(
DBname: string = 'clientVectorDB',
objectStoreName: string = 'ClientEmbeddingStore',
): Promise<void> {
const db = await IndexedDbManager.create(DBname, objectStoreName);
try {
await db.deleteIndexedDBObjectStoreFromDB(DBname, objectStoreName);
console.log(
`Object store '${objectStoreName}' deleted from database '${DBname}'`,
);
} catch (error) {
console.error('Error deleting object store:', error);
throw new Error('Error deleting object store');
}
}
async getAllObjectsFromIndexedDB(
DBname: string = 'clientVectorDB',
objectStoreName: string = 'ClientEmbeddingStore',
): Promise<any[]> {
const db = await IndexedDbManager.create(DBname, objectStoreName);
const objects: any[] = [];
for await (const record of db.dbGenerator()) {
objects.push(record);
}
return objects;
}
}
```
@@ -28,7 +28,7 @@ const staticCommands: StaticCommandItem[] = [
window.location.hash = '?page=/home';
loadHomePage();
},
priority: 10
priority: 4
},
{
id: 'messages',
@@ -40,7 +40,7 @@ const staticCommands: StaticCommandItem[] = [
action: () => {
window.location.hash = '?page=/messages';
},
priority: 10
priority: 4
},
{
id: 'timetable',
@@ -52,7 +52,7 @@ const staticCommands: StaticCommandItem[] = [
action: () => {
window.location.hash = '?page=/timetable';
},
priority: 10
priority: 4
},
{
id: 'assessments',
@@ -64,7 +64,7 @@ const staticCommands: StaticCommandItem[] = [
action: () => {
window.location.hash = '?page=/assessments';
},
priority: 10
priority: 4
},
{
id: 'toggle-dark-mode',
@@ -72,7 +72,7 @@ const staticCommands: StaticCommandItem[] = [
category: 'action',
text: 'Toggle Dark Mode',
action: () => settingsState.DarkMode = !settingsState.DarkMode,
priority: 5,
priority: 2,
keywords: ['theme', 'appearance']
}
];
@@ -1,12 +1,13 @@
<script lang="ts">
import { highlightMatch, highlightSnippet } from '../highlightUtils';
import { highlightMatch, highlightSnippet, stripHtmlButKeepHighlights } from '../highlightUtils';
import type { DynamicContentItem } from '../dynamicSearch';
import type { FuseResultMatch } from '../types';
const { item, isSelected, searchTerm, result } = $props<{
const { item, isSelected, searchTerm, matches } = $props<{
item: DynamicContentItem;
isSelected: boolean;
searchTerm: string;
result: { matches: string[] };
matches?: readonly FuseResultMatch[];
}>();
/* const dueDate = $derived(item.metadata?.dueDate
@@ -28,12 +29,11 @@
<button
class="w-full flex flex-col px-2 py-1.5 rounded-lg select-none cursor-pointer group
{isSelected ? 'bg-zinc-900/5 dark:bg-white/10 text-zinc-900 dark:text-white' : 'hover:bg-zinc-500/5 dark:hover:bg-white/5 text-zinc-800 dark:text-zinc-200'}"
onclick={() => { item.action(); }}
>
<div class="flex items-center w-full">
<div class="flex-none w-8 h-8 text-xl font-IconFamily flex items-center justify-center {isSelected ? 'text-zinc-900 dark:text-white' : 'text-zinc-600 dark:text-zinc-400'}">{item.icon}</div>
<div class="flex-none w-8 h-8 text-xl font-IconFamily flex items-center justify-center {isSelected ? 'text-zinc-900 dark:text-white' : 'text-zinc-600 dark:text-zinc-400'}">{item.metadata?.icon || '\ue924'}</div>
<span class="ml-4 text-lg truncate">
{@html highlightMatch(item.text, searchTerm, result.matches)}
{@html stripHtmlButKeepHighlights(highlightMatch(item.text, searchTerm, matches))}
</span>
<span class="flex-none ml-auto text-xs text-zinc-500 dark:text-zinc-400">
{item.category}
@@ -41,16 +41,22 @@
</div>
{#if item.content}
<div class="mt-1 ml-12 text-sm text-zinc-600 dark:text-zinc-400 line-clamp-2 text-start">
{@html highlightSnippet(item.content, searchTerm, result.matches)}
{@html stripHtmlButKeepHighlights(highlightSnippet(item.content, searchTerm, matches))}
</div>
{/if}
</button>
<style>
.highlight {
:global(.highlight) {
background-color: rgba(255, 213, 0, 0.3);
font-weight: 500;
border-radius: 2px;
padding: 0 1px;
margin: 0 -1px;
}
.dark :global(.highlight) {
background-color: rgba(255, 230, 100, 0.4);
}
.due-badge {
@@ -1,5 +1,48 @@
import type { FuseResultMatch, MatchIndices } from './types';
/**
* Simple utility to remove HTML tags from a string.
*/
export function stripHtmlTags(html: string): string {
if (!html) return '';
return html.replace(/<[^>]*>/g, '').replace('\n', ' ');
}
/**
* Removes HTML tags from a string, but preserves <span class="highlight"> tags.
*/
export function stripHtmlButKeepHighlights(html: string): string {
if (!html) return '';
// Use a placeholder for highlight tags, strip others, then restore placeholders.
const highlightOpenPlaceholder = '__HIGHLIGHT_OPEN__';
const highlightClosePlaceholder = '__HIGHLIGHT_CLOSE__';
let processed = html.replace(/<span class="highlight">/g, highlightOpenPlaceholder);
processed = processed.replace(/<\/span>/g, (match, offset, fullString) => {
// Only replace </span> if it likely corresponds to our highlight span
// This is imperfect but helps avoid replacing unrelated spans.
// Look backwards for the nearest opening placeholder.
const lastPlaceholder = fullString.lastIndexOf(highlightOpenPlaceholder, offset);
if (lastPlaceholder !== -1) {
// Check if there's another opening tag between the placeholder and the closing span
const interveningContent = fullString.substring(lastPlaceholder + highlightOpenPlaceholder.length, offset);
if (!/<span/i.test(interveningContent)) {
return highlightClosePlaceholder;
}
}
return match; // Keep the original </span> if unsure
});
// Strip all remaining HTML tags
processed = processed.replace(/<[^>]*>/g, '');
// Restore the highlight tags
processed = processed.replace(new RegExp(highlightOpenPlaceholder, 'g'), '<span class="highlight">');
processed = processed.replace(new RegExp(highlightClosePlaceholder, 'g'), '</span>');
return processed;
}
export function highlightMatch(
text: string,
term: string,
@@ -2,6 +2,7 @@ import { getAll, put, clear, remove } from './db';
import { jobs } from './jobs';
import { renderComponentMap } from './renderComponents';
import type { IndexItem, HydratedIndexItem, Job, JobContext } from './types';
import { processItems } from '../vectorSearch';
const META_STORE = 'meta';
const LOCK_KEY = 'bsq-indexer-lock';
@@ -94,6 +95,8 @@ export async function runIndexing(): Promise<void> {
let completedJobs = 0;
dispatchProgress(completedJobs, jobIds.length, true);
const allNewItems: HydratedIndexItem[] = [];
for (const jobId of jobIds) {
const job = jobs[jobId];
const lastRun = await getLastRunMeta(jobId);
@@ -136,6 +139,13 @@ export async function runIndexing(): Promise<void> {
await setStoredItems(merged);
await updateLastRunMeta(jobId);
// Add to our collection of new items for vector processing
const hydratedItems = merged.map(item => ({
...item,
renderComponent: renderComponentMap[job.renderComponentId]
}));
allNewItems.push(...hydratedItems);
console.debug(`%c[Indexer] ✅ ${job.label}: ${newItems.length} items indexed`, 'color: #00c46f');
} catch (err) {
console.debug(`%c[Indexer] ❌ ${job.label} failed:`, 'color: red');
@@ -146,6 +156,12 @@ export async function runIndexing(): Promise<void> {
dispatchProgress(completedJobs, jobIds.length, true);
}
// Process all new items through vector search
if (allNewItems.length > 0) {
console.debug(`%c[Indexer] Processing ${allNewItems.length} items for vector search...`, 'color: #4ea1ff');
await processItems(allNewItems);
}
stopHeartbeat();
dispatchProgress(completedJobs, jobIds.length, false);
}
@@ -228,14 +228,6 @@ export const jobs: Record<string, Job> = {
}
offset += limit;
// If we've processed 500 messages and haven't found any existing ones,
// assume these are all new (first run) and stop here to avoid overwhelming
if (offset >= 500 && consecutiveExisting === 0) {
console.debug('[Messages Job] Processed 500 new messages, stopping for now');
hasMore = false;
break;
}
} catch (error) {
console.error('Error fetching messages:', error);
break;
@@ -1,8 +1,9 @@
import Fuse, { type FuseResult } from 'fuse.js';
import { getStaticCommands, type StaticCommandItem } from './commands';
import { type DynamicContentItem, getDynamicItems } from './dynamicSearch';
import { getDynamicItems } from './dynamicSearch';
import type { CombinedResult } from './types';
import type { HydratedIndexItem } from './indexing/types';
import { searchVectors, type VectorSearchResult } from './vectorSearch';
// This function is likely no longer needed as items are pre-processed by the indexer
/* export function prepareDynamicItems(items: DynamicContentItem[]): DynamicContentItem[] {
@@ -47,9 +48,9 @@ export function createSearchIndexes() {
includeScore: true,
includeMatches: true,
threshold: 0.6,
minMatchCharLength: 1,
ignoreLocation: true,
useExtendedSearch: false
minMatchCharLength: 3,
distance: 50,
useExtendedSearch: false,
};
return {
@@ -141,18 +142,74 @@ export function searchDynamicItems(
});
}
export function performSearch(
export async function performSearch(
query: string,
commandsFuse: Fuse<StaticCommandItem>,
dynamicContentFuse: Fuse<HydratedIndexItem>,
commandIdToItemMap: Map<string, StaticCommandItem>,
dynamicIdToItemMap: Map<string, HydratedIndexItem>,
showRecentFirst: boolean // Pass sorting preference
): CombinedResult[] {
const commandResults = searchCommands(commandsFuse, query, commandIdToItemMap);
const dynamicResults = searchDynamicItems(dynamicContentFuse, query, dynamicIdToItemMap, 10, showRecentFirst);
showRecentFirst: boolean
): Promise<CombinedResult[]> {
const startTime = performance.now();
const results = [...commandResults, ...dynamicResults];
// Get all results first
const commandResults = searchCommands(commandsFuse, query, commandIdToItemMap);
const commandEndTime = performance.now();
const dynamicResults = searchDynamicItems(dynamicContentFuse, query, dynamicIdToItemMap, 10, showRecentFirst);
const fuseEndTime = performance.now();
// Get vector results in parallel
const vectorResults = await searchVectors(query, 10);
const vectorEndTime = performance.now();
console.log('Vector results:', vectorResults);
// Log timings
console.log(`Command search took ${commandEndTime - startTime} milliseconds`);
console.log(`Dynamic search took ${fuseEndTime - commandEndTime} milliseconds`);
console.log(`Vector search took ${vectorEndTime - fuseEndTime} milliseconds`);
// Create a map to store our final results, using ID as key to avoid duplicates
const resultMap = new Map<string, CombinedResult>();
// Add command results first (they keep their original scores)
commandResults.forEach(r => resultMap.set(r.id, r));
// Process dynamic results and vector results together
const seenIds = new Set<string>();
// Add dynamic results first
dynamicResults.forEach(r => {
seenIds.add(r.id);
const vectorMatch = vectorResults.find(v => v.object.id === r.id);
if (vectorMatch) {
// If we found it in both searches, combine the scores
resultMap.set(r.id, {
...r,
score: r.score + (vectorMatch.similarity * 0.6) // Boost exact matches
});
} else {
// If only in Fuse results, keep as is
resultMap.set(r.id, r);
}
});
// Now add any vector results we haven't seen yet
vectorResults.forEach(v => {
const id = v.object.id;
if (!seenIds.has(id)) {
// This is a semantic match that Fuse missed - add it with the vector similarity as score
resultMap.set(id, {
id,
type: 'dynamic' as const,
score: v.similarity * 0.9, // High base score for semantic matches
item: v.object
});
}
});
// Convert to array and sort by score
const results = Array.from(resultMap.values());
results.sort((a, b) => b.score - a.score);
return results;
@@ -0,0 +1,86 @@
import { getEmbedding, EmbeddingIndex, initializeModel } from 'client-vector-search';
import type { HydratedIndexItem } from './indexing/types';
import type { SearchResult } from 'client-vector-search';
let vectorIndex: EmbeddingIndex | null = null;
export async function initVectorSearch() {
try {
await initializeModel();
vectorIndex = new EmbeddingIndex([]);
// Load existing items from IndexedDB
const stored = await vectorIndex.getAllObjectsFromIndexedDB();
if (stored.length > 0) {
stored.forEach(item => vectorIndex!.add(item));
console.debug('Vector index loaded from IndexedDB');
}
} catch (e) {
console.debug('Creating new vector index');
vectorIndex = new EmbeddingIndex([]);
}
}
export async function vectorizeItem(item: HydratedIndexItem): Promise<HydratedIndexItem & { embedding: number[] }> {
const textToEmbed = [
item.text,
item.content,
item.category,
item.metadata?.author,
item.metadata?.subject
].filter(Boolean).join(' ');
const embedding = await getEmbedding(textToEmbed);
return { ...item, embedding };
}
export async function processItems(items: HydratedIndexItem[]) {
if (!vectorIndex) await initVectorSearch();
const unprocessedItems = items.filter(item => {
try {
return !vectorIndex!.get({ id: item.id });
} catch {
return true;
}
});
if (unprocessedItems.length === 0) {
console.debug('No new items to vectorize');
return;
}
console.debug(`Vectorizing ${unprocessedItems.length} new items...`);
// Process in batches to avoid UI freeze
const BATCH_SIZE = 5;
for (let i = 0; i < unprocessedItems.length; i += BATCH_SIZE) {
const batch = unprocessedItems.slice(i, i + BATCH_SIZE);
const vectorized = await Promise.all(batch.map(vectorizeItem));
for (const item of vectorized) {
vectorIndex!.add(item);
}
// Save periodically to avoid losing progress
await vectorIndex!.saveIndex('indexedDB');
// Log progress
console.debug(`Vectorized ${Math.min(i + BATCH_SIZE, unprocessedItems.length)}/${unprocessedItems.length} items`);
}
}
export interface VectorSearchResult extends SearchResult {
object: HydratedIndexItem & { embedding: number[] };
}
export async function searchVectors(query: string, topK: number = 10): Promise<VectorSearchResult[]> {
if (!vectorIndex) await initVectorSearch();
const queryEmbedding = await getEmbedding(query);
const results = await vectorIndex!.search(queryEmbedding, {
topK,
useStorage: 'indexedDB'
});
return results as VectorSearchResult[];
}