diff --git a/package.json b/package.json index 8ac05db3..c011f0ba 100644 --- a/package.json +++ b/package.json @@ -75,6 +75,7 @@ "@uiw/codemirror-extensions-color": "^4.23.10", "@uiw/codemirror-theme-github": "^4.23.10", "autoprefixer": "^10.4.21", + "client-vector-search": "^0.2.0", "codemirror": "^6.0.1", "color": "^5.0.0", "dompurify": "^3.2.4", diff --git a/src/plugins/built-in/globalSearch/SearchBar.svelte b/src/plugins/built-in/globalSearch/SearchBar.svelte index c4330e9d..a9838714 100644 --- a/src/plugins/built-in/globalSearch/SearchBar.svelte +++ b/src/plugins/built-in/globalSearch/SearchBar.svelte @@ -6,7 +6,7 @@ import { type StaticCommandItem } from './commands'; import type { CombinedResult } from './types'; import { createSearchIndexes, performSearch as doSearch } from './searchUtils'; - import { highlightMatch, highlightSnippet } from './highlightUtils'; + import { highlightMatch, highlightSnippet, stripHtmlButKeepHighlights } from './highlightUtils'; import Fuse from 'fuse.js'; import Calculator from './Calculator.svelte'; import { actionMap } from './indexing/actions'; @@ -105,14 +105,14 @@ }; }); - const performSearch = () => { + const performSearch = async () => { isLoading = true; selectedIndex = 0; const term = searchTerm.trim().toLowerCase(); if (commandsFuse && dynamicContentFuse) { - combinedResults = doSearch( + combinedResults = await doSearch( term, commandsFuse, dynamicContentFuse, @@ -288,8 +288,9 @@ onclick={() => executeItemAction(dynamicItem)} >
+
{dynamicItem.metadata?.icon || '\ue924'}
- {@html highlightMatch(dynamicItem.text, searchTerm, result.matches)} + {@html stripHtmlButKeepHighlights(highlightMatch(dynamicItem.text, searchTerm, result.matches))} {dynamicItem.category} @@ -297,7 +298,7 @@
{#if dynamicItem.content}
- {@html highlightSnippet(dynamicItem.content, searchTerm, result.matches)} + {@html stripHtmlButKeepHighlights(highlightSnippet(dynamicItem.content, searchTerm, result.matches))}
{/if} diff --git a/src/plugins/built-in/globalSearch/client-vector-search-docs.md b/src/plugins/built-in/globalSearch/client-vector-search-docs.md new file mode 100644 index 00000000..97fab159 --- /dev/null +++ b/src/plugins/built-in/globalSearch/client-vector-search-docs.md @@ -0,0 +1,597 @@ +# client-vector-search + +A client side vector search library that can embed, search, and cache. Works on the browser and server side. + +It outperforms OpenAI's text-embedding-ada-002 and is way faster than Pinecone and other VectorDBs. + +I'm the founder of [searchbase.app](https://searchbase.app) and we needed this for our product and customers. We'll be using this library in production. You can be sure it'll be maintained and improved. + +- Embed documents using transformers by default: gte-small (~30mb). +- Calculate cosine similarity between embeddings. +- Create an index and search on the client side +- Cache vectors with browser caching support. + +Lots of improvements are coming! + +## Roadmap + +Our goal is to build a super simple, fast vector search that works with couple hundred to thousands vectors. ~1k vectors per user covers 99% of the use cases. + +We'll initially keep things super simple and sub 100ms + +### TODOs +- [ ] add HNSW index that works on node and browser env, don't rely on hnsw binder libs +- [ ] add a proper testing suite and ci/cd for the lib + - [ ] simple health tests + - [ ] mock the @xenova/transformers for jest, it's not happy with it + - [ ] performance tests, recall, memory usage, cpu usage etc. + + +## Installation + +```bash +npm i client-vector-search +``` + + +## Quickstart + +This library provides a plug-and-play solution for embedding and vector search. It's designed to be easy to use, efficient, and versatile. Here's a quick start guide: + + +```ts + import { getEmbedding, EmbeddingIndex } from 'client-vector-search'; + + // getEmbedding is an async function, so you need to use 'await' or '.then()' to get the result + const embedding = await getEmbedding("Apple"); // Returns embedding as number[] + + // Each object should have an 'embedding' property of type number[] + const initialObjects = [ + { id: 1, name: "Apple", embedding: embedding }, + { id: 2, name: "Banana", embedding: await getEmbedding("Banana") }, + { id: 3, name: "Cheddar", embedding: await getEmbedding("Cheddar")}, + { id: 4, name: "Space", embedding: await getEmbedding("Space")}, + { id: 5, name: "database", embedding: await getEmbedding("database")}, + ]; + const index = new EmbeddingIndex(initialObjects); // Creates an index + + // The query should be an embedding of type number[] + const queryEmbedding = await getEmbedding('Fruit'); // Query embedding + const results = await index.search(queryEmbedding, { topK: 5 }); // Returns top similar objects + + // specify the storage type + await index.saveIndex('indexedDB'); + const results = await index.search([1, 2, 3], { + topK: 5, + useStorage: 'indexedDB', + // storageOptions: { // use only if you overrode the defaults + // indexedDBName: 'clientVectorDB', + // indexedDBObjectStoreName: 'ClientEmbeddingStore', + // }, + }); + + console.log(results); + + await index.deleteIndexedDB(); // if you overrode default, specify db name +``` + +## Trouble-shooting + +### NextJS +To use it inside NextJS projects you'll need to update the `next.config.js` file to include the following: + +```js +module.exports = { + // Override the default webpack configuration + webpack: (config) => { + // See https://webpack.js.org/configuration/resolve/#resolvealias + config.resolve.alias = { + ...config.resolve.alias, + sharp$: false, + "onnxruntime-node$": false, + }; + return config; + }, +}; +``` + +#### Model load after page is loaded + +You can initialize the model before using it to generate embeddings. This will ensure that the model is loaded before you use it and provide a better UX. + +```js +import { initializeModel } from "client-vector-search" +... + useEffect(() => { + try { + initializeModel(); + } catch (e) { + console.log(e); + } + }, []); +``` + +## Usage Guide + +This guide provides a step-by-step walkthrough of the library's main features. It covers everything from generating embeddings for a string to performing operations on the index such as adding, updating, and removing objects. It also includes instructions on how to save the index to a database and perform search operations within it. + +Until we have a reference documentation, you can find all the methods and their usage in this guide. Each step is accompanied by a code snippet to illustrate the usage of the method in question. Make sure to follow along and try out the examples in your own environment to get a better understanding of how everything works. + +Let's get started! + +### Step 1: Generate Embeddings for String +Generate embeddings for a given string using the `getEmbedding` method. + +```ts +const embedding = await getEmbedding("Apple"); // Returns embedding as number[] +``` +> **Note**: `getEmbedding` is asynchronous; make sure to use `await`. + +--- + +### Step 2: Calculate Cosine Similarity +Calculate the cosine similarity between two embeddings. + +```ts +const similarity = cosineSimilarity(embedding1, embedding2, 6); +``` +> **Note**: Both embeddings should be of the same length. + +--- + +### Step 3: Create an Index +Create an index with an initial array of objects. Each object must have an 'embedding' property. + +```ts +const initialObjects = [...]; +const index = new EmbeddingIndex(initialObjects); +``` + +--- + +### Step 4: Add to Index +Add an object to the index. + +```ts +const objectToAdd = { id: 6, name: 'Cat', embedding: await getEmbedding('Cat') }; +index.add(objectToAdd); +``` + +--- + +### Step 5: Update Index +Update an existing object in the index. + +```ts +const vectorToUpdate = { id: 6, name: 'Dog', embedding: await getEmbedding('Dog') }; +index.update({ id: 6 }, vectorToUpdate); +``` + +--- + +### Step 6: Remove from Index +Remove an object from the index. + +```ts +index.remove({ id: 6 }); +``` + +--- + +### Step 7: Retrieve from Index +Retrieve an object from the index. + +```ts +const vector = index.get({ id: 1 }); +``` + +--- + +### Step 8: Search the Index +Search the index with a query embedding. + +```ts +const queryEmbedding = await getEmbedding('Fruit'); +const results = await index.search(queryEmbedding, { topK: 5 }); +``` + +--- + +### Step 9: Print the Index +Print the entire index to the console. + +```ts +index.printIndex(); +``` + +--- + +### Step 10: Save Index to IndexedDB (for browser) +Save the index to a persistent IndexedDB database. Note + +```ts +await index.saveIndex("indexedDB", { DBName: "clientVectorDB", objectStoreName:"ClientEmbeddingStore"}) +``` + +--- + +### Important: Search in indexedDB +Perform a search operation in the IndexedDB. + +```ts +const results = await index.search(queryEmbedding, { + topK: 5, + useStorage: "indexedDB", + storageOptions: { // only if you want to override the default options, defaults are below + indexedDBName: 'clientVectorDB', + indexedDBObjectStoreName: 'ClientEmbeddingStore' + } +}); + +--- + +### Delete Database +To delete an entire database. + +```ts +await IndexedDbManager.deleteIndexedDB("clientVectorDB"); +``` + +--- + +### Delete Object Store +To delete an object store from a database. + +```ts +await IndexedDbManager.deleteIndexedDBObjectStore("clientVectorDB", "ClientEmbeddingStore"); +``` + +--- + +### Retrieve All Objects +To retrieve all objects from a specific object store. + +```ts +const allObjects = await IndexedDbManager.getAllObjectsFromIndexedDB("clientVectorDB", "ClientEmbeddingStore"); +``` + + + + +# THE MAIN INDEX.TS FILE THAT YOU ARE IMPORTING FROM +```index.ts +const DEFAULT_TOP_K = 3; + +interface Filter { + [key: string]: any; +} + +import Cache from './cache'; +import { IndexedDbManager } from './indexedDB'; +import { cosineSimilarity } from './utils'; +export { ExperimentalHNSWIndex } from './hnsw'; + +// uncomment if you want to test indexedDB implementation in node env for faster dev cycle +// import { IDBFactory } from 'fake-indexeddb'; +// const indexedDB = new IDBFactory(); + +export interface SearchResult { + similarity: number; + object: any; +} + +type StorageOptions = 'indexedDB' | 'localStorage' | 'none'; + +/** + * Interface for search options in the EmbeddingIndex class. + * topK: The number of top similar items to return. + * filter: An optional filter to apply to the objects before searching. + * useStorage: A flag to indicate whether to use storage options like indexedDB or localStorage. + */ +interface SearchOptions { + topK?: number; + filter?: Filter; + useStorage?: StorageOptions; + storageOptions?: { indexedDBName: string; indexedDBObjectStoreName: string }; // TODO: generalize it to localStorage as well +} + +const cacheInstance = Cache.getInstance(); + +let pipe: any; +let currentModel: string; + +export const initializeModel = async ( + model: string = 'Xenova/gte-small', +): Promise => { + if (model !== currentModel) { + const transformersModule = await import('@xenova/transformers'); + const pipeline = transformersModule.pipeline; + pipe = await pipeline('feature-extraction', model); + currentModel = model; + } +}; + +export const getEmbedding = async ( + text: string, + precision: number = 7, + options = { pooling: 'mean', normalize: false }, + model = 'Xenova/gte-small', +): Promise => { + const cachedEmbedding = cacheInstance.get(text); + if (cachedEmbedding) { + return Promise.resolve(cachedEmbedding); + } + + if (model !== currentModel) { + await initializeModel(model); + } + + const output = await pipe(text, options); + const roundedOutput = Array.from(output.data as number[]).map( + (value: number) => parseFloat(value.toFixed(precision)), + ); + cacheInstance.set(text, roundedOutput); + return Array.from(roundedOutput); +}; + +export class EmbeddingIndex { + private objects: Filter[]; + private keys: string[]; + + constructor(initialObjects?: Filter[]) { + // TODO: add support for options while creating index such as {... indexedDB: true, ...} + this.objects = []; + this.keys = []; + if (initialObjects && initialObjects.length > 0) { + initialObjects.forEach((obj) => this.validateAndAdd(obj)); + if (initialObjects[0]) { + this.keys = Object.keys(initialObjects[0]); + } + } + } + + private findVectorIndex(filter: Filter): number { + return this.objects.findIndex((object) => + Object.keys(filter).every((key) => object[key] === filter[key]), + ); + } + + private validateAndAdd(obj: Filter) { + if (!Array.isArray(obj.embedding) || obj.embedding.some(isNaN)) { + throw new Error( + 'Object must have an embedding property of type number[]', + ); + } + if (this.keys.length === 0) { + this.keys = Object.keys(obj); + } else if (!this.keys.every((key) => key in obj)) { + throw new Error( + 'Object must have the same properties as the initial objects', + ); + } + this.objects.push(obj); + } + + add(obj: Filter) { + this.validateAndAdd(obj); + } + + // Method to update an existing vector in the index + update(filter: Filter, vector: Filter) { + const index = this.findVectorIndex(filter); + if (index === -1) { + throw new Error('Vector not found'); + } + if (vector.hasOwnProperty('embedding')) { + // Validate and add the new vector + this.validateAndAdd(vector); + } + // Replace the old vector with the new one + this.objects[index] = Object.assign(this.objects[index] as Filter, vector); + } + + // Method to remove a vector from the index + remove(filter: Filter) { + const index = this.findVectorIndex(filter); + if (index === -1) { + throw new Error('Vector not found'); + } + // Remove the vector from the index + this.objects.splice(index, 1); + } + + // Method to remove multiple vectors from the index + removeBatch(filters: Filter[]) { + filters.forEach((filter) => { + const index = this.findVectorIndex(filter); + if (index !== -1) { + // Remove the vector from the index + this.objects.splice(index, 1); + } + }); + } + + // Method to retrieve a vector from the index + get(filter: Filter) { + const vector = this.objects[this.findVectorIndex(filter)]; + return vector || null; + } + + size(): number { + // Returns the size of the index + return this.objects.length; + } + + clear() { + this.objects = []; + } + + async search( + queryEmbedding: number[], + options: SearchOptions = { + topK: 3, + useStorage: 'none', + storageOptions: { + indexedDBName: 'clientVectorDB', + indexedDBObjectStoreName: 'ClientEmbeddingStore', + }, + }, + ): Promise { + const topK = options.topK || DEFAULT_TOP_K; + const filter = options.filter || {}; + const useStorage = options.useStorage || 'none'; + + if (useStorage === 'indexedDB') { + const DBname = options.storageOptions?.indexedDBName || 'clientVectorDB'; + const objectStoreName = + options.storageOptions?.indexedDBObjectStoreName || + 'ClientEmbeddingStore'; + + if (typeof indexedDB === 'undefined') { + console.error('IndexedDB is not supported'); + throw new Error('IndexedDB is not supported'); + } + const results = await this.loadAndSearchFromIndexedDB( + DBname, + objectStoreName, + queryEmbedding, + topK, + filter, + ); + return results; + } else { + // Compute similarities + const similarities = this.objects + .filter((object) => + Object.keys(filter).every((key) => object[key] === filter[key]), + ) + .map((obj) => ({ + similarity: cosineSimilarity(queryEmbedding, obj.embedding), + object: obj, + })); + + // Sort by similarity and return topK results + return similarities + .sort((a, b) => b.similarity - a.similarity) + .slice(0, topK); + } + } + + printIndex() { + console.log('Index Content:'); + this.objects.forEach((obj, idx) => { + console.log(`Item ${idx + 1}:`, obj); + }); + } + + async saveIndex( + storageType: string, + options: { DBName: string; objectStoreName: string } = { + DBName: 'clientVectorDB', + objectStoreName: 'ClientEmbeddingStore', + }, + ) { + if (storageType === 'indexedDB') { + await this.saveToIndexedDB(options.DBName, options.objectStoreName); + } else { + throw new Error( + `Unsupported storage type: ${storageType} \n Supported storage types: "indexedDB"`, + ); + } + } + + async saveToIndexedDB( + DBname: string = 'clientVectorDB', + objectStoreName: string = 'ClientEmbeddingStore', + ): Promise { + if (typeof indexedDB === 'undefined') { + console.error('IndexedDB is not defined'); + throw new Error('IndexedDB is not supported'); + } + + if (!this.objects || this.objects.length === 0) { + throw new Error('Index is empty. Nothing to save'); + } + + try { + const db = await IndexedDbManager.create(DBname, objectStoreName); + await db.addToIndexedDB(this.objects); + console.log( + `Index saved to database '${DBname}' object store '${objectStoreName}'`, + ); + } catch (error) { + console.error('Error saving index to database:', error); + throw new Error('Error saving index to database'); + } + } + + async loadAndSearchFromIndexedDB( + DBname: string = 'clientVectorDB', + objectStoreName: string = 'ClientEmbeddingStore', + queryEmbedding: number[], + topK: number, + filter: { [key: string]: any }, + ): Promise { + const db = await IndexedDbManager.create(DBname, objectStoreName); + const generator = db.dbGenerator(); + const results: { similarity: number; object: any }[] = []; + + for await (const record of generator) { + if (Object.keys(filter).every((key) => record[key] === filter[key])) { + const similarity = cosineSimilarity(queryEmbedding, record.embedding); + results.push({ similarity, object: record }); + } + } + results.sort((a, b) => b.similarity - a.similarity); + return results.slice(0, topK); + } + + async deleteIndexedDB(DBname: string = 'clientVectorDB'): Promise { + if (typeof indexedDB === 'undefined') { + console.error('IndexedDB is not defined'); + throw new Error('IndexedDB is not supported'); + } + return new Promise((resolve, reject) => { + const request = indexedDB.deleteDatabase(DBname); + + request.onsuccess = () => { + console.log(`Database '${DBname}' deleted`); + resolve(); + }; + request.onerror = (event) => { + console.error('Failed to delete database', event); + reject(new Error('Failed to delete database')); + }; + }); + } + + async deleteIndexedDBObjectStore( + DBname: string = 'clientVectorDB', + objectStoreName: string = 'ClientEmbeddingStore', + ): Promise { + const db = await IndexedDbManager.create(DBname, objectStoreName); + + try { + await db.deleteIndexedDBObjectStoreFromDB(DBname, objectStoreName); + console.log( + `Object store '${objectStoreName}' deleted from database '${DBname}'`, + ); + } catch (error) { + console.error('Error deleting object store:', error); + throw new Error('Error deleting object store'); + } + } + + async getAllObjectsFromIndexedDB( + DBname: string = 'clientVectorDB', + objectStoreName: string = 'ClientEmbeddingStore', + ): Promise { + const db = await IndexedDbManager.create(DBname, objectStoreName); + const objects: any[] = []; + for await (const record of db.dbGenerator()) { + objects.push(record); + } + return objects; + } +} +``` \ No newline at end of file diff --git a/src/plugins/built-in/globalSearch/commands.ts b/src/plugins/built-in/globalSearch/commands.ts index 618dc6a9..f5b6ae75 100644 --- a/src/plugins/built-in/globalSearch/commands.ts +++ b/src/plugins/built-in/globalSearch/commands.ts @@ -28,7 +28,7 @@ const staticCommands: StaticCommandItem[] = [ window.location.hash = '?page=/home'; loadHomePage(); }, - priority: 10 + priority: 4 }, { id: 'messages', @@ -40,7 +40,7 @@ const staticCommands: StaticCommandItem[] = [ action: () => { window.location.hash = '?page=/messages'; }, - priority: 10 + priority: 4 }, { id: 'timetable', @@ -52,7 +52,7 @@ const staticCommands: StaticCommandItem[] = [ action: () => { window.location.hash = '?page=/timetable'; }, - priority: 10 + priority: 4 }, { id: 'assessments', @@ -64,7 +64,7 @@ const staticCommands: StaticCommandItem[] = [ action: () => { window.location.hash = '?page=/assessments'; }, - priority: 10 + priority: 4 }, { id: 'toggle-dark-mode', @@ -72,7 +72,7 @@ const staticCommands: StaticCommandItem[] = [ category: 'action', text: 'Toggle Dark Mode', action: () => settingsState.DarkMode = !settingsState.DarkMode, - priority: 5, + priority: 2, keywords: ['theme', 'appearance'] } ]; diff --git a/src/plugins/built-in/globalSearch/components/AssessmentItem.svelte b/src/plugins/built-in/globalSearch/components/AssessmentItem.svelte index 15643118..1d5dde71 100644 --- a/src/plugins/built-in/globalSearch/components/AssessmentItem.svelte +++ b/src/plugins/built-in/globalSearch/components/AssessmentItem.svelte @@ -1,12 +1,13 @@