diff --git a/src/plugins/built-in/globalSearch/lazy.ts b/src/plugins/built-in/globalSearch/lazy.ts index a8ae19eb..10e5222f 100644 --- a/src/plugins/built-in/globalSearch/lazy.ts +++ b/src/plugins/built-in/globalSearch/lazy.ts @@ -6,6 +6,7 @@ import { hotkeySetting, } from "../../core/settingsHelpers"; import styles from "./src/core/styles.css?inline"; +import { resetSearchIndexes } from "./src/indexing/resetIndexes"; // Platform-aware default hotkey const getDefaultHotkey = () => { @@ -34,78 +35,33 @@ const settings = defineSettings({ title: "Index on Page Load", description: "Run content indexing when SEQTA loads", }), + passiveIndexing: booleanSetting({ + default: true, + title: "Index Browsed Content", + description: + "Capture safe text from SEQTA pages you visit so they're searchable. Sensitive routes (settings, files, login) are always excluded.", + }), resetIndex: buttonSetting({ title: "Reset Index", description: "Reset the search index and storage", trigger: async () => { - const confirmed = confirm("Are you sure you want to reset the search index and storage?"); + const confirmed = confirm( + "Are you sure you want to reset the search index and storage?", + ); + if (!confirmed) return; - if (confirmed) { - try { - // Dynamically import modules to avoid loading heavy dependencies - const { VectorWorkerManager } = await import("./src/indexing/worker/vectorWorkerManager"); - const { resetDatabase } = await import("./src/indexing/db"); - - // Reset vector worker first - try { - const workerManager = VectorWorkerManager.getInstance(); - await workerManager.resetWorker(); - console.log("Vector worker reset successfully"); - } catch (e) { - console.warn("Failed to reset vector worker:", e); - } - - // Close all database connections properly before deletion - try { - await resetDatabase(); - console.log("betterseqta-index database closed and reset"); - } catch (e) { - console.warn("Failed to reset betterseqta-index database:", e); - } - - // Wait a bit for connections to fully close - await new Promise(resolve => setTimeout(resolve, 100)); - - // Delete embeddiaDB (vector search database) - const deleteDb = (dbName: string) => { - return new Promise((resolve, reject) => { - const req = indexedDB.deleteDatabase(dbName); - req.onsuccess = () => { - console.log(`Successfully deleted database: ${dbName}`); - resolve(); - }; - req.onerror = () => { - console.error(`Error deleting database ${dbName}:`, req.error); - reject(req.error); - }; - req.onblocked = () => { - console.warn(`Database ${dbName} deletion blocked - connections still open`); - // Wait and retry once - setTimeout(() => { - const retryReq = indexedDB.deleteDatabase(dbName); - retryReq.onsuccess = () => { - console.log(`Successfully deleted database on retry: ${dbName}`); - resolve(); - }; - retryReq.onerror = () => reject(retryReq.error); - retryReq.onblocked = () => { - reject(new Error(`One database is open, failed to remove: ${dbName}. Please close other tabs and try again.`)); - }; - }, 500); - }; - }); - }; - - try { - await deleteDb("embeddiaDB"); - await deleteDb("betterseqta-index"); - alert("Search index and storage have been reset successfully."); - } catch (e) { - alert("Failed to reset one or more databases: " + String(e) + "\n\nTry closing other browser tabs and try again."); - } - } catch (e) { - alert("Failed to reset index: " + String(e)); - } + try { + // `resetSearchIndexes` is a tiny statically-imported helper: no + // dynamic chunks to chase, so the button keeps working even when + // the settings page has been open across an extension update. + await resetSearchIndexes(); + alert("Search index and storage have been reset successfully."); + } catch (e) { + alert( + "Failed to reset index: " + + String(e) + + "\n\nTry closing other browser tabs and try again.", + ); } }, }), diff --git a/src/plugins/built-in/globalSearch/src/components/SearchBar.svelte b/src/plugins/built-in/globalSearch/src/components/SearchBar.svelte index 912622f0..67a75f59 100644 --- a/src/plugins/built-in/globalSearch/src/components/SearchBar.svelte +++ b/src/plugins/built-in/globalSearch/src/components/SearchBar.svelte @@ -48,6 +48,13 @@ let calculatorResult = $state(null); let resultsList = $state(); + // Monotonic counter so a slow async search (vector reranking) cannot + // overwrite results from a newer keystroke. Without this guard, the user + // observes results "flickering" — e.g. typing `world w` finds the assessment + // but `world wa` triggers a new search whose vector pass returns later than + // the `world w` pass and clobbers the more relevant matches. + let searchRequestId = 0; + const updateCalculatorState = (hasResult: string | null) => { calculatorResult = hasResult; }; @@ -166,20 +173,30 @@ }); const term = searchTerm.trim().toLowerCase(); - + const requestId = ++searchRequestId; + if (commandsFuse && dynamicContentFuse) { - combinedResults = await doSearch( - term, - commandsFuse, + const results = await doSearch( + term, + commandsFuse, commandIdToItemMap, dynamicContentFuse, dynamicIdToItemMap, true, // sortByRecent ); + + // Drop the result if the user has typed since this search started, or + // if the current term no longer matches what we searched for. This + // keeps the visible list anchored to the latest query. + if (requestId !== searchRequestId) return; + if (searchTerm.trim().toLowerCase() !== term) return; + + combinedResults = results; } else { + if (requestId !== searchRequestId) return; combinedResults = []; } - + isLoading = false; }; diff --git a/src/plugins/built-in/globalSearch/src/components/items/GenericItem.svelte b/src/plugins/built-in/globalSearch/src/components/items/GenericItem.svelte new file mode 100644 index 00000000..7948ff7c --- /dev/null +++ b/src/plugins/built-in/globalSearch/src/components/items/GenericItem.svelte @@ -0,0 +1,89 @@ + + + diff --git a/src/plugins/built-in/globalSearch/src/core/commands.ts b/src/plugins/built-in/globalSearch/src/core/commands.ts index c5382764..ce443141 100644 --- a/src/plugins/built-in/globalSearch/src/core/commands.ts +++ b/src/plugins/built-in/globalSearch/src/core/commands.ts @@ -1,6 +1,7 @@ import { settingsState } from "@/seqta/utils/listeners/SettingsState"; import { loadHomePage } from "@/seqta/utils/Loaders/LoadHomePage"; import { waitForElm } from "@/seqta/utils/waitForElm"; +import { getCurrentStudentId } from "../indexing/api"; export interface BaseCommandItem { id: string; @@ -23,13 +24,19 @@ async function getCurrentLesson() { const todayFormatted = formatDate(date); try { + const student = await getCurrentStudentId(); + if (typeof student !== "number") { + alert("Could not determine the active SEQTA student."); + return null; + } const response = await fetch(`${location.origin}/seqta/student/load/timetable?`, { method: "POST", + credentials: "include", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ from: todayFormatted, until: todayFormatted, - student: 69, + student, }), }); diff --git a/src/plugins/built-in/globalSearch/src/core/index.ts b/src/plugins/built-in/globalSearch/src/core/index.ts index 65f9f628..23b9801a 100644 --- a/src/plugins/built-in/globalSearch/src/core/index.ts +++ b/src/plugins/built-in/globalSearch/src/core/index.ts @@ -15,6 +15,10 @@ import { cleanupSearchBar, mountSearchBar } from "./mountSearchBar"; import { IndexedDbManager } from "embeddia"; import { VectorWorkerManager } from "../indexing/worker/vectorWorkerManager"; import { checkAndHandleUpdate } from "../utils/versionCheck"; +import { + getStoredPassiveItems, + installPassiveObserver, +} from "../indexing/passiveObserver"; // Platform-aware default hotkey const getDefaultHotkey = () => { @@ -43,6 +47,12 @@ const settings = defineSettings({ title: "Index on Page Load", description: "Run content indexing when SEQTA loads", }), + passiveIndexing: booleanSetting({ + default: true, + title: "Index Browsed Content", + description: + "Capture safe text from SEQTA pages you visit so they're searchable. Sensitive routes (settings, files, login) are always excluded.", + }), resetIndex: buttonSetting({ title: "Reset Index", description: "Reset the search index and storage", @@ -131,6 +141,9 @@ class GlobalSearchPlugin extends BasePlugin { @Setting(settings.runIndexingOnLoad) runIndexingOnLoad!: boolean; + @Setting(settings.passiveIndexing) + passiveIndexing!: boolean; + @Setting(settings.resetIndex) resetIndex!: () => void; } @@ -150,26 +163,35 @@ const globalSearchPlugin: Plugin = { run: async (api) => { const appRef = { current: null }; - // Check for extension updates and clear caches if needed - // Use a timeout to avoid blocking initialization - setTimeout(async () => { - try { - const wasUpdated = await checkAndHandleUpdate(); - if (wasUpdated) { - console.log("[Global Search] Extension updated - caches cleared"); - } - } catch (error: any) { - // Handle CSS preload errors and other failures gracefully - // These can happen in Firefox or when assets aren't available - if (error?.message?.includes("preload CSS") || - error?.message?.includes("MIME type") || - error?.message?.includes("NS_ERROR_CORRUPTED_CONTENT")) { - console.debug("[Global Search] Version check skipped due to asset loading restrictions:", error.message); - } else { - console.warn("[Global Search] Failed to check for updates:", error); - } + // Run the version check BEFORE we open any IndexedDB connections. + // On a normal load (no version change) this is just a string compare + // and a manifest read, so the cost is negligible. On a real update, + // we want the database wipe to complete before `IndexedDbManager` + // grabs a handle on `embeddiaDB`, otherwise the delete request comes + // back blocked. + try { + const wasUpdated = await checkAndHandleUpdate(); + if (wasUpdated) { + console.log( + "[Global Search] Extension updated — search index reset; the next indexing pass will repopulate.", + ); } - }, 100); + } catch (error: any) { + // Firefox sometimes refuses CSS preloads or asset reads; we never + // want this path to take the whole plugin down. + if ( + error?.message?.includes("preload CSS") || + error?.message?.includes("MIME type") || + error?.message?.includes("NS_ERROR_CORRUPTED_CONTENT") + ) { + console.debug( + "[Global Search] Version check skipped due to asset loading restrictions:", + error.message, + ); + } else { + console.warn("[Global Search] Failed to check for updates:", error); + } + } try { await IndexedDbManager.create("embeddiaDB", "embeddiaObjectStore", { @@ -210,6 +232,17 @@ const globalSearchPlugin: Plugin = { const workerManager = VectorWorkerManager.getInstance(); console.log("Streaming active:", workerManager.isStreamingActive()); }, + passiveItems: async () => { + const items = await getStoredPassiveItems(); + console.log(`Captured ${items.length} passive items`); + return items; + }, + runSelfTests: async () => { + const { runGlobalSearchSelfTests } = await import( + "../indexing/selfTests" + ); + return runGlobalSearchSelfTests(); + }, checkIndexedDBSize: async () => { try { const estimate = await navigator.storage.estimate(); @@ -232,6 +265,14 @@ const globalSearchPlugin: Plugin = { } }; + if (api.settings.passiveIndexing) { + try { + installPassiveObserver(); + } catch (error) { + console.warn("[Global Search] Passive observer install failed:", error); + } + } + if (api.settings.runIndexingOnLoad) { setTimeout(async () => { await runIndexing(); diff --git a/src/plugins/built-in/globalSearch/src/core/mountSearchBar.ts b/src/plugins/built-in/globalSearch/src/core/mountSearchBar.ts index 8a20a1a9..03c60751 100644 --- a/src/plugins/built-in/globalSearch/src/core/mountSearchBar.ts +++ b/src/plugins/built-in/globalSearch/src/core/mountSearchBar.ts @@ -18,62 +18,68 @@ export function mountSearchBar( let currentHotkey = isValidHotkey(api.settings.searchHotkey) ? api.settings.searchHotkey : "ctrl+k"; let hotkeyDisplay = formatHotkeyForDisplay(currentHotkey); + // Search trigger + progress UI live in one wrapper so the auto-margin + // pushes the whole group to the left edge of the topbar instead of + // stranding the progress text on the far right of the screen. + const searchWrapper = document.createElement("div"); + searchWrapper.className = "search-trigger-wrapper"; + + // Anchor lets us absolutely position the progress bar directly beneath + // the search button without disturbing the topbar's vertical rhythm. + const searchAnchor = document.createElement("div"); + searchAnchor.className = "search-trigger-anchor"; + const searchButton = document.createElement("div"); searchButton.className = "search-trigger"; - - // Create progress indicator container - const progressContainer = document.createElement("div"); - progressContainer.className = "search-progress-container"; - progressContainer.style.cssText = "display: flex; align-items: center; gap: 8px; margin-left: 8px; min-width: 120px;"; - - // Create progress bar + const progressBarWrapper = document.createElement("div"); progressBarWrapper.className = "search-progress-bar-wrapper"; - progressBarWrapper.style.cssText = "flex: 1; height: 4px; background: rgba(0, 0, 0, 0.1); border-radius: 2px; overflow: hidden; display: none;"; - + const progressBar = document.createElement("div"); progressBar.className = "search-progress-bar"; - progressBar.style.cssText = "height: 100%; background: linear-gradient(90deg, #3b82f6, #2563eb, #3b82f6); transition: width 0.3s ease-out; width: 0%; position: relative;"; - - // Add shimmer effect - const shimmer = document.createElement("div"); - shimmer.style.cssText = "position: absolute; inset: 0; background: linear-gradient(90deg, transparent, rgba(255,255,255,0.3), transparent); animation: shimmer 2s infinite;"; - progressBar.appendChild(shimmer); progressBarWrapper.appendChild(progressBar); - - // Create progress text - const progressText = document.createElement("span"); + + // Use a block-level
so the label reliably participates in flex + // layout. A defaults to `display: inline`, which silently ignores + // `max-width`, `overflow`, and `text-overflow: ellipsis`, and was the + // reason the label appeared blank when the bar was visible. + const progressText = document.createElement("div"); progressText.className = "search-progress-text"; - progressText.style.cssText = "font-size: 11px; color: #666; white-space: nowrap; display: none;"; - - progressContainer.appendChild(progressBarWrapper); - progressContainer.appendChild(progressText); - + progressText.setAttribute("aria-live", "polite"); + + searchAnchor.appendChild(searchButton); + searchAnchor.appendChild(progressBarWrapper); + searchWrapper.appendChild(searchAnchor); + searchWrapper.appendChild(progressText); + // Indexing state let isIndexing = false; let completedJobs = 0; let totalJobs = 0; let indexingStatus: string | null = null; - + const updateProgressDisplay = () => { if (isIndexing && totalJobs > 0) { const percentage = Math.round((completedJobs / totalJobs) * 100); progressBar.style.width = `${Math.max(2, percentage)}%`; - progressBarWrapper.style.display = "block"; - + progressBarWrapper.classList.add("is-active"); + if (indexingStatus) { - progressText.textContent = indexingStatus.length > 20 ? indexingStatus.substring(0, 20) + "..." : indexingStatus; - progressText.style.display = "block"; + const statusText = + indexingStatus.length > 28 + ? indexingStatus.substring(0, 28) + "…" + : indexingStatus; + progressText.textContent = `${statusText} · ${percentage}%`; } else { - progressText.textContent = `${completedJobs}/${totalJobs} (${percentage}%)`; - progressText.style.display = "block"; + progressText.textContent = `Indexing ${completedJobs}/${totalJobs} (${percentage}%)`; } + progressText.classList.add("is-active"); } else { - progressBarWrapper.style.display = "none"; - progressText.style.display = "none"; + progressBarWrapper.classList.remove("is-active"); + progressText.classList.remove("is-active"); } }; - + // Listen for indexing progress events const progressHandler = (event: CustomEvent) => { const { completed, total, indexing, status } = event.detail; @@ -83,7 +89,7 @@ export function mountSearchBar( indexingStatus = status || null; updateProgressDisplay(); }; - + window.addEventListener('indexing-progress', progressHandler as EventListener); appRef.progressHandler = progressHandler; @@ -99,8 +105,7 @@ export function mountSearchBar( }; updateSearchButtonDisplay(); - titleElement.appendChild(searchButton); - titleElement.appendChild(progressContainer); + titleElement.appendChild(searchWrapper); // Listen for hotkey setting changes const handleStorageChange = (changes: any, area: string) => { @@ -155,18 +160,17 @@ export function cleanupSearchBar(appRef: { current: any; storageChangeHandler?: appRef.progressHandler = null; } - // Remove search trigger button - const searchTrigger = document.querySelector(".search-trigger"); - if (searchTrigger) { - searchTrigger.remove(); - } - - // Remove progress container - const progressContainer = document.querySelector(".search-progress-container"); - if (progressContainer) { - progressContainer.remove(); + // Remove search trigger wrapper (which contains the button and progress UI) + const searchWrapper = document.querySelector(".search-trigger-wrapper"); + if (searchWrapper) { + searchWrapper.remove(); } + // Defensive cleanup for older mounts that may have left the trigger or + // progress container as direct children of the topbar. + document.querySelector(".search-trigger")?.remove(); + document.querySelector(".search-progress-container")?.remove(); + // Remove search root const searchRoot = document.querySelector("div[data-search-root]"); if (searchRoot) { diff --git a/src/plugins/built-in/globalSearch/src/core/styles.css b/src/plugins/built-in/globalSearch/src/core/styles.css index 1c50394e..ea2eaade 100644 --- a/src/plugins/built-in/globalSearch/src/core/styles.css +++ b/src/plugins/built-in/globalSearch/src/core/styles.css @@ -1,13 +1,36 @@ +/* + * Wrapper that owns the auto-margin so the whole search-trigger-and-progress + * group sits at the left of the SEQTA topbar. Previously, only the + * `.search-trigger` had `margin-right: auto`, which pushed the progress text + * all the way to the far right of the screen. + */ +.search-trigger-wrapper { + display: flex !important; + align-items: center; + gap: 12px; + margin-left: 10px; + margin-right: auto !important; + /* Allow the bar's bottom portion to peek out below the wrapper without + getting clipped by the topbar's flex line. */ + overflow: visible; +} + +.search-trigger-anchor { + position: relative; + display: inline-flex; + isolation: isolate; /* new stacking context so the bar's z-index is local */ +} + .search-trigger { + position: relative; + z-index: 2; /* sits above the progress bar so the bar tucks under */ display: flex; align-items: center; justify-content: center; height: 32px; - margin-left: 10px; border-radius: 8px; cursor: pointer; transition: all 0.2s ease; - margin-right: auto !important; padding: 3px 12px; box-shadow: 0 2px 5px rgba(0, 0, 0, 0.2); backdrop-filter: blur(4px); @@ -83,29 +106,45 @@ animation: shimmer 2s infinite; } -/* Progress indicator next to search trigger */ -.search-progress-container { - display: flex; - align-items: center; - gap: 8px; - margin-left: 8px; - min-width: 120px; - max-width: 200px; - height: 32px; +/* + * Progress bar that hugs the bottom of the search button like the next + * card peeking from a small stack. The bar is intentionally inset on the + * sides and slightly shorter than the button so it reads as a stacked + * shadow card rather than a separate, floating element. + */ +.search-progress-bar-wrapper { + position: absolute; + left: 6px; + right: 6px; + /* + * `top: 100%; margin-top: -6px;` makes the bar slide UP into the button + * by 6px while still extending below it. Combined with z-index: 1 (vs + * the button's z-index: 2), the bar's top edge tucks under the button + * so only the bottom portion peeks out — the card-stack look. + */ + top: 100%; + margin-top: -6px; + height: 10px; + z-index: 1; + background: rgba(0, 0, 0, 0.1); + border-radius: 0 0 7px 7px; + overflow: hidden; + opacity: 0; + transform: translateY(-3px) scaleX(0.94); + transform-origin: top center; + transition: opacity 0.2s ease, transform 0.25s cubic-bezier(0.2, 0.7, 0.3, 1); + pointer-events: none; + box-shadow: 0 3px 6px rgba(0, 0, 0, 0.12); } -.search-progress-bar-wrapper { - flex: 1; - height: 4px; - background: rgba(0, 0, 0, 0.1); - border-radius: 2px; - overflow: hidden; - display: none; - min-width: 60px; +.search-progress-bar-wrapper.is-active { + opacity: 1; + transform: translateY(0) scaleX(1); } .dark .search-progress-bar-wrapper { - background: rgba(255, 255, 255, 0.1); + background: rgba(255, 255, 255, 0.08); + box-shadow: 0 2px 4px rgba(0, 0, 0, 0.35); } .search-progress-bar { @@ -114,26 +153,46 @@ transition: width 0.3s ease-out; width: 0%; position: relative; - border-radius: 2px; + border-radius: 0 0 6px 6px; } .search-progress-bar::after { content: ''; position: absolute; inset: 0; - background: linear-gradient(90deg, transparent, rgba(255,255,255,0.3), transparent); + background: linear-gradient(90deg, transparent, rgba(255, 255, 255, 0.3), transparent); animation: shimmer 2s infinite; - border-radius: 2px; + border-radius: 0 0 6px 6px; } +/* + * Progress label sits as a flex child immediately to the right of the + * search button (gap is provided by .search-trigger-wrapper). It's hidden + * by default and fades in once an indexing pass is active. + */ .search-progress-text { - font-size: 11px; - color: #666; + display: block; + font-size: 12px; + color: #475569; white-space: nowrap; - display: none; font-weight: 500; + opacity: 0; + transform: translateX(-4px); + transition: opacity 0.2s ease, transform 0.2s ease; + pointer-events: none; + max-width: 240px; + overflow: hidden; + text-overflow: ellipsis; + line-height: 32px; + letter-spacing: 0.01em; + flex: 0 0 auto; +} + +.search-progress-text.is-active { + opacity: 1; + transform: translateX(0); } .dark .search-progress-text { - color: #999; + color: #cbd5e1; } \ No newline at end of file diff --git a/src/plugins/built-in/globalSearch/src/indexing/__fixtures__/seqtaResponses.ts b/src/plugins/built-in/globalSearch/src/indexing/__fixtures__/seqtaResponses.ts new file mode 100644 index 00000000..850a5a6e --- /dev/null +++ b/src/plugins/built-in/globalSearch/src/indexing/__fixtures__/seqtaResponses.ts @@ -0,0 +1,161 @@ +/** + * Representative SEQTA response shapes captured from a real `/seqta/student/` + * session via the websiteskimmer recorder. These are static fixtures used + * by `selfTests.ts` to verify our extractors and the passive observer + * remain compatible with the upstream API as it evolves. + * + * NOTE: These fixtures are scrubbed of any secrets and reduced in size; the + * structure (keys, types, nesting) faithfully matches what SEQTA returns + * but the values are illustrative rather than real student data. + */ + +export const subjectsListPayload = [ + { + code: "2026S1", + description: "Sample Semester 1 timetable", + active: 1, + id: 77, + subjects: [ + { + code: "ENGG1", + classunit: 29248, + description: "English GEN 1", + metaclass: 29611, + title: "English GEN 1", + programme: 3830, + marksbook_type: "numeric", + }, + { + code: "MASA1", + classunit: 29247, + description: "Mathematics Specialist 1", + metaclass: 29610, + title: "Mathematics Specialist 1", + programme: 3831, + marksbook_type: "numeric", + }, + ], + }, +]; + +export const coursesPayload = { + c: "ENGG1#1", + t: "English GEN 1", + i: 3830, + m: 29611, + document: + '{"document":{"modules":[{"uuid":"1641cf87-ae08-4bcb-832d-d5709d84d0c5"}]}}', + w: [ + [ + { t: "", h: "", i: 248293, l: "", n: 0, o: "" }, + { + t: "", + i: 248316, + l: '

http://ed.ted.com/on/r80lnJL0#watch

', + n: 1, + o: "", + }, + ], + [{ t: "Lesson 2", h: "

Module 2

", i: 248294, l: "", n: 0, o: "" }], + ], +}; + +export const messagesListPayload = { + hasMore: false, + messages: [ + { + date: "2026-04-29 04:26:25.075868+00", + attachments: false, + read: 1, + sender: "Jacob Johannesburg", + subject: "test", + sender_type: "student", + attachmentCount: 0, + id: 81469, + sender_id: 3111, + }, + ], + ts: "2026-04-30 03:25:02.27900", +}; + +export const documentsPayload = [ + { + docs: [ + { + file: 49555, + filename: "School Glossary.docx", + size: "14931", + context_uuid: "3162189c-2052-4f83-ad83-a66c57460ea2", + mimetype: + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + created_date: "2021-08-04 12:55:55.102653+00", + title: "School Glossary", + uuid: "3162189c-2052-4f83-ad83-a66c57460ea2", + created_by: "537", + }, + ], + id: 9, + category: "Document repository", + }, +]; + +export const noticesPayload = [ + { + id: 12345, + title: "Lunchtime sport tomorrow", + contents: "

Bring shoes.

", + staff: "Mr Coach", + staff_id: 246, + label: 9, + label_title: "All Students", + colour: "#ff5722", + }, +]; + +export const portalsPayload = [ + { + is_power_portal: false, + inherit_styles: true, + icon: "colour-cerulean", + id: 328, + label: "Mathletics", + priority: 20, + uuid: "9d20f40c-fdc9-4aa3-91f1-905d86e240c4", + url: "www.mathletics.com/", + }, +]; + +export const folioListPayload = { + me: "Jacob Johannesburg", + list: [ + { + student: "Jacob Johannesburg", + id: 203, + published: "2026-04-14 20:02:50", + title: "My folio", + }, + ], +}; + +export const folioEntryPayload = { + forum: 478, + contents: + '[[embed:raw|

Some reflection text.

]] Plain trailing text.', + created: "2026-04-14 10:32:34.264641+00", + allow_comments: true, + author: { year: "Year 10", name: "Jacob Johannesburg", id: 3111 }, + files: [], + id: 203, + published: "2026-04-14 20:02:50", + title: "My folio", + updated: "2026-04-14 10:32:50.696678+00", +}; + +/** + * Settings payload contains tenant-wide configuration including third-party + * URLs and API keys. The passive observer must NEVER index this route. + */ +export const settingsPayload = { + "global.dropbox.api.key": { value: "xxx-do-not-index" }, + "global.ai.api.baseurl": { value: "https://example.com" }, +}; diff --git a/src/plugins/built-in/globalSearch/src/indexing/actions.ts b/src/plugins/built-in/globalSearch/src/indexing/actions.ts index a862249c..954b6730 100644 --- a/src/plugins/built-in/globalSearch/src/indexing/actions.ts +++ b/src/plugins/built-in/globalSearch/src/indexing/actions.ts @@ -28,6 +28,40 @@ interface AssessmentMetadata { type ActionHandler = (item: IndexItem & { metadata: T }) => void; +/** + * Navigate to a SEQTA SPA hash route in the most reliable way available. + * + * Setting `location.hash` works when the destination module is already + * registered with SEQTA's hashchange router (as is the case for the + * existing `message`/`assessment` actions, which then poke at the live + * DOM). For navigations that switch to a module the SPA may not have + * loaded yet (courses, forums, folios, portals, documents, reports, + * goals, notices, ...) we instead assign through `location.href` against + * the canonical `${origin}/` base. The path stays `/`, so the browser + * still treats this as a hash-only change in practice — but if anything + * went sideways with the path, we get a clean reload that bootstraps the + * SPA fresh, which is far less surprising than a blank screen. + */ +function navigateToHashRoute(routeWithLeadingSlash: string): void { + const target = `${location.origin}/#?page=${routeWithLeadingSlash}`; + window.location.href = target; +} + +function navigateInCurrentSeqtaApp(routeWithLeadingSlash: string): void { + window.location.hash = `#?page=${routeWithLeadingSlash}`; +} + +/** + * Final-fallback hub when an item has no usable deep-link metadata. + * + * `/dashboard` is the standard SEQTA Learn landing page and is the + * destination the websiteskimmer recording captured for unknown routes. + * `/home` is BetterSEQTA-Plus's custom replacement which only renders + * after our content script has hooked the SPA — using it as a fallback + * from a fresh nav can produce a blank frame. + */ +const FALLBACK_ROUTE = "/dashboard"; + export const actionMap: Record> = { message: (async (item: IndexItem & { metadata: MessageMetadata }) => { window.location.hash = `#?page=/messages`; @@ -81,32 +115,34 @@ export const actionMap: Record> = { } } - // Try to extract metadata values using multiple methods to handle XrayWrapper + // Try to extract metadata values using multiple methods to handle XrayWrapper. + // The metadata bag is intentionally typed loosely here because Firefox's + // XrayWrapper occasionally surfaces extra/casing-variant keys we still + // want to read defensively. const getMetadataValue = (key: string, altKey?: string): any => { + const bag = metadata as unknown as Record; try { - // Try direct access first - const value = metadata[key]; + const value = bag[key]; if (value !== undefined && value !== null) { return value; } if (altKey) { - const altValue = metadata[altKey]; + const altValue = bag[altKey]; if (altValue !== undefined && altValue !== null) { return altValue; } } - // Try accessing via Object.keys iteration (works around XrayWrapper) try { - const keys = Object.keys(metadata); + const keys = Object.keys(bag); for (const k of keys) { if (k === key || k === altKey) { - const val = metadata[k]; + const val = bag[k]; if (val !== undefined && val !== null) { return val; } } } - } catch (e) { + } catch { // Object.keys might fail on XrayWrapper, that's okay } return undefined; @@ -189,14 +225,218 @@ export const actionMap: Record> = { }) as ActionHandler, subjectassessment: ((item: IndexItem) => { - window.location.href = `/#?page=/assessments/${item.metadata.programme}:${item.metadata.subjectId}`; + navigateToHashRoute( + `/assessments/${item.metadata.programme}:${item.metadata.subjectId}`, + ); }) as ActionHandler, subjectcourse: ((item: IndexItem) => { - window.location.href = `/#?page=/courses/${item.metadata.programme}:${item.metadata.subjectId}`; + navigateToHashRoute( + `/courses/${item.metadata.programme}:${item.metadata.subjectId}`, + ); }) as ActionHandler, forum: ((item: IndexItem) => { - window.location.href = `/#?page=/forums/${item.metadata.forumId}`; + navigateToHashRoute(`/forums/${item.metadata.forumId}`); + }) as ActionHandler, + + course: ((item: IndexItem) => { + const programme = item.metadata?.programme; + const metaclass = item.metadata?.metaclass ?? item.metadata?.subjectId; + if (programme !== undefined && metaclass !== undefined) { + navigateToHashRoute(`/courses/${programme}:${metaclass}`); + return; + } + if (item.metadata?.route) { + navigateToHashRoute(String(item.metadata.route)); + return; + } + navigateToHashRoute(FALLBACK_ROUTE); + }) as ActionHandler, + + notice: ((_item: IndexItem) => { + // SEQTA's notices route doesn't honour `&date=` from the hash, so just + // open the listing. + navigateToHashRoute("/notices"); + }) as ActionHandler, + + document: ((_item: IndexItem) => { + // We don't trigger downloads automatically: opening the documents page + // gives users full SEQTA controls (preview, download, share) without + // needing the JWT-stamped streaming URL we deliberately avoid storing. + navigateToHashRoute("/documents"); + }) as ActionHandler, + + folio: ((_item: IndexItem) => { + // SEQTA's folio SPA does not expose a per-id route; the previous + // `?page=/folios/read?id=N` shape contained a literal `?` inside the + // `page` query value and was unmatchable, which sent users to the + // dashboard. Always land on the read view and let the user pick. + navigateToHashRoute("/folios/read"); + }) as ActionHandler, + + portal: ((item: IndexItem) => { + // SEQTA renders portals via the in-app viewer at `?page=/portals/` + // (verified via the websiteskimmer capture). Prefer that so SSO/headers + // are preserved; only pop the external URL as a fallback if we don't + // have a UUID; final fallback to the dashboard rather than blanking. + const uuid = item.metadata?.portalUuid; + if (typeof uuid === "string" && uuid) { + navigateToHashRoute(`/portals/${uuid}`); + return; + } + const url = item.metadata?.url; + if (typeof url === "string" && url) { + window.open(url, "_blank", "noopener,noreferrer"); + return; + } + navigateToHashRoute(FALLBACK_ROUTE); + }) as ActionHandler, + + report: ((_item: IndexItem) => { + navigateToHashRoute("/reports"); + }) as ActionHandler, + + goal: ((item: IndexItem) => { + const year = item.metadata?.year; + if (year !== undefined) { + navigateToHashRoute(`/goals/${year}`); + } else { + navigateToHashRoute("/goals"); + } + }) as ActionHandler, + + /** + * Routes for passively-captured items. + * + * The passive observer captures whatever `/seqta/student/...` JSON the + * page is fetching, so we can't trust a single category to imply a + * single SEQTA SPA route. Instead, derive the destination from the API + * route the entity came from, augmented with entity-shaped hints + * (programme/metaclass/year/uuid/...) that the observer hoists into + * metadata. We never replay the original POST: actions are user-driven + * and must stay safe even though the observer's own denylist excludes + * `save/*` and friends. + */ + passive: ((item: IndexItem) => { + const md = (item.metadata ?? {}) as Record; + const route = typeof md.route === "string" ? (md.route as string) : ""; + const sourcePage = + typeof md.sourcePage === "string" ? (md.sourcePage as string) : ""; + const routeParts = route + .replace(/^\/seqta\/student\/?/, "") + .replace(/^load\//, "") + .split("/") + .filter(Boolean) + .map((part) => part.toLowerCase()); + const tail = routeParts[0] ?? ""; + const child = routeParts[1] ?? ""; + + const num = (key: string): number | undefined => { + const value = md[key]; + if (typeof value === "number" && Number.isFinite(value)) return value; + if (typeof value === "string" && value && Number.isFinite(Number(value))) { + return Number(value); + } + return undefined; + }; + const str = (key: string): string | undefined => { + const value = md[key]; + return typeof value === "string" && value ? value : undefined; + }; + + const programme = num("programme") ?? num("programmeId") ?? num("programmeID"); + const metaclass = + num("metaclass") ?? num("metaclassId") ?? num("metaclassID"); + const portalUuid = str("portalUuid") ?? str("uuid"); + const forumId = num("forumId") ?? num("forum"); + const year = num("year"); + const assessmentId = + num("assessmentId") ?? num("assessmentID") ?? num("id"); + const messageId = num("messageId"); + + if (sourcePage === "/messages") { + navigateInCurrentSeqtaApp("/messages"); + return; + } + + switch (tail) { + case "courses": + if (programme !== undefined && metaclass !== undefined) { + navigateToHashRoute(`/courses/${programme}:${metaclass}`); + return; + } + break; + case "assessments": + if (programme !== undefined && metaclass !== undefined) { + const itemSuffix = + assessmentId !== undefined ? `&item=${assessmentId}` : ""; + navigateToHashRoute( + `/assessments/${programme}:${metaclass}${itemSuffix}`, + ); + return; + } + if (assessmentId !== undefined) { + navigateToHashRoute(`/assessments/upcoming&item=${assessmentId}`); + return; + } + navigateToHashRoute("/assessments/upcoming"); + return; + case "forums": + case "forum": + if (forumId !== undefined) { + navigateToHashRoute(`/forums/${forumId}`); + return; + } + break; + case "portals": + case "portal": + if (portalUuid) { + navigateToHashRoute(`/portals/${portalUuid}`); + return; + } + break; + case "goals": + case "goal": + navigateToHashRoute(year !== undefined ? `/goals/${year}` : "/goals"); + return; + case "folio": + case "folios": + navigateToHashRoute("/folios/read"); + return; + case "notices": + case "notice": + navigateToHashRoute("/notices"); + return; + case "documents": + case "document": + navigateToHashRoute("/documents"); + return; + case "reports": + case "report": + navigateToHashRoute("/reports"); + return; + case "messages": + case "message": + // `/seqta/student/load/message/people` and related endpoints are + // only meaningful while SEQTA's message module is mounted. Use the + // same live hash navigation as the real message action instead of + // forcing a fresh bootstrap, which can drop back to dashboard for + // context-only endpoints. + void messageId; // noqa — preserved for future deep-select work + navigateInCurrentSeqtaApp("/messages"); + return; + case "people": + if (route.includes("/load/message/people") || child === "people") { + navigateInCurrentSeqtaApp("/messages"); + return; + } + break; + case "timetable": + navigateToHashRoute("/timetable"); + return; + } + + navigateToHashRoute(FALLBACK_ROUTE); }) as ActionHandler, }; diff --git a/src/plugins/built-in/globalSearch/src/indexing/api.ts b/src/plugins/built-in/globalSearch/src/indexing/api.ts new file mode 100644 index 00000000..470f6901 --- /dev/null +++ b/src/plugins/built-in/globalSearch/src/indexing/api.ts @@ -0,0 +1,386 @@ +import { delay } from "@/seqta/utils/delay"; + +/** + * Shared SEQTA HTTP layer used by every indexing job. + * + * - All requests are same-origin POSTs against `/seqta/student/...` with + * `credentials: "include"` so they inherit the user's existing session. + * - Responses are parsed as JSON and lightly validated (status === "200" and + * payload present, mirroring the SEQTA convention). + * - Failures are retried with exponential backoff up to a configurable limit. + * - A simple per-route concurrency / spacing limiter prevents heavy jobs (e.g. + * per-subject course crawls) from hammering SEQTA. + */ + +export interface SeqtaResponse { + payload: T; + status: string; +} + +export interface SeqtaFetchOptions { + /** Defaults to "POST". */ + method?: "POST" | "GET"; + /** Maximum number of retries for transient failures (default 2). */ + retries?: number; + /** Initial backoff delay in ms (default 200). */ + baseDelayMs?: number; + /** Hard cap on total request time in ms (default 20s). */ + timeoutMs?: number; + /** AbortSignal for cancellation. */ + signal?: AbortSignal; + /** Skip the routing limiter (rare; only for already-throttled callers). */ + skipLimiter?: boolean; +} + +const DEFAULT_RETRIES = 2; +const DEFAULT_BASE_DELAY = 200; +const DEFAULT_TIMEOUT = 20_000; + +/* ------------------------------------------------------------------ */ +/* limiter */ +/* ------------------------------------------------------------------ */ + +/** + * Caps concurrent in-flight requests per normalized SEQTA route. Indexing + * jobs often fan out (e.g. one /load/courses per subject); we don't want them + * sending dozens of requests in parallel. + */ +class RouteLimiter { + private inFlight = new Map(); + private waiters = new Map void>>(); + private readonly maxConcurrent: number; + + constructor(maxConcurrent = 4) { + this.maxConcurrent = maxConcurrent; + } + + async acquire(route: string): Promise<() => void> { + const current = this.inFlight.get(route) ?? 0; + if (current < this.maxConcurrent) { + this.inFlight.set(route, current + 1); + return () => this.release(route); + } + + return new Promise((resolve) => { + const queue = this.waiters.get(route) ?? []; + queue.push(() => { + this.inFlight.set(route, (this.inFlight.get(route) ?? 0) + 1); + resolve(() => this.release(route)); + }); + this.waiters.set(route, queue); + }); + } + + private release(route: string) { + const next = (this.inFlight.get(route) ?? 1) - 1; + if (next <= 0) { + this.inFlight.delete(route); + } else { + this.inFlight.set(route, next); + } + const queue = this.waiters.get(route); + if (queue && queue.length > 0) { + const wake = queue.shift()!; + if (queue.length === 0) this.waiters.delete(route); + wake(); + } + } +} + +const routeLimiter = new RouteLimiter(4); + +/* ------------------------------------------------------------------ */ +/* route normalization */ +/* ------------------------------------------------------------------ */ + +/** + * Strips the volatile anti-replay query token (e.g. `?mokx3qef`) so we can + * key caches and limiters off the canonical route. + */ +export function normalizeSeqtaPath(url: string): string { + try { + const parsed = new URL(url, location.origin); + // SEQTA appends a single random query token like `?mokx3qef`. Drop the + // entire query string so canonicalization is robust. + return parsed.pathname; + } catch { + // Fallback for already-relative URLs. + return url.split("?")[0]; + } +} + +/* ------------------------------------------------------------------ */ +/* sensitive routes */ +/* ------------------------------------------------------------------ */ + +/** + * Routes whose responses must never be indexed because they contain + * credentials, secrets, JWTs, or arbitrary configuration blobs. + */ +const SENSITIVE_PATH_PATTERNS: RegExp[] = [ + /\/seqta\/student\/login(\b|\/)/i, + /\/seqta\/student\/save\//i, + /\/seqta\/student\/load\/settings(\b|\/)/i, + /\/seqta\/student\/load\/prefs(\b|\/)/i, + /\/seqta\/student\/heartbeat(\b|\/)/i, + /\/seqta\/student\/storage(\b|\/)/i, + /\/seqta\/student\/themes\//i, + /\/seqta\/student\/branding\//i, + /\/seqta\/student\/releasealert\//i, + /\/seqta\/student\/files\/stream(\b|\/)/i, + /\/seqta\/student\/load\/file(\b|\/)/i, + /\/seqta\/ta\/masquerade(\b|\/)/i, +]; + +export function isSensitiveSeqtaPath(path: string): boolean { + const normalized = normalizeSeqtaPath(path); + return SENSITIVE_PATH_PATTERNS.some((re) => re.test(normalized)); +} + +/* ------------------------------------------------------------------ */ +/* student / user identity */ +/* ------------------------------------------------------------------ */ + +interface SeqtaUserInfo { + id?: number; + personUUID?: string; + username?: string; + [key: string]: unknown; +} + +let cachedUserInfo: SeqtaUserInfo | null = null; +let inflightUserInfo: Promise | null = null; + +/** + * Resolves the current SEQTA user identity by re-using the same `login` + * handshake that the host page performs. This is the canonical way to + * discover the active student id and avoids the historical hard-coded + * `student: 69` placeholder that was incorrect on every real instance. + * + * Failures are intentionally NOT cached — a transient login glitch on the + * very first call must not poison the cache for the lifetime of the page, + * because every subsequent indexing pass that needs the student id (e.g. + * the assignments job) would skip silently. + */ +export async function getCurrentUserInfo(): Promise { + if (cachedUserInfo) return cachedUserInfo; + if (inflightUserInfo) return inflightUserInfo; + + inflightUserInfo = (async () => { + try { + const res = await fetch(`${location.origin}/seqta/student/login`, { + method: "POST", + credentials: "include", + headers: { "Content-Type": "application/json; charset=utf-8" }, + body: JSON.stringify({ + mode: "normal", + query: null, + redirect_url: location.origin, + }), + }); + if (!res.ok) return null; + const json = (await res.json()) as { payload?: SeqtaUserInfo }; + const payload = json?.payload ?? null; + if (payload && typeof payload === "object") { + cachedUserInfo = payload; + return payload; + } + return null; + } catch (e) { + console.warn( + "[Global Search API] Failed to resolve current user info:", + e, + ); + return null; + } finally { + inflightUserInfo = null; + } + })(); + + return inflightUserInfo; +} + +/** + * Best-effort lookup of the active student id. Returns `undefined` when the + * value cannot be discovered (jobs should fall back gracefully rather than + * fabricating an id). + */ +export async function getCurrentStudentId(): Promise { + const info = await getCurrentUserInfo(); + const id = info?.id; + if (typeof id === "number" && Number.isFinite(id)) return id; + return undefined; +} + +/* ------------------------------------------------------------------ */ +/* core fetch */ +/* ------------------------------------------------------------------ */ + +class SeqtaApiError extends Error { + status: number; + route: string; + constructor(message: string, status: number, route: string) { + super(message); + this.name = "SeqtaApiError"; + this.status = status; + this.route = route; + } +} + +function isTransientError(err: unknown): boolean { + if (err instanceof SeqtaApiError) { + if (err.status === 0 || err.status >= 500) return true; + if (err.status === 429) return true; + return false; + } + if (err instanceof TypeError) return true; + if ((err as any)?.name === "AbortError") return false; + return true; +} + +/** + * Sends a JSON POST against a SEQTA route and returns the parsed envelope. + * + * - Adds `credentials: "include"` so requests reuse the active session. + * - Sets `X-Requested-With: XMLHttpRequest` so SEQTA classifies the request + * the same way as the first-party SPA (some routes 4xx without it). + * - Retries transient network/server errors with exponential backoff. + * - Validates that the response is JSON and has `status === "200"` (matches + * the SEQTA convention; jobs that need raw payloads can pass `path` but + * call `seqtaFetch` directly via the underlying API if they need to). + */ +export async function seqtaFetchJson( + path: string, + body: Record | undefined = {}, + options: SeqtaFetchOptions = {}, +): Promise> { + const route = normalizeSeqtaPath(path); + const retries = Math.max(0, options.retries ?? DEFAULT_RETRIES); + const baseDelay = Math.max(50, options.baseDelayMs ?? DEFAULT_BASE_DELAY); + const timeoutMs = Math.max(1_000, options.timeoutMs ?? DEFAULT_TIMEOUT); + + let release: (() => void) | null = null; + if (!options.skipLimiter) { + release = await routeLimiter.acquire(route); + } + + try { + let attempt = 0; + let lastError: unknown = null; + + while (attempt <= retries) { + const controller = new AbortController(); + const timer = setTimeout(() => controller.abort(), timeoutMs); + + const onAbort = () => controller.abort(); + if (options.signal) { + if (options.signal.aborted) controller.abort(); + else options.signal.addEventListener("abort", onAbort, { once: true }); + } + + try { + const res = await fetch(`${location.origin}${route}`, { + method: options.method ?? "POST", + credentials: "include", + headers: { + "Content-Type": "application/json; charset=utf-8", + "X-Requested-With": "XMLHttpRequest", + Accept: "text/javascript, text/html, application/xml, text/xml, */*", + }, + body: body === undefined ? undefined : JSON.stringify(body), + signal: controller.signal, + }); + + if (!res.ok) { + throw new SeqtaApiError( + `HTTP ${res.status} ${res.statusText} for ${route}`, + res.status, + route, + ); + } + + const rawJson = (await res.json()) as unknown; + if (!rawJson || typeof rawJson !== "object") { + throw new SeqtaApiError( + `Invalid SEQTA response (not a JSON object) for ${route}`, + res.status, + route, + ); + } + + // SEQTA's "envelope" convention is `{ status, payload }`, but in + // practice some endpoints — notably `/seqta/student/load/subjects` + // and `/seqta/student/assessment/list/*` — occasionally return + // either a bare array or an envelope with a non-"200" status. + // Strict validation here was historically silently killing the + // assignments + courses indexing pipelines when those endpoints + // returned a quirky shape, so we normalize permissively and let + // callers handle missing/empty payloads. + let json: SeqtaResponse; + if (Array.isArray(rawJson)) { + json = { payload: rawJson as unknown as T, status: "200" }; + } else { + const obj = rawJson as Record; + const hasEnvelopeKey = "payload" in obj || "status" in obj; + if (hasEnvelopeKey) { + json = { + payload: ("payload" in obj ? obj.payload : undefined) as T, + status: + typeof obj.status === "string" + ? obj.status + : typeof obj.status === "number" + ? String(obj.status) + : "200", + }; + } else { + json = { payload: rawJson as unknown as T, status: "200" }; + } + } + + if (json.status && json.status !== "200") { + console.warn( + `[Global Search API] Non-200 SEQTA status "${json.status}" for ${route} — returning payload anyway`, + ); + } + + return json; + } catch (err) { + lastError = err; + if (!isTransientError(err) || attempt === retries) { + throw err; + } + const wait = Math.min(5_000, baseDelay * Math.pow(2, attempt)); + await delay(wait); + attempt++; + } finally { + clearTimeout(timer); + if (options.signal) options.signal.removeEventListener("abort", onAbort); + } + } + + throw lastError ?? new Error(`seqtaFetchJson exhausted retries for ${route}`); + } finally { + if (release) release(); + } +} + +/** + * Convenience helper: fetch and unwrap `.payload` directly. Returns `null` + * on failure rather than throwing, so jobs can use the value optionally. + */ +export async function seqtaFetchPayload( + path: string, + body: Record | undefined = {}, + options: SeqtaFetchOptions = {}, +): Promise { + try { + const res = await seqtaFetchJson(path, body, options); + return res.payload ?? null; + } catch (e) { + console.warn( + `[Global Search API] Request to ${normalizeSeqtaPath(path)} failed:`, + e, + ); + return null; + } +} diff --git a/src/plugins/built-in/globalSearch/src/indexing/extract.ts b/src/plugins/built-in/globalSearch/src/indexing/extract.ts new file mode 100644 index 00000000..c91a9d07 --- /dev/null +++ b/src/plugins/built-in/globalSearch/src/indexing/extract.ts @@ -0,0 +1,303 @@ +import { htmlToPlainText } from "./utils"; +import type { IndexItem } from "./types"; + +/** + * Safe extraction helpers used by both active SEQTA jobs and the passive + * network observer. + * + * The goal is to take arbitrary SEQTA JSON / embedded HTML fragments and + * derive concise, redacted, search-friendly text without ever indexing + * obvious credentials, tokens, JWTs, or large binary blobs. + */ + +/* ------------------------------------------------------------------ */ +/* sensitive keys */ +/* ------------------------------------------------------------------ */ + +/** + * Field names whose values should never be indexed regardless of context. + * Matches SEQTA's frequently-used credential / config keys plus generic + * security-related names. Comparison is case-insensitive and matches both + * the full key and any sub-string fragments (so `client_secret`, + * `apiKey`, `dropboxKey` all hit). + */ +const SENSITIVE_KEY_FRAGMENTS: readonly string[] = [ + "password", + "passwd", + "pwd", + "secret", + "token", + "jwt", + "session", + "cookie", + "auth", + "apikey", + "api_key", + "clientid", + "client_id", + "clientsecret", + "client_secret", + "credential", + "private", + "salt", + "hash", + "csrf", + "x-api", + "bearer", + "dropbox", + "oauth", + "signature", +]; + +export function isSensitiveKey(key: string): boolean { + if (!key) return false; + const lower = key.toLowerCase(); + return SENSITIVE_KEY_FRAGMENTS.some((frag) => lower.includes(frag)); +} + +/** + * Returns true if the supplied scalar value looks credential-shaped: a long + * hex/base64-like blob that doesn't decode to readable text. This catches + * arbitrary tokens that don't have a clear field-name signal. + */ +export function looksLikeSecretValue(value: unknown): boolean { + if (typeof value !== "string") return false; + const trimmed = value.trim(); + if (trimmed.length < 32) return false; + + // Long contiguous base64 / hex with no whitespace and no humanish punctuation. + if (/\s/.test(trimmed)) return false; + if (/^[A-Za-z0-9+/=._-]{32,}$/.test(trimmed) && !/[.,!?]/.test(trimmed)) { + // Reject obvious URLs and UUIDs (they're useful and not secret). + if (/^https?:\/\//i.test(trimmed)) return false; + if ( + /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i.test( + trimmed, + ) + ) { + return false; + } + return true; + } + + // JWT detection: three base64url segments separated by dots. + if (/^[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+$/.test(trimmed)) { + return true; + } + + return false; +} + +/* ------------------------------------------------------------------ */ +/* text extraction */ +/* ------------------------------------------------------------------ */ + +/** + * Recursively pulls human-readable text out of an arbitrary JSON value. + * + * - HTML strings are passed through `htmlToPlainText`. + * - Sensitive keys and secret-shaped values are skipped. + * - Long blobs are truncated to keep the index lean. + * - Arrays and objects are walked; depth is bounded to avoid pathological + * structures. + */ +export interface ExtractTextOptions { + /** Hard cap on combined characters across the walk (default 4000). */ + maxChars?: number; + /** Maximum recursion depth (default 6). */ + maxDepth?: number; + /** Maximum array length to traverse (default 200). */ + maxArrayItems?: number; + /** Skip individual string values longer than this (default 8000). */ + maxStringLength?: number; +} + +const DEFAULT_EXTRACT_OPTIONS: Required = { + maxChars: 4000, + maxDepth: 6, + maxArrayItems: 200, + maxStringLength: 8000, +}; + +export function extractTextFromValue( + value: unknown, + options: ExtractTextOptions = {}, +): string { + const opts = { ...DEFAULT_EXTRACT_OPTIONS, ...options }; + const parts: string[] = []; + let remaining = opts.maxChars; + + const push = (text: string) => { + if (!text || remaining <= 0) return; + const trimmed = text.trim(); + if (!trimmed) return; + const slice = trimmed.length > remaining ? trimmed.slice(0, remaining) : trimmed; + parts.push(slice); + remaining -= slice.length + 1; + }; + + const walk = (node: unknown, depth: number, parentKey: string | null) => { + if (remaining <= 0) return; + if (node === null || node === undefined) return; + if (parentKey && isSensitiveKey(parentKey)) return; + + if (typeof node === "string") { + if (node.length > opts.maxStringLength) return; + if (looksLikeSecretValue(node)) return; + if (node.includes("<") && node.includes(">")) { + push(htmlToPlainText(node)); + } else { + push(node); + } + return; + } + + if (typeof node === "number" || typeof node === "boolean") { + // Numbers/booleans rarely contribute to search recall; skip to keep + // the index focused on text. + return; + } + + if (depth >= opts.maxDepth) return; + + if (Array.isArray(node)) { + const limit = Math.min(node.length, opts.maxArrayItems); + for (let i = 0; i < limit; i++) { + walk(node[i], depth + 1, parentKey); + if (remaining <= 0) return; + } + return; + } + + if (typeof node === "object") { + for (const [key, child] of Object.entries(node as Record)) { + if (remaining <= 0) return; + if (isSensitiveKey(key)) continue; + walk(child, depth + 1, key); + } + } + }; + + walk(value, 0, null); + + return parts.join("\n").trim(); +} + +/* ------------------------------------------------------------------ */ +/* redacted clones */ +/* ------------------------------------------------------------------ */ + +/** + * Returns a deep clone of `value` with sensitive keys/values stripped. The + * passive observer uses this when persisting metadata so we never store + * raw tokens or settings blobs in IndexedDB. + */ +export function redactSensitive(value: T, depth = 0): T { + if (value === null || value === undefined) return value; + if (depth >= 8) return value; + + if (Array.isArray(value)) { + return value + .slice(0, 200) + .map((v) => redactSensitive(v, depth + 1)) as unknown as T; + } + if (typeof value === "object") { + const out: Record = {}; + for (const [key, child] of Object.entries(value as Record)) { + if (isSensitiveKey(key)) continue; + if (typeof child === "string" && looksLikeSecretValue(child)) continue; + out[key] = redactSensitive(child, depth + 1); + } + return out as T; + } + if (typeof value === "string" && looksLikeSecretValue(value)) { + return "" as unknown as T; + } + return value; +} + +/* ------------------------------------------------------------------ */ +/* title / id heuristics */ +/* ------------------------------------------------------------------ */ + +const TITLE_KEYS = [ + "title", + "subject", + "name", + "label", + "heading", + "displayName", + "filename", + "code", +]; + +const ID_KEYS = ["id", "uuid", "messageID", "assessmentID", "notificationID"]; + +/** + * Best-effort title extraction: returns the first sensible string-valued + * field commonly used by SEQTA payloads. Falls back to an empty string when + * none are present. + */ +export function pickTitle(node: unknown, fallback = ""): string { + if (!node || typeof node !== "object") return fallback; + const obj = node as Record; + for (const key of TITLE_KEYS) { + const v = obj[key]; + if (typeof v === "string" && v.trim()) return v.trim(); + } + return fallback; +} + +export function pickId(node: unknown, fallback = ""): string { + if (!node || typeof node !== "object") return fallback; + const obj = node as Record; + for (const key of ID_KEYS) { + const v = obj[key]; + if (typeof v === "string" && v.trim()) return v.trim(); + if (typeof v === "number" && Number.isFinite(v)) return String(v); + } + return fallback; +} + +/* ------------------------------------------------------------------ */ +/* IndexItem builders */ +/* ------------------------------------------------------------------ */ + +/** + * Constructs an `IndexItem` from a raw entity, applying our standard + * extraction rules. Callers fill in the things that need domain knowledge + * (`category`, `actionId`, `metadata`, deep-link route hints) and we handle + * the boring text + redaction work. + */ +export function buildIndexItem(input: { + id: string; + text: string; + category: string; + rawForContent?: unknown; + contentOverride?: string; + metadata?: Record; + actionId: string; + renderComponentId: string; + dateAdded?: number; + contentMaxChars?: number; +}): IndexItem { + const content = + input.contentOverride !== undefined + ? input.contentOverride + : extractTextFromValue(input.rawForContent, { + maxChars: input.contentMaxChars ?? 1500, + }); + + const metadata = input.metadata ? redactSensitive(input.metadata) : {}; + + return { + id: input.id, + text: input.text, + category: input.category, + content, + dateAdded: input.dateAdded ?? Date.now(), + metadata, + actionId: input.actionId, + renderComponentId: input.renderComponentId, + }; +} diff --git a/src/plugins/built-in/globalSearch/src/indexing/indexer.ts b/src/plugins/built-in/globalSearch/src/indexing/indexer.ts index c14ee9bc..70222ca8 100644 --- a/src/plugins/built-in/globalSearch/src/indexing/indexer.ts +++ b/src/plugins/built-in/globalSearch/src/indexing/indexer.ts @@ -1,10 +1,11 @@ -import { clear, get, getAll, put, remove } from "./db"; +import { clear, get, getAll, put, remove, resetDatabase } from "./db"; import { jobs } from "./jobs"; import { renderComponentMap } from "./renderComponents"; import type { IndexItem, Job, JobContext } from "./types"; import { VectorWorkerManager } from "./worker/vectorWorkerManager"; import { loadDynamicItems } from "../utils/dynamicItems"; import { getVectorizedItemIds } from "./utils"; +import { INDEX_SCHEMA_VERSION, SCHEMA_VERSION_KEY } from "./schemaVersion"; const META_STORE = "meta"; const LOCK_KEY = "bsq-indexer-lock"; @@ -12,6 +13,50 @@ const HEARTBEAT_INTERVAL = 10000; const LOCK_TIMEOUT = 20000; const LOCK_ACQUIRE_TIMEOUT = 5000; +let schemaCheckPromise: Promise | null = null; + +async function ensureSchemaCurrent(): Promise { + if (schemaCheckPromise) return schemaCheckPromise; + schemaCheckPromise = (async () => { + let storedRaw: string | null = null; + try { + storedRaw = localStorage.getItem(SCHEMA_VERSION_KEY); + } catch { + return; + } + const stored = storedRaw ? parseInt(storedRaw, 10) : 0; + if (stored === INDEX_SCHEMA_VERSION) return; + + console.warn( + `[Indexer] Schema version changed (${stored} -> ${INDEX_SCHEMA_VERSION}); resetting structured + vector indexes.`, + ); + + try { + await resetDatabase(); + } catch (e) { + console.warn("[Indexer] Failed to reset structured database:", e); + } + + try { + await new Promise((resolve) => { + const req = indexedDB.deleteDatabase("embeddiaDB"); + req.onsuccess = () => resolve(); + req.onerror = () => resolve(); + req.onblocked = () => resolve(); + }); + } catch (e) { + console.warn("[Indexer] Failed to reset embeddiaDB:", e); + } + + try { + localStorage.setItem(SCHEMA_VERSION_KEY, String(INDEX_SCHEMA_VERSION)); + } catch { + /* ignore */ + } + })(); + return schemaCheckPromise; +} + /* ─────────── Progress‑meta helpers ─────────── */ async function loadProgress(jobId: string): Promise { const rec = await get(META_STORE, `progress:${jobId}`); @@ -162,6 +207,8 @@ export async function loadAllStoredItems(): Promise { } export async function runIndexing(): Promise { + await ensureSchemaCurrent(); + if (!(await acquireLock())) { console.debug( "%c[Indexer] Could not acquire lock - another tab is indexing or this tab is already indexing", @@ -178,8 +225,6 @@ export async function runIndexing(): Promise { const totalSteps = jobIds.length + 1; dispatchProgress(completedJobs, totalSteps, true, "Starting jobs"); - let hasStreamingJobs = false; - for (const jobId of jobIds) { dispatchProgress( completedJobs, @@ -255,10 +300,6 @@ export async function runIndexing(): Promise { await setStoredItems(merged); await updateLastRunMeta(jobId); - if (jobId === 'messages' || jobId === 'notifications') { - hasStreamingJobs = true; - } - console.debug( `%c[Indexer] ${job.label}: ${newItemsRaw.length} new items reported by run, ${merged.length} total items now in '${jobId}' store.`, "color: #00c46f", diff --git a/src/plugins/built-in/globalSearch/src/indexing/jobs.ts b/src/plugins/built-in/globalSearch/src/indexing/jobs.ts index 659d2bc0..be18f290 100644 --- a/src/plugins/built-in/globalSearch/src/indexing/jobs.ts +++ b/src/plugins/built-in/globalSearch/src/indexing/jobs.ts @@ -4,6 +4,14 @@ import { notificationsJob } from "./jobs/notifications"; import { forumsJob } from "./jobs/forums"; import { subjectsJob } from "./jobs/subjects"; import { assignmentsJob } from "./jobs/assignments"; +import { coursesJob } from "./jobs/courses"; +import { noticesJob } from "./jobs/notices"; +import { documentsJob } from "./jobs/documents"; +import { folioJob } from "./jobs/folio"; +import { portalsJob } from "./jobs/portals"; +import { reportsJob } from "./jobs/reports"; +import { goalsJob } from "./jobs/goals"; +import { passiveJob } from "./jobs/passive"; export const jobs: Record = { messages: messagesJob, @@ -11,4 +19,12 @@ export const jobs: Record = { forums: forumsJob, subjects: subjectsJob, assignments: assignmentsJob, + courses: coursesJob, + notices: noticesJob, + documents: documentsJob, + folio: folioJob, + portals: portalsJob, + reports: reportsJob, + goals: goalsJob, + passive: passiveJob, }; diff --git a/src/plugins/built-in/globalSearch/src/indexing/jobs/assignments.ts b/src/plugins/built-in/globalSearch/src/indexing/jobs/assignments.ts index 596cbe42..ed1ba696 100644 --- a/src/plugins/built-in/globalSearch/src/indexing/jobs/assignments.ts +++ b/src/plugins/built-in/globalSearch/src/indexing/jobs/assignments.ts @@ -1,55 +1,85 @@ import type { IndexItem, Job } from "../types"; +import { getCurrentStudentId, seqtaFetchPayload } from "../api"; +import { getUserInfo } from "@/seqta/ui/AddBetterSEQTAElements"; -const fetchJSON = async (url: string, body: any) => { - const res = await fetch(`${location.origin}${url}`, { - method: "POST", - credentials: "include", - headers: { "Content-Type": "application/json; charset=utf-8" }, - body: JSON.stringify(body), - }); - return res.json(); -}; - -const fetchUpcomingAssessments = async (student: number = 69) => { +/** + * Resolves the active student id from whatever source is available. + * + * The shared `getCurrentStudentId()` calls `/seqta/student/login` with a + * specific body shape; on some SEQTA installs that endpoint can return a + * response that confuses the helper (no `id`, or a non-"200" envelope). + * To make sure we never silently skip the entire assignments pass, we + * also fall back to `getUserInfo()` from `AddBetterSEQTAElements.ts` — + * it's the same handshake the host page uses to render the avatar, so + * if the user is logged in at all this path resolves. + */ +async function resolveStudentId(): Promise { try { - const res = await fetchJSON("/seqta/student/assessment/list/upcoming?", { - student, - }); - // Match analytics.rs: payload is an array, return empty array if not found - return Array.isArray(res.payload) ? res.payload : []; + const direct = await getCurrentStudentId(); + if (typeof direct === "number" && Number.isFinite(direct)) return direct; } catch (e) { - console.error("[Assignments job] Failed to fetch upcoming assessments:", e); - return []; + console.warn( + "[Assignments job] getCurrentStudentId() threw, falling back to getUserInfo()", + e, + ); } + + try { + const info = (await getUserInfo()) as { id?: unknown } | null; + const id = info?.id; + if (typeof id === "number" && Number.isFinite(id)) return id; + if (typeof id === "string" && id && Number.isFinite(Number(id))) { + return Number(id); + } + } catch (e) { + console.warn("[Assignments job] getUserInfo() fallback failed:", e); + } + + return undefined; +} + +const fetchUpcomingAssessments = async (student: number) => { + const payload = await seqtaFetchPayload( + "/seqta/student/assessment/list/upcoming", + { student }, + ); + return Array.isArray(payload) ? payload : []; }; const fetchSubjects = async () => { - try { - const res = await fetchJSON("/seqta/student/load/subjects?", {}); - return res.payload - ?.filter((s: any) => s.active === 1) - ?.flatMap((s: any) => s.subjects) || []; - } catch (e) { - console.error("[Assignments job] Failed to fetch subjects:", e); - return []; - } + // SEQTA accepts both `{}` and `{ mode: "list" }` here; the latter is the + // shape every BetterSEQTA-Plus path uses elsewhere and is the more + // reliable response format on schools that customize the `student/load` + // endpoint. + const payload = await seqtaFetchPayload( + "/seqta/student/load/subjects", + { mode: "list" }, + ); + if (!Array.isArray(payload)) return []; + return payload + .filter((s: any) => s && s.active === 1) + .flatMap((s: any) => (Array.isArray(s.subjects) ? s.subjects : [])); }; -const fetchPastAssessments = async (student: number = 69, subjects: any[]) => { +const fetchPastAssessments = async (student: number, subjects: any[]) => { const map: Record = {}; - + // Fetch past assessments for all subjects in parallel (like assessmentsOverview does) // This is much faster than sequential fetching await Promise.all( subjects.map(async (subject) => { try { - // Match analytics.rs exactly: parameter order is programme, metaclass, student - const res = await fetchJSON("/seqta/student/assessment/list/past?", { - programme: subject.programme, - metaclass: subject.metaclass, - student, - }); - + const payload = await seqtaFetchPayload( + "/seqta/student/assessment/list/past", + { + programme: subject.programme, + metaclass: subject.metaclass, + student, + }, + ); + + if (!payload) return; + // Past assessments API can return data in payload.tasks OR payload.pending (or both) // Based on analytics.rs fetch_past_assessments, we need to check both arrays const processAssessment = (assessment: any) => { @@ -65,23 +95,23 @@ const fetchPastAssessments = async (student: number = 69, subjects: any[]) => { }; } }; - + // Match analytics.rs: Check both pending and tasks arrays // Check for pending array first (matching Rust code order) - if (res.payload?.pending && Array.isArray(res.payload.pending)) { - res.payload.pending.forEach(processAssessment); + if (payload?.pending && Array.isArray(payload.pending)) { + payload.pending.forEach(processAssessment); } - + // Check for tasks array - if (res.payload?.tasks && Array.isArray(res.payload.tasks)) { - res.payload.tasks.forEach(processAssessment); + if (payload?.tasks && Array.isArray(payload.tasks)) { + payload.tasks.forEach(processAssessment); } } catch (e) { console.warn(`[Assignments job] Failed to fetch past assessments for subject ${subject.code || subject.subject || 'unknown'}:`, e); } }) ); - + return Object.values(map); }; @@ -126,9 +156,27 @@ export const assignmentsJob: Job = { const existingItems = await ctx.getStoredItems("assignments"); const existingIds = new Set(existingItems.map((i) => i.id)); - const student = 69; // TODO: Get from context if available - - console.debug("[Assignments job] Starting indexing - fetching all assessments (upcoming and past)..."); + // Resolve the active student id from the live SEQTA session. Historically + // this was hard-coded to 69, which only happens to be correct on a few + // local dev instances; the shared helper now reuses the same `login` + // handshake that the host page performs so every install gets the right + // value without configuration. + // + // We *throw* instead of returning [] when resolution fails, so the + // indexer's "lastRun" meta is NOT updated. Otherwise the job would be + // marked complete (with zero items) and `shouldRun` would skip it for + // the entire 24h frequency window — meaning a single bad page load + // could leave the user without any assessment results until tomorrow. + const student = await resolveStudentId(); + if (typeof student !== "number") { + throw new Error( + "[Assignments job] Could not resolve current student id from /seqta/student/login. The job will retry on the next page load.", + ); + } + + console.debug( + `[Assignments job] Starting indexing for student=${student} - fetching all assessments (upcoming and past)...`, + ); // Fetch data in parallel const [upcoming, subjects] = await Promise.all([ diff --git a/src/plugins/built-in/globalSearch/src/indexing/jobs/courses.ts b/src/plugins/built-in/globalSearch/src/indexing/jobs/courses.ts new file mode 100644 index 00000000..c5fa959f --- /dev/null +++ b/src/plugins/built-in/globalSearch/src/indexing/jobs/courses.ts @@ -0,0 +1,179 @@ +import type { IndexItem, Job } from "../types"; +import { seqtaFetchPayload } from "../api"; +import { buildIndexItem } from "../extract"; +import { htmlToPlainText } from "../utils"; + +/** + * Indexes per-subject course content from `/seqta/student/load/courses`. + * + * The course payload contains the lesson grid in `w[][]` where each cell's + * `l` field is a (possibly empty) HTML snippet authored by teachers. We + * concatenate these into searchable text per course, plus the course title + * and code from `t` / `c`. Embedded files referenced via TED/SEQTA URLs are + * preserved as plain-text links so users can find them by URL fragment. + */ + +interface SubjectsListPayload { + code: string; + description?: string; + active: number; + subjects: Array<{ + code: string; + title?: string; + description?: string; + metaclass: number; + programme: number; + }>; +} + +interface CoursePayload { + c?: string; + t?: string; + i?: number; + m?: number; + w?: Array>; + document?: string; +} + +const fetchActiveSubjects = async (): Promise< + SubjectsListPayload["subjects"] +> => { + const payload = await seqtaFetchPayload( + "/seqta/student/load/subjects", + {}, + ); + if (!Array.isArray(payload)) return []; + + const out: SubjectsListPayload["subjects"] = []; + for (const semester of payload) { + if (!semester || !Array.isArray(semester.subjects)) continue; + if (semester.active !== 1) continue; + for (const subject of semester.subjects) { + if ( + subject && + Number.isFinite(subject.programme) && + Number.isFinite(subject.metaclass) + ) { + out.push(subject); + } + } + } + return out; +}; + +function flattenLessonHtml(payload: CoursePayload): string { + if (!Array.isArray(payload.w)) return ""; + const fragments: string[] = []; + for (const row of payload.w) { + if (!Array.isArray(row)) continue; + for (const cell of row) { + if (!cell) continue; + if (typeof cell.l === "string" && cell.l.trim()) { + fragments.push(cell.l); + } + if (typeof cell.h === "string" && cell.h.trim()) { + fragments.push(cell.h); + } + if (typeof cell.t === "string" && cell.t.trim()) { + fragments.push(cell.t); + } + if (typeof cell.o === "string" && cell.o.trim()) { + fragments.push(cell.o); + } + } + } + if (fragments.length === 0) return ""; + return htmlToPlainText(fragments.join("\n")); +} + +export const coursesJob: Job = { + id: "courses", + label: "Courses", + renderComponentId: "course", + // Course content rarely changes minute-to-minute but does evolve per term. + // Refresh once per day (after pageLoad cool-down) to keep new lessons + // discoverable without hammering SEQTA. + frequency: { type: "expiry", afterMs: 1000 * 60 * 60 * 24 }, + + boostCriteria: (item, searchTerm) => { + if (!searchTerm) return -50; + let score = 0; + if (item.metadata?.subjectCode) score += 0.05; + if (item.metadata?.isActive) score += 0.02; + return score; + }, + + run: async (_ctx) => { + const subjects = await fetchActiveSubjects(); + if (subjects.length === 0) { + console.debug("[Courses job] No active subjects discovered."); + return []; + } + + const items: IndexItem[] = []; + const seenIds = new Set(); + + // Sequential per-subject fetch keeps load on SEQTA bounded; the shared + // API layer also limits concurrency per route as a defense in depth. + for (const subject of subjects) { + const id = `course-${subject.programme}-${subject.metaclass}`; + if (seenIds.has(id)) continue; + seenIds.add(id); + + const payload = await seqtaFetchPayload( + "/seqta/student/load/courses", + { + programme: String(subject.programme), + metaclass: String(subject.metaclass), + }, + ); + + if (!payload) continue; + + const title = + (typeof payload.t === "string" && payload.t.trim()) || + subject.title || + subject.description || + subject.code || + "Course"; + + const lessonText = flattenLessonHtml(payload); + const courseCode = + (typeof payload.c === "string" && payload.c.trim()) || subject.code; + + const summary = [courseCode, lessonText] + .filter((s) => s && s.length > 0) + .join("\n") + .slice(0, 4000); + + items.push( + buildIndexItem({ + id, + text: title, + category: "courses", + contentOverride: summary || `Course content for ${title}`, + metadata: { + subjectCode: subject.code, + subjectName: subject.title ?? title, + programme: subject.programme, + metaclass: subject.metaclass, + courseCode, + isActive: true, + route: `/courses/${subject.programme}:${subject.metaclass}`, + entityType: "course", + icon: "\ueb4d", + }, + actionId: "course", + renderComponentId: "course", + }), + ); + } + + console.debug( + `[Courses job] Indexed ${items.length} courses across ${subjects.length} subjects.`, + ); + return items; + }, + + purge: (items) => items, +}; diff --git a/src/plugins/built-in/globalSearch/src/indexing/jobs/documents.ts b/src/plugins/built-in/globalSearch/src/indexing/jobs/documents.ts new file mode 100644 index 00000000..a89c7955 --- /dev/null +++ b/src/plugins/built-in/globalSearch/src/indexing/jobs/documents.ts @@ -0,0 +1,139 @@ +import type { IndexItem, Job } from "../types"; +import { seqtaFetchPayload } from "../api"; + +/** + * Indexes file metadata from `/seqta/student/load/documents`. + * + * Each top-level entry is a category containing one or more documents + * (`docs[]`). We capture the human-readable title, filename, mimetype, and + * stable UUID/category for every doc, but never download or index the + * binary content itself - the document streaming endpoint uses one-time + * JWTs that are unsafe to persist or replay. + */ + +interface DocumentEntry { + file?: number | string; + filename?: string; + size?: string | number; + context_uuid?: string; + mimetype?: string; + created_date?: string; + title?: string; + uuid?: string; + created_by?: string; +} + +interface DocumentCategory { + id: number | string; + category: string; + colour?: string; + docs: DocumentEntry[]; +} + +function prettySize(size: string | number | undefined): string | null { + if (size === undefined || size === null) return null; + const bytes = typeof size === "string" ? parseInt(size, 10) : size; + if (!Number.isFinite(bytes) || bytes <= 0) return null; + const units = ["B", "KB", "MB", "GB"]; + let value = bytes; + let i = 0; + while (value >= 1024 && i < units.length - 1) { + value /= 1024; + i++; + } + return `${value.toFixed(value < 10 && i > 0 ? 1 : 0)} ${units[i]}`; +} + +function describeMime(mime: string | undefined): string | null { + if (!mime) return null; + if (mime.startsWith("application/pdf")) return "PDF"; + if (mime.includes("officedocument.wordprocessingml")) return "Word"; + if (mime.includes("officedocument.spreadsheetml")) return "Excel"; + if (mime.includes("officedocument.presentationml")) return "PowerPoint"; + if (mime.startsWith("image/")) return "Image"; + if (mime.startsWith("video/")) return "Video"; + if (mime.startsWith("audio/")) return "Audio"; + return null; +} + +export const documentsJob: Job = { + id: "documents", + label: "Documents", + renderComponentId: "document", + frequency: { type: "expiry", afterMs: 1000 * 60 * 60 * 12 }, // 12 hours + + boostCriteria: (_item, searchTerm) => { + if (!searchTerm) return -20; + return 0; + }, + + run: async (_ctx) => { + const payload = await seqtaFetchPayload( + "/seqta/student/load/documents", + {}, + ); + if (!Array.isArray(payload)) return []; + + const items: IndexItem[] = []; + const seen = new Set(); + + for (const category of payload) { + if (!category || !Array.isArray(category.docs)) continue; + for (const doc of category.docs) { + const uuid = doc.uuid || doc.context_uuid; + if (!uuid && !doc.file) continue; + const id = `document-${uuid ?? doc.file}`; + if (seen.has(id)) continue; + seen.add(id); + + const title = + doc.title?.trim() || + doc.filename?.trim() || + `Document ${doc.file ?? uuid}`; + + const sizeText = prettySize(doc.size); + const mimeLabel = describeMime(doc.mimetype); + + const contentParts: string[] = []; + if (doc.filename && doc.filename !== title) contentParts.push(doc.filename); + if (category.category) contentParts.push(`Category: ${category.category}`); + if (mimeLabel) contentParts.push(mimeLabel); + if (sizeText) contentParts.push(sizeText); + if (doc.created_date) contentParts.push(`Added ${doc.created_date}`); + + const dateAdded = doc.created_date + ? new Date(doc.created_date).getTime() || Date.now() + : Date.now(); + + items.push({ + id, + text: title, + category: "documents", + content: contentParts.join(" \u2022 "), + dateAdded, + metadata: { + documentUuid: uuid, + fileId: doc.file, + filename: doc.filename, + mimetype: doc.mimetype, + sizeBytes: + typeof doc.size === "string" ? parseInt(doc.size, 10) : doc.size, + categoryId: category.id, + categoryName: category.category, + createdDate: doc.created_date, + entityType: "document", + route: "/documents", + icon: "\ueb6f", + }, + actionId: "document", + renderComponentId: "document", + }); + } + } + + console.debug(`[Documents job] Indexed ${items.length} document entries.`); + return items; + }, + + purge: (items) => items, +}; diff --git a/src/plugins/built-in/globalSearch/src/indexing/jobs/folio.ts b/src/plugins/built-in/globalSearch/src/indexing/jobs/folio.ts new file mode 100644 index 00000000..8131441f --- /dev/null +++ b/src/plugins/built-in/globalSearch/src/indexing/jobs/folio.ts @@ -0,0 +1,134 @@ +import type { IndexItem, Job } from "../types"; +import { seqtaFetchPayload } from "../api"; +import { htmlToPlainText } from "../utils"; +import { delay } from "@/seqta/utils/delay"; + +/** + * Indexes student folio entries from `/seqta/student/folio`. + * + * The list mode returns `{ me, list: [{ id, title, published, student }] }`, + * and the load mode returns the full body via `{ contents, files, ... }`. + * Folio bodies frequently contain `[[embed:raw|]]` blocks which we + * normalize to plain text before indexing - the htmlToPlainText sanitizer + * never executes scripts because it parses into an inert document. + */ + +interface FolioListPayload { + me?: string; + list?: Array<{ + id: number | string; + title?: string; + published?: string; + student?: string; + }>; +} + +interface FolioEntryPayload { + forum?: number; + contents?: string; + created?: string; + allow_comments?: boolean; + author?: { name?: string; year?: string; id?: number }; + files?: unknown[]; + id?: number | string; + published?: string; + title?: string; + updated?: string; +} + +const PER_ITEM_DELAY_MS = 80; + +function stripEmbedRaw(text: string): string { + if (!text) return ""; + return text.replace(/\[\[embed:raw\|([\s\S]*?)\]\]/g, (_match, inner) => { + return htmlToPlainText(typeof inner === "string" ? inner : ""); + }); +} + +export const folioJob: Job = { + id: "folio", + label: "Folio", + renderComponentId: "folio", + frequency: { type: "expiry", afterMs: 1000 * 60 * 60 * 24 }, + + boostCriteria: (_item, searchTerm) => { + if (!searchTerm) return -30; + return 0; + }, + + run: async (ctx) => { + const stored = await ctx.getStoredItems("folio"); + const existing = new Map(stored.map((i) => [i.id, i])); + + const list = await seqtaFetchPayload( + "/seqta/student/folio", + { mode: "list", page: 0, filters: {} }, + ); + if (!list || !Array.isArray(list.list)) return []; + + const items: IndexItem[] = []; + for (const entry of list.list) { + if (!entry || entry.id === undefined) continue; + const id = `folio-${entry.id}`; + const dateAdded = entry.published + ? new Date(entry.published).getTime() || Date.now() + : Date.now(); + + // If we already have this folio and the title hasn't changed, reuse + // the stored content instead of paying for another /folio?mode=load. + const existingItem = existing.get(id); + const titleChanged = existingItem && existingItem.text !== (entry.title ?? ""); + if (existingItem && !titleChanged) { + items.push({ + ...existingItem, + dateAdded, + }); + continue; + } + + try { + const detail = await seqtaFetchPayload( + "/seqta/student/folio", + { mode: "load", id: entry.id }, + ); + const rawContents = detail?.contents ?? ""; + const flattened = stripEmbedRaw(rawContents); + const content = flattened.slice(0, 4000); + + items.push({ + id, + text: entry.title?.trim() || `Folio ${entry.id}`, + category: "folio", + content, + dateAdded, + metadata: { + folioId: entry.id, + student: list.me ?? entry.student, + publishedAt: entry.published, + updatedAt: detail?.updated, + createdAt: detail?.created, + authorName: detail?.author?.name, + authorId: detail?.author?.id, + forumId: detail?.forum, + allowComments: detail?.allow_comments, + fileCount: Array.isArray(detail?.files) ? detail!.files!.length : 0, + entityType: "folio", + route: "/folios/read", + icon: "\ueb16", + }, + actionId: "folio", + renderComponentId: "folio", + }); + } catch (e) { + console.warn(`[Folio job] Failed to load folio ${entry.id}:`, e); + } + + await delay(PER_ITEM_DELAY_MS); + } + + console.debug(`[Folio job] Indexed ${items.length} folio entries.`); + return items; + }, + + purge: (items) => items, +}; diff --git a/src/plugins/built-in/globalSearch/src/indexing/jobs/goals.ts b/src/plugins/built-in/globalSearch/src/indexing/jobs/goals.ts new file mode 100644 index 00000000..2b8f8265 --- /dev/null +++ b/src/plugins/built-in/globalSearch/src/indexing/jobs/goals.ts @@ -0,0 +1,109 @@ +import type { IndexItem, Job } from "../types"; +import { seqtaFetchPayload } from "../api"; +import { extractTextFromValue } from "../extract"; +import { delay } from "@/seqta/utils/delay"; + +/** + * Indexes student goals from `/seqta/student/load/goals`. + * + * The endpoint exposes `mode: "years"` which returns the list of available + * years and `mode: "list"` (per-year) which returns the actual goals. We + * gracefully degrade if the school has goals disabled (the years payload + * is empty in that case). + */ + +interface GoalEntry { + id?: number | string; + uuid?: string; + title?: string; + description?: string; + status?: string; + year?: number | string; + created?: string; + updated?: string; +} + +const PER_YEAR_DELAY_MS = 80; + +export const goalsJob: Job = { + id: "goals", + label: "Goals", + renderComponentId: "goal", + frequency: { type: "expiry", afterMs: 1000 * 60 * 60 * 24 * 3 }, // every 3 days + + boostCriteria: (_item, searchTerm) => { + if (!searchTerm) return -40; + return 0; + }, + + run: async (_ctx) => { + const years = await seqtaFetchPayload | null>( + "/seqta/student/load/goals", + { mode: "years" }, + ); + if (!Array.isArray(years) || years.length === 0) { + console.debug("[Goals job] No goal years available; skipping."); + return []; + } + + const items: IndexItem[] = []; + const seen = new Set(); + + for (const year of years) { + try { + const yearGoals = await seqtaFetchPayload( + "/seqta/student/load/goals", + { mode: "list", year }, + ); + if (!Array.isArray(yearGoals)) continue; + + for (const goal of yearGoals) { + if (!goal) continue; + const stableId = goal.uuid ?? goal.id; + if (stableId === undefined || stableId === null) continue; + const id = `goal-${stableId}`; + if (seen.has(id)) continue; + seen.add(id); + + const title = + goal.title?.trim() || goal.description?.slice(0, 80) || `Goal ${stableId}`; + const dateAdded = goal.updated || goal.created + ? new Date(goal.updated ?? goal.created!).getTime() || Date.now() + : Date.now(); + + items.push({ + id, + text: title, + category: "goals", + content: extractTextFromValue( + { description: goal.description, status: goal.status }, + { maxChars: 1000 }, + ), + dateAdded, + metadata: { + goalId: goal.id, + goalUuid: goal.uuid, + status: goal.status, + year: goal.year ?? year, + createdAt: goal.created, + updatedAt: goal.updated, + entityType: "goal", + route: `/goals/${year}`, + icon: "\uea15", + }, + actionId: "goal", + renderComponentId: "goal", + }); + } + } catch (e) { + console.warn(`[Goals job] Failed to fetch goals for year ${year}:`, e); + } + await delay(PER_YEAR_DELAY_MS); + } + + console.debug(`[Goals job] Indexed ${items.length} goal entries.`); + return items; + }, + + purge: (items) => items, +}; diff --git a/src/plugins/built-in/globalSearch/src/indexing/jobs/notices.ts b/src/plugins/built-in/globalSearch/src/indexing/jobs/notices.ts new file mode 100644 index 00000000..93e6be15 --- /dev/null +++ b/src/plugins/built-in/globalSearch/src/indexing/jobs/notices.ts @@ -0,0 +1,218 @@ +import type { IndexItem, Job } from "../types"; +import { seqtaFetchPayload } from "../api"; +import { htmlToPlainText } from "../utils"; +import { delay } from "@/seqta/utils/delay"; + +/** + * Indexes daily notices from `/seqta/student/load/notices`. + * + * SEQTA returns notices keyed by date, so we sweep a sliding window + * (default: 14 days back) the first time we run, then incrementally pull + * the most recent days on subsequent runs. Sensitive routes are excluded + * because notices are surfaced for the active student already. + */ + +interface NoticeRecord { + id?: number | string; + title?: string; + contents?: string; + staff?: string; + staff_id?: number; + date?: string; + label?: number; + label_title?: string; + colour?: string; +} + +interface NoticesProgress { + earliestDate: string | null; + lastSweepBackTo: string | null; +} + +const SWEEP_DAYS = 14; +const MAX_HISTORY_DAYS = 365; +const FETCH_DELAY_MS = 60; + +function formatYmd(date: Date): string { + const y = date.getFullYear(); + const m = (date.getMonth() + 1).toString().padStart(2, "0"); + const d = date.getDate().toString().padStart(2, "0"); + return `${y}-${m}-${d}`; +} + +function parseYmd(value: string | null | undefined): Date | null { + if (!value) return null; + const match = /^(\d{4})-(\d{2})-(\d{2})$/.exec(value); + if (!match) return null; + const [, y, m, d] = match; + return new Date(Number(y), Number(m) - 1, Number(d)); +} + +const fetchNoticesForDate = async (date: string): Promise => { + const payload = await seqtaFetchPayload( + "/seqta/student/load/notices", + { date }, + ); + if (!payload) return []; + if (Array.isArray(payload)) return payload; + if (Array.isArray((payload as any).notices)) return (payload as any).notices; + return []; +}; + +const fetchLabelLookup = async (): Promise> => { + const payload = await seqtaFetchPayload< + Array<{ id: number; title?: string }> + >("/seqta/student/load/notices", { mode: "labels" }); + const map = new Map(); + if (Array.isArray(payload)) { + for (const entry of payload) { + if (entry && typeof entry.id === "number" && entry.title) { + map.set(entry.id, entry.title); + } + } + } + return map; +}; + +export const noticesJob: Job = { + id: "notices", + label: "Notices", + renderComponentId: "notice", + frequency: { type: "expiry", afterMs: 1000 * 60 * 60 * 6 }, // 6 hours + + boostCriteria: (item, searchTerm) => { + if (!searchTerm) return -10; + let score = 0; + const ts = item.metadata?.timestamp; + if (typeof ts === "string") { + const ageDays = + (Date.now() - new Date(ts).getTime()) / (1000 * 60 * 60 * 24); + if (ageDays >= 0 && ageDays <= 7) score += 0.05; + } + return score; + }, + + run: async (ctx) => { + const stored = await ctx.getStoredItems("notices"); + const existingIds = new Set(stored.map((i) => i.id)); + const progress = (await ctx.getProgress()) ?? { + earliestDate: null, + lastSweepBackTo: null, + }; + + const labelLookup = await fetchLabelLookup(); + + const today = new Date(); + today.setHours(0, 0, 0, 0); + + // Sweep window: always the most recent SWEEP_DAYS, plus extend further + // back the first time we run until we hit MAX_HISTORY_DAYS. + const earliestEverIso = formatYmd( + new Date(today.getTime() - MAX_HISTORY_DAYS * 86_400_000), + ); + + const dates: string[] = []; + for (let offset = 0; offset < SWEEP_DAYS; offset++) { + const day = new Date(today.getTime() - offset * 86_400_000); + dates.push(formatYmd(day)); + } + if ( + !progress.lastSweepBackTo || + progress.lastSweepBackTo > earliestEverIso + ) { + // Walk backwards in batches of ~30 days per run so we don't blow up + // a single indexing pass. + const startBack = parseYmd(progress.lastSweepBackTo) ?? today; + const targetBack = new Date(startBack.getTime() - 30 * 86_400_000); + const minBack = parseYmd(earliestEverIso) ?? targetBack; + const stopBack = targetBack < minBack ? minBack : targetBack; + for ( + let cursor = new Date(startBack.getTime() - SWEEP_DAYS * 86_400_000); + cursor >= stopBack; + cursor = new Date(cursor.getTime() - 86_400_000) + ) { + dates.push(formatYmd(cursor)); + } + progress.lastSweepBackTo = formatYmd(stopBack); + } + + const items: IndexItem[] = []; + const seen = new Set(); + + for (const date of dates) { + try { + const notices = await fetchNoticesForDate(date); + for (const notice of notices) { + if (!notice || (notice.id === undefined && !notice.title)) continue; + const id = `notice-${date}-${notice.id ?? notice.title}`; + if (seen.has(id)) continue; + seen.add(id); + + const labelTitle = + notice.label_title ?? + (typeof notice.label === "number" + ? labelLookup.get(notice.label) ?? null + : null); + + const bodyText = notice.contents + ? htmlToPlainText(notice.contents) + : ""; + + items.push({ + id, + text: notice.title?.trim() || `Notice ${notice.id ?? date}`, + category: "notices", + content: bodyText.slice(0, 4000), + dateAdded: new Date(date).getTime(), + metadata: { + noticeId: notice.id, + date, + author: notice.staff, + authorId: notice.staff_id, + label: labelTitle, + labelId: notice.label, + colour: notice.colour, + timestamp: date, + entityType: "notice", + route: "/notices", + icon: "\ueb24", + }, + actionId: "notice", + renderComponentId: "notice", + }); + } + } catch (e) { + console.warn(`[Notices job] Failed to fetch notices for ${date}:`, e); + } + await delay(FETCH_DELAY_MS); + } + + if (items.length > 0) { + const dateStrings = items + .map((i) => i.metadata?.date as string | undefined) + .filter((d): d is string => !!d); + if (dateStrings.length > 0) { + const earliest = dateStrings.sort()[0]; + if ( + !progress.earliestDate || + earliest < progress.earliestDate + ) { + progress.earliestDate = earliest; + } + } + } + + await ctx.setProgress(progress); + + const newCount = items.filter((i) => !existingIds.has(i.id)).length; + console.debug( + `[Notices job] Indexed ${items.length} notices across ${dates.length} dates (${newCount} new).`, + ); + return items; + }, + + purge: (items) => { + const oneYearAgo = Date.now() - 365 * 24 * 60 * 60 * 1000; + return items.filter((i) => i.dateAdded >= oneYearAgo); + }, +}; diff --git a/src/plugins/built-in/globalSearch/src/indexing/jobs/passive.ts b/src/plugins/built-in/globalSearch/src/indexing/jobs/passive.ts new file mode 100644 index 00000000..324d0f6d --- /dev/null +++ b/src/plugins/built-in/globalSearch/src/indexing/jobs/passive.ts @@ -0,0 +1,49 @@ +import type { Job } from "../types"; + +/** + * Stub job for the passive-observer store. + * + * The passive observer (see `passiveObserver.ts`) writes captured items + * directly into IndexedDB via `getAll`/`put`. We still register a job here + * so the indexer: + * - Creates the `passive` object store on first use. + * - Picks up the right `renderComponentId` when materializing in-memory + * items in `loadAllStoredItems()`. + * - Applies a deterministic boost / purge policy to passive results. + * + * `run()` is a no-op: the passive observer has its own write path so it + * works whether or not an active indexing pass is running. + */ +export const passiveJob: Job = { + id: "passive", + label: "Recently viewed", + renderComponentId: "passive", + // Run frequently so any newly captured items are merged into the + // dynamic-items cache on the next indexing tick. The actual capture is + // continuous; this is only the synchronization cadence. + frequency: { type: "interval", ms: 1000 * 60 * 5 }, + + boostCriteria: (item, searchTerm) => { + // Passive items are noisier than curated ones, so penalize them + // slightly when there's no query and only modestly help on matches. + if (!searchTerm) return -60; + let score = 0; + if (item.metadata?.entityType) score += 0.02; + return score; + }, + + run: async () => { + return []; + }, + + purge: (items) => { + // Keep the most recent ~500 passive entries and anything newer than + // 30 days. This caps storage growth from heavy browsing sessions. + const cutoff = Date.now() - 30 * 24 * 60 * 60 * 1000; + const recent = items + .filter((i) => i.dateAdded >= cutoff) + .sort((a, b) => b.dateAdded - a.dateAdded) + .slice(0, 500); + return recent; + }, +}; diff --git a/src/plugins/built-in/globalSearch/src/indexing/jobs/portals.ts b/src/plugins/built-in/globalSearch/src/indexing/jobs/portals.ts new file mode 100644 index 00000000..01d5bd71 --- /dev/null +++ b/src/plugins/built-in/globalSearch/src/indexing/jobs/portals.ts @@ -0,0 +1,90 @@ +import type { IndexItem, Job } from "../types"; +import { seqtaFetchPayload } from "../api"; + +/** + * Indexes the user's external portal entries from `/seqta/student/load/portals`. + * + * Portals are user-facing tiles linking to third-party tools (Mathletics, + * Seesaw, Google Classroom, ...). We index their labels and external URLs + * so users can jump to them via the global search palette without scrolling + * the dashboard. + */ + +interface PortalPayload { + id: number | string; + label?: string; + url?: string; + uuid?: string; + icon?: string; + priority?: number; + is_power_portal?: boolean; + contents?: string; + inherit_styles?: boolean; +} + +function normalizePortalUrl(raw: string | undefined): string | undefined { + if (!raw) return undefined; + const trimmed = raw.trim(); + if (!trimmed) return undefined; + if (/^https?:\/\//i.test(trimmed)) return trimmed; + return `https://${trimmed.replace(/^\/+/, "")}`; +} + +export const portalsJob: Job = { + id: "portals", + label: "Portals", + renderComponentId: "portal", + frequency: { type: "expiry", afterMs: 1000 * 60 * 60 * 24 * 7 }, // weekly + + boostCriteria: (_item, searchTerm) => { + if (!searchTerm) return -50; + return 0; + }, + + run: async (_ctx) => { + const payload = await seqtaFetchPayload( + "/seqta/student/load/portals", + {}, + ); + if (!Array.isArray(payload)) return []; + + const items: IndexItem[] = []; + const seen = new Set(); + + for (const portal of payload) { + if (!portal || (portal.id === undefined && !portal.uuid)) continue; + const id = `portal-${portal.uuid ?? portal.id}`; + if (seen.has(id)) continue; + seen.add(id); + + const url = normalizePortalUrl(portal.url); + const label = portal.label?.trim() || `Portal ${portal.id}`; + const contentParts: string[] = []; + if (url) contentParts.push(url); + if (portal.is_power_portal) contentParts.push("Power Portal"); + + items.push({ + id, + text: label, + category: "portals", + content: contentParts.join(" \u2022 "), + dateAdded: Date.now(), + metadata: { + portalId: portal.id, + portalUuid: portal.uuid, + url, + isPowerPortal: !!portal.is_power_portal, + entityType: "portal", + icon: "\ueb01", + }, + actionId: "portal", + renderComponentId: "portal", + }); + } + + console.debug(`[Portals job] Indexed ${items.length} portal entries.`); + return items; + }, + + purge: (items) => items, +}; diff --git a/src/plugins/built-in/globalSearch/src/indexing/jobs/reports.ts b/src/plugins/built-in/globalSearch/src/indexing/jobs/reports.ts new file mode 100644 index 00000000..9b3fc973 --- /dev/null +++ b/src/plugins/built-in/globalSearch/src/indexing/jobs/reports.ts @@ -0,0 +1,97 @@ +import type { IndexItem, Job } from "../types"; +import { seqtaFetchPayload } from "../api"; + +/** + * Indexes report metadata from `/seqta/student/load/reports`. + * + * Reports are PDFs gated behind SEQTA's authenticated download endpoint, so + * we only index the human-readable metadata (year, term, title, file UUID) + * and a stable hash route so the search palette can deep-link straight + * into the reports page. + */ + +interface ReportEntry { + id?: number | string; + uuid?: string; + title?: string; + description?: string; + date_published?: string; + date_created?: string; + year?: number | string; + term?: number | string; + metaclass?: number; + programme?: number; + filename?: string; +} + +export const reportsJob: Job = { + id: "reports", + label: "Reports", + renderComponentId: "report", + frequency: { type: "expiry", afterMs: 1000 * 60 * 60 * 24 }, // daily + + boostCriteria: (_item, searchTerm) => { + if (!searchTerm) return -25; + return 0; + }, + + run: async (_ctx) => { + const payload = await seqtaFetchPayload( + "/seqta/student/load/reports", + {}, + ); + if (!Array.isArray(payload)) return []; + + const items: IndexItem[] = []; + const seen = new Set(); + + for (const report of payload) { + if (!report) continue; + const stableId = report.uuid ?? report.id; + if (stableId === undefined || stableId === null) continue; + const id = `report-${stableId}`; + if (seen.has(id)) continue; + seen.add(id); + + const title = report.title?.trim() || `Report ${stableId}`; + const dateAdded = report.date_published + ? new Date(report.date_published).getTime() || Date.now() + : Date.now(); + + const contentParts: string[] = []; + if (report.description) contentParts.push(report.description); + if (report.year) contentParts.push(`Year ${report.year}`); + if (report.term) contentParts.push(`Term ${report.term}`); + if (report.date_published) contentParts.push(report.date_published); + + items.push({ + id, + text: title, + category: "reports", + content: contentParts.join(" \u2022 "), + dateAdded, + metadata: { + reportId: report.id, + reportUuid: report.uuid, + year: report.year, + term: report.term, + metaclass: report.metaclass, + programme: report.programme, + publishedAt: report.date_published, + createdAt: report.date_created, + filename: report.filename, + entityType: "report", + route: "/reports", + icon: "\ueb70", + }, + actionId: "report", + renderComponentId: "report", + }); + } + + console.debug(`[Reports job] Indexed ${items.length} reports.`); + return items; + }, + + purge: (items) => items, +}; diff --git a/src/plugins/built-in/globalSearch/src/indexing/passiveObserver.ts b/src/plugins/built-in/globalSearch/src/indexing/passiveObserver.ts new file mode 100644 index 00000000..6d30a7bb --- /dev/null +++ b/src/plugins/built-in/globalSearch/src/indexing/passiveObserver.ts @@ -0,0 +1,583 @@ +import type { IndexItem } from "./types"; +import { put, getAll } from "./db"; +import { + buildIndexItem, + extractTextFromValue, + pickId, + pickTitle, +} from "./extract"; +import { isSensitiveSeqtaPath, normalizeSeqtaPath } from "./api"; +import { loadAllStoredItems } from "./indexer"; +import { loadDynamicItems } from "../utils/dynamicItems"; +import { renderComponentMap } from "./renderComponents"; +import { jobs } from "./jobs"; + +/** + * Passive network observer. + * + * Wraps the page's `fetch` (and best-effort `XMLHttpRequest`) so that any + * successful same-origin SEQTA JSON response observed while the user + * browses is opportunistically distilled into IndexItems and persisted to + * the `passive` object store. + * + * Hard guarantees: + * - Only same-origin requests under `/seqta/student/` are considered. + * - The shared sensitive-route denylist (login, save/*, settings, prefs, + * heartbeat, branding, themes, file streams, masquerade, ...) is checked + * before any persistence. + * - Response bodies are read via `Response.clone()` so we never consume the + * body the host page intends to use. + * - Sensitive keys/values are stripped via `redactSensitive` before the + * item is stored. + * - Binary file contents are never indexed (we only work on JSON responses + * served as `text/json` / `application/json`). + */ + +const STORE_ID = "passive"; +const FLUSH_DEBOUNCE_MS = 1500; +const MAX_ITEMS_PER_RESPONSE = 50; +const MAX_PER_RESPONSE_TEXT_CHARS = 1500; + +let installed = false; +let pendingFlush: ReturnType | null = null; +let pendingDirty = false; + +export function isPassiveObserverInstalled(): boolean { + return installed; +} + +/* ------------------------------------------------------------------ */ +/* eligibility checks */ +/* ------------------------------------------------------------------ */ + +function isSameOriginSeqtaUrl(url: string): boolean { + try { + const parsed = new URL(url, location.origin); + if (parsed.origin !== location.origin) return false; + return parsed.pathname.startsWith("/seqta/student/"); + } catch { + return false; + } +} + +function looksLikeJsonContentType(contentType: string | null): boolean { + if (!contentType) return false; + return /json/i.test(contentType); +} + +/* ------------------------------------------------------------------ */ +/* item synthesis */ +/* ------------------------------------------------------------------ */ + +interface CapturedContext { + route: string; + requestBody: unknown; + observedAt: number; +} + +function categoryFromRoute(route: string): string { + // /seqta/student/load/courses -> courses + // /seqta/student/load/message -> message + const tail = route.replace(/^\/seqta\/student\//, "").split("/").filter(Boolean); + if (tail.length === 0) return "passive"; + // message/people is a support endpoint that backs the messages compose UI. + // We treat it as a low-priority `messages-support` record rather than a + // standalone "people" category so it never competes with real assessments + // / messages in the result list. + if (route.includes("/load/message/people")) return "messages-support"; + return tail[tail.length - 1].toLowerCase(); +} + +/** + * `/seqta/student/load/message/people` returns the contact picker dataset + * used by the messages compose view. We only want to surface entries that + * actually carry a human display name — the rest is structural noise that + * historically caused raw API paths to appear as titles. + */ +function isPeopleEntityWorthIndexing(entity: unknown): boolean { + if (!entity || typeof entity !== "object") return false; + const obj = entity as Record; + const first = stringField(obj, [ + "preferredName", + "preferred", + "firstname", + "firstName", + "first_name", + "given", + "givenName", + ]); + const last = stringField(obj, [ + "surname", + "lastname", + "lastName", + "last_name", + "familyName", + ]); + const display = stringField(obj, ["displayName", "name", "fullName"]); + return Boolean((first && last) || display); +} + +function sourcePageForRoute(route: string): string | undefined { + if (route.includes("/load/message/people")) return "/messages"; + if (route.includes("/load/message")) return "/messages"; + if (route.includes("/load/messages")) return "/messages"; + if (route.includes("/load/courses")) return "/courses"; + if (route.includes("/load/assessments")) return "/assessments/upcoming"; + if (route.includes("/load/notices")) return "/notices"; + if (route.includes("/load/documents")) return "/documents"; + if (route.includes("/folio")) return "/folios/read"; + if (route.includes("/load/forums")) return "/forums"; + if (route.includes("/load/goals")) return "/goals"; + if (route.includes("/load/reports")) return "/reports"; + if (route.includes("/load/portals")) return "/dashboard"; + return undefined; +} + +function entitiesFromPayload(payload: unknown): unknown[] { + if (Array.isArray(payload)) return payload; + if (payload && typeof payload === "object") { + const obj = payload as Record; + // SEQTA frequently nests arrays as `payload.list`, `.messages`, + // `.items`, `.tasks`, etc. Pull the first array-shaped child as our + // best guess; if none exists, fall back to the object itself so we + // still index a single entry. + for (const key of [ + "list", + "items", + "messages", + "tasks", + "pending", + "forums", + "docs", + ]) { + const value = obj[key]; + if (Array.isArray(value)) return value; + } + return [payload]; + } + return []; +} + +/** + * Whitelist of entity-shaped fields we hoist into item metadata so the + * `passive` action handler can deep-link into the right SEQTA SPA route. + * These mirror what the active jobs already store (see `courses.ts`, + * `portals.ts`, etc.) so the action only has to consult one source. + */ +const DEEP_LINK_FIELDS = [ + "programme", + "programmeId", + "programmeID", + "metaclass", + "metaclassId", + "metaclassID", + "year", + "uuid", + "portalUuid", + "forum", + "forumId", + "assessmentId", + "assessmentID", + "messageId", +] as const; + +function pickDeepLinkHints( + entity: unknown, +): Record { + if (!entity || typeof entity !== "object") return {}; + const src = entity as Record; + const out: Record = {}; + for (const key of DEEP_LINK_FIELDS) { + const value = src[key]; + if (typeof value === "number" && Number.isFinite(value)) { + out[key] = value; + } else if (typeof value === "string" && value) { + out[key] = value; + } + } + return out; +} + +function stringField( + entity: Record, + keys: readonly string[], +): string | undefined { + for (const key of keys) { + const value = entity[key]; + if (typeof value === "string" && value.trim()) return value.trim(); + } + return undefined; +} + +function titleFromEndpoint( + route: string, + entity: unknown, + extractedContent: string, + fallback: string, +): string { + if (route.includes("/load/message/people") && entity && typeof entity === "object") { + const obj = entity as Record; + const first = stringField(obj, [ + "preferredName", + "preferred", + "firstname", + "firstName", + "first_name", + "given", + "givenName", + ]); + const last = stringField(obj, [ + "surname", + "lastname", + "lastName", + "last_name", + "familyName", + ]); + const full = [first, last].filter(Boolean).join(" ").trim(); + if (full) return full.slice(0, 200); + } + + const picked = pickTitle(entity, ""); + if (picked) return picked.slice(0, 200); + + // Last resort: show a human-readable content preview instead of a raw API + // path like `/seqta/student/load/message/people#20`. + const firstLine = extractedContent + .split(/\r?\n/) + .map((line) => line.trim()) + .find(Boolean); + return (firstLine || fallback).slice(0, 200); +} + +function synthesizeItems( + ctx: CapturedContext, + payload: unknown, +): IndexItem[] { + const entities = entitiesFromPayload(payload); + if (entities.length === 0) return []; + + const category = categoryFromRoute(ctx.route); + const now = ctx.observedAt; + const out: IndexItem[] = []; + + const isPeopleSupport = ctx.route.includes("/load/message/people"); + + const limit = Math.min(entities.length, MAX_ITEMS_PER_RESPONSE); + for (let i = 0; i < limit; i++) { + const entity = entities[i]; + if (!entity || (typeof entity !== "object" && typeof entity !== "string")) { + continue; + } + + // For the messages compose-people endpoint, skip records that don't + // carry a real human name. We never want raw entries like + // `/seqta/student/load/message/people#20` becoming titles, and we + // explicitly route the rest to /messages so they're treated as support + // records, not standalone "people" results. + if (isPeopleSupport && !isPeopleEntityWorthIndexing(entity)) { + continue; + } + + const fallbackId = `${ctx.route}#${i}`; + const entityId = pickId(entity, fallbackId); + const stableId = `passive-${ctx.route.replace(/\//g, "_")}-${entityId}`; + + const content = extractTextFromValue(entity, { + maxChars: MAX_PER_RESPONSE_TEXT_CHARS, + }); + const title = titleFromEndpoint(ctx.route, entity, content, fallbackId); + if (!content && (!title || title === fallbackId)) { + // Skip records that produced neither title nor content; they are + // structurally noise (e.g. tiny acknowledgement payloads). + continue; + } + + const deepLinkHints = pickDeepLinkHints(entity); + const sourcePage = sourcePageForRoute(ctx.route); + + out.push( + buildIndexItem({ + id: stableId, + text: title, + category, + contentOverride: content, + metadata: { + route: ctx.route, + source: "passive", + observedAt: new Date(now).toISOString(), + entityType: category, + entityId, + icon: "\ueb71", + sourcePage, + // Mark message/people as a low-priority support record so the + // search ranker can deprioritize it relative to real messages, + // assessments, courses, etc. + ...(isPeopleSupport ? { supportRecord: true, priority: "low" } : {}), + ...deepLinkHints, + }, + actionId: "passive", + renderComponentId: "passive", + dateAdded: now, + }), + ); + } + + return out; +} + +/* ------------------------------------------------------------------ */ +/* persistence */ +/* ------------------------------------------------------------------ */ + +async function persistItems(items: IndexItem[]): Promise { + if (items.length === 0) return; + + // Dedupe against existing entries. We replace on collision so the latest + // observation wins (e.g. if a message changes title). + for (const item of items) { + try { + await put(STORE_ID, item, item.id); + } catch (e) { + console.warn( + `[Passive Observer] Failed to persist item ${item.id}:`, + e, + ); + } + } + + pendingDirty = true; + scheduleFlush(); +} + +function scheduleFlush() { + if (pendingFlush) return; + pendingFlush = setTimeout(() => { + pendingFlush = null; + if (!pendingDirty) return; + pendingDirty = false; + void flushDynamicItems(); + }, FLUSH_DEBOUNCE_MS); +} + +async function flushDynamicItems(): Promise { + try { + const all = await loadAllStoredItems(); + const decorated = all.map((item) => { + try { + const jobDef = + jobs[item.category] || + Object.values(jobs).find((j) => j.id === item.category) || + jobs[item.renderComponentId]; + let renderComponent = item.renderComponent; + if (jobDef) { + renderComponent = + renderComponentMap[jobDef.renderComponentId] || renderComponent; + } else if (renderComponentMap[item.renderComponentId]) { + renderComponent = renderComponentMap[item.renderComponentId]; + } + try { + const cloned = JSON.parse(JSON.stringify(item)); + cloned.renderComponent = renderComponent; + return cloned; + } catch { + return { ...item, renderComponent }; + } + } catch { + return item; + } + }); + loadDynamicItems(decorated); + window.dispatchEvent( + new CustomEvent("dynamic-items-updated", { + detail: { + incremental: true, + jobId: STORE_ID, + streaming: false, + }, + }), + ); + } catch (e) { + console.warn("[Passive Observer] Failed to refresh dynamic items:", e); + } +} + +/* ------------------------------------------------------------------ */ +/* fetch hook */ +/* ------------------------------------------------------------------ */ + +async function consumeResponse( + response: Response, + url: string, + requestBody: unknown, +): Promise { + if (!response.ok) return; + + const route = normalizeSeqtaPath(url); + if (isSensitiveSeqtaPath(route)) return; + + const contentType = response.headers.get("content-type"); + if (!looksLikeJsonContentType(contentType)) return; + + let body: any; + try { + body = await response.clone().json(); + } catch { + return; + } + + if (!body || typeof body !== "object") return; + if (body.status && body.status !== "200") return; + + const payload = body.payload; + if (payload === undefined || payload === null) return; + + const items = synthesizeItems( + { + route, + requestBody, + observedAt: Date.now(), + }, + payload, + ); + + if (items.length > 0) { + await persistItems(items); + } +} + +function tryParseJson(value: unknown): unknown { + if (typeof value !== "string") return value; + try { + return JSON.parse(value); + } catch { + return value; + } +} + +/** + * Installs the passive observer once. Subsequent calls are no-ops. + * + * Designed to be called from the global-search plugin bootstrap after + * `mountSearchBar` succeeds so the observer is only active when the + * plugin itself is enabled. + */ +export function installPassiveObserver(): void { + if (installed) return; + if (typeof window === "undefined" || typeof window.fetch !== "function") { + return; + } + installed = true; + + const originalFetch = window.fetch.bind(window); + window.fetch = async function patchedFetch( + input: RequestInfo | URL, + init?: RequestInit, + ): Promise { + const response = await originalFetch(input, init); + + try { + const url = + typeof input === "string" + ? input + : input instanceof URL + ? input.toString() + : input.url; + if (isSameOriginSeqtaUrl(url)) { + const body = init?.body; + const parsed = + body && typeof body === "string" + ? tryParseJson(body) + : undefined; + // Fire-and-forget: never block the host page on indexing work. + void consumeResponse(response, url, parsed); + } + } catch (e) { + // Never let observer errors bubble up to the host page. + console.debug("[Passive Observer] fetch hook error:", e); + } + + return response; + }; + + // Best-effort XHR hook for the rare callers that bypass fetch. + const ProtoXhr = (window as any).XMLHttpRequest?.prototype; + if (ProtoXhr) { + const originalOpen = ProtoXhr.open; + const originalSend = ProtoXhr.send; + ProtoXhr.open = function patchedOpen( + this: XMLHttpRequest, + method: string, + url: string, + ...rest: any[] + ) { + try { + (this as any).__bsplusUrl = url; + (this as any).__bsplusMethod = method; + } catch { + /* ignore */ + } + return originalOpen.call(this, method, url, ...rest); + }; + ProtoXhr.send = function patchedSend( + this: XMLHttpRequest, + body?: any, + ) { + try { + const url = (this as any).__bsplusUrl as string | undefined; + if (url && isSameOriginSeqtaUrl(url)) { + const parsed = + typeof body === "string" ? tryParseJson(body) : undefined; + this.addEventListener("load", () => { + try { + if (this.status < 200 || this.status >= 300) return; + const ct = this.getResponseHeader("content-type"); + if (!looksLikeJsonContentType(ct)) return; + const route = normalizeSeqtaPath(url); + if (isSensitiveSeqtaPath(route)) return; + let json: any; + try { + json = JSON.parse(this.responseText); + } catch { + return; + } + if (!json || typeof json !== "object") return; + if (json.status && json.status !== "200") return; + const payload = json.payload; + if (payload === undefined || payload === null) return; + const items = synthesizeItems( + { + route, + requestBody: parsed, + observedAt: Date.now(), + }, + payload, + ); + if (items.length > 0) { + void persistItems(items); + } + } catch (e) { + console.debug("[Passive Observer] xhr load error:", e); + } + }); + } + } catch { + /* ignore */ + } + return originalSend.call(this, body); + }; + } + + console.debug("[Passive Observer] Installed."); +} + +/** + * Returns currently-stored passive items. Mainly used for diagnostics from + * `window.globalSearchDebug`. + */ +export async function getStoredPassiveItems(): Promise { + try { + return (await getAll(STORE_ID)) as IndexItem[]; + } catch { + return []; + } +} diff --git a/src/plugins/built-in/globalSearch/src/indexing/renderComponents.ts b/src/plugins/built-in/globalSearch/src/indexing/renderComponents.ts index 633fa49c..ade25c79 100644 --- a/src/plugins/built-in/globalSearch/src/indexing/renderComponents.ts +++ b/src/plugins/built-in/globalSearch/src/indexing/renderComponents.ts @@ -2,10 +2,23 @@ import type { SvelteComponent } from "svelte"; import AssessmentItem from "../components/items/AssessmentItem.svelte"; import ForumItem from "../components/items/ForumItem.svelte"; import SubjectItem from "../components/items/SubjectItem.svelte"; +import GenericItem from "../components/items/GenericItem.svelte"; export const renderComponentMap: Record = { assessment: AssessmentItem as unknown as typeof SvelteComponent, message: AssessmentItem as unknown as typeof SvelteComponent, forum: ForumItem as unknown as typeof SvelteComponent, subject: SubjectItem as unknown as typeof SvelteComponent, -}; \ No newline at end of file + // New categories share a generic, category-aware row component to keep + // the palette consistent without bespoke layouts for every job. The + // component reads `item.metadata.icon` and the `category` to pick a + // sensible default visual treatment. + course: GenericItem as unknown as typeof SvelteComponent, + notice: GenericItem as unknown as typeof SvelteComponent, + document: GenericItem as unknown as typeof SvelteComponent, + folio: GenericItem as unknown as typeof SvelteComponent, + portal: GenericItem as unknown as typeof SvelteComponent, + report: GenericItem as unknown as typeof SvelteComponent, + goal: GenericItem as unknown as typeof SvelteComponent, + passive: GenericItem as unknown as typeof SvelteComponent, +}; diff --git a/src/plugins/built-in/globalSearch/src/indexing/resetIndexes.ts b/src/plugins/built-in/globalSearch/src/indexing/resetIndexes.ts new file mode 100644 index 00000000..c4d30b75 --- /dev/null +++ b/src/plugins/built-in/globalSearch/src/indexing/resetIndexes.ts @@ -0,0 +1,112 @@ +import { SCHEMA_VERSION_KEY } from "./schemaVersion"; + +/** + * Hard-reset of all global-search persistence. + * + * This module is intentionally dependency-free (no imports from `db.ts`, + * the worker manager, embeddia, or any heavy bundle) so it can be + * statically imported from: + * + * - The always-loaded plugin shell (`lazy.ts`) for the manual + * "Reset Index" settings button. Statically importing means the button + * keeps working across extension updates — there's no chunk hash to + * chase via dynamic import, which previously produced + * `Failed to fetch dynamically imported module: .../assets/.js` + * when an older settings page tried to load a chunk that the new build + * had already replaced. + * + * - The version-check path (`utils/versionCheck.ts`) for the auto-reset + * that fires whenever the extension's manifest version changes. + * + * The function: + * 1. Notifies in-process modules to drop in-memory caches and any open + * IndexedDB connections via custom DOM events (best effort). + * 2. Deletes the structured `betterseqta-index` and the vector + * `embeddiaDB` databases. + * 3. Clears version-tracking localStorage keys so the next indexing + * pass treats the world as fresh. + * + * It never throws on partial failure: each step is wrapped in try/catch + * so a stuck connection on one DB doesn't block the other. + */ + +const STRUCTURED_DB = "betterseqta-index"; +const VECTOR_DB = "embeddiaDB"; +const STRUCTURED_VERSION_KEY = "betterseqta-index-version"; + +function deleteIndexedDb(name: string): Promise { + return new Promise((resolve) => { + let resolved = false; + const finish = () => { + if (resolved) return; + resolved = true; + resolve(); + }; + + let req: IDBOpenDBRequest; + try { + req = indexedDB.deleteDatabase(name); + } catch (e) { + console.warn(`[Reset] Could not start delete of ${name}:`, e); + finish(); + return; + } + + req.onsuccess = () => finish(); + req.onerror = () => { + console.warn(`[Reset] Error deleting ${name}:`, req.error); + finish(); + }; + req.onblocked = () => { + // Connections are still open in another tab. Wait briefly, retry, + // then resolve regardless so we never hang the caller forever. + console.warn( + `[Reset] Delete of ${name} blocked; will retry then resolve.`, + ); + setTimeout(() => { + try { + const retry = indexedDB.deleteDatabase(name); + retry.onsuccess = () => finish(); + retry.onerror = () => finish(); + retry.onblocked = () => finish(); + } catch { + finish(); + } + }, 600); + }; + }); +} + +export async function resetSearchIndexes(): Promise { + try { + if (typeof window !== "undefined") { + window.dispatchEvent( + new CustomEvent("betterseqta-clear-search-cache"), + ); + window.dispatchEvent( + new CustomEvent("betterseqta-clear-embedding-cache"), + ); + window.dispatchEvent( + new CustomEvent("betterseqta-reset-search-index"), + ); + } + } catch { + /* ignore — events are best-effort */ + } + + // Give listeners a tick to close any open IDB connections; otherwise + // the delete request below comes back with `onblocked`. + await new Promise((resolve) => setTimeout(resolve, 150)); + + await Promise.allSettled([ + deleteIndexedDb(STRUCTURED_DB), + deleteIndexedDb(VECTOR_DB), + ]); + + try { + localStorage.removeItem(STRUCTURED_VERSION_KEY); + localStorage.removeItem(SCHEMA_VERSION_KEY); + } catch { + /* ignore */ + } +} diff --git a/src/plugins/built-in/globalSearch/src/indexing/schemaVersion.ts b/src/plugins/built-in/globalSearch/src/indexing/schemaVersion.ts new file mode 100644 index 00000000..0c743f97 --- /dev/null +++ b/src/plugins/built-in/globalSearch/src/indexing/schemaVersion.ts @@ -0,0 +1,16 @@ +/** + * Index schema version. Bump whenever the IndexItem shape, category set, + * or text construction changes in a way that should invalidate previously + * stored items (and their embeddings). + * + * On mismatch, both the structured IndexedDB store and the embeddiaDB are + * wiped before the next indexing pass so we don't serve stale results. + * + * Kept in its own file (with no imports) so very lightweight callers — the + * always-loaded plugin shell in `lazy.ts`, the version-check path — can + * pull it in without bringing the heavy indexer/worker bundle along. + */ +export const INDEX_SCHEMA_VERSION = 6; + +/** Key used to track the schema version a previous run wrote out. */ +export const SCHEMA_VERSION_KEY = "bsq-index-schema-version"; diff --git a/src/plugins/built-in/globalSearch/src/indexing/selfTests.ts b/src/plugins/built-in/globalSearch/src/indexing/selfTests.ts new file mode 100644 index 00000000..36a7bae1 --- /dev/null +++ b/src/plugins/built-in/globalSearch/src/indexing/selfTests.ts @@ -0,0 +1,328 @@ +import { + isSensitiveKey, + looksLikeSecretValue, + redactSensitive, + extractTextFromValue, + pickTitle, + pickId, + buildIndexItem, +} from "./extract"; +import { isSensitiveSeqtaPath, normalizeSeqtaPath } from "./api"; +import { + coursesPayload, + documentsPayload, + folioEntryPayload, + noticesPayload, + portalsPayload, + settingsPayload, + subjectsListPayload, +} from "./__fixtures__/seqtaResponses"; + +/** + * Lightweight in-process self-tests for the global-search overhaul. + * + * The repository does not (yet) ship with a test runner, so we instead + * expose a deterministic suite of assertions over the pure helpers that + * back active jobs and the passive observer. This is intentionally + * dependency-free so it can run inside the extension page (`window. + * globalSearchDebug.runSelfTests()`) and from any future Vitest harness + * without modification. + */ + +interface TestCase { + name: string; + run: () => void | Promise; +} + +class AssertionError extends Error { + constructor(message: string) { + super(message); + this.name = "AssertionError"; + } +} + +function assert(condition: unknown, message: string): asserts condition { + if (!condition) throw new AssertionError(message); +} + +function assertEqual(actual: T, expected: T, label: string) { + if (actual !== expected) { + throw new AssertionError( + `${label}: expected ${JSON.stringify(expected)} but got ${JSON.stringify(actual)}`, + ); + } +} + +function assertContains(haystack: string, needle: string, label: string) { + if (!haystack.includes(needle)) { + throw new AssertionError( + `${label}: expected "${haystack}" to contain "${needle}"`, + ); + } +} + +function assertNotContains(haystack: string, needle: string, label: string) { + if (haystack.includes(needle)) { + throw new AssertionError( + `${label}: expected "${haystack}" NOT to contain "${needle}"`, + ); + } +} + +const cases: TestCase[] = [ + { + name: "normalizeSeqtaPath strips query tokens", + run: () => { + assertEqual( + normalizeSeqtaPath("/seqta/student/load/messages?mokx3qef"), + "/seqta/student/load/messages", + "trailing token", + ); + assertEqual( + normalizeSeqtaPath( + "https://learn.example.com/seqta/student/load/courses?abc123", + ), + "/seqta/student/load/courses", + "absolute URL", + ); + }, + }, + { + name: "isSensitiveSeqtaPath catches credential routes", + run: () => { + assert( + isSensitiveSeqtaPath("/seqta/student/login?xyz"), + "login is sensitive", + ); + assert( + isSensitiveSeqtaPath("/seqta/student/save/message"), + "save/* is sensitive", + ); + assert( + isSensitiveSeqtaPath("/seqta/student/load/settings"), + "settings is sensitive", + ); + assert( + isSensitiveSeqtaPath("/seqta/student/load/prefs?z=1"), + "prefs is sensitive", + ); + assert( + isSensitiveSeqtaPath("/seqta/ta/masquerade"), + "masquerade is sensitive", + ); + assert( + !isSensitiveSeqtaPath("/seqta/student/load/messages"), + "messages is NOT sensitive", + ); + assert( + !isSensitiveSeqtaPath("/seqta/student/load/courses"), + "courses is NOT sensitive", + ); + }, + }, + { + name: "isSensitiveKey covers the credential vocabulary", + run: () => { + for (const key of [ + "password", + "Password", + "client_secret", + "apiKey", + "X-API-Token", + "jwtSession", + "oauth_signature", + ]) { + assert(isSensitiveKey(key), `expected ${key} to be sensitive`); + } + for (const key of ["title", "subject", "uuid", "metaclass"]) { + assert(!isSensitiveKey(key), `expected ${key} to be safe`); + } + }, + }, + { + name: "looksLikeSecretValue catches token-shaped strings", + run: () => { + assert( + looksLikeSecretValue( + "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjMifQ.abc123def456", + ), + "JWT looks secret", + ); + assert( + looksLikeSecretValue("a".repeat(40) + "b".repeat(40)), + "long base64-ish string looks secret", + ); + assert( + !looksLikeSecretValue("Hello world"), + "short readable text is safe", + ); + assert( + !looksLikeSecretValue("https://example.com/foo/bar"), + "URLs are not secrets", + ); + assert( + !looksLikeSecretValue("3162189c-2052-4f83-ad83-a66c57460ea2"), + "UUIDs are useful and not secret", + ); + }, + }, + { + name: "redactSensitive scrubs settings payloads", + run: () => { + const cleaned = redactSensitive(settingsPayload); + const json = JSON.stringify(cleaned); + assertNotContains(json, "global.dropbox.api.key", "dropbox key dropped"); + assertNotContains(json, "xxx-do-not-index", "secret value dropped"); + }, + }, + { + name: "extractTextFromValue distills HTML and skips secrets", + run: () => { + const text = extractTextFromValue({ + title: "Hello", + body: "

Some HTML body.

", + password: "should-not-appear", + nested: { token: "leak-me-please" }, + }); + assertContains(text, "Hello", "title preserved"); + assertContains(text, "HTML body", "html flattened"); + assertNotContains(text, "should-not-appear", "password redacted"); + assertNotContains(text, "leak-me-please", "nested token redacted"); + }, + }, + { + name: "pickTitle / pickId prefer common SEQTA fields", + run: () => { + assertEqual( + pickTitle({ title: "Hello", name: "Other" }), + "Hello", + "title wins over name", + ); + assertEqual( + pickTitle({ filename: "doc.pdf" }), + "doc.pdf", + "filename fallback", + ); + assertEqual(pickId({ id: 42 }), "42", "numeric id stringified"); + assertEqual(pickId({ uuid: "abc" }), "abc", "uuid id"); + }, + }, + { + name: "buildIndexItem produces redacted, well-formed records", + run: () => { + const item = buildIndexItem({ + id: "x-1", + text: "Test", + category: "passive", + rawForContent: { + title: "Test", + body: "

Hello

", + token: "should-be-stripped", + }, + metadata: { route: "/seqta/student/load/whatever", apiKey: "drop" }, + actionId: "passive", + renderComponentId: "passive", + }); + assertEqual(item.id, "x-1", "id propagated"); + assertContains(item.content, "Hello", "html distilled"); + assertNotContains(item.content, "should-be-stripped", "token stripped"); + assert( + !("apiKey" in (item.metadata as Record)), + "apiKey metadata stripped", + ); + assertEqual(item.category, "passive", "category passes through"); + }, + }, + { + name: "courses fixture flattens lesson HTML", + run: () => { + // Verify that the structural shape we depend on still matches. + assert(Array.isArray(coursesPayload.w), "lesson grid present"); + const lessonHtml = (coursesPayload.w[0]?.[1] as { l?: string })?.l ?? ""; + assertContains(lessonHtml, "ed.ted.com", "lesson html link present"); + }, + }, + { + name: "subjects fixture exposes programme/metaclass", + run: () => { + const subject = subjectsListPayload[0]?.subjects[0]; + assert(subject, "fixture has at least one subject"); + assert( + Number.isFinite(subject!.programme) && + Number.isFinite(subject!.metaclass), + "programme & metaclass numeric", + ); + }, + }, + { + name: "documents fixture exposes uuid + filename", + run: () => { + const doc = documentsPayload[0]?.docs[0]; + assert(doc?.uuid && doc?.filename, "uuid + filename present"); + }, + }, + { + name: "notices fixture is HTML-bearing", + run: () => { + assertContains( + noticesPayload[0]?.contents ?? "", + "

", + "notice html present", + ); + }, + }, + { + name: "portals fixture has external url", + run: () => { + assert(portalsPayload[0]?.url?.includes("mathletics"), "portal url"); + }, + }, + { + name: "folio entry contents passes html-flattening", + run: () => { + const distilled = extractTextFromValue(folioEntryPayload, { + maxChars: 4000, + }); + assertContains(distilled, "reflection", "folio body extracted"); + }, + }, +]; + +export interface SelfTestReport { + passed: number; + failed: number; + failures: Array<{ name: string; error: string }>; +} + +/** + * Runs every assertion case and resolves with a summary. Never throws. + * + * Designed to be invoked from `window.globalSearchDebug.runSelfTests()` + * by maintainers who want to validate the indexing pipeline against a + * real SEQTA tab. + */ +export async function runGlobalSearchSelfTests(): Promise { + const report: SelfTestReport = { passed: 0, failed: 0, failures: [] }; + for (const test of cases) { + try { + await test.run(); + report.passed++; + } catch (e) { + report.failed++; + const error = + e instanceof Error ? `${e.name}: ${e.message}` : String(e); + report.failures.push({ name: test.name, error }); + } + } + if (report.failed > 0) { + console.warn( + `[Global Search Self-Tests] ${report.failed} failed / ${report.passed} passed`, + report.failures, + ); + } else { + console.info( + `[Global Search Self-Tests] All ${report.passed} cases passed`, + ); + } + return report; +} diff --git a/src/plugins/built-in/globalSearch/src/search/hybridSearch.ts b/src/plugins/built-in/globalSearch/src/search/hybridSearch.ts index 42360e16..0168b2e4 100644 --- a/src/plugins/built-in/globalSearch/src/search/hybridSearch.ts +++ b/src/plugins/built-in/globalSearch/src/search/hybridSearch.ts @@ -2,6 +2,32 @@ import type { IndexItem } from "../indexing/types"; import type { CombinedResult } from "../core/types"; import { searchVectors, type VectorSearchResult } from "./vector/vectorSearch"; import { jobs } from "../indexing/jobs"; +import { + getLexicalMatchQuality, + isStrongLexicalMatch, + STRONG_LEXICAL_THRESHOLD, +} from "./lexicalMatch"; + +function isIndexItem(item: CombinedResult["item"]): item is IndexItem { + return (item as IndexItem).dateAdded !== undefined; +} + +/** + * Heuristic for "this query is still too short / too sparse for vector + * recall to be reliable". When true we should not promote vector-only + * results above lexical ones. + * + * Note: this is intentionally distinct from the absolute >2 character cut-off + * used for `hybridSearch`. Vector recall on 3-7 character single-token + * queries is noisy enough that we should keep lexical results dominant. + */ +function isWeakSemanticQuery(trimmedQuery: string): boolean { + if (trimmedQuery.length < 8) return true; + const meaningfulTokens = trimmedQuery + .split(/\s+/) + .filter((t) => t.length >= 3); + return meaningfulTokens.length < 2; +} /** * Hybrid Search Implementation @@ -36,14 +62,6 @@ const DEFAULT_OPTIONS: Required = { recencyWeight: 0.1, }; -/** - * Normalizes a score to 0-1 range - */ -function normalizeScore(score: number, min: number, max: number): number { - if (max === min) return 0.5; - return Math.max(0, Math.min(1, (score - min) / (max - min))); -} - /** * Calculates recency boost based on item age */ @@ -55,28 +73,56 @@ function calculateRecencyBoost(item: IndexItem, now: number): number { } /** - * Calculates popularity boost (can be extended with click tracking, etc.) + * Category-aware popularity / structure boost. + * + * High-confidence curated content (assignments, courses, subjects, forums) + * sits above noisier sources (notices, documents) which sit above the + * passive store. This keeps the most actionable hits at the top while + * still surfacing wide-recall semantic matches when relevant. */ function calculatePopularityBoost(item: IndexItem): number { - // For now, boost based on category and metadata let boost = 0; - - // Boost assignments/assessments - if (item.category === "assignments") { - boost += 0.1; + + switch (item.category) { + case "assignments": + boost += 0.12; + break; + case "subjects": + case "courses": + boost += 0.08; + break; + case "forums": + case "messages": + boost += 0.06; + break; + case "notices": + case "folio": + case "reports": + case "goals": + boost += 0.04; + break; + case "documents": + boost += 0.03; + break; + case "portals": + boost += 0.02; + break; + case "passive": + boost -= 0.1; + break; + case "messages-support": + boost -= 0.18; + break; } - - // Boost upcoming items - if (item.metadata?.isUpcoming) { - boost += 0.15; - } - - // Boost items with subject codes (more structured) - if (item.metadata?.subjectCode) { - boost += 0.05; - } - - return Math.min(boost, 0.3); // Cap at 0.3 + + if (item.metadata?.isUpcoming) boost += 0.12; + if (item.metadata?.subjectCode) boost += 0.04; + if (item.metadata?.entityType === "course") boost += 0.02; + if (item.metadata?.source === "passive") boost -= 0.08; + if (item.metadata?.supportRecord) boost -= 0.12; + if (item.metadata?.priority === "low") boost -= 0.05; + + return Math.max(-0.2, Math.min(boost, 0.3)); } /** @@ -97,11 +143,7 @@ export async function hybridSearch( // Limit BM25 results to top K const topBm25Results = bm25Results.slice(0, opts.bm25TopK); - - // Get vector search results for reranking - // We'll search the full index and then filter to our BM25 results - let vectorResults: VectorSearchResult[] = []; - + if (trimmedQuery.length > 2) { try { // Get more vector results than BM25 results to ensure coverage @@ -121,59 +163,57 @@ export async function hybridSearch( // Now rerank BM25 results with vector scores const now = Date.now(); - const rerankedResults = topBm25Results.map(result => { + const rerankedResults: CombinedResult[] = topBm25Results.map(result => { const item = result.item; - - // Normalize BM25 score to 0-1 - // Fuse.js scores: lower is better (0 = perfect match) - // We need to invert: higher score = better match - // Result.score is typically 0-100, where higher = better - // So we normalize it to 0-1 + + // Static command items don't have dateAdded/metadata/category to score + // against — pass them through untouched so palette commands still + // surface correctly. + if (!isIndexItem(item)) { + return result; + } + + // Normalize BM25 score to 0-1. + // Result.score is typically 0-100, where higher = better, so we + // clamp into the 0..1 range. const normalizedBm25Score = Math.max(0, Math.min(1, result.score / 100)); - - // Get vector similarity (0-1, already normalized) - // If item wasn't in vector results, use a default low score - const vectorSimilarity = vectorMap.get(item.id) || 0.3; // Default to 0.3 if not found - - // Calculate recency boost (0-1 range) - const recencyBoost = opts.recencyBoost + + // Get vector similarity (0-1, already normalized). If item wasn't in + // vector results, use a default mid-low score. + const vectorSimilarity = vectorMap.get(item.id) || 0.3; + + const recencyBoost = opts.recencyBoost ? calculateRecencyBoost(item, now) * opts.recencyWeight : 0; - - // Calculate popularity boost (0-1 range) + const popularityBoost = calculatePopularityBoost(item); - - // Apply job-specific boost if available + const job = jobs[item.category]; let jobBoost = 0; if (job && typeof job.boostCriteria === 'function') { const boost = job.boostCriteria(item, trimmedQuery); if (boost) { - jobBoost = boost / 100; // Normalize boost to 0-1 + jobBoost = boost / 100; } } - - // Combine scores using weighted average - // BM25 and vector are weighted, boosts are additive - const hybridScore = + + // Lexical guardrail: a strong title match is worth a meaningful + // bonus so vector reranking can't quietly drop an exact assessment + // title between adjacent keystrokes. Scale is roughly 0..0.18. + const lexicalQuality = getLexicalMatchQuality(item, trimmedQuery); + const lexicalBonus = lexicalQuality > 0 ? lexicalQuality / 80 : 0; + + const hybridScore = (normalizedBm25Score * opts.bm25Weight) + (vectorSimilarity * opts.vectorWeight) + recencyBoost + popularityBoost + - jobBoost; - + jobBoost + + lexicalBonus; + return { ...result, - score: hybridScore * 100, // Scale back to 0-100 for consistency - // Store component scores for debugging (optional, can be removed in production) - _hybridScores: { - bm25: normalizedBm25Score, - vector: vectorSimilarity, - recency: recencyBoost, - popularity: popularityBoost, - jobBoost: jobBoost, - final: hybridScore, - }, + score: hybridScore * 100, }; }); @@ -200,20 +240,27 @@ export async function hybridSearch( export async function hybridSearchWithExpansion( bm25Results: CombinedResult[], query: string, - allItems: IndexItem[], + _allItems: IndexItem[], options: HybridSearchOptions = {}, ): Promise { const opts = { ...DEFAULT_OPTIONS, ...options }; const trimmedQuery = query.trim().toLowerCase(); - + // First, rerank BM25 results const rerankedBm25 = await hybridSearch(bm25Results, query, options); - + // If query is too short, skip vector expansion if (trimmedQuery.length <= 2) { return rerankedBm25; } - + + // For short / single-token queries vector expansion brings in too much + // noise (and is the main reason results "flicker" between adjacent + // keystrokes). Keep semantic recall for longer queries. + if (isWeakSemanticQuery(trimmedQuery)) { + return rerankedBm25.slice(0, opts.finalLimit); + } + // Get vector search results let vectorResults: VectorSearchResult[] = []; try { @@ -222,59 +269,88 @@ export async function hybridSearchWithExpansion( console.warn("[Hybrid Search] Vector search failed:", e); return rerankedBm25; } - + // Find vector results that weren't in BM25 results const bm25Ids = new Set(bm25Results.map(r => r.item.id)); const vectorOnlyResults: CombinedResult[] = []; - + const now = Date.now(); - - vectorResults.forEach(v => { - if (!bm25Ids.has(v.object.id)) { - // This is a semantic match that BM25 missed - const item = v.object; - - // Calculate boosts - const recencyBoost = opts.recencyBoost - ? calculateRecencyBoost(item, now) * opts.recencyWeight - : 0; - const popularityBoost = calculatePopularityBoost(item); - - // Vector-only results get lower base score but high vector similarity - const vectorScore = v.similarity * opts.vectorWeight + recencyBoost + popularityBoost; - - // Apply job-specific boost if available - const job = jobs[item.category]; - let jobBoost = 0; - if (job && typeof job.boostCriteria === 'function') { - const boost = job.boostCriteria(item, trimmedQuery); - if (boost) { - jobBoost = boost / 100; // Normalize boost - } + + // Compute the floor at which a vector-only result is allowed to enter the + // ranking. Strong lexical matches in the BM25 list set this floor — a + // vector-only result must beat the lowest strong lexical match's score by + // a margin to displace it. + let strongLexicalFloor = -Infinity; + for (const r of rerankedBm25) { + if (isIndexItem(r.item) && isStrongLexicalMatch(r.item, trimmedQuery)) { + if (r.score > strongLexicalFloor) { + strongLexicalFloor = r.score; } - - vectorOnlyResults.push({ - id: item.id, - type: "dynamic" as const, - score: (vectorScore + jobBoost) * 100, - item, - _hybridScores: { - bm25: 0, - vector: v.similarity, - recency: recencyBoost, - popularity: popularityBoost, - final: vectorScore + jobBoost, - }, - }); } + } + // Vector-only results may sit at most at this score: + const vectorOnlyCeiling = strongLexicalFloor === -Infinity + ? Infinity + : strongLexicalFloor - 1; + + vectorResults.forEach(v => { + if (bm25Ids.has(v.object.id)) return; + + // This is a semantic match that BM25 missed + const item = v.object; + + // Calculate boosts + const recencyBoost = opts.recencyBoost + ? calculateRecencyBoost(item, now) * opts.recencyWeight + : 0; + const popularityBoost = calculatePopularityBoost(item); + + // Penalize vector-only matches that have no lexical content overlap. + // Vector recall on its own is fuzzy — without lexical confirmation we + // should rank these below curated keyword hits. + const lexicalQuality = getLexicalMatchQuality(item, trimmedQuery); + let vectorOnlyPenalty = 0; + if (lexicalQuality === 0) { + vectorOnlyPenalty -= 0.18; + } + + // Passive captures without lexical confirmation are demoted further — + // they're often raw API records that should never lead the result list. + if (item.category === "passive" && lexicalQuality < STRONG_LEXICAL_THRESHOLD) { + vectorOnlyPenalty -= 0.12; + } + + // Vector-only results get lower base score but high vector similarity + const vectorScore = + v.similarity * opts.vectorWeight + recencyBoost + popularityBoost + vectorOnlyPenalty; + + // Apply job-specific boost if available + const job = jobs[item.category]; + let jobBoost = 0; + if (job && typeof job.boostCriteria === 'function') { + const boost = job.boostCriteria(item, trimmedQuery); + if (boost) { + jobBoost = boost / 100; // Normalize boost + } + } + + let finalScore = (vectorScore + jobBoost) * 100; + if (finalScore > vectorOnlyCeiling) finalScore = vectorOnlyCeiling; + + vectorOnlyResults.push({ + id: item.id, + type: "dynamic" as const, + score: finalScore, + item, + }); }); - + // Combine reranked BM25 results with vector-only results const allResults = [...rerankedBm25, ...vectorOnlyResults]; - + // Sort by score and return top results allResults.sort((a, b) => b.score - a.score); - + return allResults.slice(0, opts.finalLimit); } diff --git a/src/plugins/built-in/globalSearch/src/search/lexicalMatch.ts b/src/plugins/built-in/globalSearch/src/search/lexicalMatch.ts new file mode 100644 index 00000000..c29f31ed --- /dev/null +++ b/src/plugins/built-in/globalSearch/src/search/lexicalMatch.ts @@ -0,0 +1,118 @@ +import type { IndexItem } from "../indexing/types"; + +/** + * Maximum bonus a strong lexical title match can contribute on top of the + * underlying Fuse / hybrid score. Tuned to outweigh small vector reranking + * deltas so a true assessment-title match cannot be displaced by a vector + * neighbour as the user types one more character. + */ +export const LEXICAL_TITLE_BONUS = 12; + +/** + * Threshold at or above which a result counts as a "strong lexical match". + * Strong matches must always be surfaced and protected from vector reranking + * displacing them. + */ +export const STRONG_LEXICAL_THRESHOLD = 6; + +const WORD_SPLIT_RE = /\s+/; +const NON_WORD_RE = /[^a-z0-9]+/gi; + +function normalize(value: string | undefined | null): string { + if (!value) return ""; + return String(value).toLowerCase().trim(); +} + +function tokens(value: string): string[] { + return normalize(value) + .split(WORD_SPLIT_RE) + .map((t) => t.replace(NON_WORD_RE, "")) + .filter(Boolean); +} + +/** + * Score how strongly the query lexically matches the title-like fields of an + * IndexItem. Return value is a non-negative number — 0 means no useful match. + * + * Tiers (roughly): + * ~12 exact title equality + * ~10 title starts with full query string + * ~8 title contains full query string, on a word boundary + * ~7 ordered token-prefix match (e.g. `world w` vs `World War 2 Essay`) + * ~5 subject / metadata title contains query + * ~3 any token in title starts with query + * ~2 substring anywhere in title + * 0 no lexical signal + * + * The function is intentionally cheap (string ops only, no regex compilation + * per call beyond the constants above) because it is called for every item in + * the candidate pool. + */ +export function getLexicalMatchQuality(item: IndexItem, query: string): number { + const q = normalize(query); + if (!q) return 0; + + const title = normalize(item.text); + if (!title) return 0; + + if (title === q) return 12; + if (title.startsWith(q + " ") || title.startsWith(q)) return 10; + + const queryTokens = tokens(q); + const titleTokens = tokens(title); + + if (queryTokens.length > 0 && titleTokens.length >= queryTokens.length) { + let bestStreakStart = -1; + for (let i = 0; i <= titleTokens.length - queryTokens.length; i++) { + let ok = true; + for (let j = 0; j < queryTokens.length; j++) { + const tt = titleTokens[i + j]; + const qt = queryTokens[j]; + const isLast = j === queryTokens.length - 1; + if (isLast) { + if (!tt.startsWith(qt)) { + ok = false; + break; + } + } else { + if (tt !== qt) { + ok = false; + break; + } + } + } + if (ok) { + bestStreakStart = i; + break; + } + } + if (bestStreakStart === 0) return 9; + if (bestStreakStart > 0) return 7; + } + + if (title.includes(" " + q) || title.includes(q + " ")) return 8; + + // Token starts-with anywhere + for (const t of titleTokens) { + if (t.startsWith(q)) return 3; + } + + // Subject / curated metadata title + const md = (item.metadata ?? {}) as Record; + const subjectName = normalize( + typeof md.subjectName === "string" ? md.subjectName : "", + ); + const subjectCode = normalize( + typeof md.subjectCode === "string" ? md.subjectCode : "", + ); + if (subjectName && (subjectName === q || subjectName.startsWith(q))) return 5; + if (subjectCode && (subjectCode === q || subjectCode.startsWith(q))) return 5; + + if (title.includes(q)) return 2; + + return 0; +} + +export function isStrongLexicalMatch(item: IndexItem, query: string): boolean { + return getLexicalMatchQuality(item, query) >= STRONG_LEXICAL_THRESHOLD; +} diff --git a/src/plugins/built-in/globalSearch/src/search/searchUtils.ts b/src/plugins/built-in/globalSearch/src/search/searchUtils.ts index 3343e839..758a16b8 100644 --- a/src/plugins/built-in/globalSearch/src/search/searchUtils.ts +++ b/src/plugins/built-in/globalSearch/src/search/searchUtils.ts @@ -3,10 +3,12 @@ import { getStaticCommands, type StaticCommandItem } from "../core/commands"; import { getDynamicItems } from "../utils/dynamicItems"; import type { CombinedResult } from "../core/types"; import type { IndexItem } from "../indexing/types"; -import { searchVectors } from "./vector/vectorSearch"; -import type { VectorSearchResult } from "./vector/vectorTypes"; -import { jobs } from "../indexing/jobs"; import { hybridSearchWithExpansion } from "./hybridSearch"; +import { + getLexicalMatchQuality, + isStrongLexicalMatch, + STRONG_LEXICAL_THRESHOLD, +} from "./lexicalMatch"; // Search result cache for better performance const searchCache = new Map(); @@ -25,7 +27,9 @@ function setCachedResults(query: string, results: CombinedResult[]) { // Limit cache size if (searchCache.size >= MAX_CACHE_SIZE) { const firstKey = searchCache.keys().next().value; - searchCache.delete(firstKey); + if (firstKey !== undefined) { + searchCache.delete(firstKey); + } } searchCache.set(query, { results, timestamp: Date.now() }); } @@ -61,23 +65,40 @@ export function createSearchIndexes() { findAllMatches: false, // Performance optimization }; - // Optimized dynamic content search options + // Optimized dynamic content search options. + // The expanded corpus mixes structured entities (assessments, subjects) + // with free-form text (course content, notices, folio bodies, passive + // captures) so we list a broad set of metadata keys while keeping titles + // dominant in the ranking. + // NOTE: metadata.route is intentionally excluded. Raw API paths like + // `/seqta/student/load/message/people` should never influence ranking — they + // historically caused passive-capture support records to bubble up above + // real assessments when the user typed substrings that happened to appear in + // the path. const dynamicOptions = { keys: [ - { name: "text", weight: 3 }, // Increased weight for title matches + { name: "text", weight: 3 }, // Title is king { name: "content", weight: 1 }, - { name: "category", weight: 0.5 }, // Lower weight for category - { name: "metadata.subjectName", weight: 1.5 }, // Boost subject name matches - { name: "metadata.subjectCode", weight: 1.5 }, // Boost subject code matches + { name: "category", weight: 0.4 }, + { name: "metadata.subjectName", weight: 1.6 }, + { name: "metadata.subjectCode", weight: 1.6 }, + { name: "metadata.subject", weight: 1.4 }, + { name: "metadata.courseCode", weight: 1.2 }, + { name: "metadata.filename", weight: 1.2 }, + { name: "metadata.author", weight: 0.8 }, + { name: "metadata.authorName", weight: 0.8 }, + { name: "metadata.label", weight: 0.6 }, + { name: "metadata.categoryName", weight: 0.6 }, + { name: "metadata.entityType", weight: 0.4 }, ], includeScore: true, includeMatches: true, - threshold: 0.5, // More permissive for better partial word matching (increased from 0.4) - minMatchCharLength: 2, // Minimum 2 characters for Fuse.js matches (substring fallback handles shorter queries) - distance: 100, // Increased to allow matches across longer strings + threshold: 0.5, + minMatchCharLength: 2, + distance: 100, useExtendedSearch: true, - ignoreLocation: true, // Allow matches anywhere in the string for better partial word matching - findAllMatches: true, // Enable to find all matches for better partial word support + ignoreLocation: true, + findAllMatches: true, shouldSort: true, }; @@ -189,23 +210,32 @@ export function searchDynamicItems( const results = searchResults.map((result: FuseResult) => { const item = result.item; const fuseScore = 10 * (1 - (result.score || 0.5)); - + let score = fuseScore; // Recency boost const ageInDays = (now - item.dateAdded) / (1000 * 60 * 60 * 24); const recencyBoost = sortByRecent ? 1 / (ageInDays + 1) : 0; score += recencyBoost; - - // Boost for exact text matches (especially at the start) - const textLower = item.text.toLowerCase(); - if (textLower.startsWith(queryLower)) { - score += 5; // Strong boost for prefix matches - } else if (textLower.includes(queryLower)) { - score += 2; // Boost for substring matches + + // Lexical title bonus — sticky across adjacent keystrokes so a strong + // title prefix match like `world wa` doesn't disappear from the top once + // vector reranking kicks in. + const lexicalQuality = getLexicalMatchQuality(item, queryLower); + if (lexicalQuality > 0) { + score += lexicalQuality; + // Curated-content boost: assessments and assignments with a strong + // title match should be elevated further, since they are the items + // users are most often hunting for. + if ( + lexicalQuality >= STRONG_LEXICAL_THRESHOLD && + (item.category === "assignments" || item.category === "assessments") + ) { + score += 4; + } } - - // Boost for category matches + + // Category match (small nudge) if (item.category.toLowerCase().includes(queryLower)) { score += 1; } @@ -218,37 +248,34 @@ export function searchDynamicItems( matches: result.matches, }; }); - + // Add additional matches from simple substring search additionalMatches.forEach((item) => { - // Check if already in results if (!results.find(r => r.id === item.id)) { - const textLower = item.text.toLowerCase(); let score = 5; // Base score for substring matches - - // Boost for prefix matches - if (textLower.startsWith(queryLower)) { - score += 5; - } - - // Recency boost + + const lexicalQuality = getLexicalMatchQuality(item, queryLower); + score += lexicalQuality; + const ageInDays = (now - item.dateAdded) / (1000 * 60 * 60 * 24); const recencyBoost = sortByRecent ? 1 / (ageInDays + 1) : 0; score += recencyBoost; - + results.push({ id: item.id, type: "dynamic" as const, score, item, + matches: undefined, }); } }); - + // Sort by score and return top results return results.sort((a, b) => b.score - a.score).slice(0, limit); } + export async function performSearch( query: string, commandsFuse: Fuse, @@ -286,12 +313,37 @@ export async function performSearch( sortByRecent, ); + // Step 2b: Always include strong lexical title matches, even if Fuse + // missed them with the current threshold. This is the safety net that + // stops `world wa` from dropping a `World War 2 Essay` assessment that + // `world w` happily showed. + const allItems = Array.from(dynamicIdToItemMap.values()); + const seen = new Set(bm25Results.map((r) => r.id)); + const lexicalAdds: CombinedResult[] = []; + for (const item of allItems) { + if (seen.has(item.id)) continue; + if (!isStrongLexicalMatch(item, trimmedQuery)) continue; + const quality = getLexicalMatchQuality(item, trimmedQuery); + let score = 6 + quality; + if (item.category === "assignments" || item.category === "assessments") { + score += 4; + } + lexicalAdds.push({ + id: item.id, + type: "dynamic" as const, + score, + item, + matches: undefined, + }); + } + if (lexicalAdds.length > 0) { + bm25Results.push(...lexicalAdds); + bm25Results.sort((a, b) => b.score - a.score); + } + // Step 3: Apply hybrid search (BM25 + Vector reranking + boosting) if (trimmedQuery.length > 2 && bm25Results.length > 0) { try { - // Get all items for expansion - const allItems = Array.from(dynamicIdToItemMap.values()); - // Apply hybrid search with expansion dynamicResults = await hybridSearchWithExpansion( bm25Results, diff --git a/src/plugins/built-in/globalSearch/src/search/vector/vectorSearch.ts b/src/plugins/built-in/globalSearch/src/search/vector/vectorSearch.ts index 59013d60..a56a5902 100644 --- a/src/plugins/built-in/globalSearch/src/search/vector/vectorSearch.ts +++ b/src/plugins/built-in/globalSearch/src/search/vector/vectorSearch.ts @@ -40,7 +40,6 @@ export interface VectorSearchResult extends SearchResult { // Cache for query embeddings to avoid recomputing const embeddingCache = new Map(); -const EMBEDDING_CACHE_TTL = 1000 * 60 * 30; // 30 minutes const MAX_EMBEDDING_CACHE_SIZE = 50; function getCachedEmbedding(query: string): number[] | null { @@ -55,7 +54,9 @@ function setCachedEmbedding(query: string, embedding: number[]) { // Limit cache size if (embeddingCache.size >= MAX_EMBEDDING_CACHE_SIZE) { const firstKey = embeddingCache.keys().next().value; - embeddingCache.delete(firstKey); + if (firstKey !== undefined) { + embeddingCache.delete(firstKey); + } } embeddingCache.set(query, embedding); } diff --git a/src/plugins/built-in/globalSearch/src/utils/versionCheck.ts b/src/plugins/built-in/globalSearch/src/utils/versionCheck.ts index 31f9aa3d..2c4e9b04 100644 --- a/src/plugins/built-in/globalSearch/src/utils/versionCheck.ts +++ b/src/plugins/built-in/globalSearch/src/utils/versionCheck.ts @@ -1,4 +1,5 @@ import browser from "webextension-polyfill"; +import { resetSearchIndexes } from "../indexing/resetIndexes"; const VERSION_STORAGE_KEY = "betterseqta-global-search-version"; const VERSION_CACHE_KEY = "betterseqta-global-search-cache-version"; @@ -40,34 +41,53 @@ export function storeVersion(version: string): void { } /** - * Checks if the extension has been updated and clears caches if needed - * Returns true if an update was detected + * Checks if the extension has been updated and clears caches + resets the + * search index if needed. + * + * The reset is intentionally aggressive: every manifest version bump + * triggers a full IndexedDB wipe so changes to indexer extraction logic, + * job sets, or item shape can never serve stale results from an older + * build. The next indexing pass will repopulate from scratch in the + * background. Re-population is bounded by the per-job rate limits in + * `api.ts` so it can't hammer SEQTA after an update. + * + * Returns true if an update was detected. */ export async function checkAndHandleUpdate(): Promise { const currentVersion = getCurrentVersion(); const storedVersion = getStoredVersion(); - - // If no stored version, this is first run - store current version + + // First run: just remember the version, don't reset (the user likely + // just installed the extension; the index is already empty). if (!storedVersion) { - console.debug(`[Version Check] First run detected, storing version ${currentVersion}`); + console.debug( + `[Version Check] First run detected, storing version ${currentVersion}`, + ); storeVersion(currentVersion); return false; } - - // If versions match, no update + if (storedVersion === currentVersion) { return false; } - - // Version mismatch detected - extension was updated - console.log(`[Version Check] Extension updated from ${storedVersion} to ${currentVersion}, clearing caches...`); - - // Clear all caches + + console.log( + `[Version Check] Extension updated from ${storedVersion} to ${currentVersion}, resetting search index...`, + ); + await clearAllCaches(); - - // Store new version + + try { + await resetSearchIndexes(); + console.log( + "[Version Check] Search index reset; next indexing pass will repopulate from scratch.", + ); + } catch (e) { + console.warn("[Version Check] resetSearchIndexes failed:", e); + } + storeVersion(currentVersion); - + return true; }