From d04f9b73689674246bab1764300a92fe9d215d16 Mon Sep 17 00:00:00 2001 From: Thomas Jego Date: Thu, 16 Apr 2026 14:58:19 +0200 Subject: [PATCH 01/58] feat(documents): add web crawl source type and crawl URL contract --- apps/api/src/domains/documents/document.entity.ts | 1 + packages/api-contracts/src/documents/documents.dto.ts | 11 +++++++++++ .../api-contracts/src/documents/documents.routes.ts | 6 ++++++ 3 files changed, 18 insertions(+) diff --git a/apps/api/src/domains/documents/document.entity.ts b/apps/api/src/domains/documents/document.entity.ts index 2a83d784..e3fb73ca 100644 --- a/apps/api/src/domains/documents/document.entity.ts +++ b/apps/api/src/domains/documents/document.entity.ts @@ -41,6 +41,7 @@ export class Document extends ConnectEntityBase { | "extraction" | "evaluationExtractionDataset" | "evaluationExtractionRun" + | "webCrawl" @Column({ name: "embedding_status", nullable: false, default: "pending" }) embeddingStatus!: "pending" | "queued" | "processing" | "completed" | "failed" diff --git a/packages/api-contracts/src/documents/documents.dto.ts b/packages/api-contracts/src/documents/documents.dto.ts index a2d40a1f..f1f04f68 100644 --- a/packages/api-contracts/src/documents/documents.dto.ts +++ b/packages/api-contracts/src/documents/documents.dto.ts @@ -9,6 +9,7 @@ export type DocumentSourceType = | "extraction" | "evaluationExtractionDataset" | "evaluationExtractionRun" + | "webCrawl" export type DocumentEmbeddingStatus = "pending" | "queued" | "processing" | "completed" | "failed" export type DocumentEmbeddingStatusChangedEventPayload = { type: typeof DOCUMENT_EMBEDDING_STATUS_CHANGED_CHANNEL_DTO @@ -51,11 +52,21 @@ export type DocumentDto = { mimeType?: MimeTypes size?: number storageRelativePath?: string + sourceUrl?: string | null embeddingStatus: DocumentEmbeddingStatus embeddingError: string | null tagIds: DocumentTagDto["id"][] } +export type CrawlUrlRequestDto = { + url: string + limit?: number +} + +export type CrawlUrlResponseDto = { + message: string +} + // https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Common_types export enum MimeTypes { aac = "audio/aac", diff --git a/packages/api-contracts/src/documents/documents.routes.ts b/packages/api-contracts/src/documents/documents.routes.ts index d20eb5a5..f6fca448 100644 --- a/packages/api-contracts/src/documents/documents.routes.ts +++ b/packages/api-contracts/src/documents/documents.routes.ts @@ -2,6 +2,8 @@ import type { DocumentTagsUpdateFieldsDto } from "../document-tags/document-tag. import type { RequestPayload, ResponseData, SuccessResponseDTO } from "../generic" import { defineRoute } from "../helpers" import type { + CrawlUrlRequestDto, + CrawlUrlResponseDto, DocumentDto, DocumentUploadOptionalTagFields, PresignFileRequestItemDto, @@ -53,6 +55,10 @@ export const DocumentsRoutes = { method: "post", path: "organizations/:organizationId/projects/:projectId/documents/:documentId/reprocess", }), + crawlUrl: defineRoute, RequestPayload>({ + method: "post", + path: "organizations/:organizationId/projects/:projectId/documents/crawl-url", + }), // Streaming responses are sent as text/event-stream (SSE) and do not follow ResponseData. streamEmbeddingStatus: defineRoute>({ method: "get", From ac36bc2d115ed1f7d501ea8ac033b344b7432404 Mon Sep 17 00:00:00 2001 From: Thomas Jego Date: Thu, 16 Apr 2026 15:48:38 +0200 Subject: [PATCH 02/58] feat(documents): add website crawl feature to frontend --- .../documents/components/CrawlUrlButton.tsx | 105 ++++++++++++++++++ .../documents/documents.middleware.ts | 25 +++++ .../features/documents/documents.spi.ts | 6 + .../features/documents/documents.thunks.ts | 13 +++ .../documents/external/documents.api.ts | 9 ++ .../documents/locales/document.en.json | 8 ++ .../documents/locales/document.fr.json | 8 ++ apps/web/src/studio/routes/DocumentsRoute.tsx | 4 +- 8 files changed, 177 insertions(+), 1 deletion(-) create mode 100644 apps/web/src/studio/features/documents/components/CrawlUrlButton.tsx diff --git a/apps/web/src/studio/features/documents/components/CrawlUrlButton.tsx b/apps/web/src/studio/features/documents/components/CrawlUrlButton.tsx new file mode 100644 index 00000000..85fca5c1 --- /dev/null +++ b/apps/web/src/studio/features/documents/components/CrawlUrlButton.tsx @@ -0,0 +1,105 @@ +import { Button } from "@caseai-connect/ui/shad/button" +import { + Dialog, + DialogContent, + DialogDescription, + DialogHeader, + DialogTitle, + DialogTrigger, +} from "@caseai-connect/ui/shad/dialog" +import { Field, FieldGroup, FieldLabel } from "@caseai-connect/ui/shad/field" +import { Input } from "@caseai-connect/ui/shad/input" +import { GlobeIcon, Loader2Icon } from "lucide-react" +import { useState } from "react" +import { useTranslation } from "react-i18next" +import { useAppDispatch } from "@/common/store/hooks" +import { crawlUrl } from "../documents.thunks" + +export function CrawlUrlButton() { + const [open, setOpen] = useState(false) + + return ( + + + + + + setOpen(false)} /> + + + ) +} + +function CrawlUrlForm({ onSuccess }: { onSuccess: () => void }) { + const dispatch = useAppDispatch() + const { t } = useTranslation("document") + const [url, setUrl] = useState("") + const [limit, setLimit] = useState(10) + const [isSubmitting, setIsSubmitting] = useState(false) + + const isValidUrl = (() => { + try { + new URL(url) + return true + } catch { + return false + } + })() + + const handleSubmit = async (event: React.FormEvent) => { + event.preventDefault() + if (!isValidUrl || isSubmitting) return + + setIsSubmitting(true) + try { + await dispatch(crawlUrl({ url, limit })).unwrap() + onSuccess() + } finally { + setIsSubmitting(false) + } + } + + return ( +
+ + {t("document:crawl.title")} + {t("document:crawl.description")} + +
+ + + {t("document:crawl.urlLabel")} + setUrl(event.target.value)} + required + /> + + + {t("document:crawl.limitLabel")} + setLimit(Number(event.target.value))} + /> + + +
+ +
+
+
+ ) +} diff --git a/apps/web/src/studio/features/documents/documents.middleware.ts b/apps/web/src/studio/features/documents/documents.middleware.ts index 9ca2ada7..ffbe91ed 100644 --- a/apps/web/src/studio/features/documents/documents.middleware.ts +++ b/apps/web/src/studio/features/documents/documents.middleware.ts @@ -10,6 +10,7 @@ import { import { selectUploaderState } from "./documents.selectors" import { documentsActions } from "./documents.slice" import { + crawlUrl, deleteDocument, listDocuments, updateDocument, @@ -70,6 +71,7 @@ function registerListeners() { // Document changes uploadDocument.fulfilled, uploadDocuments.fulfilled, + crawlUrl.fulfilled, updateDocument.fulfilled, deleteDocument.fulfilled, // DocumentTag changes @@ -200,6 +202,29 @@ function registerListeners() { ) }, }) + + listenerMiddleware.startListening({ + actionCreator: crawlUrl.fulfilled, + effect: async (action, listenerApi) => { + listenerApi.dispatch( + notificationsActions.show({ + title: action.payload.message, + type: "success", + }), + ) + }, + }) + listenerMiddleware.startListening({ + actionCreator: crawlUrl.rejected, + effect: async (_, listenerApi) => { + listenerApi.dispatch( + notificationsActions.show({ + title: "Website crawl failed", + type: "error", + }), + ) + }, + }) } export const documentsMiddleware = { listenerMiddleware, registerListeners } diff --git a/apps/web/src/studio/features/documents/documents.spi.ts b/apps/web/src/studio/features/documents/documents.spi.ts index f0d45788..36e2f1d0 100644 --- a/apps/web/src/studio/features/documents/documents.spi.ts +++ b/apps/web/src/studio/features/documents/documents.spi.ts @@ -50,4 +50,10 @@ export interface IDocumentsSpi { signal?: AbortSignal onStatusChanged: (event: DocumentEmbeddingStatusChangedEvent) => void }): Promise + crawlUrl(params: { + organizationId: string + projectId: string + url: string + limit?: number + }): Promise<{ message: string }> } diff --git a/apps/web/src/studio/features/documents/documents.thunks.ts b/apps/web/src/studio/features/documents/documents.thunks.ts index eaae3c28..5129fd86 100644 --- a/apps/web/src/studio/features/documents/documents.thunks.ts +++ b/apps/web/src/studio/features/documents/documents.thunks.ts @@ -155,6 +155,19 @@ export const getDocumentTemporaryUrl = createAsyncThunk< return await services.documents.getTemporaryUrl({ organizationId, projectId, documentId }) }) +export const crawlUrl = createAsyncThunk< + { message: string }, + { url: string; limit?: number }, + ThunkConfig +>("documents/crawlUrl", async ({ url, limit }, { extra: { services }, getState }) => { + const state = getState() + const { organizationId, projectId } = getCurrentIds({ + state, + wantedIds: ["organizationId", "projectId"], + }) + return await services.documents.crawlUrl({ organizationId, projectId, url, limit }) +}) + export const streamDocumentEmbeddingStatuses = createAsyncThunk( "documents/streamEmbeddingStatus", async (_, { extra: { services }, getState, dispatch, signal }) => { diff --git a/apps/web/src/studio/features/documents/external/documents.api.ts b/apps/web/src/studio/features/documents/external/documents.api.ts index eef74cac..77995a76 100644 --- a/apps/web/src/studio/features/documents/external/documents.api.ts +++ b/apps/web/src/studio/features/documents/external/documents.api.ts @@ -126,6 +126,14 @@ export default { onStatusChanged, }) }, + crawlUrl: async ({ organizationId, projectId, url, limit }) => { + const axios = getAxiosInstance() + const response = await axios.post( + DocumentsRoutes.crawlUrl.getPath({ organizationId, projectId }), + { payload: { url, limit } } satisfies typeof DocumentsRoutes.crawlUrl.request, + ) + return response.data.data + }, } satisfies IDocumentsSpi function toDocument(dto: DocumentDto): Document { @@ -140,6 +148,7 @@ function toDocument(dto: DocumentDto): Document { projectId: dto.projectId, size: dto.size, storageRelativePath: dto.storageRelativePath, + sourceUrl: dto.sourceUrl, embeddingStatus: dto.embeddingStatus, embeddingError: dto.embeddingError ?? null, title: dto.title, diff --git a/apps/web/src/studio/features/documents/locales/document.en.json b/apps/web/src/studio/features/documents/locales/document.en.json index 7e87d8e6..3d1e1a0b 100644 --- a/apps/web/src/studio/features/documents/locales/document.en.json +++ b/apps/web/src/studio/features/documents/locales/document.en.json @@ -32,6 +32,14 @@ "title": "Delete {{documentTitle}}", "description": "Are you sure you want to delete this document? This action cannot be undone." }, + "crawl": { + "button": "Crawl Website", + "title": "Crawl a Website", + "description": "Enter a URL to crawl. Each page will be added as a document.", + "urlLabel": "Website URL", + "limitLabel": "Max pages to crawl", + "submit": "Start Crawling" + }, "reprocess": { "cta": "Reprocess" }, diff --git a/apps/web/src/studio/features/documents/locales/document.fr.json b/apps/web/src/studio/features/documents/locales/document.fr.json index 4166581c..04fc56b0 100644 --- a/apps/web/src/studio/features/documents/locales/document.fr.json +++ b/apps/web/src/studio/features/documents/locales/document.fr.json @@ -32,6 +32,14 @@ "title": "Supprimer {{documentTitle}}", "description": "Êtes-vous sûr de vouloir supprimer ce document ? Cette action est irréversible." }, + "crawl": { + "button": "Explorer un site web", + "title": "Explorer un site web", + "description": "Entrez une URL à explorer. Chaque page sera ajoutée en tant que document.", + "urlLabel": "URL du site web", + "limitLabel": "Nombre maximum de pages", + "submit": "Lancer l'exploration" + }, "reprocess": { "cta": "Relancer le traitement" }, diff --git a/apps/web/src/studio/routes/DocumentsRoute.tsx b/apps/web/src/studio/routes/DocumentsRoute.tsx index 622b69f3..b4235083 100644 --- a/apps/web/src/studio/routes/DocumentsRoute.tsx +++ b/apps/web/src/studio/routes/DocumentsRoute.tsx @@ -53,6 +53,7 @@ import { } from "@/studio/features/document-tags/document-tags.helpers" import type { DocumentTag } from "@/studio/features/document-tags/document-tags.models" import { selectDocumentTagsData } from "@/studio/features/document-tags/document-tags.selectors" +import { CrawlUrlButton } from "@/studio/features/documents/components/CrawlUrlButton" import { DocumentTagPicker } from "@/studio/features/documents/components/DocumentTagPicker" import { EmbeddingStatusBadge } from "@/studio/features/documents/components/EmbeddingStatusBadge" import { EmptyDocument } from "@/studio/features/documents/components/EmptyDocument" @@ -107,7 +108,8 @@ function WithData({ title={t("document:documents")} description={t("document:list.description")} action={ -
+
+
From ef30d3e73da5ad75550d77389a3de230c6c5b4a7 Mon Sep 17 00:00:00 2001 From: Thomas Jego Date: Thu, 16 Apr 2026 16:48:00 +0200 Subject: [PATCH 03/58] feat(api): add Spider integration for web crawling --- .../external/spider/spider-client.service.ts | 41 +++++++++++++++++++ .../src/external/spider/spider.constants.ts | 11 +++++ 2 files changed, 52 insertions(+) create mode 100644 apps/api/src/external/spider/spider-client.service.ts create mode 100644 apps/api/src/external/spider/spider.constants.ts diff --git a/apps/api/src/external/spider/spider-client.service.ts b/apps/api/src/external/spider/spider-client.service.ts new file mode 100644 index 00000000..bfb4e49e --- /dev/null +++ b/apps/api/src/external/spider/spider-client.service.ts @@ -0,0 +1,41 @@ +import { Injectable, Logger } from "@nestjs/common" +import { Spider } from "@spider-cloud/spider-client" +import { resolveSpiderApiKey } from "./spider.constants" + +export type CrawledPage = { + url: string + markdown: string +} + +@Injectable() +export class SpiderClientService { + private readonly logger = new Logger(SpiderClientService.name) + + async crawlUrl(params: { url: string; limit: number }): Promise { + const apiKey = resolveSpiderApiKey() + const spider = new Spider({ apiKey }) + + this.logger.log(`Crawling ${params.url} with limit ${params.limit}`) + + const response = await spider.crawlUrl(params.url, { + limit: params.limit, + return_format: "markdown", + metadata: true, + }) + + if (!response) { + this.logger.warn(`Spider returned no response for ${params.url}`) + return [] + } + + const pages = response + .filter((page) => page.content && page.content.trim().length > 0) + .map((page) => ({ + url: page.url ?? params.url, + markdown: page.content ?? "", + })) + + this.logger.log(`Crawled ${pages.length} pages from ${params.url}`) + return pages + } +} diff --git a/apps/api/src/external/spider/spider.constants.ts b/apps/api/src/external/spider/spider.constants.ts new file mode 100644 index 00000000..2fcc87eb --- /dev/null +++ b/apps/api/src/external/spider/spider.constants.ts @@ -0,0 +1,11 @@ +export const SPIDER_API_KEY_ENV = "SPIDER_API_KEY" +export const DEFAULT_CRAWL_LIMIT = 10 +export const MAX_CRAWL_LIMIT = 50 + +export function resolveSpiderApiKey(): string { + const apiKey = process.env[SPIDER_API_KEY_ENV] + if (!apiKey) { + throw new Error(`${SPIDER_API_KEY_ENV} environment variable is not set`) + } + return apiKey +} From 9bb84d417630e86c54dfb4d9835c6a952ead6fe3 Mon Sep 17 00:00:00 2001 From: Thomas Jego Date: Thu, 16 Apr 2026 16:50:45 +0200 Subject: [PATCH 04/58] feat(documents): add crawling domain logic --- .../bull-mq-url-crawling-batch.service.ts | 20 ++++++ .../crawling/url-crawling-batch.interface.ts | 7 ++ .../crawling/url-crawling-batch.module.ts | 30 ++++++++ .../url-crawling-processor.service.ts | 69 +++++++++++++++++++ .../crawling/url-crawling-workers.module.ts | 37 ++++++++++ .../crawling/url-crawling.constants.ts | 5 ++ .../documents/crawling/url-crawling.types.ts | 8 +++ .../documents/crawling/url-crawling.worker.ts | 42 +++++++++++ 8 files changed, 218 insertions(+) create mode 100644 apps/api/src/domains/documents/crawling/bull-mq-url-crawling-batch.service.ts create mode 100644 apps/api/src/domains/documents/crawling/url-crawling-batch.interface.ts create mode 100644 apps/api/src/domains/documents/crawling/url-crawling-batch.module.ts create mode 100644 apps/api/src/domains/documents/crawling/url-crawling-processor.service.ts create mode 100644 apps/api/src/domains/documents/crawling/url-crawling-workers.module.ts create mode 100644 apps/api/src/domains/documents/crawling/url-crawling.constants.ts create mode 100644 apps/api/src/domains/documents/crawling/url-crawling.types.ts create mode 100644 apps/api/src/domains/documents/crawling/url-crawling.worker.ts diff --git a/apps/api/src/domains/documents/crawling/bull-mq-url-crawling-batch.service.ts b/apps/api/src/domains/documents/crawling/bull-mq-url-crawling-batch.service.ts new file mode 100644 index 00000000..50bfd19d --- /dev/null +++ b/apps/api/src/domains/documents/crawling/bull-mq-url-crawling-batch.service.ts @@ -0,0 +1,20 @@ +import { InjectQueue } from "@nestjs/bullmq" +import { Injectable, Logger } from "@nestjs/common" +import type { Queue } from "bullmq" +import { URL_CRAWLING_JOB_NAME, URL_CRAWLING_QUEUE_NAME } from "./url-crawling.constants" +import type { CrawlUrlJobPayload } from "./url-crawling.types" + +@Injectable() +export class BullMqUrlCrawlingBatchService { + private readonly logger = new Logger(BullMqUrlCrawlingBatchService.name) + + constructor( + @InjectQueue(URL_CRAWLING_QUEUE_NAME) + private readonly urlCrawlingQueue: Queue, + ) {} + + async enqueueCrawlUrl(payload: CrawlUrlJobPayload): Promise { + this.logger.log(`Enqueuing URL crawl job ${JSON.stringify(payload)}`) + await this.urlCrawlingQueue.add(URL_CRAWLING_JOB_NAME, payload) + } +} diff --git a/apps/api/src/domains/documents/crawling/url-crawling-batch.interface.ts b/apps/api/src/domains/documents/crawling/url-crawling-batch.interface.ts new file mode 100644 index 00000000..ab95fbd4 --- /dev/null +++ b/apps/api/src/domains/documents/crawling/url-crawling-batch.interface.ts @@ -0,0 +1,7 @@ +import type { CrawlUrlJobPayload } from "./url-crawling.types" + +export const URL_CRAWLING_BATCH_SERVICE = "URL_CRAWLING_BATCH_SERVICE" + +export interface UrlCrawlingBatchService { + enqueueCrawlUrl(payload: CrawlUrlJobPayload): Promise +} diff --git a/apps/api/src/domains/documents/crawling/url-crawling-batch.module.ts b/apps/api/src/domains/documents/crawling/url-crawling-batch.module.ts new file mode 100644 index 00000000..22928f55 --- /dev/null +++ b/apps/api/src/domains/documents/crawling/url-crawling-batch.module.ts @@ -0,0 +1,30 @@ +import { BullModule } from "@nestjs/bullmq" +import { Module } from "@nestjs/common" +import { ConfigModule } from "@nestjs/config" +import { getDocumentEmbeddingsBullMqConnection } from "../embeddings/document-embeddings-bullmq.config" +import { BullMqUrlCrawlingBatchService } from "./bull-mq-url-crawling-batch.service" +import { URL_CRAWLING_QUEUE_NAME } from "./url-crawling.constants" +import { URL_CRAWLING_BATCH_SERVICE } from "./url-crawling-batch.interface" + +@Module({ + imports: [ + BullModule.forRootAsync({ + imports: [ConfigModule], + useFactory: () => ({ + connection: getDocumentEmbeddingsBullMqConnection(), + }), + }), + BullModule.registerQueue({ + name: URL_CRAWLING_QUEUE_NAME, + }), + ], + providers: [ + BullMqUrlCrawlingBatchService, + { + provide: URL_CRAWLING_BATCH_SERVICE, + useExisting: BullMqUrlCrawlingBatchService, + }, + ], + exports: [URL_CRAWLING_BATCH_SERVICE], +}) +export class UrlCrawlingBatchModule {} diff --git a/apps/api/src/domains/documents/crawling/url-crawling-processor.service.ts b/apps/api/src/domains/documents/crawling/url-crawling-processor.service.ts new file mode 100644 index 00000000..e1202623 --- /dev/null +++ b/apps/api/src/domains/documents/crawling/url-crawling-processor.service.ts @@ -0,0 +1,69 @@ +import { randomUUID } from "node:crypto" +import { Inject, Injectable, Logger } from "@nestjs/common" +// biome-ignore lint/style/useImportType: Required at runtime for NestJS DI +import { SpiderClientService } from "@/external/spider/spider-client.service" +// biome-ignore lint/style/useImportType: Required at runtime for NestJS DI +import { DocumentsService } from "../documents.service" +import { + DOCUMENT_EMBEDDINGS_BATCH_SERVICE, + type DocumentEmbeddingsBatchService, +} from "../embeddings/document-embeddings-batch.interface" +import type { CrawlUrlJobPayload } from "./url-crawling.types" + +@Injectable() +export class UrlCrawlingProcessorService { + private readonly logger = new Logger(UrlCrawlingProcessorService.name) + + constructor( + private readonly spiderClientService: SpiderClientService, + private readonly documentsService: DocumentsService, + @Inject(DOCUMENT_EMBEDDINGS_BATCH_SERVICE) + private readonly embeddingsBatchService: DocumentEmbeddingsBatchService, + ) {} + + async processCrawlJob(payload: CrawlUrlJobPayload): Promise { + this.logger.log(`Processing crawl job for ${payload.url} (limit: ${payload.limit})`) + + const pages = await this.spiderClientService.crawlUrl({ + url: payload.url, + limit: payload.limit, + }) + + this.logger.log(`Crawled ${pages.length} pages from ${payload.url}`) + + const connectScope = { + organizationId: payload.organizationId, + projectId: payload.projectId, + } + + for (const page of pages) { + const documentId = randomUUID() + + const document = await this.documentsService.createDocument({ + connectScope, + documentId, + uploadStatus: "uploaded", + fields: { + title: page.url, + content: page.markdown, + mimeType: "text/html", + sourceType: "webCrawl", + size: Buffer.byteLength(page.markdown, "utf-8"), + fileName: null as unknown as string, + storageRelativePath: null as unknown as string, + }, + }) + + await this.embeddingsBatchService.enqueueCreateEmbeddingsForDocument({ + documentId: document.id, + organizationId: payload.organizationId, + projectId: payload.projectId, + uploadedByUserId: payload.requestedByUserId, + origin: "web-crawl", + currentTraceId: payload.currentTraceId, + }) + + this.logger.log(`Created document ${document.id} for page ${page.url}`) + } + } +} diff --git a/apps/api/src/domains/documents/crawling/url-crawling-workers.module.ts b/apps/api/src/domains/documents/crawling/url-crawling-workers.module.ts new file mode 100644 index 00000000..0c80dd9e --- /dev/null +++ b/apps/api/src/domains/documents/crawling/url-crawling-workers.module.ts @@ -0,0 +1,37 @@ +import { BullModule } from "@nestjs/bullmq" +import { Module } from "@nestjs/common" +import { ConfigModule } from "@nestjs/config" +import { TypeOrmModule } from "@nestjs/typeorm" +import { ALL_ENTITIES } from "@/common/all-entities" +import { SpiderClientService } from "@/external/spider/spider-client.service" +import { DocumentsService } from "../documents.service" +import { DocumentEmbeddingsBatchModule } from "../embeddings/document-embeddings-batch.module" +import { getDocumentEmbeddingsBullMqConnection } from "../embeddings/document-embeddings-bullmq.config" +import { DocumentTagsService } from "../tags/document-tags.service" +import { URL_CRAWLING_QUEUE_NAME } from "./url-crawling.constants" +import { UrlCrawlingWorker } from "./url-crawling.worker" +import { UrlCrawlingProcessorService } from "./url-crawling-processor.service" + +@Module({ + imports: [ + BullModule.forRootAsync({ + imports: [ConfigModule], + useFactory: () => ({ + connection: getDocumentEmbeddingsBullMqConnection(), + }), + }), + BullModule.registerQueue({ + name: URL_CRAWLING_QUEUE_NAME, + }), + TypeOrmModule.forFeature(ALL_ENTITIES), + DocumentEmbeddingsBatchModule, + ], + providers: [ + UrlCrawlingWorker, + UrlCrawlingProcessorService, + SpiderClientService, + DocumentsService, + DocumentTagsService, + ], +}) +export class UrlCrawlingWorkersModule {} diff --git a/apps/api/src/domains/documents/crawling/url-crawling.constants.ts b/apps/api/src/domains/documents/crawling/url-crawling.constants.ts new file mode 100644 index 00000000..c0b4010a --- /dev/null +++ b/apps/api/src/domains/documents/crawling/url-crawling.constants.ts @@ -0,0 +1,5 @@ +const DEFAULT_URL_CRAWLING_QUEUE_NAME = "url-crawling" + +export const URL_CRAWLING_QUEUE_NAME = + process.env.URL_CRAWLING_QUEUE_NAME ?? DEFAULT_URL_CRAWLING_QUEUE_NAME +export const URL_CRAWLING_JOB_NAME = "crawl-url" diff --git a/apps/api/src/domains/documents/crawling/url-crawling.types.ts b/apps/api/src/domains/documents/crawling/url-crawling.types.ts new file mode 100644 index 00000000..ba4d3a59 --- /dev/null +++ b/apps/api/src/domains/documents/crawling/url-crawling.types.ts @@ -0,0 +1,8 @@ +export type CrawlUrlJobPayload = { + url: string + limit: number + organizationId: string + projectId: string + requestedByUserId: string + currentTraceId: string +} diff --git a/apps/api/src/domains/documents/crawling/url-crawling.worker.ts b/apps/api/src/domains/documents/crawling/url-crawling.worker.ts new file mode 100644 index 00000000..c5186138 --- /dev/null +++ b/apps/api/src/domains/documents/crawling/url-crawling.worker.ts @@ -0,0 +1,42 @@ +import { OnWorkerEvent, Processor, WorkerHost } from "@nestjs/bullmq" +import { Logger } from "@nestjs/common" +import type { Job } from "bullmq" +import { URL_CRAWLING_JOB_NAME, URL_CRAWLING_QUEUE_NAME } from "./url-crawling.constants" +import type { CrawlUrlJobPayload } from "./url-crawling.types" +// biome-ignore lint/style/useImportType: Required at runtime for NestJS DI +import { UrlCrawlingProcessorService } from "./url-crawling-processor.service" + +@Processor(URL_CRAWLING_QUEUE_NAME) +export class UrlCrawlingWorker extends WorkerHost { + private readonly logger = new Logger(UrlCrawlingWorker.name) + + constructor(private readonly crawlingProcessorService: UrlCrawlingProcessorService) { + super() + } + + async process(job: Job): Promise { + if (job.name !== URL_CRAWLING_JOB_NAME) { + return + } + + await this.crawlingProcessorService.processCrawlJob(job.data) + } + + @OnWorkerEvent("active") + onActive(job: Job): void { + this.logger.log(`Job active: ${job.name} (${job.id})`) + } + + @OnWorkerEvent("completed") + onCompleted(job: Job): void { + this.logger.log(`Job completed: ${job.name} (${job.id})`) + } + + @OnWorkerEvent("failed") + onFailed(job: Job | undefined, error: Error): void { + this.logger.error( + `Job failed: ${job?.name ?? "unknown"} (${job?.id ?? "unknown"})`, + error.stack, + ) + } +} From 1d52255a21d3cc4b6a7a1ef8d9a9f5464c29c368 Mon Sep 17 00:00:00 2001 From: Thomas Jego Date: Thu, 16 Apr 2026 16:51:39 +0200 Subject: [PATCH 05/58] feat(documents): wire crawling into documents service and embeddings pipeline --- .../domains/documents/documents.controller.ts | 39 +++++++++++++++++++ .../src/domains/documents/documents.module.ts | 2 + .../domains/documents/documents.service.ts | 4 +- .../document-embeddings-processor.service.ts | 6 +++ .../embeddings/document-embeddings.types.ts | 2 +- 5 files changed, 51 insertions(+), 2 deletions(-) diff --git a/apps/api/src/domains/documents/documents.controller.ts b/apps/api/src/domains/documents/documents.controller.ts index d9d9579a..84fb3587 100644 --- a/apps/api/src/domains/documents/documents.controller.ts +++ b/apps/api/src/domains/documents/documents.controller.ts @@ -46,6 +46,10 @@ import type { MulterFile } from "@/common/types" import { TrackActivity } from "@/domains/activities/track-activity.decorator" import { JwtAuthGuard } from "@/domains/auth/jwt-auth.guard" import { UserGuard } from "@/domains/users/user.guard" +import { + URL_CRAWLING_BATCH_SERVICE, + type UrlCrawlingBatchService, +} from "./crawling/url-crawling-batch.interface" import type { Document } from "./document.entity" import { DocumentsGuard } from "./documents.guard" import { @@ -73,6 +77,8 @@ export class DocumentsController { private readonly fileStorageService: IFileStorage, @Inject(DOCUMENT_EMBEDDINGS_BATCH_SERVICE) private readonly documentEmbeddingsBatchService: DocumentEmbeddingsBatchService, + @Inject(URL_CRAWLING_BATCH_SERVICE) + private readonly urlCrawlingBatchService: UrlCrawlingBatchService, private readonly documentsService: DocumentsService, private readonly documentEmbeddingStatusStreamService: DocumentEmbeddingStatusStreamService, ) {} @@ -357,6 +363,39 @@ export class DocumentsController { return { data: { url } } } + @CheckPolicy((policy) => policy.canCreate()) + @Post(DocumentsRoutes.crawlUrl.path) + @TrackActivity({ action: "document.crawlUrl" }) + @HttpCode(HttpStatus.ACCEPTED) + async crawlUrl( + @Body() { payload }: typeof DocumentsRoutes.crawlUrl.request, + @Request() req: EndpointRequestWithProject, + ): Promise { + try { + new URL(payload.url) + } catch { + throw new UnprocessableEntityException("Invalid URL.") + } + + const limit = Math.min(Math.max(payload.limit ?? 10, 1), 50) + const connectScope = getRequiredConnectScope(req) + + await this.urlCrawlingBatchService.enqueueCrawlUrl({ + url: payload.url, + limit, + organizationId: connectScope.organizationId, + projectId: connectScope.projectId, + requestedByUserId: req.user.id, + currentTraceId: v4(), + }) + + return { + data: { + message: `Crawling ${payload.url} (up to ${limit} pages). Documents will appear as they are processed.`, + }, + } + } + @CheckPolicy((policy) => policy.canList()) @Sse(DocumentsRoutes.streamEmbeddingStatus.path, { method: 0 /* GET */ }) streamEmbeddingStatus( diff --git a/apps/api/src/domains/documents/documents.module.ts b/apps/api/src/domains/documents/documents.module.ts index 458917de..7ae2a2d6 100644 --- a/apps/api/src/domains/documents/documents.module.ts +++ b/apps/api/src/domains/documents/documents.module.ts @@ -14,6 +14,7 @@ import { ProjectMembership } from "@/domains/projects/memberships/project-member import { Project } from "@/domains/projects/project.entity" import { ProjectsModule } from "@/domains/projects/projects.module" import { UsersModule } from "@/domains/users/users.module" +import { UrlCrawlingBatchModule } from "./crawling/url-crawling-batch.module" import { Document } from "./document.entity" import { DocumentsController } from "./documents.controller" import { DocumentsGuard } from "./documents.guard" @@ -55,6 +56,7 @@ import { DocumentTagsModule } from "./tags/document-tags.module" AuthModule, StorageModule, DocumentEmbeddingsBatchModule, + UrlCrawlingBatchModule, ], providers: [ DocumentsService, diff --git a/apps/api/src/domains/documents/documents.service.ts b/apps/api/src/domains/documents/documents.service.ts index 5aae1055..2d7b5f96 100644 --- a/apps/api/src/domains/documents/documents.service.ts +++ b/apps/api/src/domains/documents/documents.service.ts @@ -30,7 +30,8 @@ export class DocumentsService { fields: Pick< Document, "fileName" | "mimeType" | "size" | "storageRelativePath" | "title" | "sourceType" - > + > & + Partial> uploadStatus: "pending" | "uploaded" tagIds?: string[] }): Promise { @@ -42,6 +43,7 @@ export class DocumentsService { storageRelativePath: fields.storageRelativePath, title: fields.title ?? fields.fileName, sourceType: fields.sourceType, + content: fields.content, uploadStatus, }) diff --git a/apps/api/src/domains/documents/embeddings/document-embeddings-processor.service.ts b/apps/api/src/domains/documents/embeddings/document-embeddings-processor.service.ts index 3d5369bf..b44763b2 100644 --- a/apps/api/src/domains/documents/embeddings/document-embeddings-processor.service.ts +++ b/apps/api/src/domains/documents/embeddings/document-embeddings-processor.service.ts @@ -93,6 +93,12 @@ export class DocumentEmbeddingsProcessorService { chunks: string[] extractionEngine: DocumentExtractionEngine }> { + if (document.content && !document.storageRelativePath) { + const chunks = this.splitTextForEmbeddings(document.content) + this.logger.log(`Split document ${document.id} (from content) into ${chunks.length} chunks`) + return { chunks, extractionEngine: "web-crawl" } + } + const fileBuffer = await this.fileStorage.readFile(document.storageRelativePath) const extractionResult = await this.textExtractorService.extract(fileBuffer, document.mimeType) const chunks = extractionResult.chunks ?? this.splitTextForEmbeddings(extractionResult.text) diff --git a/apps/api/src/domains/documents/embeddings/document-embeddings.types.ts b/apps/api/src/domains/documents/embeddings/document-embeddings.types.ts index 850f0755..37d50ba5 100644 --- a/apps/api/src/domains/documents/embeddings/document-embeddings.types.ts +++ b/apps/api/src/domains/documents/embeddings/document-embeddings.types.ts @@ -5,7 +5,7 @@ export type CreateDocumentEmbeddingsJobPayload = { organizationId: string projectId: string uploadedByUserId: string - origin: "document-upload" + origin: "document-upload" | "web-crawl" currentTraceId: string } From 7603ff07eaf6048d1f571df05ab1b02d351ba3b3 Mon Sep 17 00:00:00 2001 From: Thomas Jego Date: Thu, 16 Apr 2026 16:53:26 +0200 Subject: [PATCH 06/58] chore(api): register crawling module and add dependencies --- apps/api/package.json | 1 + apps/api/src/workers-app.module.ts | 2 ++ 2 files changed, 3 insertions(+) diff --git a/apps/api/package.json b/apps/api/package.json index 5dd22a36..383f6ffc 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -67,6 +67,7 @@ "@opentelemetry/sdk-metrics": "^2.6.1", "@opentelemetry/sdk-node": "^0.214.0", "@opentelemetry/sdk-trace-base": "^2.6.1", + "@spider-cloud/spider-client": "^0.2.0", "ai": "^6.0.87", "axios": "^1.12.2", "bullmq": "^5.70.2", diff --git a/apps/api/src/workers-app.module.ts b/apps/api/src/workers-app.module.ts index 0f331e0e..f0d7dbf5 100644 --- a/apps/api/src/workers-app.module.ts +++ b/apps/api/src/workers-app.module.ts @@ -5,6 +5,7 @@ import { TypeOrmModule } from "@nestjs/typeorm" import { getBullMqConnection } from "./bullmq.config" import { WorkersHealthModule } from "./common/workers-health/workers-health.module" import typeorm from "./config/typeorm" +import { UrlCrawlingWorkersModule } from "./domains/documents/crawling/url-crawling-workers.module" import { DocumentEmbeddingsWorkersModule } from "./domains/documents/embeddings/document-embeddings-workers.module" import { StorageModule } from "./domains/documents/storage/storage.module" import { EvaluationExtractionRunWorkersModule } from "./domains/evaluations/extraction/runs/evaluation-extraction-run-workers.module" @@ -28,6 +29,7 @@ import { EvaluationExtractionRunWorkersModule } from "./domains/evaluations/extr }), DocumentEmbeddingsWorkersModule, EvaluationExtractionRunWorkersModule, + UrlCrawlingWorkersModule, StorageModule, WorkersHealthModule, ], From 02a119bf312cb3894c89b34a52cd0c76b060fc40 Mon Sep 17 00:00:00 2001 From: Thomas Jego Date: Thu, 16 Apr 2026 17:10:06 +0200 Subject: [PATCH 07/58] fix(documents): update document service and chunk retrieval for crawl sources --- apps/api/src/domains/documents/documents.service.ts | 5 ++++- .../documents/embeddings/document-chunk-retrieval.service.ts | 4 +++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/apps/api/src/domains/documents/documents.service.ts b/apps/api/src/domains/documents/documents.service.ts index 2d7b5f96..251f06b8 100644 --- a/apps/api/src/domains/documents/documents.service.ts +++ b/apps/api/src/domains/documents/documents.service.ts @@ -80,7 +80,10 @@ export class DocumentsService { async listDocuments(connectScope: RequiredConnectScope): Promise { return ( await this.documentConnectRepository.find(connectScope, { - where: { sourceType: "project", uploadStatus: "uploaded" }, + where: [ + { sourceType: "project", uploadStatus: "uploaded" }, + { sourceType: "webCrawl", uploadStatus: "uploaded" }, + ], relations: ["tags"], }) )?.sort(this.sortNewestFirst) diff --git a/apps/api/src/domains/documents/embeddings/document-chunk-retrieval.service.ts b/apps/api/src/domains/documents/embeddings/document-chunk-retrieval.service.ts index 925f5f2e..e08d4d0e 100644 --- a/apps/api/src/domains/documents/embeddings/document-chunk-retrieval.service.ts +++ b/apps/api/src/domains/documents/embeddings/document-chunk-retrieval.service.ts @@ -114,7 +114,9 @@ export class DocumentChunkRetrievalService { .andWhere("document.embedding_status = :embeddingStatus", { embeddingStatus: "completed", }) - .andWhere("document.source_type = :projectSourceType", { projectSourceType: "project" }) + .andWhere("document.source_type IN (:...allowedSourceTypes)", { + allowedSourceTypes: ["project", "webCrawl"], + }) .andWhere("chunk.deleted_at IS NULL") .andWhere("embedding.deleted_at IS NULL") .andWhere("document.deleted_at IS NULL") From d923591b8daf0a910236a68a20a80d4f05ff29b8 Mon Sep 17 00:00:00 2001 From: Thomas Jego Date: Fri, 17 Apr 2026 11:54:51 +0200 Subject: [PATCH 08/58] feat(documents): ensure chunks never span across crawled pages --- .../url-crawling-processor.service.ts | 58 ++++++++++--------- .../document-embeddings-processor.service.ts | 11 +++- 2 files changed, 42 insertions(+), 27 deletions(-) diff --git a/apps/api/src/domains/documents/crawling/url-crawling-processor.service.ts b/apps/api/src/domains/documents/crawling/url-crawling-processor.service.ts index e1202623..599a4e6c 100644 --- a/apps/api/src/domains/documents/crawling/url-crawling-processor.service.ts +++ b/apps/api/src/domains/documents/crawling/url-crawling-processor.service.ts @@ -36,34 +36,40 @@ export class UrlCrawlingProcessorService { projectId: payload.projectId, } - for (const page of pages) { - const documentId = randomUUID() + const contentPages = pages.map((page) => ({ + url: page.url, + markdown: page.markdown, + })) + const contentJson = JSON.stringify(contentPages) - const document = await this.documentsService.createDocument({ - connectScope, - documentId, - uploadStatus: "uploaded", - fields: { - title: page.url, - content: page.markdown, - mimeType: "text/html", - sourceType: "webCrawl", - size: Buffer.byteLength(page.markdown, "utf-8"), - fileName: null as unknown as string, - storageRelativePath: null as unknown as string, - }, - }) + const documentId = randomUUID() - await this.embeddingsBatchService.enqueueCreateEmbeddingsForDocument({ - documentId: document.id, - organizationId: payload.organizationId, - projectId: payload.projectId, - uploadedByUserId: payload.requestedByUserId, - origin: "web-crawl", - currentTraceId: payload.currentTraceId, - }) + const document = await this.documentsService.createDocument({ + connectScope, + documentId, + uploadStatus: "uploaded", + fields: { + title: payload.url, + content: contentJson, + mimeType: "text/html", + sourceType: "webCrawl", + size: Buffer.byteLength(contentJson, "utf-8"), + fileName: null as unknown as string, + storageRelativePath: null as unknown as string, + }, + }) - this.logger.log(`Created document ${document.id} for page ${page.url}`) - } + await this.embeddingsBatchService.enqueueCreateEmbeddingsForDocument({ + documentId: document.id, + organizationId: payload.organizationId, + projectId: payload.projectId, + uploadedByUserId: payload.requestedByUserId, + origin: "web-crawl", + currentTraceId: payload.currentTraceId, + }) + + this.logger.log( + `Created document ${document.id} from ${pages.length} pages crawled at ${payload.url}`, + ) } } diff --git a/apps/api/src/domains/documents/embeddings/document-embeddings-processor.service.ts b/apps/api/src/domains/documents/embeddings/document-embeddings-processor.service.ts index b44763b2..00fb9d79 100644 --- a/apps/api/src/domains/documents/embeddings/document-embeddings-processor.service.ts +++ b/apps/api/src/domains/documents/embeddings/document-embeddings-processor.service.ts @@ -94,7 +94,7 @@ export class DocumentEmbeddingsProcessorService { extractionEngine: DocumentExtractionEngine }> { if (document.content && !document.storageRelativePath) { - const chunks = this.splitTextForEmbeddings(document.content) + const chunks = this.splitWebCrawlContent(document.content) this.logger.log(`Split document ${document.id} (from content) into ${chunks.length} chunks`) return { chunks, extractionEngine: "web-crawl" } } @@ -121,6 +121,15 @@ export class DocumentEmbeddingsProcessorService { .filter((chunk) => chunk.length > 0) } + private splitWebCrawlContent(content: string): string[] { + try { + const pages: { url: string; markdown: string }[] = JSON.parse(content) + return pages.flatMap((page) => this.splitTextForEmbeddings(page.markdown)) + } catch { + return this.splitTextForEmbeddings(content) + } + } + private async generateEmbeddingsByModel(chunks: string[]): Promise> { const { project, location } = resolveVertexConfig() const embeddingModelNames = resolveEmbeddingModelNames() From e569a7251e6ed159259f15b062db02091ca0a247 Mon Sep 17 00:00:00 2001 From: Thomas Jego Date: Fri, 17 Apr 2026 11:55:56 +0200 Subject: [PATCH 09/58] fix(spider): flatten nested array response from Spider API --- apps/api/src/external/spider/spider-client.service.ts | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/apps/api/src/external/spider/spider-client.service.ts b/apps/api/src/external/spider/spider-client.service.ts index bfb4e49e..f4827d11 100644 --- a/apps/api/src/external/spider/spider-client.service.ts +++ b/apps/api/src/external/spider/spider-client.service.ts @@ -28,7 +28,14 @@ export class SpiderClientService { return [] } - const pages = response + // Spider may return a nested array — flatten it + const flatResponse = response.flat() + + this.logger.debug( + `Spider flat response: ${flatResponse.length} items, keys: ${flatResponse.length > 0 && flatResponse[0] ? Object.keys(flatResponse[0]).join(", ") : "N/A"}`, + ) + + const pages = flatResponse .filter((page) => page.content && page.content.trim().length > 0) .map((page) => ({ url: page.url ?? params.url, From 647990156dcb2a8abda71644357f0a1c5cc0cc49 Mon Sep 17 00:00:00 2001 From: Thomas Jego Date: Fri, 17 Apr 2026 12:50:52 +0200 Subject: [PATCH 10/58] feat(documents): add crawled sub-pages dropdown in documents list --- .../documents/documents-stream-events.spec.ts | 1 + .../documents/external/documents.api.ts | 1 + apps/web/src/studio/routes/DocumentsRoute.tsx | 103 ++++++++++++++---- 3 files changed, 86 insertions(+), 19 deletions(-) diff --git a/apps/web/src/studio/features/documents/documents-stream-events.spec.ts b/apps/web/src/studio/features/documents/documents-stream-events.spec.ts index 99dc8dad..d39c391b 100644 --- a/apps/web/src/studio/features/documents/documents-stream-events.spec.ts +++ b/apps/web/src/studio/features/documents/documents-stream-events.spec.ts @@ -17,6 +17,7 @@ function buildDocument(documentId: string): Document { mimeType: MimeTypes.pdf, size: 123, storageRelativePath: "/documents/file.pdf", + sourceType: "project", embeddingStatus: "processing", embeddingError: null, tagIds: [], diff --git a/apps/web/src/studio/features/documents/external/documents.api.ts b/apps/web/src/studio/features/documents/external/documents.api.ts index 77995a76..61217998 100644 --- a/apps/web/src/studio/features/documents/external/documents.api.ts +++ b/apps/web/src/studio/features/documents/external/documents.api.ts @@ -148,6 +148,7 @@ function toDocument(dto: DocumentDto): Document { projectId: dto.projectId, size: dto.size, storageRelativePath: dto.storageRelativePath, + sourceType: dto.sourceType, sourceUrl: dto.sourceUrl, embeddingStatus: dto.embeddingStatus, embeddingError: dto.embeddingError ?? null, diff --git a/apps/web/src/studio/routes/DocumentsRoute.tsx b/apps/web/src/studio/routes/DocumentsRoute.tsx index b4235083..7045d17a 100644 --- a/apps/web/src/studio/routes/DocumentsRoute.tsx +++ b/apps/web/src/studio/routes/DocumentsRoute.tsx @@ -27,11 +27,15 @@ import { TableHeader, TableRow, } from "@caseai-connect/ui/shad/table" +import { Collapsible, CollapsibleTrigger } from "@caseai-connect/ui/shad/collapsible" import { + ChevronDownIcon, ChevronRightIcon, CloudAlertIcon, EllipsisVerticalIcon, + ExternalLinkIcon, FileDownIcon, + GlobeIcon, InfoIcon, Loader2Icon, PencilIcon, @@ -149,6 +153,19 @@ function WithData({ ) } +function parseCrawledPages(content?: string): { url: string; markdown: string }[] | null { + if (!content) return null + try { + const parsed = JSON.parse(content) + if (Array.isArray(parsed) && parsed.length > 0 && parsed[0].url && parsed[0].markdown) { + return parsed + } + } catch { + // not JSON, not a crawl document + } + return null +} + function DocumentRow({ document, documentTags, @@ -157,27 +174,75 @@ function DocumentRow({ documentTags: DocumentTag[] }) { const date = buildSince(document.updatedAt) + const crawledPages = + document.sourceType === "webCrawl" ? parseCrawledPages(document.content) : null + + const [isOpen, setIsOpen] = useState(false) return ( - - {document.title} - -
- {document.tagIds.map((tagId) => ( - - {getTagFullPath(documentTags, tagId)} - - ))} -
-
- - - - {date} - - - -
+ <> + + +
+ {crawledPages && crawledPages.length > 0 ? ( + + + + + + ) : null} +
+ {crawledPages ? : null} + {document.title} + {crawledPages ? ( + + {crawledPages.length} pages + + ) : null} +
+
+
+ +
+ {document.tagIds.map((tagId) => ( + + {getTagFullPath(documentTags, tagId)} + + ))} +
+
+ + + + {date} + + + +
+ {crawledPages && isOpen + ? crawledPages.map((page) => ( + + + + + {page.url} + + + + )) + : null} + ) } From 9f7fe950f9943f93c0bf15db2feee4c63074b51f Mon Sep 17 00:00:00 2001 From: Thomas Jego Date: Fri, 17 Apr 2026 12:51:56 +0200 Subject: [PATCH 11/58] feat(documents): expose sourceType in document API response --- apps/api/src/domains/documents/documents.controller.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/api/src/domains/documents/documents.controller.ts b/apps/api/src/domains/documents/documents.controller.ts index 84fb3587..b4700a2a 100644 --- a/apps/api/src/domains/documents/documents.controller.ts +++ b/apps/api/src/domains/documents/documents.controller.ts @@ -427,6 +427,7 @@ function toDocumentDto(entity: Document): DocumentDto { mimeType: entity.mimeType as MimeTypes, size: entity.size, storageRelativePath: entity.storageRelativePath, + sourceType: entity.sourceType, embeddingStatus: entity.embeddingStatus, embeddingError: entity.embeddingError ?? null, tagIds: entity.tags?.map((tag) => tag.id) || [], From 36a1bc4ea9fa1d15357e0238f66633c97a1edd90 Mon Sep 17 00:00:00 2001 From: Thomas Jego Date: Fri, 17 Apr 2026 12:52:50 +0200 Subject: [PATCH 12/58] feat(documents): add sourceType to DocumentDto --- packages/api-contracts/src/documents/documents.dto.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/packages/api-contracts/src/documents/documents.dto.ts b/packages/api-contracts/src/documents/documents.dto.ts index f1f04f68..203e853b 100644 --- a/packages/api-contracts/src/documents/documents.dto.ts +++ b/packages/api-contracts/src/documents/documents.dto.ts @@ -52,6 +52,7 @@ export type DocumentDto = { mimeType?: MimeTypes size?: number storageRelativePath?: string + sourceType: DocumentSourceType sourceUrl?: string | null embeddingStatus: DocumentEmbeddingStatus embeddingError: string | null From 99c03249db445203f75a7bd01b9193f715b1bd17 Mon Sep 17 00:00:00 2001 From: Thomas Jego Date: Fri, 17 Apr 2026 13:26:11 +0200 Subject: [PATCH 13/58] feat(documents): create crawl document eagerly so it appears instantly in the UI --- .../url-crawling-processor.service.ts | 23 +++++-------------- .../documents/crawling/url-crawling.types.ts | 1 + .../domains/documents/documents.controller.ts | 16 +++++++++++++ .../domains/documents/documents.service.ts | 20 ++++++++++++++++ 4 files changed, 43 insertions(+), 17 deletions(-) diff --git a/apps/api/src/domains/documents/crawling/url-crawling-processor.service.ts b/apps/api/src/domains/documents/crawling/url-crawling-processor.service.ts index 599a4e6c..de255c2e 100644 --- a/apps/api/src/domains/documents/crawling/url-crawling-processor.service.ts +++ b/apps/api/src/domains/documents/crawling/url-crawling-processor.service.ts @@ -1,4 +1,3 @@ -import { randomUUID } from "node:crypto" import { Inject, Injectable, Logger } from "@nestjs/common" // biome-ignore lint/style/useImportType: Required at runtime for NestJS DI import { SpiderClientService } from "@/external/spider/spider-client.service" @@ -42,25 +41,15 @@ export class UrlCrawlingProcessorService { })) const contentJson = JSON.stringify(contentPages) - const documentId = randomUUID() - - const document = await this.documentsService.createDocument({ + await this.documentsService.updateContent({ connectScope, - documentId, - uploadStatus: "uploaded", - fields: { - title: payload.url, - content: contentJson, - mimeType: "text/html", - sourceType: "webCrawl", - size: Buffer.byteLength(contentJson, "utf-8"), - fileName: null as unknown as string, - storageRelativePath: null as unknown as string, - }, + documentId: payload.documentId, + content: contentJson, + size: Buffer.byteLength(contentJson, "utf-8"), }) await this.embeddingsBatchService.enqueueCreateEmbeddingsForDocument({ - documentId: document.id, + documentId: payload.documentId, organizationId: payload.organizationId, projectId: payload.projectId, uploadedByUserId: payload.requestedByUserId, @@ -69,7 +58,7 @@ export class UrlCrawlingProcessorService { }) this.logger.log( - `Created document ${document.id} from ${pages.length} pages crawled at ${payload.url}`, + `Updated document ${payload.documentId} with ${pages.length} pages crawled at ${payload.url}`, ) } } diff --git a/apps/api/src/domains/documents/crawling/url-crawling.types.ts b/apps/api/src/domains/documents/crawling/url-crawling.types.ts index ba4d3a59..1f1cc8ff 100644 --- a/apps/api/src/domains/documents/crawling/url-crawling.types.ts +++ b/apps/api/src/domains/documents/crawling/url-crawling.types.ts @@ -1,4 +1,5 @@ export type CrawlUrlJobPayload = { + documentId: string url: string limit: number organizationId: string diff --git a/apps/api/src/domains/documents/documents.controller.ts b/apps/api/src/domains/documents/documents.controller.ts index b4700a2a..0c2cc858 100644 --- a/apps/api/src/domains/documents/documents.controller.ts +++ b/apps/api/src/domains/documents/documents.controller.ts @@ -380,7 +380,23 @@ export class DocumentsController { const limit = Math.min(Math.max(payload.limit ?? 10, 1), 50) const connectScope = getRequiredConnectScope(req) + const documentId = v4() + await this.documentsService.createDocument({ + connectScope, + documentId, + uploadStatus: "uploaded", + fields: { + title: payload.url, + mimeType: "text/html", + sourceType: "webCrawl", + size: 0, + fileName: null as unknown as string, + storageRelativePath: null as unknown as string, + }, + }) + await this.urlCrawlingBatchService.enqueueCrawlUrl({ + documentId, url: payload.url, limit, organizationId: connectScope.organizationId, diff --git a/apps/api/src/domains/documents/documents.service.ts b/apps/api/src/domains/documents/documents.service.ts index 251f06b8..78154997 100644 --- a/apps/api/src/domains/documents/documents.service.ts +++ b/apps/api/src/domains/documents/documents.service.ts @@ -156,6 +156,26 @@ export class DocumentsService { return this.documentConnectRepository.saveOne(document) } + async updateContent({ + connectScope, + documentId, + content, + size, + }: { + connectScope: RequiredConnectScope + documentId: string + content: string + size: number + }): Promise { + const document = await this.documentConnectRepository.getOneById(connectScope, documentId) + if (!document) { + throw new NotFoundException(`Document with id ${documentId} not found`) + } + document.content = content + document.size = size + return this.documentConnectRepository.saveOne(document) + } + async saveOne(document: Document): Promise { return this.documentConnectRepository.saveOne(document) } From f05f4cfc13c13119324dcfa3e0b64dffd20c4be8 Mon Sep 17 00:00:00 2001 From: Thomas Jego Date: Fri, 17 Apr 2026 13:29:49 +0200 Subject: [PATCH 14/58] feat(documents): show globe icon immediately and refetch on crawl completion for dropdown --- .../features/documents/documents.middleware.ts | 17 ++++++++++++++++- apps/web/src/studio/routes/DocumentsRoute.tsx | 11 ++++++----- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/apps/web/src/studio/features/documents/documents.middleware.ts b/apps/web/src/studio/features/documents/documents.middleware.ts index ffbe91ed..77bb8ba6 100644 --- a/apps/web/src/studio/features/documents/documents.middleware.ts +++ b/apps/web/src/studio/features/documents/documents.middleware.ts @@ -1,6 +1,7 @@ import { createListenerMiddleware, isAnyOf } from "@reduxjs/toolkit" import { notificationsActions } from "@/common/features/notifications/notifications.slice" import { hasProjectChanged } from "@/common/features/projects/projects.selectors" +import { ADS } from "@/common/store/async-data-status" import type { AppDispatch, RootState } from "@/common/store/types" import { createDocumentTag, @@ -53,8 +54,22 @@ function registerListeners() { listenerMiddleware.startListening({ actionCreator: documentsActions.patchDocumentEmbeddingStatus, - effect: async (_, listenerApi) => { + effect: async (action, listenerApi) => { syncDocumentEmbeddingStatusStreamWithDocuments(listenerApi) + + // Refetch documents when a webCrawl document finishes embedding + // so the content (crawled pages) is available for the dropdown + if (action.payload.embeddingStatus === "completed") { + const state = listenerApi.getState() + if (ADS.isFulfilled(state.studio.documents.data)) { + const document = state.studio.documents.data.value.find( + (document) => document.id === action.payload.documentId, + ) + if (document?.sourceType === "webCrawl") { + listenerApi.dispatch(listDocuments()) + } + } + } }, }) diff --git a/apps/web/src/studio/routes/DocumentsRoute.tsx b/apps/web/src/studio/routes/DocumentsRoute.tsx index 7045d17a..602add5b 100644 --- a/apps/web/src/studio/routes/DocumentsRoute.tsx +++ b/apps/web/src/studio/routes/DocumentsRoute.tsx @@ -174,8 +174,9 @@ function DocumentRow({ documentTags: DocumentTag[] }) { const date = buildSince(document.updatedAt) - const crawledPages = - document.sourceType === "webCrawl" ? parseCrawledPages(document.content) : null + const isWebCrawl = document.sourceType === "webCrawl" + const crawledPages = isWebCrawl ? parseCrawledPages(document.content) : null + const hasPages = crawledPages && crawledPages.length > 0 const [isOpen, setIsOpen] = useState(false) @@ -184,7 +185,7 @@ function DocumentRow({
- {crawledPages && crawledPages.length > 0 ? ( + {hasPages ? ( - - - - {sources.map((source) => ( -
- {source.chunks.map((chunk) => ( - - {chunk.partialContent} - - ))} -
- ))} + + {sources.length > 1 && } + {sources.map((source) => { + const isWebCrawl = source.documentSourceType === "webCrawl" + return ( +
+ {source.documentTitle ? ( +
+ {isWebCrawl ? ( + + ) : ( + + )} + {source.documentTitle} +
+ ) : null} + {source.chunks.map((chunk) => ( + + {chunk.partialContent} + + ))} +
+ ) + })}
) From 5d292232fe8323d2428ecb20be1e607904656e6a Mon Sep 17 00:00:00 2001 From: Thomas Jego Date: Tue, 21 Apr 2026 11:42:06 +0200 Subject: [PATCH 19/58] feat(documents): return documentSourceType in retrieved chunks --- .../documents/embeddings/document-chunk-retrieval.service.ts | 2 ++ 1 file changed, 2 insertions(+) diff --git a/apps/api/src/domains/documents/embeddings/document-chunk-retrieval.service.ts b/apps/api/src/domains/documents/embeddings/document-chunk-retrieval.service.ts index e08d4d0e..80009ce2 100644 --- a/apps/api/src/domains/documents/embeddings/document-chunk-retrieval.service.ts +++ b/apps/api/src/domains/documents/embeddings/document-chunk-retrieval.service.ts @@ -13,6 +13,7 @@ export type RetrievedDocumentChunk = { documentId: string documentTitle: string documentFileName: string | null + documentSourceType: string chunkIndex: number content: string distance: number @@ -97,6 +98,7 @@ export class DocumentChunkRetrievalService { .addSelect("chunk.document_id", "documentId") .addSelect("document.title", "documentTitle") .addSelect("document.file_name", "documentFileName") + .addSelect("document.source_type", "documentSourceType") .addSelect("chunk.chunk_index", "chunkIndex") .addSelect("chunk.content", "content") .addSelect("embedding.model_name", "modelName") From e45415b04d9fa053be6c337fab0d574d0d052a42 Mon Sep 17 00:00:00 2001 From: Thomas Jego Date: Thu, 23 Apr 2026 11:19:28 +0200 Subject: [PATCH 20/58] feat(documents): crawl full websites and surface crawl failures --- .../url-crawling-processor.service.ts | 85 ++++++++++++------- .../crawling/url-crawling-workers.module.ts | 6 +- .../documents/crawling/url-crawling.types.ts | 1 - .../web-source-embeddings-batch.module.ts | 2 +- .../web-source-embeddings-workers.module.ts | 6 +- 5 files changed, 62 insertions(+), 38 deletions(-) diff --git a/apps/api/src/domains/documents/crawling/url-crawling-processor.service.ts b/apps/api/src/domains/documents/crawling/url-crawling-processor.service.ts index e0c3e16c..f3bd1883 100644 --- a/apps/api/src/domains/documents/crawling/url-crawling-processor.service.ts +++ b/apps/api/src/domains/documents/crawling/url-crawling-processor.service.ts @@ -3,11 +3,13 @@ import { Inject, Injectable, Logger } from "@nestjs/common" import { SpiderClientService } from "@/external/spider/spider-client.service" // biome-ignore lint/style/useImportType: Required at runtime for NestJS DI import { DocumentsService } from "../documents.service" +// biome-ignore lint/style/useImportType: Required at runtime for NestJS DI +import { DocumentEmbeddingStatusNotifierService } from "../embeddings/document-embedding-status-notifier.service" +import type { CrawlUrlJobPayload } from "./url-crawling.types" import { WEB_SOURCE_EMBEDDINGS_BATCH_SERVICE, type WebSourceEmbeddingsBatchService, } from "./web-source-embeddings-batch.interface" -import type { CrawlUrlJobPayload } from "./url-crawling.types" @Injectable() export class UrlCrawlingProcessorService { @@ -16,49 +18,70 @@ export class UrlCrawlingProcessorService { constructor( private readonly spiderClientService: SpiderClientService, private readonly documentsService: DocumentsService, + private readonly embeddingStatusNotifierService: DocumentEmbeddingStatusNotifierService, @Inject(WEB_SOURCE_EMBEDDINGS_BATCH_SERVICE) private readonly embeddingsBatchService: WebSourceEmbeddingsBatchService, ) {} async processCrawlJob(payload: CrawlUrlJobPayload): Promise { - this.logger.log(`Processing crawl job for ${payload.url} (limit: ${payload.limit})`) - - const pages = await this.spiderClientService.crawlUrl({ - url: payload.url, - limit: payload.limit, - }) - - this.logger.log(`Crawled ${pages.length} pages from ${payload.url}`) + this.logger.log(`Processing full-site crawl job for ${payload.url}`) const connectScope = { organizationId: payload.organizationId, projectId: payload.projectId, } - const contentPages = pages.map((page) => ({ - url: page.url, - markdown: page.markdown, - })) - const contentJson = JSON.stringify(contentPages) + try { + const pages = await this.spiderClientService.crawlUrl({ url: payload.url }) - await this.documentsService.updateContent({ - connectScope, - documentId: payload.documentId, - content: contentJson, - size: Buffer.byteLength(contentJson, "utf-8"), - }) + this.logger.log(`Crawled ${pages.length} pages from ${payload.url}`) - await this.embeddingsBatchService.enqueueCreateEmbeddingsForDocument({ - documentId: payload.documentId, - organizationId: payload.organizationId, - projectId: payload.projectId, - uploadedByUserId: payload.requestedByUserId, - origin: "web-crawl", - currentTraceId: payload.currentTraceId, - }) + const contentPages = pages.map((page) => ({ + url: page.url, + markdown: page.markdown, + })) + const contentJson = JSON.stringify(contentPages) + + await this.documentsService.updateContent({ + connectScope, + documentId: payload.documentId, + content: contentJson, + size: Buffer.byteLength(contentJson, "utf-8"), + }) - this.logger.log( - `Updated document ${payload.documentId} with ${pages.length} pages crawled at ${payload.url}`, - ) + await this.embeddingsBatchService.enqueueCreateEmbeddingsForDocument({ + documentId: payload.documentId, + organizationId: payload.organizationId, + projectId: payload.projectId, + uploadedByUserId: payload.requestedByUserId, + origin: "web-crawl", + currentTraceId: payload.currentTraceId, + }) + + this.logger.log( + `Updated document ${payload.documentId} with ${pages.length} pages crawled at ${payload.url}`, + ) + } catch (error) { + this.logger.error(`Crawl failed for ${payload.url}: ${(error as Error).message}`) + try { + const failed = await this.documentsService.updateEmbeddingStatus({ + connectScope, + documentId: payload.documentId, + status: "failed", + }) + await this.embeddingStatusNotifierService.notifyEmbeddingStatusChanged({ + documentId: failed.id, + organizationId: failed.organizationId, + projectId: failed.projectId, + embeddingStatus: failed.embeddingStatus, + updatedAt: failed.updatedAt.getTime(), + }) + } catch (notifyError) { + this.logger.error( + `Failed to mark document ${payload.documentId} as failed: ${(notifyError as Error).message}`, + ) + } + throw error + } } } diff --git a/apps/api/src/domains/documents/crawling/url-crawling-workers.module.ts b/apps/api/src/domains/documents/crawling/url-crawling-workers.module.ts index 9a3c791b..27ba694d 100644 --- a/apps/api/src/domains/documents/crawling/url-crawling-workers.module.ts +++ b/apps/api/src/domains/documents/crawling/url-crawling-workers.module.ts @@ -2,15 +2,16 @@ import { BullModule } from "@nestjs/bullmq" import { Module } from "@nestjs/common" import { ConfigModule } from "@nestjs/config" import { TypeOrmModule } from "@nestjs/typeorm" +import { getBullMqConnection } from "@/bullmq.config" import { ALL_ENTITIES } from "@/common/all-entities" import { SpiderClientService } from "@/external/spider/spider-client.service" import { DocumentsService } from "../documents.service" -import { WebSourceEmbeddingsBatchModule } from "./web-source-embeddings-batch.module" -import { getBullMqConnection } from "@/bullmq.config" +import { DocumentEmbeddingStatusNotifierService } from "../embeddings/document-embedding-status-notifier.service" import { DocumentTagsService } from "../tags/document-tags.service" import { URL_CRAWLING_QUEUE_NAME } from "./url-crawling.constants" import { UrlCrawlingWorker } from "./url-crawling.worker" import { UrlCrawlingProcessorService } from "./url-crawling-processor.service" +import { WebSourceEmbeddingsBatchModule } from "./web-source-embeddings-batch.module" @Module({ imports: [ @@ -32,6 +33,7 @@ import { UrlCrawlingProcessorService } from "./url-crawling-processor.service" SpiderClientService, DocumentsService, DocumentTagsService, + DocumentEmbeddingStatusNotifierService, ], }) export class UrlCrawlingWorkersModule {} diff --git a/apps/api/src/domains/documents/crawling/url-crawling.types.ts b/apps/api/src/domains/documents/crawling/url-crawling.types.ts index 1f1cc8ff..d9b2a1eb 100644 --- a/apps/api/src/domains/documents/crawling/url-crawling.types.ts +++ b/apps/api/src/domains/documents/crawling/url-crawling.types.ts @@ -1,7 +1,6 @@ export type CrawlUrlJobPayload = { documentId: string url: string - limit: number organizationId: string projectId: string requestedByUserId: string diff --git a/apps/api/src/domains/documents/crawling/web-source-embeddings-batch.module.ts b/apps/api/src/domains/documents/crawling/web-source-embeddings-batch.module.ts index 3148a557..a8a8449a 100644 --- a/apps/api/src/domains/documents/crawling/web-source-embeddings-batch.module.ts +++ b/apps/api/src/domains/documents/crawling/web-source-embeddings-batch.module.ts @@ -1,8 +1,8 @@ import { BullModule } from "@nestjs/bullmq" import { Module } from "@nestjs/common" import { BullMqWebSourceEmbeddingsBatchService } from "./bull-mq-web-source-embeddings-batch.service" -import { WEB_SOURCE_EMBEDDINGS_BATCH_SERVICE } from "./web-source-embeddings-batch.interface" import { WEB_SOURCE_EMBEDDINGS_QUEUE_NAME } from "./web-source-embeddings.constants" +import { WEB_SOURCE_EMBEDDINGS_BATCH_SERVICE } from "./web-source-embeddings-batch.interface" @Module({ imports: [ diff --git a/apps/api/src/domains/documents/crawling/web-source-embeddings-workers.module.ts b/apps/api/src/domains/documents/crawling/web-source-embeddings-workers.module.ts index 464cc8cf..d050c50c 100644 --- a/apps/api/src/domains/documents/crawling/web-source-embeddings-workers.module.ts +++ b/apps/api/src/domains/documents/crawling/web-source-embeddings-workers.module.ts @@ -2,14 +2,14 @@ import { BullModule } from "@nestjs/bullmq" import { Module } from "@nestjs/common" import { ConfigModule } from "@nestjs/config" import { TypeOrmModule } from "@nestjs/typeorm" -import { ALL_ENTITIES } from "@/common/all-entities" import { getBullMqConnection } from "@/bullmq.config" +import { ALL_ENTITIES } from "@/common/all-entities" import { DocumentsService } from "../documents.service" -import { StorageModule } from "../storage/storage.module" -import { DocumentTagsService } from "../tags/document-tags.service" import { DocumentEmbeddingStatusNotifierService } from "../embeddings/document-embedding-status-notifier.service" import { DocumentEmbeddingsProcessorService } from "../embeddings/document-embeddings-processor.service" import { DocumentTextExtractorService } from "../embeddings/document-text-extractor.service" +import { StorageModule } from "../storage/storage.module" +import { DocumentTagsService } from "../tags/document-tags.service" import { WEB_SOURCE_EMBEDDINGS_QUEUE_NAME } from "./web-source-embeddings.constants" import { WebSourceEmbeddingsWorker } from "./web-source-embeddings.worker" From 69c821d5022568fd482f00c1f170d7117b5212fb Mon Sep 17 00:00:00 2001 From: Thomas Jego Date: Thu, 23 Apr 2026 11:20:22 +0200 Subject: [PATCH 21/58] feat(documents): accept crawl requests without a page limit --- .../domains/documents/documents.controller.ts | 4 +--- .../src/domains/documents/documents.service.ts | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/apps/api/src/domains/documents/documents.controller.ts b/apps/api/src/domains/documents/documents.controller.ts index 0c2cc858..3e12a1df 100644 --- a/apps/api/src/domains/documents/documents.controller.ts +++ b/apps/api/src/domains/documents/documents.controller.ts @@ -377,7 +377,6 @@ export class DocumentsController { throw new UnprocessableEntityException("Invalid URL.") } - const limit = Math.min(Math.max(payload.limit ?? 10, 1), 50) const connectScope = getRequiredConnectScope(req) const documentId = v4() @@ -398,7 +397,6 @@ export class DocumentsController { await this.urlCrawlingBatchService.enqueueCrawlUrl({ documentId, url: payload.url, - limit, organizationId: connectScope.organizationId, projectId: connectScope.projectId, requestedByUserId: req.user.id, @@ -407,7 +405,7 @@ export class DocumentsController { return { data: { - message: `Crawling ${payload.url} (up to ${limit} pages). Documents will appear as they are processed.`, + message: `Crawling ${payload.url}. Documents will appear as they are processed.`, }, } } diff --git a/apps/api/src/domains/documents/documents.service.ts b/apps/api/src/domains/documents/documents.service.ts index 78154997..a09828e6 100644 --- a/apps/api/src/domains/documents/documents.service.ts +++ b/apps/api/src/domains/documents/documents.service.ts @@ -180,6 +180,23 @@ export class DocumentsService { return this.documentConnectRepository.saveOne(document) } + async updateEmbeddingStatus({ + connectScope, + documentId, + status, + }: { + connectScope: RequiredConnectScope + documentId: string + status: Document["embeddingStatus"] + }): Promise { + const document = await this.documentConnectRepository.getOneById(connectScope, documentId) + if (!document) { + throw new NotFoundException(`Document with id ${documentId} not found`) + } + document.embeddingStatus = status + return this.documentConnectRepository.saveOne(document) + } + async deleteDocument({ connectScope, documentId, From 53fe5b00c800716fb98447e834f67c491097db43 Mon Sep 17 00:00:00 2001 From: Thomas Jego Date: Thu, 23 Apr 2026 11:21:12 +0200 Subject: [PATCH 22/58] feat(spider): unlimited full-site crawl --- apps/api/src/external/spider/spider-client.service.ts | 6 +++--- apps/api/src/external/spider/spider.constants.ts | 2 -- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/apps/api/src/external/spider/spider-client.service.ts b/apps/api/src/external/spider/spider-client.service.ts index f4827d11..fc80d4d1 100644 --- a/apps/api/src/external/spider/spider-client.service.ts +++ b/apps/api/src/external/spider/spider-client.service.ts @@ -11,14 +11,14 @@ export type CrawledPage = { export class SpiderClientService { private readonly logger = new Logger(SpiderClientService.name) - async crawlUrl(params: { url: string; limit: number }): Promise { + async crawlUrl(params: { url: string }): Promise { const apiKey = resolveSpiderApiKey() const spider = new Spider({ apiKey }) - this.logger.log(`Crawling ${params.url} with limit ${params.limit}`) + this.logger.log(`Crawling ${params.url} (full site, no page limit)`) const response = await spider.crawlUrl(params.url, { - limit: params.limit, + limit: 0, return_format: "markdown", metadata: true, }) diff --git a/apps/api/src/external/spider/spider.constants.ts b/apps/api/src/external/spider/spider.constants.ts index 2fcc87eb..879158d1 100644 --- a/apps/api/src/external/spider/spider.constants.ts +++ b/apps/api/src/external/spider/spider.constants.ts @@ -1,6 +1,4 @@ export const SPIDER_API_KEY_ENV = "SPIDER_API_KEY" -export const DEFAULT_CRAWL_LIMIT = 10 -export const MAX_CRAWL_LIMIT = 50 export function resolveSpiderApiKey(): string { const apiKey = process.env[SPIDER_API_KEY_ENV] From b4a9fd16d016a7a763af7013e6cada88e366298e Mon Sep 17 00:00:00 2001 From: Thomas Jego Date: Thu, 23 Apr 2026 11:22:05 +0200 Subject: [PATCH 23/58] feat(web): full-site crawl with dedicated crawling state --- .../documents/components/CrawlUrlButton.tsx | 14 +---- .../components/DocumentDetailsSheet.tsx | 5 +- .../documents/components/DocumentItem.tsx | 4 +- .../components/EmbeddingStatusBadge.tsx | 60 ++++++++++++++----- .../features/documents/documents.spi.ts | 1 - .../features/documents/documents.thunks.ts | 23 ++++--- .../documents/external/documents.api.ts | 4 +- .../documents/locales/document.en.json | 8 ++- .../documents/locales/document.fr.json | 8 ++- 9 files changed, 76 insertions(+), 51 deletions(-) diff --git a/apps/web/src/studio/features/documents/components/CrawlUrlButton.tsx b/apps/web/src/studio/features/documents/components/CrawlUrlButton.tsx index 85fca5c1..2bce62f0 100644 --- a/apps/web/src/studio/features/documents/components/CrawlUrlButton.tsx +++ b/apps/web/src/studio/features/documents/components/CrawlUrlButton.tsx @@ -37,7 +37,6 @@ function CrawlUrlForm({ onSuccess }: { onSuccess: () => void }) { const dispatch = useAppDispatch() const { t } = useTranslation("document") const [url, setUrl] = useState("") - const [limit, setLimit] = useState(10) const [isSubmitting, setIsSubmitting] = useState(false) const isValidUrl = (() => { @@ -55,7 +54,7 @@ function CrawlUrlForm({ onSuccess }: { onSuccess: () => void }) { setIsSubmitting(true) try { - await dispatch(crawlUrl({ url, limit })).unwrap() + await dispatch(crawlUrl({ url })).unwrap() onSuccess() } finally { setIsSubmitting(false) @@ -81,17 +80,6 @@ function CrawlUrlForm({ onSuccess }: { onSuccess: () => void }) { required /> - - {t("document:crawl.limitLabel")} - setLimit(Number(event.target.value))} - /> -
- + {date} @@ -402,7 +405,10 @@ function DocumentActions({
{t("document:props.embeddingStatus")}: - +
{document.embeddingError && ( Date: Thu, 23 Apr 2026 12:09:10 +0200 Subject: [PATCH 25/58] feat(documents): stream live crawl progress via SSE --- ...ocument-crawl-progress-notifier.service.ts | 29 +++++++++++++++++++ .../document-crawl-progress-stream.service.ts | 16 ++++++++++ .../document-crawl-progress.constants.ts | 3 ++ .../url-crawling-processor.service.ts | 25 +++++++++++++++- .../crawling/url-crawling-workers.module.ts | 2 ++ 5 files changed, 74 insertions(+), 1 deletion(-) create mode 100644 apps/api/src/domains/documents/crawling/document-crawl-progress-notifier.service.ts create mode 100644 apps/api/src/domains/documents/crawling/document-crawl-progress-stream.service.ts create mode 100644 apps/api/src/domains/documents/crawling/document-crawl-progress.constants.ts diff --git a/apps/api/src/domains/documents/crawling/document-crawl-progress-notifier.service.ts b/apps/api/src/domains/documents/crawling/document-crawl-progress-notifier.service.ts new file mode 100644 index 00000000..cd21c1e9 --- /dev/null +++ b/apps/api/src/domains/documents/crawling/document-crawl-progress-notifier.service.ts @@ -0,0 +1,29 @@ +import { Injectable } from "@nestjs/common" +import { InjectDataSource } from "@nestjs/typeorm" +import type { DataSource } from "typeorm" +import { PostgresStatusNotifierService } from "@/common/sse/postgres-status-notifier.service" +import { DOCUMENT_CRAWL_PROGRESS_CHANGED_CHANNEL } from "./document-crawl-progress.constants" + +@Injectable() +export class DocumentCrawlProgressNotifierService extends PostgresStatusNotifierService { + constructor(@InjectDataSource() dataSource: DataSource) { + super(dataSource, DOCUMENT_CRAWL_PROGRESS_CHANGED_CHANNEL) + } + + async notifyCrawlProgress(params: { + documentId: string + organizationId: string + projectId: string + pagesCrawled: number + updatedAt: number + }): Promise { + await this.notify({ + type: DOCUMENT_CRAWL_PROGRESS_CHANGED_CHANNEL, + documentId: params.documentId, + organizationId: params.organizationId, + projectId: params.projectId, + pagesCrawled: params.pagesCrawled, + updatedAt: params.updatedAt, + }) + } +} diff --git a/apps/api/src/domains/documents/crawling/document-crawl-progress-stream.service.ts b/apps/api/src/domains/documents/crawling/document-crawl-progress-stream.service.ts new file mode 100644 index 00000000..e635fc6c --- /dev/null +++ b/apps/api/src/domains/documents/crawling/document-crawl-progress-stream.service.ts @@ -0,0 +1,16 @@ +import type { DocumentCrawlProgressChangedEventDto } from "@caseai-connect/api-contracts" +import { Injectable } from "@nestjs/common" +import { PostgresStatusStreamService } from "@/common/sse/postgres-status-stream.service" +import { DOCUMENT_CRAWL_PROGRESS_CHANGED_CHANNEL } from "./document-crawl-progress.constants" + +@Injectable() +export class DocumentCrawlProgressStreamService extends PostgresStatusStreamService { + constructor() { + super({ + channel: DOCUMENT_CRAWL_PROGRESS_CHANGED_CHANNEL, + expectedType: DOCUMENT_CRAWL_PROGRESS_CHANGED_CHANNEL, + serviceName: DocumentCrawlProgressStreamService.name, + isExpectedEvent: (payload) => payload.type === DOCUMENT_CRAWL_PROGRESS_CHANGED_CHANNEL, + }) + } +} diff --git a/apps/api/src/domains/documents/crawling/document-crawl-progress.constants.ts b/apps/api/src/domains/documents/crawling/document-crawl-progress.constants.ts new file mode 100644 index 00000000..99ec4109 --- /dev/null +++ b/apps/api/src/domains/documents/crawling/document-crawl-progress.constants.ts @@ -0,0 +1,3 @@ +import { DOCUMENT_CRAWL_PROGRESS_CHANGED_CHANNEL_DTO } from "@caseai-connect/api-contracts" + +export const DOCUMENT_CRAWL_PROGRESS_CHANGED_CHANNEL = DOCUMENT_CRAWL_PROGRESS_CHANGED_CHANNEL_DTO diff --git a/apps/api/src/domains/documents/crawling/url-crawling-processor.service.ts b/apps/api/src/domains/documents/crawling/url-crawling-processor.service.ts index f3bd1883..307ae7f6 100644 --- a/apps/api/src/domains/documents/crawling/url-crawling-processor.service.ts +++ b/apps/api/src/domains/documents/crawling/url-crawling-processor.service.ts @@ -5,6 +5,8 @@ import { SpiderClientService } from "@/external/spider/spider-client.service" import { DocumentsService } from "../documents.service" // biome-ignore lint/style/useImportType: Required at runtime for NestJS DI import { DocumentEmbeddingStatusNotifierService } from "../embeddings/document-embedding-status-notifier.service" +// biome-ignore lint/style/useImportType: Required at runtime for NestJS DI +import { DocumentCrawlProgressNotifierService } from "./document-crawl-progress-notifier.service" import type { CrawlUrlJobPayload } from "./url-crawling.types" import { WEB_SOURCE_EMBEDDINGS_BATCH_SERVICE, @@ -19,6 +21,7 @@ export class UrlCrawlingProcessorService { private readonly spiderClientService: SpiderClientService, private readonly documentsService: DocumentsService, private readonly embeddingStatusNotifierService: DocumentEmbeddingStatusNotifierService, + private readonly crawlProgressNotifierService: DocumentCrawlProgressNotifierService, @Inject(WEB_SOURCE_EMBEDDINGS_BATCH_SERVICE) private readonly embeddingsBatchService: WebSourceEmbeddingsBatchService, ) {} @@ -31,8 +34,28 @@ export class UrlCrawlingProcessorService { projectId: payload.projectId, } + let pagesCrawled = 0 + try { - const pages = await this.spiderClientService.crawlUrl({ url: payload.url }) + const pages = await this.spiderClientService.crawlUrl({ + url: payload.url, + onPage: () => { + pagesCrawled += 1 + this.crawlProgressNotifierService + .notifyCrawlProgress({ + documentId: payload.documentId, + organizationId: payload.organizationId, + projectId: payload.projectId, + pagesCrawled, + updatedAt: Date.now(), + }) + .catch((error) => { + this.logger.error( + `Failed to emit crawl progress for ${payload.documentId}: ${(error as Error).message}`, + ) + }) + }, + }) this.logger.log(`Crawled ${pages.length} pages from ${payload.url}`) diff --git a/apps/api/src/domains/documents/crawling/url-crawling-workers.module.ts b/apps/api/src/domains/documents/crawling/url-crawling-workers.module.ts index 27ba694d..7f8e1658 100644 --- a/apps/api/src/domains/documents/crawling/url-crawling-workers.module.ts +++ b/apps/api/src/domains/documents/crawling/url-crawling-workers.module.ts @@ -8,6 +8,7 @@ import { SpiderClientService } from "@/external/spider/spider-client.service" import { DocumentsService } from "../documents.service" import { DocumentEmbeddingStatusNotifierService } from "../embeddings/document-embedding-status-notifier.service" import { DocumentTagsService } from "../tags/document-tags.service" +import { DocumentCrawlProgressNotifierService } from "./document-crawl-progress-notifier.service" import { URL_CRAWLING_QUEUE_NAME } from "./url-crawling.constants" import { UrlCrawlingWorker } from "./url-crawling.worker" import { UrlCrawlingProcessorService } from "./url-crawling-processor.service" @@ -34,6 +35,7 @@ import { WebSourceEmbeddingsBatchModule } from "./web-source-embeddings-batch.mo DocumentsService, DocumentTagsService, DocumentEmbeddingStatusNotifierService, + DocumentCrawlProgressNotifierService, ], }) export class UrlCrawlingWorkersModule {} From 4d15f3aa18aef4d3519670eb7c74e843fc703844 Mon Sep 17 00:00:00 2001 From: Thomas Jego Date: Thu, 23 Apr 2026 12:10:34 +0200 Subject: [PATCH 26/58] feat(web): live crawl progress badge --- .../components/DocumentDetailsSheet.tsx | 4 ++ .../documents/components/DocumentItem.tsx | 9 +++- .../documents/documents-stream-status.ts | 35 ++++++++++++--- .../documents/documents.middleware.ts | 16 +++++++ .../features/documents/documents.models.ts | 6 +++ .../features/documents/documents.selectors.ts | 15 +++++++ .../features/documents/documents.slice.ts | 43 +++++++++++++++++++ .../features/documents/documents.spi.ts | 12 +++++- .../features/documents/documents.thunks.ts | 25 +++++++++++ .../documents/external/documents-streaming.ts | 40 ++++++++++++++++- .../documents/external/documents.api.ts | 10 ++++- .../documents/locales/document.en.json | 1 + .../documents/locales/document.fr.json | 1 + 13 files changed, 208 insertions(+), 9 deletions(-) diff --git a/apps/web/src/studio/features/documents/components/DocumentDetailsSheet.tsx b/apps/web/src/studio/features/documents/components/DocumentDetailsSheet.tsx index 55f2fb62..c3ed0143 100644 --- a/apps/web/src/studio/features/documents/components/DocumentDetailsSheet.tsx +++ b/apps/web/src/studio/features/documents/components/DocumentDetailsSheet.tsx @@ -9,10 +9,12 @@ import { import { InfoIcon } from "lucide-react" import { useTranslation } from "react-i18next" import { MarkdownWrapper } from "@/common/features/agents/agent-sessions/shared/agent-session-messages/components/MarkdownWrapper" +import { useAppSelector } from "@/common/store/hooks" import { buildDate } from "@/common/utils/build-date" import { DocumentTagItem } from "@/studio/features/document-tags/components/DocumentTagItem" import type { DocumentTag } from "@/studio/features/document-tags/document-tags.models" import type { Document } from "@/studio/features/documents/documents.models" +import { selectCrawlProgressByDocumentId } from "@/studio/features/documents/documents.selectors" import { EmbeddingStatusBadge } from "./EmbeddingStatusBadge" export function DocumentDetailsSheet({ @@ -23,6 +25,7 @@ export function DocumentDetailsSheet({ documentTags: DocumentTag[] }) { const { t } = useTranslation("document", { keyPrefix: "props" }) + const pagesCrawled = useAppSelector(selectCrawlProgressByDocumentId)[document.id] return ( @@ -47,6 +50,7 @@ export function DocumentDetailsSheet({
{document.embeddingError && ( diff --git a/apps/web/src/studio/features/documents/components/DocumentItem.tsx b/apps/web/src/studio/features/documents/components/DocumentItem.tsx index 24a4bc9f..78ddc333 100644 --- a/apps/web/src/studio/features/documents/components/DocumentItem.tsx +++ b/apps/web/src/studio/features/documents/components/DocumentItem.tsx @@ -1,7 +1,9 @@ import { GridItem } from "@/common/components/grid/Grid" +import { useAppSelector } from "@/common/store/hooks" import { buildSince } from "@/common/utils/build-date" import type { DocumentTag } from "@/studio/features/document-tags/document-tags.models" import type { Document } from "@/studio/features/documents/documents.models" +import { selectCrawlProgressByDocumentId } from "@/studio/features/documents/documents.selectors" import { DocumentDeletor } from "./DocumentDeletor" import { DocumentDetailsSheet } from "./DocumentDetailsSheet" import { DocumentEditor } from "./DocumentEditor" @@ -18,12 +20,17 @@ export function DocumentItem({ documentTags: DocumentTag[] }) { const date = buildSince(document.updatedAt) + const pagesCrawled = useAppSelector(selectCrawlProgressByDocumentId)[document.id] return ( + } title={
{document.title}
} description={date} diff --git a/apps/web/src/studio/features/documents/documents-stream-status.ts b/apps/web/src/studio/features/documents/documents-stream-status.ts index 6f143056..07b3eca6 100644 --- a/apps/web/src/studio/features/documents/documents-stream-status.ts +++ b/apps/web/src/studio/features/documents/documents-stream-status.ts @@ -1,10 +1,16 @@ import { createStreamStatusManager } from "@/common/sse/stream-status-manager" import type { AppDispatch, RootState } from "@/common/store/types" import { + selectHasDocumentsCrawling, selectHasDocumentsInProgress, + selectIsCrawlProgressStreamActive, selectIsEmbeddingStatusStreamActive, } from "./documents.selectors" -import { listDocuments, streamDocumentEmbeddingStatuses } from "./documents.thunks" +import { + listDocuments, + streamDocumentCrawlProgresses, + streamDocumentEmbeddingStatuses, +} from "./documents.thunks" type AbortableStreamTask = { abort: () => void; unwrap: () => Promise } type StreamListenerApi = { @@ -12,7 +18,7 @@ type StreamListenerApi = { getState: () => RootState } -const manager = createStreamStatusManager({ +const embeddingManager = createStreamStatusManager({ selectIsStreamActive: selectIsEmbeddingStatusStreamActive, selectHasItemsInProgress: selectHasDocumentsInProgress, dispatchStreamThunk: (listenerApi) => @@ -20,14 +26,27 @@ const manager = createStreamStatusManager({ dispatchRefresh: (listenerApi) => listenerApi.dispatch(listDocuments()), }) +const crawlProgressManager = createStreamStatusManager({ + selectIsStreamActive: selectIsCrawlProgressStreamActive, + selectHasItemsInProgress: selectHasDocumentsCrawling, + dispatchStreamThunk: (listenerApi) => + listenerApi.dispatch(streamDocumentCrawlProgresses()) as unknown as AbortableStreamTask, + dispatchRefresh: (listenerApi) => listenerApi.dispatch(listDocuments()), +}) + export function stopDocumentEmbeddingStatusStream() { - manager.stop() + embeddingManager.stop() +} + +export function stopDocumentCrawlProgressStream() { + crawlProgressManager.stop() } export function syncDocumentEmbeddingStatusStreamWithDocuments( listenerApi: StreamListenerApi, ): void { - manager.sync(listenerApi) + embeddingManager.sync(listenerApi) + crawlProgressManager.sync(listenerApi) } export async function handleDocumentsContextChanged(listenerApi: StreamListenerApi): Promise { @@ -38,5 +57,11 @@ export async function handleDocumentsContextChanged(listenerApi: StreamListenerA export async function startDocumentEmbeddingStatusStream( listenerApi: StreamListenerApi, ): Promise { - await manager.start(listenerApi) + await embeddingManager.start(listenerApi) +} + +export async function startDocumentCrawlProgressStream( + listenerApi: StreamListenerApi, +): Promise { + await crawlProgressManager.start(listenerApi) } diff --git a/apps/web/src/studio/features/documents/documents.middleware.ts b/apps/web/src/studio/features/documents/documents.middleware.ts index 77bb8ba6..7ec78203 100644 --- a/apps/web/src/studio/features/documents/documents.middleware.ts +++ b/apps/web/src/studio/features/documents/documents.middleware.ts @@ -20,7 +20,9 @@ import { } from "./documents.thunks" import { handleDocumentsContextChanged, + startDocumentCrawlProgressStream, startDocumentEmbeddingStatusStream, + stopDocumentCrawlProgressStream, stopDocumentEmbeddingStatusStream, syncDocumentEmbeddingStatusStreamWithDocuments, } from "./documents-stream-status" @@ -52,6 +54,20 @@ function registerListeners() { }, }) + listenerMiddleware.startListening({ + actionCreator: documentsActions.startCrawlProgressStream, + effect: async (_, listenerApi) => { + await startDocumentCrawlProgressStream(listenerApi) + }, + }) + + listenerMiddleware.startListening({ + actionCreator: documentsActions.stopCrawlProgressStream, + effect: async () => { + stopDocumentCrawlProgressStream() + }, + }) + listenerMiddleware.startListening({ actionCreator: documentsActions.patchDocumentEmbeddingStatus, effect: async (action, listenerApi) => { diff --git a/apps/web/src/studio/features/documents/documents.models.ts b/apps/web/src/studio/features/documents/documents.models.ts index 42c0a1c8..9f12de87 100644 --- a/apps/web/src/studio/features/documents/documents.models.ts +++ b/apps/web/src/studio/features/documents/documents.models.ts @@ -9,3 +9,9 @@ export type DocumentEmbeddingStatusChangedEvent = { embeddingError: Document["embeddingError"] updatedAt: number } + +export type DocumentCrawlProgressEvent = { + documentId: string + pagesCrawled: number + updatedAt: number +} diff --git a/apps/web/src/studio/features/documents/documents.selectors.ts b/apps/web/src/studio/features/documents/documents.selectors.ts index 18f60cc5..df3e4694 100644 --- a/apps/web/src/studio/features/documents/documents.selectors.ts +++ b/apps/web/src/studio/features/documents/documents.selectors.ts @@ -44,3 +44,18 @@ export const selectHasDocumentsInProgress = createSelector( ) }, ) + +export const selectIsCrawlProgressStreamActive = (state: RootState) => + state.studio.documents.crawlProgressStream.isActive + +export const selectHasDocumentsCrawling = createSelector([selectDocumentsData], (documentsData) => { + if (!ADS.isFulfilled(documentsData)) { + return false + } + return documentsData.value.some( + (document) => document.sourceType === "webCrawl" && document.embeddingStatus === "pending", + ) +}) + +export const selectCrawlProgressByDocumentId = (state: RootState) => + state.studio.documents.crawlProgressByDocumentId diff --git a/apps/web/src/studio/features/documents/documents.slice.ts b/apps/web/src/studio/features/documents/documents.slice.ts index fe808ee0..3314c873 100644 --- a/apps/web/src/studio/features/documents/documents.slice.ts +++ b/apps/web/src/studio/features/documents/documents.slice.ts @@ -13,11 +13,16 @@ type UploaderState = { type EmbeddingStatusStreamState = { isActive: boolean } +type CrawlProgressStreamState = { + isActive: boolean +} interface State { currentDocumentId: string | null data: AsyncData uploader: UploaderState embeddingStatusStream: EmbeddingStatusStreamState + crawlProgressStream: CrawlProgressStreamState + crawlProgressByDocumentId: Record } const initialState: State = { @@ -32,6 +37,10 @@ const initialState: State = { embeddingStatusStream: { isActive: false, }, + crawlProgressStream: { + isActive: false, + }, + crawlProgressByDocumentId: {}, } function mergeDocumentsByUpdatedAt({ @@ -111,6 +120,26 @@ const slice = createSlice({ document.embeddingStatus = action.payload.embeddingStatus document.embeddingError = action.payload.embeddingError document.updatedAt = action.payload.updatedAt + if ( + action.payload.embeddingStatus === "completed" || + action.payload.embeddingStatus === "failed" + ) { + delete state.crawlProgressByDocumentId[action.payload.documentId] + } + }, + startCrawlProgressStream: (state) => { + state.crawlProgressStream.isActive = true + }, + stopCrawlProgressStream: (state) => { + state.crawlProgressStream.isActive = false + }, + patchDocumentCrawlProgress: ( + state, + action: PayloadAction<{ documentId: string; pagesCrawled: number }>, + ) => { + const previous = state.crawlProgressByDocumentId[action.payload.documentId] ?? 0 + if (action.payload.pagesCrawled < previous) return + state.crawlProgressByDocumentId[action.payload.documentId] = action.payload.pagesCrawled }, }, extraReducers: (builder) => { @@ -132,6 +161,20 @@ const slice = createSlice({ error: null, value: mergedDocuments, } + + const stillCrawling = new Set( + mergedDocuments + .filter( + (document) => + document.sourceType === "webCrawl" && document.embeddingStatus === "pending", + ) + .map((document) => document.id), + ) + for (const documentId of Object.keys(state.crawlProgressByDocumentId)) { + if (!stillCrawling.has(documentId)) { + delete state.crawlProgressByDocumentId[documentId] + } + } }) .addCase(listDocuments.rejected, (state, action) => { state.data.status = ADS.Error diff --git a/apps/web/src/studio/features/documents/documents.spi.ts b/apps/web/src/studio/features/documents/documents.spi.ts index 89d24770..6865ec93 100644 --- a/apps/web/src/studio/features/documents/documents.spi.ts +++ b/apps/web/src/studio/features/documents/documents.spi.ts @@ -1,6 +1,10 @@ import type { DocumentSourceType } from "@caseai-connect/api-contracts" import type { DocumentTagsUpdateFields } from "@/studio/features/document-tags/document-tags.models" -import type { Document, DocumentEmbeddingStatusChangedEvent } from "./documents.models" +import type { + Document, + DocumentCrawlProgressEvent, + DocumentEmbeddingStatusChangedEvent, +} from "./documents.models" export interface IDocumentsSpi { getAll(params: { organizationId: string; projectId: string }): Promise @@ -50,6 +54,12 @@ export interface IDocumentsSpi { signal?: AbortSignal onStatusChanged: (event: DocumentEmbeddingStatusChangedEvent) => void }): Promise + streamCrawlProgress(params: { + organizationId: string + projectId: string + signal?: AbortSignal + onProgressChanged: (event: DocumentCrawlProgressEvent) => void + }): Promise crawlUrl(params: { organizationId: string projectId: string diff --git a/apps/web/src/studio/features/documents/documents.thunks.ts b/apps/web/src/studio/features/documents/documents.thunks.ts index 7888c7a6..b8f5c0ed 100644 --- a/apps/web/src/studio/features/documents/documents.thunks.ts +++ b/apps/web/src/studio/features/documents/documents.thunks.ts @@ -167,6 +167,31 @@ export const crawlUrl = createAsyncThunk<{ message: string }, { url: string }, T }, ) +export const streamDocumentCrawlProgresses = createAsyncThunk( + "documents/streamCrawlProgress", + async (_, { extra: { services }, getState, dispatch, signal }) => { + const state = getState() + const { organizationId, projectId } = getCurrentIds({ + state, + wantedIds: ["organizationId", "projectId"], + }) + + await services.documents.streamCrawlProgress({ + organizationId, + projectId, + signal, + onProgressChanged: ({ documentId, pagesCrawled }) => { + dispatch( + documentsActions.patchDocumentCrawlProgress({ + documentId, + pagesCrawled, + }), + ) + }, + }) + }, +) + export const streamDocumentEmbeddingStatuses = createAsyncThunk( "documents/streamEmbeddingStatus", async (_, { extra: { services }, getState, dispatch, signal }) => { diff --git a/apps/web/src/studio/features/documents/external/documents-streaming.ts b/apps/web/src/studio/features/documents/external/documents-streaming.ts index 401bf8f9..82b6d0fc 100644 --- a/apps/web/src/studio/features/documents/external/documents-streaming.ts +++ b/apps/web/src/studio/features/documents/external/documents-streaming.ts @@ -1,10 +1,15 @@ import { + DOCUMENT_CRAWL_PROGRESS_CHANGED_CHANNEL_DTO, DOCUMENT_EMBEDDING_STATUS_CHANGED_CHANNEL_DTO, + type DocumentCrawlProgressChangedEventDto, type DocumentEmbeddingStatusChangedEventDto, DocumentsRoutes, } from "@caseai-connect/api-contracts" import { readSSEStream, type SSEStreamConfig } from "@/common/sse/sse-stream-reader" -import type { DocumentEmbeddingStatusChangedEvent } from "../documents.models" +import type { + DocumentCrawlProgressEvent, + DocumentEmbeddingStatusChangedEvent, +} from "../documents.models" const documentEmbeddingSSEConfig: SSEStreamConfig< DocumentEmbeddingStatusChangedEventDto, @@ -39,3 +44,36 @@ export async function streamDocumentEmbeddingStatus(params: { onStatusChanged: params.onStatusChanged, }) } + +const documentCrawlProgressSSEConfig: SSEStreamConfig< + DocumentCrawlProgressChangedEventDto, + DocumentCrawlProgressEvent +> = { + label: "DocumentsCrawlProgress", + getStreamPath: (params) => + DocumentsRoutes.streamCrawlProgress.getPath({ + organizationId: params.organizationId, + projectId: params.projectId, + }), + isExpectedEvent: (dto) => dto.type === DOCUMENT_CRAWL_PROGRESS_CHANGED_CHANNEL_DTO, + fromDto: (dto) => ({ + documentId: dto.documentId, + pagesCrawled: dto.pagesCrawled, + updatedAt: dto.updatedAt, + }), +} + +export async function streamDocumentCrawlProgress(params: { + organizationId: string + projectId: string + signal?: AbortSignal + onProgressChanged: (event: DocumentCrawlProgressEvent) => void +}): Promise { + return readSSEStream({ + config: documentCrawlProgressSSEConfig, + organizationId: params.organizationId, + projectId: params.projectId, + signal: params.signal, + onStatusChanged: params.onProgressChanged, + }) +} diff --git a/apps/web/src/studio/features/documents/external/documents.api.ts b/apps/web/src/studio/features/documents/external/documents.api.ts index c3f34629..8b110b3a 100644 --- a/apps/web/src/studio/features/documents/external/documents.api.ts +++ b/apps/web/src/studio/features/documents/external/documents.api.ts @@ -6,7 +6,7 @@ import { import { getAxiosInstance } from "@/external/axios" import type { Document } from "../documents.models" import type { IDocumentsSpi } from "../documents.spi" -import { streamDocumentEmbeddingStatus } from "./documents-streaming" +import { streamDocumentCrawlProgress, streamDocumentEmbeddingStatus } from "./documents-streaming" export default { getAll: async ({ organizationId, projectId }) => { @@ -126,6 +126,14 @@ export default { onStatusChanged, }) }, + streamCrawlProgress: async ({ organizationId, projectId, signal, onProgressChanged }) => { + await streamDocumentCrawlProgress({ + organizationId, + projectId, + signal, + onProgressChanged, + }) + }, crawlUrl: async ({ organizationId, projectId, url }) => { const axios = getAxiosInstance() const response = await axios.post( diff --git a/apps/web/src/studio/features/documents/locales/document.en.json b/apps/web/src/studio/features/documents/locales/document.en.json index 83133e52..9678c94d 100644 --- a/apps/web/src/studio/features/documents/locales/document.en.json +++ b/apps/web/src/studio/features/documents/locales/document.en.json @@ -21,6 +21,7 @@ "completed": "Ready", "failed": "Failed", "crawling": "Crawling", + "crawlingWithCount": "Crawling — {{count}} pages", "embedding": "Embedding", "ready": "Ready" }, diff --git a/apps/web/src/studio/features/documents/locales/document.fr.json b/apps/web/src/studio/features/documents/locales/document.fr.json index d3061cd9..9ac52fdb 100644 --- a/apps/web/src/studio/features/documents/locales/document.fr.json +++ b/apps/web/src/studio/features/documents/locales/document.fr.json @@ -21,6 +21,7 @@ "completed": "Prêt", "failed": "Échoué", "crawling": "Exploration", + "crawlingWithCount": "Exploration — {{count}} pages", "embedding": "Indexation", "ready": "Prêt" }, From abc21847444b86f61c35ee3ea031aff110d2ede4 Mon Sep 17 00:00:00 2001 From: Thomas Jego Date: Thu, 23 Apr 2026 12:11:17 +0200 Subject: [PATCH 27/58] feat(api-contracts): add document_crawl_progress_changed channel --- .../api-contracts/src/documents/documents.dto.ts | 12 ++++++++++++ .../api-contracts/src/documents/documents.routes.ts | 4 ++++ 2 files changed, 16 insertions(+) diff --git a/packages/api-contracts/src/documents/documents.dto.ts b/packages/api-contracts/src/documents/documents.dto.ts index 7cdfb54e..60624f66 100644 --- a/packages/api-contracts/src/documents/documents.dto.ts +++ b/packages/api-contracts/src/documents/documents.dto.ts @@ -2,6 +2,7 @@ import type { DocumentTagDto } from "../document-tags/document-tag.dto" import type { TimeType } from "../generic" export const DOCUMENT_EMBEDDING_STATUS_CHANGED_CHANNEL_DTO = "document_embedding_status_changed" +export const DOCUMENT_CRAWL_PROGRESS_CHANGED_CHANNEL_DTO = "document_crawl_progress_changed" export type DocumentSourceType = | "project" @@ -23,6 +24,17 @@ export type DocumentEmbeddingStatusChangedEventPayload = { export type DocumentEmbeddingStatusChangedEventDto = MessageEvent & DocumentEmbeddingStatusChangedEventPayload +export type DocumentCrawlProgressChangedEventPayload = { + type: typeof DOCUMENT_CRAWL_PROGRESS_CHANGED_CHANNEL_DTO + documentId: string + organizationId: string + projectId: string + pagesCrawled: number + updatedAt: TimeType +} +export type DocumentCrawlProgressChangedEventDto = MessageEvent & + DocumentCrawlProgressChangedEventPayload + export type PresignFileRequestItemDto = { fileName: string mimeType: MimeTypes diff --git a/packages/api-contracts/src/documents/documents.routes.ts b/packages/api-contracts/src/documents/documents.routes.ts index f6fca448..84774672 100644 --- a/packages/api-contracts/src/documents/documents.routes.ts +++ b/packages/api-contracts/src/documents/documents.routes.ts @@ -64,4 +64,8 @@ export const DocumentsRoutes = { method: "get", path: "organizations/:organizationId/projects/:projectId/documents/embedding-status/stream", }), + streamCrawlProgress: defineRoute>({ + method: "get", + path: "organizations/:organizationId/projects/:projectId/documents/crawl-progress/stream", + }), } From 482bf98ee0f7c50e6d7f4159bd8565630acc170e Mon Sep 17 00:00:00 2001 From: Thomas Jego Date: Thu, 23 Apr 2026 12:12:37 +0200 Subject: [PATCH 28/58] feat(documents): stream Spider pages + SSE endpoint --- .../domains/documents/documents.controller.ts | 20 ++++++++ .../src/domains/documents/documents.module.ts | 2 + .../external/spider/spider-client.service.ts | 50 +++++++++---------- 3 files changed, 46 insertions(+), 26 deletions(-) diff --git a/apps/api/src/domains/documents/documents.controller.ts b/apps/api/src/domains/documents/documents.controller.ts index 3e12a1df..b3e3dac2 100644 --- a/apps/api/src/domains/documents/documents.controller.ts +++ b/apps/api/src/domains/documents/documents.controller.ts @@ -1,4 +1,5 @@ import { + type DocumentCrawlProgressChangedEventDto, type DocumentDto, type DocumentEmbeddingStatusChangedEventDto, type DocumentSourceType, @@ -46,6 +47,8 @@ import type { MulterFile } from "@/common/types" import { TrackActivity } from "@/domains/activities/track-activity.decorator" import { JwtAuthGuard } from "@/domains/auth/jwt-auth.guard" import { UserGuard } from "@/domains/users/user.guard" +// biome-ignore lint/style/useImportType: Required at runtime for NestJS DI +import { DocumentCrawlProgressStreamService } from "./crawling/document-crawl-progress-stream.service" import { URL_CRAWLING_BATCH_SERVICE, type UrlCrawlingBatchService, @@ -81,6 +84,7 @@ export class DocumentsController { private readonly urlCrawlingBatchService: UrlCrawlingBatchService, private readonly documentsService: DocumentsService, private readonly documentEmbeddingStatusStreamService: DocumentEmbeddingStatusStreamService, + private readonly documentCrawlProgressStreamService: DocumentCrawlProgressStreamService, ) {} @CheckPolicy((policy) => policy.canCreate()) @@ -425,6 +429,22 @@ export class DocumentsController { map((event) => ({ ...event, data: JSON.stringify(event) })), ) } + + @CheckPolicy((policy) => policy.canList()) + @Sse(DocumentsRoutes.streamCrawlProgress.path, { method: 0 /* GET */ }) + streamCrawlProgress( + @Request() req: EndpointRequestWithProject, + ): Observable { + const connectScope = getRequiredConnectScope(req) + return this.documentCrawlProgressStreamService.events$.pipe( + filter( + (event) => + event.organizationId === connectScope.organizationId && + event.projectId === connectScope.projectId, + ), + map((event) => ({ ...event, data: JSON.stringify(event) })), + ) + } } function toDocumentDto(entity: Document): DocumentDto { diff --git a/apps/api/src/domains/documents/documents.module.ts b/apps/api/src/domains/documents/documents.module.ts index 7ae2a2d6..fd5d6e05 100644 --- a/apps/api/src/domains/documents/documents.module.ts +++ b/apps/api/src/domains/documents/documents.module.ts @@ -14,6 +14,7 @@ import { ProjectMembership } from "@/domains/projects/memberships/project-member import { Project } from "@/domains/projects/project.entity" import { ProjectsModule } from "@/domains/projects/projects.module" import { UsersModule } from "@/domains/users/users.module" +import { DocumentCrawlProgressStreamService } from "./crawling/document-crawl-progress-stream.service" import { UrlCrawlingBatchModule } from "./crawling/url-crawling-batch.module" import { Document } from "./document.entity" import { DocumentsController } from "./documents.controller" @@ -61,6 +62,7 @@ import { DocumentTagsModule } from "./tags/document-tags.module" providers: [ DocumentsService, DocumentEmbeddingStatusStreamService, + DocumentCrawlProgressStreamService, DocumentChunkRetrievalService, DocumentsGuard, ResourceContextGuard, diff --git a/apps/api/src/external/spider/spider-client.service.ts b/apps/api/src/external/spider/spider-client.service.ts index fc80d4d1..07f91844 100644 --- a/apps/api/src/external/spider/spider-client.service.ts +++ b/apps/api/src/external/spider/spider-client.service.ts @@ -11,37 +11,35 @@ export type CrawledPage = { export class SpiderClientService { private readonly logger = new Logger(SpiderClientService.name) - async crawlUrl(params: { url: string }): Promise { + async crawlUrl(params: { + url: string + onPage?: (page: CrawledPage) => void + }): Promise { const apiKey = resolveSpiderApiKey() const spider = new Spider({ apiKey }) - this.logger.log(`Crawling ${params.url} (full site, no page limit)`) - - const response = await spider.crawlUrl(params.url, { - limit: 0, - return_format: "markdown", - metadata: true, - }) - - if (!response) { - this.logger.warn(`Spider returned no response for ${params.url}`) - return [] - } - - // Spider may return a nested array — flatten it - const flatResponse = response.flat() - - this.logger.debug( - `Spider flat response: ${flatResponse.length} items, keys: ${flatResponse.length > 0 && flatResponse[0] ? Object.keys(flatResponse[0]).join(", ") : "N/A"}`, + this.logger.log(`Streaming full-site crawl of ${params.url}`) + + const pages: CrawledPage[] = [] + + await spider.crawlUrl( + params.url, + { limit: 0, return_format: "markdown", metadata: true }, + true, + (chunk) => { + const items = Array.isArray(chunk) ? chunk : [chunk] + for (const item of items) { + if (!item?.content || item.content.trim().length === 0) continue + const page: CrawledPage = { + url: item.url ?? params.url, + markdown: item.content, + } + pages.push(page) + params.onPage?.(page) + } + }, ) - const pages = flatResponse - .filter((page) => page.content && page.content.trim().length > 0) - .map((page) => ({ - url: page.url ?? params.url, - markdown: page.content ?? "", - })) - this.logger.log(`Crawled ${pages.length} pages from ${params.url}`) return pages } From efdd54917d5f97c5e84ff0664747e835b8a21081 Mon Sep 17 00:00:00 2001 From: Thomas Jego Date: Thu, 23 Apr 2026 12:13:19 +0200 Subject: [PATCH 29/58] fix(web): open crawl progress stream on mount --- apps/web/src/studio/routes/DocumentsRoute.tsx | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/apps/web/src/studio/routes/DocumentsRoute.tsx b/apps/web/src/studio/routes/DocumentsRoute.tsx index ee7e793e..a791c417 100644 --- a/apps/web/src/studio/routes/DocumentsRoute.tsx +++ b/apps/web/src/studio/routes/DocumentsRoute.tsx @@ -64,6 +64,7 @@ import { EmptyDocument } from "@/studio/features/documents/components/EmptyDocum import { UploadDocumentsButton } from "@/studio/features/documents/components/UploadDocumentsButton" import type { Document } from "@/studio/features/documents/documents.models" import { + selectCrawlProgressByDocumentId, selectDocumentsData, selectUploaderState, } from "@/studio/features/documents/documents.selectors" @@ -177,6 +178,7 @@ function DocumentRow({ const isWebCrawl = document.sourceType === "webCrawl" const crawledPages = isWebCrawl ? parseCrawledPages(document.content) : null const hasPages = crawledPages && crawledPages.length > 0 + const pagesCrawled = useAppSelector(selectCrawlProgressByDocumentId)[document.id] const [isOpen, setIsOpen] = useState(false) @@ -222,6 +224,7 @@ function DocumentRow({
{date} @@ -283,6 +286,7 @@ function DocumentActions({ const dispatch = useAppDispatch() const { t } = useTranslation() const [activeAction, setActiveAction] = useState<"delete" | "edit" | "details" | null>(null) + const pagesCrawled = useAppSelector(selectCrawlProgressByDocumentId)[document.id] const handleDownload = async () => { const result = await dispatch(getDocumentTemporaryUrl({ documentId: document.id })).unwrap() @@ -408,6 +412,7 @@ function DocumentActions({
{document.embeddingError && ( @@ -546,8 +551,10 @@ function useDocumentEmbeddingStatusStream() { useEffect(() => { dispatch(documentsActions.startEmbeddingStatusStream()) + dispatch(documentsActions.startCrawlProgressStream()) return () => { dispatch(documentsActions.stopEmbeddingStatusStream()) + dispatch(documentsActions.stopCrawlProgressStream()) } }, [dispatch]) } From 1d6b53e2090896c7a2641c93e3fec7cf2ca5f7e1 Mon Sep 17 00:00:00 2001 From: Thomas Jego Date: Thu, 23 Apr 2026 12:30:19 +0200 Subject: [PATCH 30/58] fix(web): truncate crawled sub-page URLs --- apps/web/src/studio/routes/DocumentsRoute.tsx | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/apps/web/src/studio/routes/DocumentsRoute.tsx b/apps/web/src/studio/routes/DocumentsRoute.tsx index a791c417..fa3549a4 100644 --- a/apps/web/src/studio/routes/DocumentsRoute.tsx +++ b/apps/web/src/studio/routes/DocumentsRoute.tsx @@ -235,15 +235,16 @@ function DocumentRow({ {crawledPages && isOpen ? crawledPages.map((page) => ( - + - {page.url} + {page.url} From e3c968ad35a8dbcba5fad61f79c3944dcb6c7793 Mon Sep 17 00:00:00 2001 From: Thomas Jego Date: Thu, 23 Apr 2026 12:46:14 +0200 Subject: [PATCH 31/58] fix(web): keep feature-flag gate mounted while projects load --- apps/web/src/common/components/RestrictedFeature.tsx | 3 ++- apps/web/src/common/hooks/use-feature-flags.ts | 6 +++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/apps/web/src/common/components/RestrictedFeature.tsx b/apps/web/src/common/components/RestrictedFeature.tsx index 02bda9fb..0377ff10 100644 --- a/apps/web/src/common/components/RestrictedFeature.tsx +++ b/apps/web/src/common/components/RestrictedFeature.tsx @@ -8,7 +8,8 @@ export function RestrictedFeature({ feature: FeatureFlagKey children: React.ReactNode }) { - const { hasFeature } = useFeatureFlags() + const { hasFeature, isLoading } = useFeatureFlags() + if (isLoading) return null if (!hasFeature(feature)) return null return <>{children} } diff --git a/apps/web/src/common/hooks/use-feature-flags.ts b/apps/web/src/common/hooks/use-feature-flags.ts index b47abf49..d87066ec 100644 --- a/apps/web/src/common/hooks/use-feature-flags.ts +++ b/apps/web/src/common/hooks/use-feature-flags.ts @@ -14,11 +14,15 @@ export function useFeatureFlags(project?: Project) { if (project) { return { hasFeature: (feature: FeatureFlagKey): boolean => check(project.featureFlags || [], feature), + isLoading: false, } } else { - if (!ADS.isFulfilled(p)) return { hasFeature: () => false } + if (!ADS.isFulfilled(p)) { + return { hasFeature: () => false, isLoading: ADS.isLoading(p) || ADS.isUninitialized(p) } + } return { hasFeature: (feature: FeatureFlagKey): boolean => check(p.value.featureFlags || [], feature), + isLoading: false, } } } From 5ca0337283d0ef624a85a1392d1df073b39918c7 Mon Sep 17 00:00:00 2001 From: Thomas Jego Date: Mon, 27 Apr 2026 12:18:33 +0200 Subject: [PATCH 32/58] feat(documents): gate 'Explorer un site web' behind web_sources feature flag --- apps/web/src/studio/routes/DocumentsRoute.tsx | 5 ++++- .../api-contracts/src/feature-flags/feature-flags.dto.ts | 4 ++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/apps/web/src/studio/routes/DocumentsRoute.tsx b/apps/web/src/studio/routes/DocumentsRoute.tsx index fa3549a4..9f278d8f 100644 --- a/apps/web/src/studio/routes/DocumentsRoute.tsx +++ b/apps/web/src/studio/routes/DocumentsRoute.tsx @@ -47,6 +47,7 @@ import { useEffect, useReducer, useState } from "react" import { useTranslation } from "react-i18next" import { useNavigate } from "react-router-dom" import { GridHeader } from "@/common/components/grid/Grid" +import { RestrictedFeature } from "@/common/components/RestrictedFeature" import { MarkdownWrapper } from "@/common/features/agents/agent-sessions/shared/agent-session-messages/components/MarkdownWrapper" import { useAppDispatch, useAppSelector } from "@/common/store/hooks" import { buildDate, buildSince } from "@/common/utils/build-date" @@ -114,7 +115,9 @@ function WithData({ description={t("document:list.description")} action={
- + + +
diff --git a/packages/api-contracts/src/feature-flags/feature-flags.dto.ts b/packages/api-contracts/src/feature-flags/feature-flags.dto.ts index 50d5971a..5947a9da 100644 --- a/packages/api-contracts/src/feature-flags/feature-flags.dto.ts +++ b/packages/api-contracts/src/feature-flags/feature-flags.dto.ts @@ -16,6 +16,10 @@ export const FeatureFlags = [ key: "project-analytics", description: "View project-level analytics and usage charts in the studio.", }, + { + key: "web_sources", + description: "Crawl a website and index its pages as documents.", + }, ] as const export type FeatureFlagKey = (typeof FeatureFlags)[number]["key"] export type FeatureFlagsDto = FeatureFlagKey[] From 4134e55a05cee9619087d16042e99ee2d5dea421 Mon Sep 17 00:00:00 2001 From: Thomas Jego Date: Mon, 27 Apr 2026 12:44:01 +0200 Subject: [PATCH 33/58] feat(documents): add source-type filter tabs to separate uploaded documents from web sources --- .../documents/locales/document.en.json | 5 ++++ .../documents/locales/document.fr.json | 5 ++++ apps/web/src/studio/routes/DocumentsRoute.tsx | 30 +++++++++++++++++-- 3 files changed, 38 insertions(+), 2 deletions(-) diff --git a/apps/web/src/studio/features/documents/locales/document.en.json b/apps/web/src/studio/features/documents/locales/document.en.json index 9678c94d..1251a21e 100644 --- a/apps/web/src/studio/features/documents/locales/document.en.json +++ b/apps/web/src/studio/features/documents/locales/document.en.json @@ -53,6 +53,11 @@ "description": "Upload your first document to get started." } }, + "filter": { + "all": "All", + "uploaded": "Documents", + "webSources": "Web sources" + }, "upload": { "tagDialog": { "title": "Tags for this upload", diff --git a/apps/web/src/studio/features/documents/locales/document.fr.json b/apps/web/src/studio/features/documents/locales/document.fr.json index 9ac52fdb..bf9e2b88 100644 --- a/apps/web/src/studio/features/documents/locales/document.fr.json +++ b/apps/web/src/studio/features/documents/locales/document.fr.json @@ -53,6 +53,11 @@ "description": "Téléversez votre premier document pour commencer." } }, + "filter": { + "all": "Tous", + "uploaded": "Documents", + "webSources": "Sites web" + }, "upload": { "tagDialog": { "title": "Tags pour cet envoi", diff --git a/apps/web/src/studio/routes/DocumentsRoute.tsx b/apps/web/src/studio/routes/DocumentsRoute.tsx index 9f278d8f..f850e5e6 100644 --- a/apps/web/src/studio/routes/DocumentsRoute.tsx +++ b/apps/web/src/studio/routes/DocumentsRoute.tsx @@ -28,6 +28,7 @@ import { TableHeader, TableRow, } from "@caseai-connect/ui/shad/table" +import { Tabs, TabsList, TabsTrigger } from "@caseai-connect/ui/shad/tabs" import { ChevronDownIcon, ChevronRightIcon, @@ -49,6 +50,8 @@ import { useNavigate } from "react-router-dom" import { GridHeader } from "@/common/components/grid/Grid" import { RestrictedFeature } from "@/common/components/RestrictedFeature" import { MarkdownWrapper } from "@/common/features/agents/agent-sessions/shared/agent-session-messages/components/MarkdownWrapper" +import { useGetPath } from "@/common/hooks/use-build-path" +import { useFeatureFlags } from "@/common/hooks/use-feature-flags" import { useAppDispatch, useAppSelector } from "@/common/store/hooks" import { buildDate, buildSince } from "@/common/utils/build-date" import { generateId } from "@/common/utils/generate-id" @@ -102,6 +105,14 @@ function WithData({ }) { const navigate = useNavigate() const { t } = useTranslation() + const { getPath } = useGetPath() + const { hasFeature } = useFeatureFlags() + const [activeTab, setActiveTab] = useState<"all" | "project" | "webCrawl">("all") + + const visibleDocuments = + activeTab === "all" + ? documents + : documents.filter((document) => document.sourceType === activeTab) const handleBack = () => { navigate(-1) @@ -124,9 +135,24 @@ function WithData({ } /> + {hasFeature("web_sources") && ( +
+ setActiveTab(value as typeof activeTab)} + > + + {t("document:filter.all")} + {t("document:filter.uploaded")} + {t("document:filter.webSources")} + + +
+ )} +
- {documents.length === 0 ? ( + {visibleDocuments.length === 0 ? ( ) : ( @@ -146,7 +172,7 @@ function WithData({ - {documents.map((document) => ( + {visibleDocuments.map((document) => ( ))} From 5051982278b7947c701db8f501424fff72d96ecb Mon Sep 17 00:00:00 2001 From: Thomas Jego Date: Mon, 27 Apr 2026 13:06:37 +0200 Subject: [PATCH 34/58] feat(documents): add optional name field to crawl URL and file upload modals --- .../domains/documents/documents.controller.ts | 2 +- .../documents/components/CrawlUrlButton.tsx | 13 ++++++++- .../components/UploadDocumentsButton.tsx | 19 ++++++++++++- .../features/documents/documents.spi.ts | 2 ++ .../features/documents/documents.thunks.ts | 28 ++++++++++--------- .../documents/external/documents.api.ts | 16 ++++++++--- .../documents/locales/document.en.json | 6 +++- .../documents/locales/document.fr.json | 6 +++- apps/web/src/studio/routes/DocumentsRoute.tsx | 9 +++--- .../src/documents/documents.dto.ts | 1 + 10 files changed, 75 insertions(+), 27 deletions(-) diff --git a/apps/api/src/domains/documents/documents.controller.ts b/apps/api/src/domains/documents/documents.controller.ts index b3e3dac2..c1063225 100644 --- a/apps/api/src/domains/documents/documents.controller.ts +++ b/apps/api/src/domains/documents/documents.controller.ts @@ -389,7 +389,7 @@ export class DocumentsController { documentId, uploadStatus: "uploaded", fields: { - title: payload.url, + title: payload.name ?? payload.url, mimeType: "text/html", sourceType: "webCrawl", size: 0, diff --git a/apps/web/src/studio/features/documents/components/CrawlUrlButton.tsx b/apps/web/src/studio/features/documents/components/CrawlUrlButton.tsx index 2bce62f0..8d312706 100644 --- a/apps/web/src/studio/features/documents/components/CrawlUrlButton.tsx +++ b/apps/web/src/studio/features/documents/components/CrawlUrlButton.tsx @@ -37,6 +37,7 @@ function CrawlUrlForm({ onSuccess }: { onSuccess: () => void }) { const dispatch = useAppDispatch() const { t } = useTranslation("document") const [url, setUrl] = useState("") + const [name, setName] = useState("") const [isSubmitting, setIsSubmitting] = useState(false) const isValidUrl = (() => { @@ -54,7 +55,7 @@ function CrawlUrlForm({ onSuccess }: { onSuccess: () => void }) { setIsSubmitting(true) try { - await dispatch(crawlUrl({ url })).unwrap() + await dispatch(crawlUrl({ url, name: name.trim() || undefined })).unwrap() onSuccess() } finally { setIsSubmitting(false) @@ -80,6 +81,16 @@ function CrawlUrlForm({ onSuccess }: { onSuccess: () => void }) { required /> + + {t("document:crawl.nameLabel")} + setName(event.target.value)} + /> +
@@ -186,9 +195,11 @@ function parseCrawledPages(content?: string): { url: string; markdown: string }[ function DocumentRow({ document, documentTags, + showPages, }: { document: Document documentTags: DocumentTag[] + showPages?: boolean }) { const date = buildSince(document.updatedAt) const isWebCrawl = document.sourceType === "webCrawl" @@ -222,9 +233,11 @@ function DocumentRow({
- - {hasPages ? crawledPages.length : "—"} - + {showPages && ( + + {hasPages ? crawledPages.length : "—"} + + )}
{document.tagIds.map((tagId) => ( From 8d3bf79ac23cdf9d76ded7bde5922262abc107f5 Mon Sep 17 00:00:00 2001 From: Thomas Jego Date: Wed, 29 Apr 2026 15:14:32 +0200 Subject: [PATCH 40/58] fix(crawling): fix the re-crawl feature --- .../src/domains/documents/document.entity.ts | 3 + .../src/domains/documents/document.factory.ts | 1 + .../domains/documents/documents.controller.ts | 61 +++++++++++++++++++ .../src/domains/documents/documents.module.ts | 2 + .../domains/documents/documents.service.ts | 20 +++++- 5 files changed, 86 insertions(+), 1 deletion(-) diff --git a/apps/api/src/domains/documents/document.entity.ts b/apps/api/src/domains/documents/document.entity.ts index e3fb73ca..37e11a5a 100644 --- a/apps/api/src/domains/documents/document.entity.ts +++ b/apps/api/src/domains/documents/document.entity.ts @@ -34,6 +34,9 @@ export class Document extends ConnectEntityBase { @Column({ name: "storage_relative_path", nullable: true }) storageRelativePath!: string + @Column({ name: "source_url", type: "text", nullable: true }) + sourceUrl!: string | null + @Column({ name: "source_type", nullable: false }) sourceType!: | "project" diff --git a/apps/api/src/domains/documents/document.factory.ts b/apps/api/src/domains/documents/document.factory.ts index 8ba87be4..63bd5cb3 100644 --- a/apps/api/src/domains/documents/document.factory.ts +++ b/apps/api/src/domains/documents/document.factory.ts @@ -37,6 +37,7 @@ export const documentFactory = DocumentFactory.define(({ sequence, params, trans size: params.size || 1024, storageRelativePath: params.storageRelativePath || `documents/file_${sequence}.txt`, sourceType: params.sourceType || "project", + sourceUrl: params.sourceUrl ?? null, embeddingStatus: params.embeddingStatus || "pending", embeddingError: params.embeddingError ?? null, extractionEngine: params.extractionEngine ?? null, diff --git a/apps/api/src/domains/documents/documents.controller.ts b/apps/api/src/domains/documents/documents.controller.ts index c1063225..2a125a83 100644 --- a/apps/api/src/domains/documents/documents.controller.ts +++ b/apps/api/src/domains/documents/documents.controller.ts @@ -68,6 +68,8 @@ import { DOCUMENT_EMBEDDINGS_BATCH_SERVICE, type DocumentEmbeddingsBatchService, } from "./embeddings/document-embeddings-batch.interface" +// biome-ignore lint/style/useImportType: Required at runtime for NestJS DI +import { DocumentEmbeddingStatusNotifierService } from "./embeddings/document-embedding-status-notifier.service" import { FILE_STORAGE_SERVICE, type IFileStorage } from "./storage/file-storage.interface" const mega = 1024 @@ -85,6 +87,7 @@ export class DocumentsController { private readonly documentsService: DocumentsService, private readonly documentEmbeddingStatusStreamService: DocumentEmbeddingStatusStreamService, private readonly documentCrawlProgressStreamService: DocumentCrawlProgressStreamService, + private readonly documentEmbeddingStatusNotifierService: DocumentEmbeddingStatusNotifierService, ) {} @CheckPolicy((policy) => policy.canCreate()) @@ -392,6 +395,7 @@ export class DocumentsController { title: payload.name ?? payload.url, mimeType: "text/html", sourceType: "webCrawl", + sourceUrl: payload.url, size: 0, fileName: null as unknown as string, storageRelativePath: null as unknown as string, @@ -414,6 +418,63 @@ export class DocumentsController { } } + @CheckPolicy((policy) => policy.canUpdate()) + @Post(DocumentsRoutes.reCrawlUrl.path) + @TrackActivity({ action: "document.reCrawlUrl", entityFrom: "document" }) + @AddContext("document") + @HttpCode(HttpStatus.ACCEPTED) + async reCrawlUrl( + @Request() req: EndpointRequestWithDocument, + ): Promise { + const document = req.document + + if (document.sourceType !== "webCrawl") { + throw new UnprocessableEntityException("Document is not a web crawl source.") + } + + // sourceUrl may be null for documents crawled before source URL tracking was added — + // fall back to title, which equals the original URL when no custom name was given. + const urlToRecrawl = document.sourceUrl ?? document.title + try { + new URL(urlToRecrawl) + } catch { + throw new UnprocessableEntityException( + "Source URL not available for this document. Please delete it and crawl the website again.", + ) + } + + const connectScope = getRequiredConnectScope(req) + + const reset = await this.documentsService.resetForRecrawl({ + connectScope, + documentId: document.id, + }) + + await this.documentEmbeddingStatusNotifierService.notifyEmbeddingStatusChanged({ + documentId: reset.id, + organizationId: reset.organizationId, + projectId: reset.projectId, + embeddingStatus: reset.embeddingStatus, + embeddingError: reset.embeddingError, + updatedAt: reset.updatedAt.getTime(), + }) + + await this.urlCrawlingBatchService.enqueueCrawlUrl({ + documentId: document.id, + url: urlToRecrawl, + organizationId: connectScope.organizationId, + projectId: connectScope.projectId, + requestedByUserId: req.user.id, + currentTraceId: v4(), + }) + + return { + data: { + message: `Re-crawling ${urlToRecrawl}. Pages will be updated as they are processed.`, + }, + } + } + @CheckPolicy((policy) => policy.canList()) @Sse(DocumentsRoutes.streamEmbeddingStatus.path, { method: 0 /* GET */ }) streamEmbeddingStatus( diff --git a/apps/api/src/domains/documents/documents.module.ts b/apps/api/src/domains/documents/documents.module.ts index fd5d6e05..e7ea02af 100644 --- a/apps/api/src/domains/documents/documents.module.ts +++ b/apps/api/src/domains/documents/documents.module.ts @@ -21,6 +21,7 @@ import { DocumentsController } from "./documents.controller" import { DocumentsGuard } from "./documents.guard" import { DocumentsService } from "./documents.service" import { DocumentChunkRetrievalService } from "./embeddings/document-chunk-retrieval.service" +import { DocumentEmbeddingStatusNotifierService } from "./embeddings/document-embedding-status-notifier.service" import { DocumentEmbeddingStatusStreamService } from "./embeddings/document-embedding-status-stream.service" import { DocumentEmbeddingsBatchModule } from "./embeddings/document-embeddings-batch.module" import { LocalPresignUploadController } from "./storage/local-presign-upload.controller" @@ -62,6 +63,7 @@ import { DocumentTagsModule } from "./tags/document-tags.module" providers: [ DocumentsService, DocumentEmbeddingStatusStreamService, + DocumentEmbeddingStatusNotifierService, DocumentCrawlProgressStreamService, DocumentChunkRetrievalService, DocumentsGuard, diff --git a/apps/api/src/domains/documents/documents.service.ts b/apps/api/src/domains/documents/documents.service.ts index a09828e6..36459cb3 100644 --- a/apps/api/src/domains/documents/documents.service.ts +++ b/apps/api/src/domains/documents/documents.service.ts @@ -31,7 +31,7 @@ export class DocumentsService { Document, "fileName" | "mimeType" | "size" | "storageRelativePath" | "title" | "sourceType" > & - Partial> + Partial> uploadStatus: "pending" | "uploaded" tagIds?: string[] }): Promise { @@ -43,6 +43,7 @@ export class DocumentsService { storageRelativePath: fields.storageRelativePath, title: fields.title ?? fields.fileName, sourceType: fields.sourceType, + sourceUrl: fields.sourceUrl ?? null, content: fields.content, uploadStatus, }) @@ -197,6 +198,23 @@ export class DocumentsService { return this.documentConnectRepository.saveOne(document) } + async resetForRecrawl({ + connectScope, + documentId, + }: { + connectScope: RequiredConnectScope + documentId: string + }): Promise { + const document = await this.documentConnectRepository.getOneById(connectScope, documentId) + if (!document) { + throw new NotFoundException(`Document with id ${documentId} not found`) + } + document.content = null as unknown as string + document.embeddingStatus = "pending" + document.embeddingError = null + return this.documentConnectRepository.saveOne(document) + } + async deleteDocument({ connectScope, documentId, From 72ba3ea1365cb8690da00ef64e4b1d42c0097197 Mon Sep 17 00:00:00 2001 From: Thomas Jego Date: Wed, 29 Apr 2026 15:24:07 +0200 Subject: [PATCH 41/58] feat(web): add recrawl action for web source documents --- CHANGELOG.md | 1 + .../features/documents/documents.spi.ts | 5 +++++ .../features/documents/documents.thunks.ts | 13 +++++++++++ .../documents/external/documents.api.ts | 7 ++++++ .../documents/locales/document.en.json | 1 + .../documents/locales/document.fr.json | 1 + apps/web/src/studio/routes/DocumentsRoute.tsx | 22 +++++++++++++++---- .../src/documents/documents.dto.ts | 4 ++++ .../src/documents/documents.routes.ts | 5 +++++ 9 files changed, 55 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 82d59698..8c69ccb5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ This project uses [CalVer](https://calver.org/) (YY.MM.Micro) for product versio ### Added - (beta) Documents sidebar entry replaced by a Sources dropdown with separate Documents and Websites sections +- (beta) Re-crawl a website from its action menu to refresh content and re-index all pages ### Changed diff --git a/apps/web/src/studio/features/documents/documents.spi.ts b/apps/web/src/studio/features/documents/documents.spi.ts index 3a09ec04..4637c28d 100644 --- a/apps/web/src/studio/features/documents/documents.spi.ts +++ b/apps/web/src/studio/features/documents/documents.spi.ts @@ -67,4 +67,9 @@ export interface IDocumentsSpi { url: string name?: string }): Promise<{ message: string }> + reCrawlUrl(params: { + organizationId: string + projectId: string + documentId: string + }): Promise<{ message: string }> } diff --git a/apps/web/src/studio/features/documents/documents.thunks.ts b/apps/web/src/studio/features/documents/documents.thunks.ts index 9ea1c611..1804537d 100644 --- a/apps/web/src/studio/features/documents/documents.thunks.ts +++ b/apps/web/src/studio/features/documents/documents.thunks.ts @@ -169,6 +169,19 @@ export const crawlUrl = createAsyncThunk< return await services.documents.crawlUrl({ organizationId, projectId, url, name }) }) +export const reCrawlUrl = createAsyncThunk< + { message: string }, + { documentId: string }, + ThunkConfig +>("documents/reCrawlUrl", async ({ documentId }, { extra: { services }, getState }) => { + const state = getState() + const { organizationId, projectId } = getCurrentIds({ + state, + wantedIds: ["organizationId", "projectId"], + }) + return await services.documents.reCrawlUrl({ organizationId, projectId, documentId }) +}) + export const streamDocumentCrawlProgresses = createAsyncThunk( "documents/streamCrawlProgress", async (_, { extra: { services }, getState, dispatch, signal }) => { diff --git a/apps/web/src/studio/features/documents/external/documents.api.ts b/apps/web/src/studio/features/documents/external/documents.api.ts index ff5659b4..cf50cf92 100644 --- a/apps/web/src/studio/features/documents/external/documents.api.ts +++ b/apps/web/src/studio/features/documents/external/documents.api.ts @@ -150,6 +150,13 @@ export default { ) return response.data.data }, + reCrawlUrl: async ({ organizationId, projectId, documentId }) => { + const axios = getAxiosInstance() + const response = await axios.post( + DocumentsRoutes.reCrawlUrl.getPath({ organizationId, projectId, documentId }), + ) + return response.data.data + }, } satisfies IDocumentsSpi function toDocument(dto: DocumentDto): Document { diff --git a/apps/web/src/studio/features/documents/locales/document.en.json b/apps/web/src/studio/features/documents/locales/document.en.json index 7dcd592c..6ed3ec84 100644 --- a/apps/web/src/studio/features/documents/locales/document.en.json +++ b/apps/web/src/studio/features/documents/locales/document.en.json @@ -38,6 +38,7 @@ "title": "Delete {{documentTitle}}", "description": "Are you sure you want to delete this document? This action cannot be undone." }, + "recrawl": "Recrawl website", "crawl": { "button": "Crawl Website", "title": "Crawl a Website", diff --git a/apps/web/src/studio/features/documents/locales/document.fr.json b/apps/web/src/studio/features/documents/locales/document.fr.json index 94f49285..29727a93 100644 --- a/apps/web/src/studio/features/documents/locales/document.fr.json +++ b/apps/web/src/studio/features/documents/locales/document.fr.json @@ -38,6 +38,7 @@ "title": "Supprimer {{documentTitle}}", "description": "Êtes-vous sûr de vouloir supprimer ce document ? Cette action est irréversible." }, + "recrawl": "Ré-explorer le site", "crawl": { "button": "Explorer un site web", "title": "Explorer un site web", diff --git a/apps/web/src/studio/routes/DocumentsRoute.tsx b/apps/web/src/studio/routes/DocumentsRoute.tsx index c0cd780b..5a903e55 100644 --- a/apps/web/src/studio/routes/DocumentsRoute.tsx +++ b/apps/web/src/studio/routes/DocumentsRoute.tsx @@ -39,6 +39,7 @@ import { InfoIcon, Loader2Icon, PencilIcon, + RefreshCwIcon, RotateCcwIcon, Trash2Icon, XIcon, @@ -73,6 +74,7 @@ import { documentsActions } from "@/studio/features/documents/documents.slice" import { deleteDocument, getDocumentTemporaryUrl, + reCrawlUrl, reprocessDocument, updateDocument, } from "@/studio/features/documents/documents.thunks" @@ -338,6 +340,10 @@ function DocumentActions({ dispatch(reprocessDocument({ documentId: document.id })) } + const handleReCrawl = () => { + dispatch(reCrawlUrl({ documentId: document.id })) + } + return ( <> @@ -347,10 +353,12 @@ function DocumentActions({ - - - {t("actions:downloadDocument")} - + {document.sourceType !== "webCrawl" && ( + + + {t("actions:downloadDocument")} + + )} setActiveAction("details")}> {t("actions:view")} @@ -365,6 +373,12 @@ function DocumentActions({ {t("document:reprocess.cta")} )} + {document.sourceType === "webCrawl" && ( + + + {t("document:recrawl")} + + )} setActiveAction("delete")}> diff --git a/packages/api-contracts/src/documents/documents.dto.ts b/packages/api-contracts/src/documents/documents.dto.ts index 277d4395..c54f9370 100644 --- a/packages/api-contracts/src/documents/documents.dto.ts +++ b/packages/api-contracts/src/documents/documents.dto.ts @@ -80,6 +80,10 @@ export type CrawlUrlResponseDto = { message: string } +export type ReCrawlUrlResponseDto = { + message: string +} + // https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Common_types export enum MimeTypes { aac = "audio/aac", diff --git a/packages/api-contracts/src/documents/documents.routes.ts b/packages/api-contracts/src/documents/documents.routes.ts index 84774672..b60e5c1f 100644 --- a/packages/api-contracts/src/documents/documents.routes.ts +++ b/packages/api-contracts/src/documents/documents.routes.ts @@ -8,6 +8,7 @@ import type { DocumentUploadOptionalTagFields, PresignFileRequestItemDto, PresignFileResponseItemDto, + ReCrawlUrlResponseDto, } from "./documents.dto" export const DocumentsRoutes = { @@ -59,6 +60,10 @@ export const DocumentsRoutes = { method: "post", path: "organizations/:organizationId/projects/:projectId/documents/crawl-url", }), + reCrawlUrl: defineRoute>({ + method: "post", + path: "organizations/:organizationId/projects/:projectId/documents/:documentId/recrawl", + }), // Streaming responses are sent as text/event-stream (SSE) and do not follow ResponseData. streamEmbeddingStatus: defineRoute>({ method: "get", From 1b8112fae7648fe8ec7f468942618a3059b29aa8 Mon Sep 17 00:00:00 2001 From: Thomas Jego Date: Tue, 5 May 2026 13:58:02 +0200 Subject: [PATCH 42/58] fix(api): recrawl fails for renamed web sources without stored sourceUrl --- CHANGELOG.md | 1 + .../domains/documents/documents.controller.ts | 39 +++++++++++++++---- .../domains/documents/documents.service.ts | 13 +++++++ 3 files changed, 45 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8c69ccb5..ff85c099 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ This project uses [CalVer](https://calver.org/) (YY.MM.Micro) for product versio ### Changed ### Fixed +- (beta) Re-crawl now works correctly for renamed web sources ### Security diff --git a/apps/api/src/domains/documents/documents.controller.ts b/apps/api/src/domains/documents/documents.controller.ts index 2a125a83..5bf51e75 100644 --- a/apps/api/src/domains/documents/documents.controller.ts +++ b/apps/api/src/domains/documents/documents.controller.ts @@ -63,13 +63,13 @@ import { // biome-ignore lint/style/useImportType: Required at runtime for NestJS DI import { DocumentsService } from "./documents.service" // biome-ignore lint/style/useImportType: Required at runtime for NestJS DI +import { DocumentEmbeddingStatusNotifierService } from "./embeddings/document-embedding-status-notifier.service" +// biome-ignore lint/style/useImportType: Required at runtime for NestJS DI import { DocumentEmbeddingStatusStreamService } from "./embeddings/document-embedding-status-stream.service" import { DOCUMENT_EMBEDDINGS_BATCH_SERVICE, type DocumentEmbeddingsBatchService, } from "./embeddings/document-embeddings-batch.interface" -// biome-ignore lint/style/useImportType: Required at runtime for NestJS DI -import { DocumentEmbeddingStatusNotifierService } from "./embeddings/document-embedding-status-notifier.service" import { FILE_STORAGE_SERVICE, type IFileStorage } from "./storage/file-storage.interface" const mega = 1024 @@ -432,12 +432,10 @@ export class DocumentsController { throw new UnprocessableEntityException("Document is not a web crawl source.") } - // sourceUrl may be null for documents crawled before source URL tracking was added — - // fall back to title, which equals the original URL when no custom name was given. - const urlToRecrawl = document.sourceUrl ?? document.title - try { - new URL(urlToRecrawl) - } catch { + const urlToRecrawl = + document.sourceUrl ?? resolveSourceUrlFallback(document.title, document.content) + + if (!urlToRecrawl) { throw new UnprocessableEntityException( "Source URL not available for this document. Please delete it and crawl the website again.", ) @@ -508,6 +506,30 @@ export class DocumentsController { } } +function resolveSourceUrlFallback(title: string, content: string | null): string | null { + // 1. Title may be the original URL if the document was never renamed. + try { + new URL(title) + return title + } catch { + // title is an alias, not a URL + } + // 2. Extract the shortest URL from crawled content — typically the root entry point. + if (content) { + try { + const pages: { url?: string }[] = JSON.parse(content) + const urls = pages.map((page) => page.url).filter((url): url is string => Boolean(url)) + if (urls.length > 0) { + urls.sort((a, b) => a.length - b.length) + return urls[0] ?? null + } + } catch { + // malformed content + } + } + return null +} + function toDocumentDto(entity: Document): DocumentDto { return { id: entity.id, @@ -523,6 +545,7 @@ function toDocumentDto(entity: Document): DocumentDto { size: entity.size, storageRelativePath: entity.storageRelativePath, sourceType: entity.sourceType, + sourceUrl: entity.sourceUrl ?? null, embeddingStatus: entity.embeddingStatus, embeddingError: entity.embeddingError ?? null, tagIds: entity.tags?.map((tag) => tag.id) || [], diff --git a/apps/api/src/domains/documents/documents.service.ts b/apps/api/src/domains/documents/documents.service.ts index 36459cb3..73899eca 100644 --- a/apps/api/src/domains/documents/documents.service.ts +++ b/apps/api/src/domains/documents/documents.service.ts @@ -142,6 +142,19 @@ export class DocumentsService { throw new NotFoundException(`Document with id ${documentId} not found`) } + if ( + fieldsToUpdate.title !== undefined && + document.sourceType === "webCrawl" && + document.sourceUrl === null + ) { + try { + new URL(document.title) + document.sourceUrl = document.title + } catch { + // title is not a URL (already an alias) — nothing to backfill + } + } + if (fieldsToUpdate.title !== undefined) { document.title = fieldsToUpdate.title } From dcb4e0fd19b935f959de24dd64ba84c7bd7b0af1 Mon Sep 17 00:00:00 2001 From: Thomas Jego Date: Thu, 7 May 2026 09:41:39 +0200 Subject: [PATCH 43/58] fix: package-lock.json push for workers smoke gh action error --- package-lock.json | 970 +++++++++++++++++++++------------------------- 1 file changed, 436 insertions(+), 534 deletions(-) diff --git a/package-lock.json b/package-lock.json index fd9d3abc..f36ddc00 100644 --- a/package-lock.json +++ b/package-lock.json @@ -109,53 +109,6 @@ "typescript": "5.5.4" } }, - "apps/api/node_modules/@angular-devkit/core": { - "version": "19.2.24", - "resolved": "https://registry.npmjs.org/@angular-devkit/core/-/core-19.2.24.tgz", - "integrity": "sha512-Kd49warf6U/EyWe5BszF/eebN3zQ3bk7tgfEljAw8q/rX95UUtriJubWvp6pgzHfzBA4jwq8f+QiNZB8eBEXPA==", - "dev": true, - "license": "MIT", - "dependencies": { - "ajv": "8.18.0", - "ajv-formats": "3.0.1", - "jsonc-parser": "3.3.1", - "picomatch": "4.0.4", - "rxjs": "7.8.1", - "source-map": "0.7.4" - }, - "engines": { - "node": "^18.19.1 || ^20.11.1 || >=22.0.0", - "npm": "^6.11.0 || ^7.5.6 || >=8.0.0", - "yarn": ">= 1.13.0" - }, - "peerDependencies": { - "chokidar": "^4.0.0" - }, - "peerDependenciesMeta": { - "chokidar": { - "optional": true - } - } - }, - "apps/api/node_modules/@angular-devkit/schematics": { - "version": "19.2.24", - "resolved": "https://registry.npmjs.org/@angular-devkit/schematics/-/schematics-19.2.24.tgz", - "integrity": "sha512-lnw+ZM1Io+cJAkReC0NPDjqObL8NtKzKIkdgEEKC8CUmkhurYhedbicN8Y8NYHgG1uLd2GozW3+/QqPRZaN+Lw==", - "dev": true, - "license": "MIT", - "dependencies": { - "@angular-devkit/core": "19.2.24", - "jsonc-parser": "3.3.1", - "magic-string": "0.30.17", - "ora": "5.4.1", - "rxjs": "7.8.1" - }, - "engines": { - "node": "^18.19.1 || ^20.11.1 || >=22.0.0", - "npm": "^6.11.0 || ^7.5.6 || >=8.0.0", - "yarn": ">= 1.13.0" - } - }, "apps/api/node_modules/@angular-devkit/schematics-cli": { "version": "19.2.24", "resolved": "https://registry.npmjs.org/@angular-devkit/schematics-cli/-/schematics-cli-19.2.24.tgz", @@ -209,18 +162,6 @@ } } }, - "apps/api/node_modules/@bull-board/api": { - "version": "6.21.3", - "resolved": "https://registry.npmjs.org/@bull-board/api/-/api-6.21.3.tgz", - "integrity": "sha512-FoQO+0MgZsPrQX9WLZx0KpINamJY48FUU+OyMcZxx9mQWCwsdak45V/uBgQrTYB3GaF5oGA0SxPXEp4RHwj36A==", - "license": "MIT", - "dependencies": { - "redis-info": "^3.1.0" - }, - "peerDependencies": { - "@bull-board/ui": "6.21.3" - } - }, "apps/api/node_modules/@bull-board/nestjs": { "version": "6.21.3", "resolved": "https://registry.npmjs.org/@bull-board/nestjs/-/nestjs-6.21.3.tgz", @@ -235,15 +176,6 @@ "rxjs": "^7.8.1" } }, - "apps/api/node_modules/@bull-board/ui": { - "version": "6.21.3", - "resolved": "https://registry.npmjs.org/@bull-board/ui/-/ui-6.21.3.tgz", - "integrity": "sha512-s/PLBJab8cnoQAGVqjQb0v4oGe0KgB4aQ5G5g93doxzXB/D+wkXNL9P9+zLWLldBJXE57jL4CR99ttDCIiyNHw==", - "license": "MIT", - "dependencies": { - "@bull-board/api": "6.21.3" - } - }, "apps/api/node_modules/@llamaindex/core": { "version": "0.6.23", "resolved": "https://registry.npmjs.org/@llamaindex/core/-/core-0.6.23.tgz", @@ -368,89 +300,6 @@ "node": ">=14.17" } }, - "apps/api/node_modules/mime-db": { - "version": "1.52.0", - "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz", - "integrity": "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">= 0.6" - } - }, - "apps/api/node_modules/mime-types": { - "version": "2.1.35", - "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.35.tgz", - "integrity": "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==", - "dev": true, - "license": "MIT", - "dependencies": { - "mime-db": "1.52.0" - }, - "engines": { - "node": ">= 0.6" - } - }, - "apps/api/node_modules/rxjs": { - "version": "7.8.1", - "resolved": "https://registry.npmjs.org/rxjs/-/rxjs-7.8.1.tgz", - "integrity": "sha512-AA3TVj+0A2iuIoQkWEK/tqFjBq2j+6PO6Y0zJcvzLAFhEFIO3HL0vls9hWLncZbAAbK0mar7oZ4V079I/qPMxg==", - "license": "Apache-2.0", - "dependencies": { - "tslib": "^2.1.0" - } - }, - "apps/api/node_modules/schema-utils": { - "version": "4.3.3", - "resolved": "https://registry.npmjs.org/schema-utils/-/schema-utils-4.3.3.tgz", - "integrity": "sha512-eflK8wEtyOE6+hsaRVPxvUKYCpRgzLqDTb8krvAsRIwOGlHoSgYLgBXoubGgLd2fT41/OUYdb48v4k4WWHQurA==", - "dev": true, - "license": "MIT", - "dependencies": { - "@types/json-schema": "^7.0.9", - "ajv": "^8.9.0", - "ajv-formats": "^2.1.1", - "ajv-keywords": "^5.1.0" - }, - "engines": { - "node": ">= 10.13.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/webpack" - } - }, - "apps/api/node_modules/schema-utils/node_modules/ajv-formats": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/ajv-formats/-/ajv-formats-2.1.1.tgz", - "integrity": "sha512-Wx0Kx52hxE7C18hkMEggYlEifqWZtYaRgouJor+WMdPnQyEK13vgEWyVNup7SoeeoLMsr4kf5h6dOW11I15MUA==", - "dev": true, - "license": "MIT", - "dependencies": { - "ajv": "^8.0.0" - }, - "peerDependencies": { - "ajv": "^8.0.0" - }, - "peerDependenciesMeta": { - "ajv": { - "optional": true - } - } - }, - "apps/api/node_modules/schema-utils/node_modules/ajv-keywords": { - "version": "5.1.0", - "resolved": "https://registry.npmjs.org/ajv-keywords/-/ajv-keywords-5.1.0.tgz", - "integrity": "sha512-YCS/JNFAUyr5vAuhk1DWm1CBxRHW9LbJ2ozWeemrIqpbsqKjHVxYPyi5GC0rjZIT5JxJ3virVTS8wk4i/Z+krw==", - "dev": true, - "license": "MIT", - "dependencies": { - "fast-deep-equal": "^3.1.3" - }, - "peerDependencies": { - "ajv": "^8.8.2" - } - }, "apps/api/node_modules/typescript": { "version": "5.5.4", "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.5.4.tgz", @@ -465,55 +314,6 @@ "node": ">=14.17" } }, - "apps/api/node_modules/webpack": { - "version": "5.106.0", - "resolved": "https://registry.npmjs.org/webpack/-/webpack-5.106.0.tgz", - "integrity": "sha512-Pkx5joZ9RrdgO5LBkyX1L2ZAJeK/Taz3vqZ9CbcP0wS5LEMx5QkKsEwLl29QJfihZ+DKRBFldzy1O30pJ1MDpA==", - "dev": true, - "license": "MIT", - "dependencies": { - "@types/eslint-scope": "^3.7.7", - "@types/estree": "^1.0.8", - "@types/json-schema": "^7.0.15", - "@webassemblyjs/ast": "^1.14.1", - "@webassemblyjs/wasm-edit": "^1.14.1", - "@webassemblyjs/wasm-parser": "^1.14.1", - "acorn": "^8.16.0", - "acorn-import-phases": "^1.0.3", - "browserslist": "^4.28.1", - "chrome-trace-event": "^1.0.2", - "enhanced-resolve": "^5.20.0", - "es-module-lexer": "^2.0.0", - "eslint-scope": "5.1.1", - "events": "^3.2.0", - "glob-to-regexp": "^0.4.1", - "graceful-fs": "^4.2.11", - "json-parse-even-better-errors": "^2.3.1", - "loader-runner": "^4.3.1", - "mime-types": "^2.1.27", - "neo-async": "^2.6.2", - "schema-utils": "^4.3.3", - "tapable": "^2.3.0", - "terser-webpack-plugin": "^5.3.17", - "watchpack": "^2.5.1", - "webpack-sources": "^3.3.4" - }, - "bin": { - "webpack": "bin/webpack.js" - }, - "engines": { - "node": ">=10.13.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/webpack" - }, - "peerDependenciesMeta": { - "webpack-cli": { - "optional": true - } - } - }, "apps/web": { "name": "@caseai-connect/web", "version": "0.0.0", @@ -568,9 +368,9 @@ } }, "apps/web/node_modules/@types/node": { - "version": "24.12.0", - "resolved": "https://registry.npmjs.org/@types/node/-/node-24.12.0.tgz", - "integrity": "sha512-GYDxsZi3ChgmckRT9HPU0WEhKLP08ev/Yfcq2AstjrDASOYCSXeyjDsHg4v5t4jOj7cyDX3vmprafKlWIG9MXQ==", + "version": "24.12.2", + "resolved": "https://registry.npmjs.org/@types/node/-/node-24.12.2.tgz", + "integrity": "sha512-A1sre26ke7HDIuY/M23nd9gfB+nrmhtYyMINbjI1zHJxYteKR6qSMX56FsmjMcDb3SMcjJg5BiRRgOCC/yBD0g==", "dev": true, "license": "MIT", "dependencies": { @@ -760,6 +560,97 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/@angular-devkit/core": { + "version": "19.2.24", + "resolved": "https://registry.npmjs.org/@angular-devkit/core/-/core-19.2.24.tgz", + "integrity": "sha512-Kd49warf6U/EyWe5BszF/eebN3zQ3bk7tgfEljAw8q/rX95UUtriJubWvp6pgzHfzBA4jwq8f+QiNZB8eBEXPA==", + "dev": true, + "license": "MIT", + "dependencies": { + "ajv": "8.18.0", + "ajv-formats": "3.0.1", + "jsonc-parser": "3.3.1", + "picomatch": "4.0.4", + "rxjs": "7.8.1", + "source-map": "0.7.4" + }, + "engines": { + "node": "^18.19.1 || ^20.11.1 || >=22.0.0", + "npm": "^6.11.0 || ^7.5.6 || >=8.0.0", + "yarn": ">= 1.13.0" + }, + "peerDependencies": { + "chokidar": "^4.0.0" + }, + "peerDependenciesMeta": { + "chokidar": { + "optional": true + } + } + }, + "node_modules/@angular-devkit/core/node_modules/ajv": { + "version": "8.18.0", + "resolved": "https://registry.npmjs.org/ajv/-/ajv-8.18.0.tgz", + "integrity": "sha512-PlXPeEWMXMZ7sPYOHqmDyCJzcfNrUr3fGNKtezX14ykXOEIvyK81d+qydx89KY5O71FKMPaQ2vBfBFI5NHR63A==", + "dev": true, + "license": "MIT", + "dependencies": { + "fast-deep-equal": "^3.1.3", + "fast-uri": "^3.0.1", + "json-schema-traverse": "^1.0.0", + "require-from-string": "^2.0.2" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/epoberezkin" + } + }, + "node_modules/@angular-devkit/core/node_modules/json-schema-traverse": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz", + "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==", + "dev": true, + "license": "MIT" + }, + "node_modules/@angular-devkit/core/node_modules/rxjs": { + "version": "7.8.1", + "resolved": "https://registry.npmjs.org/rxjs/-/rxjs-7.8.1.tgz", + "integrity": "sha512-AA3TVj+0A2iuIoQkWEK/tqFjBq2j+6PO6Y0zJcvzLAFhEFIO3HL0vls9hWLncZbAAbK0mar7oZ4V079I/qPMxg==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "tslib": "^2.1.0" + } + }, + "node_modules/@angular-devkit/schematics": { + "version": "19.2.24", + "resolved": "https://registry.npmjs.org/@angular-devkit/schematics/-/schematics-19.2.24.tgz", + "integrity": "sha512-lnw+ZM1Io+cJAkReC0NPDjqObL8NtKzKIkdgEEKC8CUmkhurYhedbicN8Y8NYHgG1uLd2GozW3+/QqPRZaN+Lw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@angular-devkit/core": "19.2.24", + "jsonc-parser": "3.3.1", + "magic-string": "0.30.17", + "ora": "5.4.1", + "rxjs": "7.8.1" + }, + "engines": { + "node": "^18.19.1 || ^20.11.1 || >=22.0.0", + "npm": "^6.11.0 || ^7.5.6 || >=8.0.0", + "yarn": ">= 1.13.0" + } + }, + "node_modules/@angular-devkit/schematics/node_modules/rxjs": { + "version": "7.8.1", + "resolved": "https://registry.npmjs.org/rxjs/-/rxjs-7.8.1.tgz", + "integrity": "sha512-AA3TVj+0A2iuIoQkWEK/tqFjBq2j+6PO6Y0zJcvzLAFhEFIO3HL0vls9hWLncZbAAbK0mar7oZ4V079I/qPMxg==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "tslib": "^2.1.0" + } + }, "node_modules/@auth0/auth0-auth-js": { "version": "1.6.0", "resolved": "https://registry.npmjs.org/@auth0/auth0-auth-js/-/auth0-auth-js-1.6.0.tgz", @@ -1543,6 +1434,18 @@ "url": "https://github.com/sponsors/Borewit" } }, + "node_modules/@bull-board/api": { + "version": "6.21.3", + "resolved": "https://registry.npmjs.org/@bull-board/api/-/api-6.21.3.tgz", + "integrity": "sha512-FoQO+0MgZsPrQX9WLZx0KpINamJY48FUU+OyMcZxx9mQWCwsdak45V/uBgQrTYB3GaF5oGA0SxPXEp4RHwj36A==", + "license": "MIT", + "dependencies": { + "redis-info": "^3.1.0" + }, + "peerDependencies": { + "@bull-board/ui": "6.21.3" + } + }, "node_modules/@bull-board/express": { "version": "6.21.3", "resolved": "https://registry.npmjs.org/@bull-board/express/-/express-6.21.3.tgz", @@ -1555,19 +1458,7 @@ "express": "^5.2.1" } }, - "node_modules/@bull-board/express/node_modules/@bull-board/api": { - "version": "6.21.3", - "resolved": "https://registry.npmjs.org/@bull-board/api/-/api-6.21.3.tgz", - "integrity": "sha512-FoQO+0MgZsPrQX9WLZx0KpINamJY48FUU+OyMcZxx9mQWCwsdak45V/uBgQrTYB3GaF5oGA0SxPXEp4RHwj36A==", - "license": "MIT", - "dependencies": { - "redis-info": "^3.1.0" - }, - "peerDependencies": { - "@bull-board/ui": "6.21.3" - } - }, - "node_modules/@bull-board/express/node_modules/@bull-board/ui": { + "node_modules/@bull-board/ui": { "version": "6.21.3", "resolved": "https://registry.npmjs.org/@bull-board/ui/-/ui-6.21.3.tgz", "integrity": "sha512-s/PLBJab8cnoQAGVqjQb0v4oGe0KgB4aQ5G5g93doxzXB/D+wkXNL9P9+zLWLldBJXE57jL4CR99ttDCIiyNHw==", @@ -4908,9 +4799,9 @@ } }, "node_modules/@llamaindex/core/node_modules/@types/node": { - "version": "24.12.0", - "resolved": "https://registry.npmjs.org/@types/node/-/node-24.12.0.tgz", - "integrity": "sha512-GYDxsZi3ChgmckRT9HPU0WEhKLP08ev/Yfcq2AstjrDASOYCSXeyjDsHg4v5t4jOj7cyDX3vmprafKlWIG9MXQ==", + "version": "24.12.2", + "resolved": "https://registry.npmjs.org/@types/node/-/node-24.12.2.tgz", + "integrity": "sha512-A1sre26ke7HDIuY/M23nd9gfB+nrmhtYyMINbjI1zHJxYteKR6qSMX56FsmjMcDb3SMcjJg5BiRRgOCC/yBD0g==", "license": "MIT", "dependencies": { "undici-types": "~7.16.0" @@ -5468,6 +5359,30 @@ "yarn": ">= 1.13.0" } }, + "node_modules/@nestjs/schematics/node_modules/ajv": { + "version": "8.18.0", + "resolved": "https://registry.npmjs.org/ajv/-/ajv-8.18.0.tgz", + "integrity": "sha512-PlXPeEWMXMZ7sPYOHqmDyCJzcfNrUr3fGNKtezX14ykXOEIvyK81d+qydx89KY5O71FKMPaQ2vBfBFI5NHR63A==", + "dev": true, + "license": "MIT", + "dependencies": { + "fast-deep-equal": "^3.1.3", + "fast-uri": "^3.0.1", + "json-schema-traverse": "^1.0.0", + "require-from-string": "^2.0.2" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/epoberezkin" + } + }, + "node_modules/@nestjs/schematics/node_modules/json-schema-traverse": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz", + "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==", + "dev": true, + "license": "MIT" + }, "node_modules/@nestjs/schematics/node_modules/rxjs": { "version": "7.8.1", "resolved": "https://registry.npmjs.org/rxjs/-/rxjs-7.8.1.tgz", @@ -5711,9 +5626,9 @@ } }, "node_modules/@opentelemetry/api": { - "version": "1.9.1", - "resolved": "https://registry.npmjs.org/@opentelemetry/api/-/api-1.9.1.tgz", - "integrity": "sha512-gLyJlPHPZYdAk1JENA9LeHejZe1Ti77/pTeFm/nMXmQH/HFZlcS/O2XJB+L8fkbrNSqhdtlvjBVjxwUYanNH5Q==", + "version": "1.9.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/api/-/api-1.9.0.tgz", + "integrity": "sha512-3giAOQvZiH5F9bMlMiv8+GSPMeqg0dbaeo58/0SlA9sxSqZhnUtxzX9/2FzyhS9sWQf5S0GJE0AKBrFqjpeYcg==", "license": "Apache-2.0", "engines": { "node": ">=8.0.0" @@ -9910,9 +9825,9 @@ "license": "MIT" }, "node_modules/@types/node": { - "version": "22.19.15", - "resolved": "https://registry.npmjs.org/@types/node/-/node-22.19.15.tgz", - "integrity": "sha512-F0R/h2+dsy5wJAUe3tAU6oqa2qbWY5TpNfL/RGmo1y38hiyO1w3x2jPtt76wmuaJI4DQnOBu21cNXQ2STIUUWg==", + "version": "22.19.17", + "resolved": "https://registry.npmjs.org/@types/node/-/node-22.19.17.tgz", + "integrity": "sha512-wGdMcf+vPYM6jikpS/qhg6WiqSV/OhG+jeeHT/KlVqxYfD40iYJf9/AE1uQxVWFvU7MipKRkRv8NSHiCGgPr8Q==", "license": "MIT", "dependencies": { "undici-types": "~6.21.0" @@ -10086,12 +10001,13 @@ } }, "node_modules/@types/send": { - "version": "1.2.1", - "resolved": "https://registry.npmjs.org/@types/send/-/send-1.2.1.tgz", - "integrity": "sha512-arsCikDvlU99zl1g69TcAB3mzZPpxgw0UQnaHeC1Nwb015xp8bknZv5rIfri9xTOcMuaVgvabfIRA7PSZVuZIQ==", + "version": "0.17.6", + "resolved": "https://registry.npmjs.org/@types/send/-/send-0.17.6.tgz", + "integrity": "sha512-Uqt8rPBE8SY0RK8JB1EzVOIZ32uqy8HwdxCnoCOsYrvnswqmFZ/k+9Ikidlk/ImhsdvBsloHbAlewb2IEBV/Og==", "dev": true, "license": "MIT", "dependencies": { + "@types/mime": "^1", "@types/node": "*" } }, @@ -10107,17 +10023,6 @@ "@types/send": "<1" } }, - "node_modules/@types/serve-static/node_modules/@types/send": { - "version": "0.17.6", - "resolved": "https://registry.npmjs.org/@types/send/-/send-0.17.6.tgz", - "integrity": "sha512-Uqt8rPBE8SY0RK8JB1EzVOIZ32uqy8HwdxCnoCOsYrvnswqmFZ/k+9Ikidlk/ImhsdvBsloHbAlewb2IEBV/Og==", - "dev": true, - "license": "MIT", - "dependencies": { - "@types/mime": "^1", - "@types/node": "*" - } - }, "node_modules/@types/stack-utils": { "version": "2.0.3", "resolved": "https://registry.npmjs.org/@types/stack-utils/-/stack-utils-2.0.3.tgz", @@ -10390,6 +10295,74 @@ "vite": "^4.2.0 || ^5.0.0 || ^6.0.0 || ^7.0.0 || ^8.0.0" } }, + "node_modules/@vitest/expect": { + "version": "3.2.4", + "resolved": "https://registry.npmjs.org/@vitest/expect/-/expect-3.2.4.tgz", + "integrity": "sha512-Io0yyORnB6sikFlt8QW5K7slY4OjqNX9jmJQ02QDda8lyM6B5oNgVWoSoKPac8/kgnCUzuHQKrSLtu/uOqqrig==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/chai": "^5.2.2", + "@vitest/spy": "3.2.4", + "@vitest/utils": "3.2.4", + "chai": "^5.2.0", + "tinyrainbow": "^2.0.0" + }, + "funding": { + "url": "https://opencollective.com/vitest" + } + }, + "node_modules/@vitest/expect/node_modules/@vitest/pretty-format": { + "version": "3.2.4", + "resolved": "https://registry.npmjs.org/@vitest/pretty-format/-/pretty-format-3.2.4.tgz", + "integrity": "sha512-IVNZik8IVRJRTr9fxlitMKeJeXFFFN0JaB9PHPGQ8NKQbGpfjlTx9zO4RefN8gp7eqjNy8nyK3NZmBzOPeIxtA==", + "dev": true, + "license": "MIT", + "dependencies": { + "tinyrainbow": "^2.0.0" + }, + "funding": { + "url": "https://opencollective.com/vitest" + } + }, + "node_modules/@vitest/expect/node_modules/@vitest/spy": { + "version": "3.2.4", + "resolved": "https://registry.npmjs.org/@vitest/spy/-/spy-3.2.4.tgz", + "integrity": "sha512-vAfasCOe6AIK70iP5UD11Ac4siNUNJ9i/9PZ3NKx07sG6sUxeag1LWdNrMWeKKYBLlzuK+Gn65Yd5nyL6ds+nw==", + "dev": true, + "license": "MIT", + "dependencies": { + "tinyspy": "^4.0.3" + }, + "funding": { + "url": "https://opencollective.com/vitest" + } + }, + "node_modules/@vitest/expect/node_modules/@vitest/utils": { + "version": "3.2.4", + "resolved": "https://registry.npmjs.org/@vitest/utils/-/utils-3.2.4.tgz", + "integrity": "sha512-fB2V0JFrQSMsCo9HiSq3Ezpdv4iYaXRG1Sx8edX3MwxfyNn83mKiGzOcH+Fkxt4MHxr3y42fQi1oeAInqgX2QA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@vitest/pretty-format": "3.2.4", + "loupe": "^3.1.4", + "tinyrainbow": "^2.0.0" + }, + "funding": { + "url": "https://opencollective.com/vitest" + } + }, + "node_modules/@vitest/expect/node_modules/tinyrainbow": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/tinyrainbow/-/tinyrainbow-2.0.0.tgz", + "integrity": "sha512-op4nsTR47R6p0vMUUoYl/a+ljLFVtlfaXkLQmqfLR1qHma1h/ysYk4hEXZ880bf2CYgTskvTa/e196Vd5dDQXw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=14.0.0" + } + }, "node_modules/@vitest/mocker": { "version": "4.1.2", "resolved": "https://registry.npmjs.org/@vitest/mocker/-/mocker-4.1.2.tgz", @@ -10417,16 +10390,6 @@ } } }, - "node_modules/@vitest/mocker/node_modules/@vitest/spy": { - "version": "4.1.2", - "resolved": "https://registry.npmjs.org/@vitest/spy/-/spy-4.1.2.tgz", - "integrity": "sha512-DU4fBnbVCJGNBwVA6xSToNXrkZNSiw59H8tcuUspVMsBDBST4nfvsPsEHDHGtWRRnqBERBQu7TrTKskmjqTXKA==", - "dev": true, - "license": "MIT", - "funding": { - "url": "https://opencollective.com/vitest" - } - }, "node_modules/@vitest/mocker/node_modules/estree-walker": { "version": "3.0.3", "resolved": "https://registry.npmjs.org/estree-walker/-/estree-walker-3.0.3.tgz", @@ -10447,21 +10410,7 @@ "@jridgewell/sourcemap-codec": "^1.5.5" } }, - "node_modules/@vitest/runner": { - "version": "4.1.2", - "resolved": "https://registry.npmjs.org/@vitest/runner/-/runner-4.1.2.tgz", - "integrity": "sha512-Gr+FQan34CdiYAwpGJmQG8PgkyFVmARK8/xSijia3eTFgVfpcpztWLuP6FttGNfPLJhaZVP/euvujeNYar36OQ==", - "dev": true, - "license": "MIT", - "dependencies": { - "@vitest/utils": "4.1.2", - "pathe": "^2.0.3" - }, - "funding": { - "url": "https://opencollective.com/vitest" - } - }, - "node_modules/@vitest/runner/node_modules/@vitest/pretty-format": { + "node_modules/@vitest/pretty-format": { "version": "4.1.2", "resolved": "https://registry.npmjs.org/@vitest/pretty-format/-/pretty-format-4.1.2.tgz", "integrity": "sha512-dwQga8aejqeuB+TvXCMzSQemvV9hNEtDDpgUKDzOmNQayl2OG241PSWeJwKRH3CiC+sESrmoFd49rfnq7T4RnA==", @@ -10474,16 +10423,15 @@ "url": "https://opencollective.com/vitest" } }, - "node_modules/@vitest/runner/node_modules/@vitest/utils": { + "node_modules/@vitest/runner": { "version": "4.1.2", - "resolved": "https://registry.npmjs.org/@vitest/utils/-/utils-4.1.2.tgz", - "integrity": "sha512-xw2/TiX82lQHA06cgbqRKFb5lCAy3axQ4H4SoUFhUsg+wztiet+co86IAMDtF6Vm1hc7J6j09oh/rgDn+JdKIQ==", + "resolved": "https://registry.npmjs.org/@vitest/runner/-/runner-4.1.2.tgz", + "integrity": "sha512-Gr+FQan34CdiYAwpGJmQG8PgkyFVmARK8/xSijia3eTFgVfpcpztWLuP6FttGNfPLJhaZVP/euvujeNYar36OQ==", "dev": true, "license": "MIT", "dependencies": { - "@vitest/pretty-format": "4.1.2", - "convert-source-map": "^2.0.0", - "tinyrainbow": "^3.1.0" + "@vitest/utils": "4.1.2", + "pathe": "^2.0.3" }, "funding": { "url": "https://opencollective.com/vitest" @@ -10496,16 +10444,6 @@ "dev": true, "license": "MIT" }, - "node_modules/@vitest/runner/node_modules/tinyrainbow": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/tinyrainbow/-/tinyrainbow-3.1.0.tgz", - "integrity": "sha512-Bf+ILmBgretUrdJxzXM0SgXLZ3XfiaUuOj/IKQHuTXip+05Xn+uyEYdVg0kYDipTBcLrCVyUzAPz7QmArb0mmw==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=14.0.0" - } - }, "node_modules/@vitest/snapshot": { "version": "4.1.2", "resolved": "https://registry.npmjs.org/@vitest/snapshot/-/snapshot-4.1.2.tgz", @@ -10522,34 +10460,6 @@ "url": "https://opencollective.com/vitest" } }, - "node_modules/@vitest/snapshot/node_modules/@vitest/pretty-format": { - "version": "4.1.2", - "resolved": "https://registry.npmjs.org/@vitest/pretty-format/-/pretty-format-4.1.2.tgz", - "integrity": "sha512-dwQga8aejqeuB+TvXCMzSQemvV9hNEtDDpgUKDzOmNQayl2OG241PSWeJwKRH3CiC+sESrmoFd49rfnq7T4RnA==", - "dev": true, - "license": "MIT", - "dependencies": { - "tinyrainbow": "^3.1.0" - }, - "funding": { - "url": "https://opencollective.com/vitest" - } - }, - "node_modules/@vitest/snapshot/node_modules/@vitest/utils": { - "version": "4.1.2", - "resolved": "https://registry.npmjs.org/@vitest/utils/-/utils-4.1.2.tgz", - "integrity": "sha512-xw2/TiX82lQHA06cgbqRKFb5lCAy3axQ4H4SoUFhUsg+wztiet+co86IAMDtF6Vm1hc7J6j09oh/rgDn+JdKIQ==", - "dev": true, - "license": "MIT", - "dependencies": { - "@vitest/pretty-format": "4.1.2", - "convert-source-map": "^2.0.0", - "tinyrainbow": "^3.1.0" - }, - "funding": { - "url": "https://opencollective.com/vitest" - } - }, "node_modules/@vitest/snapshot/node_modules/magic-string": { "version": "0.30.21", "resolved": "https://registry.npmjs.org/magic-string/-/magic-string-0.30.21.tgz", @@ -10567,14 +10477,29 @@ "dev": true, "license": "MIT" }, - "node_modules/@vitest/snapshot/node_modules/tinyrainbow": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/tinyrainbow/-/tinyrainbow-3.1.0.tgz", - "integrity": "sha512-Bf+ILmBgretUrdJxzXM0SgXLZ3XfiaUuOj/IKQHuTXip+05Xn+uyEYdVg0kYDipTBcLrCVyUzAPz7QmArb0mmw==", + "node_modules/@vitest/spy": { + "version": "4.1.2", + "resolved": "https://registry.npmjs.org/@vitest/spy/-/spy-4.1.2.tgz", + "integrity": "sha512-DU4fBnbVCJGNBwVA6xSToNXrkZNSiw59H8tcuUspVMsBDBST4nfvsPsEHDHGtWRRnqBERBQu7TrTKskmjqTXKA==", "dev": true, "license": "MIT", - "engines": { - "node": ">=14.0.0" + "funding": { + "url": "https://opencollective.com/vitest" + } + }, + "node_modules/@vitest/utils": { + "version": "4.1.2", + "resolved": "https://registry.npmjs.org/@vitest/utils/-/utils-4.1.2.tgz", + "integrity": "sha512-xw2/TiX82lQHA06cgbqRKFb5lCAy3axQ4H4SoUFhUsg+wztiet+co86IAMDtF6Vm1hc7J6j09oh/rgDn+JdKIQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@vitest/pretty-format": "4.1.2", + "convert-source-map": "^2.0.0", + "tinyrainbow": "^3.1.0" + }, + "funding": { + "url": "https://opencollective.com/vitest" } }, "node_modules/@vue/compiler-core": { @@ -10823,9 +10748,9 @@ } }, "node_modules/@xmldom/xmldom": { - "version": "0.9.9", - "resolved": "https://registry.npmjs.org/@xmldom/xmldom/-/xmldom-0.9.9.tgz", - "integrity": "sha512-qycIHAucxy/LXAYIjmLmtQ8q9GPnMbnjG1KXhWm9o5sCr6pOYDATkMPiTNa6/v8eELyqOQ2FsEqeoFYmgv/gJg==", + "version": "0.9.10", + "resolved": "https://registry.npmjs.org/@xmldom/xmldom/-/xmldom-0.9.10.tgz", + "integrity": "sha512-A9gOqLdi6cV4ibazAjcQufGj0B1y/vDqYrcuP6d/6x8P27gRS8643Dj9o1dEKtB6O7fwxb2FgBmJS2mX7gpvdw==", "license": "MIT", "engines": { "node": ">=14.6" @@ -10994,26 +10919,17 @@ "zod": "^3.25.76 || ^4.1.8" } }, - "node_modules/ai/node_modules/@opentelemetry/api": { - "version": "1.9.0", - "resolved": "https://registry.npmjs.org/@opentelemetry/api/-/api-1.9.0.tgz", - "integrity": "sha512-3giAOQvZiH5F9bMlMiv8+GSPMeqg0dbaeo58/0SlA9sxSqZhnUtxzX9/2FzyhS9sWQf5S0GJE0AKBrFqjpeYcg==", - "license": "Apache-2.0", - "engines": { - "node": ">=8.0.0" - } - }, "node_modules/ajv": { - "version": "8.18.0", - "resolved": "https://registry.npmjs.org/ajv/-/ajv-8.18.0.tgz", - "integrity": "sha512-PlXPeEWMXMZ7sPYOHqmDyCJzcfNrUr3fGNKtezX14ykXOEIvyK81d+qydx89KY5O71FKMPaQ2vBfBFI5NHR63A==", + "version": "6.15.0", + "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.15.0.tgz", + "integrity": "sha512-fgFx7Hfoq60ytK2c7DhnF8jIvzYgOMxfugjLOSMHjLIPgenqa7S7oaagATUq99mV6IYvN2tRmC0wnTYX6iPbMw==", "dev": true, "license": "MIT", "dependencies": { - "fast-deep-equal": "^3.1.3", - "fast-uri": "^3.0.1", - "json-schema-traverse": "^1.0.0", - "require-from-string": "^2.0.2" + "fast-deep-equal": "^3.1.1", + "fast-json-stable-stringify": "^2.0.0", + "json-schema-traverse": "^0.4.1", + "uri-js": "^4.2.2" }, "funding": { "type": "github", @@ -11038,6 +10954,30 @@ } } }, + "node_modules/ajv-formats/node_modules/ajv": { + "version": "8.20.0", + "resolved": "https://registry.npmjs.org/ajv/-/ajv-8.20.0.tgz", + "integrity": "sha512-Thbli+OlOj+iMPYFBVBfJ3OmCAnaSyNn4M1vz9T6Gka5Jt9ba/HIR56joy65tY6kx/FCF5VXNB819Y7/GUrBGA==", + "dev": true, + "license": "MIT", + "dependencies": { + "fast-deep-equal": "^3.1.3", + "fast-uri": "^3.0.1", + "json-schema-traverse": "^1.0.0", + "require-from-string": "^2.0.2" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/epoberezkin" + } + }, + "node_modules/ajv-formats/node_modules/json-schema-traverse": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz", + "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==", + "dev": true, + "license": "MIT" + }, "node_modules/ajv-keywords": { "version": "3.5.2", "resolved": "https://registry.npmjs.org/ajv-keywords/-/ajv-keywords-3.5.2.tgz", @@ -11631,9 +11571,9 @@ } }, "node_modules/brace-expansion": { - "version": "1.1.13", - "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.13.tgz", - "integrity": "sha512-9ZLprWS6EENmhEOpjCYW2c8VkmOvckIJZfkr7rBW6dObmfgJ/L1GpSYW5Hpo9lDz4D1+n0Ckz8rU7FwHDQiG/w==", + "version": "1.1.14", + "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.14.tgz", + "integrity": "sha512-MWPGfDxnyzKU7rNOW9SP/c50vi3xrmrua/+6hfPbCS2ABNWfx24vPidzvC7krjU/RTo235sV776ymlsMtGKj8g==", "devOptional": true, "license": "MIT", "dependencies": { @@ -14444,9 +14384,9 @@ "license": "MIT" }, "node_modules/fast-uri": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/fast-uri/-/fast-uri-3.1.0.tgz", - "integrity": "sha512-iPeeDKJSWf4IEOasVVrknXpaBV0IApz/gp7S2bb7Z4Lljbl2MGJRqInZiUrQwV16cpzw/D3S5j5Julj/gT52AA==", + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/fast-uri/-/fast-uri-3.1.2.tgz", + "integrity": "sha512-rVjf7ArG3LTk+FS6Yw81V1DLuZl1bRbNrev6Tmd/9RaroeeRRJhAt7jg/6YFxbvAQXUCavSoZhPPj6oOx+5KjQ==", "dev": true, "funding": [ { @@ -16103,9 +16043,9 @@ "license": "MIT" }, "node_modules/immer": { - "version": "11.1.4", - "resolved": "https://registry.npmjs.org/immer/-/immer-11.1.4.tgz", - "integrity": "sha512-XREFCPo6ksxVzP4E0ekD5aMdf8WMwmdNaz6vuvxgI40UaEiu6q3p8X52aU6GdyvLY3XXX/8R7JOTXStz/nBbRw==", + "version": "11.1.6", + "resolved": "https://registry.npmjs.org/immer/-/immer-11.1.6.tgz", + "integrity": "sha512-uwrF08UBQfxk49i9WcUeCx045wjB1zXEHNJmbYHPVVspxmjwSeWCoKbB8DEIvs3XkBJV6lcRAyLaWJ2+u3MMCw==", "license": "MIT", "funding": { "type": "opencollective", @@ -19177,9 +19117,9 @@ } }, "node_modules/jose": { - "version": "6.2.2", - "resolved": "https://registry.npmjs.org/jose/-/jose-6.2.2.tgz", - "integrity": "sha512-d7kPDd34KO/YnzaDOlikGpOurfF0ByC2sEV4cANCtdqLlTfBlw2p14O/5d/zv40gJPbIQxfES3nSx1/oYNyuZQ==", + "version": "6.2.3", + "resolved": "https://registry.npmjs.org/jose/-/jose-6.2.3.tgz", + "integrity": "sha512-YYVDInQKFJfR/xa3ojUTl8c2KoTwiL1R5Wg9YCydwH0x0B9grbzlg5HC7mMjCtUJjbQ/YnGEZIhI5tCgfTb4Hw==", "license": "MIT", "funding": { "url": "https://github.com/sponsors/panva" @@ -19255,9 +19195,9 @@ "license": "(AFL-2.1 OR BSD-3-Clause)" }, "node_modules/json-schema-traverse": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz", - "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==", + "version": "0.4.1", + "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz", + "integrity": "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==", "dev": true, "license": "MIT" }, @@ -19367,15 +19307,6 @@ "integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==", "license": "MIT" }, - "node_modules/jszip/node_modules/string_decoder": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz", - "integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==", - "license": "MIT", - "dependencies": { - "safe-buffer": "~5.1.0" - } - }, "node_modules/jwa": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/jwa/-/jwa-2.0.1.tgz", @@ -19806,9 +19737,9 @@ } }, "node_modules/llamaindex/node_modules/@types/node": { - "version": "24.12.0", - "resolved": "https://registry.npmjs.org/@types/node/-/node-24.12.0.tgz", - "integrity": "sha512-GYDxsZi3ChgmckRT9HPU0WEhKLP08ev/Yfcq2AstjrDASOYCSXeyjDsHg4v5t4jOj7cyDX3vmprafKlWIG9MXQ==", + "version": "24.12.2", + "resolved": "https://registry.npmjs.org/@types/node/-/node-24.12.2.tgz", + "integrity": "sha512-A1sre26ke7HDIuY/M23nd9gfB+nrmhtYyMINbjI1zHJxYteKR6qSMX56FsmjMcDb3SMcjJg5BiRRgOCC/yBD0g==", "license": "MIT", "dependencies": { "undici-types": "~7.16.0" @@ -20220,9 +20151,9 @@ } }, "node_modules/mammoth/node_modules/@xmldom/xmldom": { - "version": "0.8.12", - "resolved": "https://registry.npmjs.org/@xmldom/xmldom/-/xmldom-0.8.12.tgz", - "integrity": "sha512-9k/gHF6n/pAi/9tqr3m3aqkuiNosYTurLLUtc7xQ9sxB/wm7WPygCv8GYa6mS0fLJEHhqMC1ATYhz++U/lRHqg==", + "version": "0.8.13", + "resolved": "https://registry.npmjs.org/@xmldom/xmldom/-/xmldom-0.8.13.tgz", + "integrity": "sha512-KRYzxepc14G/CEpEGc3Yn+JKaAeT63smlDr+vjB8jRfgTBBI9wRj/nkQEO+ucV8p8I9bfKLWp37uHgFrbntPvw==", "license": "MIT", "engines": { "node": ">=10.0.0" @@ -21645,6 +21576,16 @@ "integrity": "sha512-AGK2yQKIjRuqnc6VkX2Xj5d+QW8xZ87pa1UK6yA6ouUyuxfHuMP6umE5QK7UmTeOAymo+Zx1Fxiuw9rVx8taHQ==", "license": "MIT" }, + "node_modules/node-addon-api": { + "version": "8.7.0", + "resolved": "https://registry.npmjs.org/node-addon-api/-/node-addon-api-8.7.0.tgz", + "integrity": "sha512-9MdFxmkKaOYVTV+XVRG8ArDwwQ77XIgIPyKASB1k3JPq3M8fGQQQE3YpMOrKm6g//Ktx8ivZr8xo1Qmtqub+GA==", + "license": "MIT", + "peer": true, + "engines": { + "node": "^18 || ^20 || >= 21" + } + }, "node_modules/node-domexception": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/node-domexception/-/node-domexception-1.0.0.tgz", @@ -21695,6 +21636,18 @@ } } }, + "node_modules/node-gyp-build": { + "version": "4.8.4", + "resolved": "https://registry.npmjs.org/node-gyp-build/-/node-gyp-build-4.8.4.tgz", + "integrity": "sha512-LA4ZjwlnUblHVgq0oBF3Jl/6h/Nvs5fzBLwdEF4nuxnFdsfajde4WfxtJr3CaiH+F6ewcIB/q4jQ4UzPyid+CQ==", + "license": "MIT", + "peer": true, + "bin": { + "node-gyp-build": "bin.js", + "node-gyp-build-optional": "optional.js", + "node-gyp-build-test": "build-test.js" + } + }, "node_modules/node-gyp-build-optional-packages": { "version": "5.2.2", "resolved": "https://registry.npmjs.org/node-gyp-build-optional-packages/-/node-gyp-build-optional-packages-5.2.2.tgz", @@ -21977,13 +21930,13 @@ } }, "node_modules/openid-client": { - "version": "6.8.2", - "resolved": "https://registry.npmjs.org/openid-client/-/openid-client-6.8.2.tgz", - "integrity": "sha512-uOvTCndr4udZsKihJ68H9bUICrriHdUVJ6Az+4Ns6cW55rwM5h0bjVIzDz2SxgOI84LKjFyjOFvERLzdTUROGA==", + "version": "6.8.4", + "resolved": "https://registry.npmjs.org/openid-client/-/openid-client-6.8.4.tgz", + "integrity": "sha512-QSw0BA08piujetEwfZsHoTrDpMEha7GDZDicQqVwX4u0ChCjefvjDB++TZ8BTg76UpwhzIQgdvvfgfl3HpCSAw==", "license": "MIT", "dependencies": { - "jose": "^6.1.3", - "oauth4webapi": "^3.8.4" + "jose": "^6.2.2", + "oauth4webapi": "^3.8.5" }, "funding": { "url": "https://github.com/sponsors/panva" @@ -22310,9 +22263,9 @@ } }, "node_modules/path-scurry/node_modules/lru-cache": { - "version": "11.2.7", - "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-11.2.7.tgz", - "integrity": "sha512-aY/R+aEsRelme17KGQa/1ZSIpLpNYYrhcrepKTZgE+W3WM16YMCaPwOHLHsmopZHELU0Ojin1lPVxKR0MihncA==", + "version": "11.3.6", + "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-11.3.6.tgz", + "integrity": "sha512-Gf/KoL3C/MlI7Bt0PGI9I+TeTC/I6r/csU58N4BSNc4lppLBeKsOdFYkK+dX0ABDUMJNfCHTyPpzwwO21Awd3A==", "dev": true, "license": "BlueOak-1.0.0", "engines": { @@ -22537,9 +22490,9 @@ } }, "node_modules/postcss": { - "version": "8.5.10", - "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.5.10.tgz", - "integrity": "sha512-pMMHxBOZKFU6HgAZ4eyGnwXF/EvPGGqUr0MnZ5+99485wwW41kW91A4LOGxSHhgugZmSChL5AlElNdwlNgcnLQ==", + "version": "8.5.14", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.5.14.tgz", + "integrity": "sha512-SoSL4+OSEtR99LHFZQiJLkT59C5B1amGO1NzTwj7TT1qCUgUO6hxOvzkOYxD+vMrXBM3XJIKzokoERdqQq/Zmg==", "funding": [ { "type": "opencollective", @@ -23975,9 +23928,9 @@ } }, "node_modules/rimraf/node_modules/brace-expansion": { - "version": "2.0.3", - "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.3.tgz", - "integrity": "sha512-MCV/fYJEbqx68aE58kv2cA/kiky1G8vux3OR6/jbS+jIMe/6fJWa0DTzJU7dqijOWYwHi1t29FlfYI9uytqlpA==", + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.1.0.tgz", + "integrity": "sha512-TN1kCZAgdgweJhWWpgKYrQaMNHcDULHkWwQIspdtjV4Y5aurRdZpjAqn6yX3FPqTA9ngHCc4hJxMAMgGfve85w==", "license": "MIT", "dependencies": { "balanced-match": "^1.0.0" @@ -24119,7 +24072,6 @@ "resolved": "https://registry.npmjs.org/rxjs/-/rxjs-7.8.2.tgz", "integrity": "sha512-dhKf903U/PQZY6boNNtAGdWbG85WAbjT/1xYoZIC7FAY0yWapOBQVsVrDl58W86//e1VpMNBtRV4MaXfdMySFA==", "license": "Apache-2.0", - "peer": true, "dependencies": { "tslib": "^2.1.0" } @@ -24264,30 +24216,6 @@ "url": "https://opencollective.com/webpack" } }, - "node_modules/schema-utils/node_modules/ajv": { - "version": "6.14.0", - "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.14.0.tgz", - "integrity": "sha512-IWrosm/yrn43eiKqkfkHis7QioDleaXQHdDVPKg0FSwwd/DuvyX79TZnFOnYpB7dcsFAMmtFztZuXPDvSePkFw==", - "dev": true, - "license": "MIT", - "dependencies": { - "fast-deep-equal": "^3.1.1", - "fast-json-stable-stringify": "^2.0.0", - "json-schema-traverse": "^0.4.1", - "uri-js": "^4.2.2" - }, - "funding": { - "type": "github", - "url": "https://github.com/sponsors/epoberezkin" - } - }, - "node_modules/schema-utils/node_modules/json-schema-traverse": { - "version": "0.4.1", - "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz", - "integrity": "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==", - "dev": true, - "license": "MIT" - }, "node_modules/selderee": { "version": "0.11.0", "resolved": "https://registry.npmjs.org/selderee/-/selderee-0.11.0.tgz", @@ -24952,36 +24880,6 @@ "@testing-library/dom": ">=7.21.4" } }, - "node_modules/storybook/node_modules/@vitest/expect": { - "version": "3.2.4", - "resolved": "https://registry.npmjs.org/@vitest/expect/-/expect-3.2.4.tgz", - "integrity": "sha512-Io0yyORnB6sikFlt8QW5K7slY4OjqNX9jmJQ02QDda8lyM6B5oNgVWoSoKPac8/kgnCUzuHQKrSLtu/uOqqrig==", - "dev": true, - "license": "MIT", - "dependencies": { - "@types/chai": "^5.2.2", - "@vitest/spy": "3.2.4", - "@vitest/utils": "3.2.4", - "chai": "^5.2.0", - "tinyrainbow": "^2.0.0" - }, - "funding": { - "url": "https://opencollective.com/vitest" - } - }, - "node_modules/storybook/node_modules/@vitest/pretty-format": { - "version": "3.2.4", - "resolved": "https://registry.npmjs.org/@vitest/pretty-format/-/pretty-format-3.2.4.tgz", - "integrity": "sha512-IVNZik8IVRJRTr9fxlitMKeJeXFFFN0JaB9PHPGQ8NKQbGpfjlTx9zO4RefN8gp7eqjNy8nyK3NZmBzOPeIxtA==", - "dev": true, - "license": "MIT", - "dependencies": { - "tinyrainbow": "^2.0.0" - }, - "funding": { - "url": "https://opencollective.com/vitest" - } - }, "node_modules/storybook/node_modules/@vitest/spy": { "version": "3.2.4", "resolved": "https://registry.npmjs.org/@vitest/spy/-/spy-3.2.4.tgz", @@ -24995,21 +24893,6 @@ "url": "https://opencollective.com/vitest" } }, - "node_modules/storybook/node_modules/@vitest/utils": { - "version": "3.2.4", - "resolved": "https://registry.npmjs.org/@vitest/utils/-/utils-3.2.4.tgz", - "integrity": "sha512-fB2V0JFrQSMsCo9HiSq3Ezpdv4iYaXRG1Sx8edX3MwxfyNn83mKiGzOcH+Fkxt4MHxr3y42fQi1oeAInqgX2QA==", - "dev": true, - "license": "MIT", - "dependencies": { - "@vitest/pretty-format": "3.2.4", - "loupe": "^3.1.4", - "tinyrainbow": "^2.0.0" - }, - "funding": { - "url": "https://opencollective.com/vitest" - } - }, "node_modules/storybook/node_modules/dom-accessibility-api": { "version": "0.6.3", "resolved": "https://registry.npmjs.org/dom-accessibility-api/-/dom-accessibility-api-0.6.3.tgz", @@ -25030,26 +24913,6 @@ "node": ">=10" } }, - "node_modules/storybook/node_modules/tinyrainbow": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/tinyrainbow/-/tinyrainbow-2.0.0.tgz", - "integrity": "sha512-op4nsTR47R6p0vMUUoYl/a+ljLFVtlfaXkLQmqfLR1qHma1h/ysYk4hEXZ880bf2CYgTskvTa/e196Vd5dDQXw==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=14.0.0" - } - }, - "node_modules/storybook/node_modules/tinyspy": { - "version": "4.0.4", - "resolved": "https://registry.npmjs.org/tinyspy/-/tinyspy-4.0.4.tgz", - "integrity": "sha512-azl+t0z7pw/z958Gy9svOTuzqIk6xq+NSheJzn5MMWtWTFywIacg2wUlzKFGtt3cthx0r2SxMK0yzJOR0IES7Q==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=14.0.0" - } - }, "node_modules/stream-events": { "version": "1.0.5", "resolved": "https://registry.npmjs.org/stream-events/-/stream-events-1.0.5.tgz", @@ -25084,14 +24947,20 @@ } }, "node_modules/string_decoder": { - "version": "1.3.0", - "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.3.0.tgz", - "integrity": "sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==", + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz", + "integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==", "license": "MIT", "dependencies": { - "safe-buffer": "~5.2.0" + "safe-buffer": "~5.1.0" } }, + "node_modules/string_decoder/node_modules/safe-buffer": { + "version": "5.1.2", + "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz", + "integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==", + "license": "MIT" + }, "node_modules/string-length": { "version": "4.0.2", "resolved": "https://registry.npmjs.org/string-length/-/string-length-4.0.2.tgz", @@ -25650,6 +25519,23 @@ } } }, + "node_modules/terser-webpack-plugin/node_modules/ajv": { + "version": "8.20.0", + "resolved": "https://registry.npmjs.org/ajv/-/ajv-8.20.0.tgz", + "integrity": "sha512-Thbli+OlOj+iMPYFBVBfJ3OmCAnaSyNn4M1vz9T6Gka5Jt9ba/HIR56joy65tY6kx/FCF5VXNB819Y7/GUrBGA==", + "dev": true, + "license": "MIT", + "dependencies": { + "fast-deep-equal": "^3.1.3", + "fast-uri": "^3.0.1", + "json-schema-traverse": "^1.0.0", + "require-from-string": "^2.0.2" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/epoberezkin" + } + }, "node_modules/terser-webpack-plugin/node_modules/ajv-formats": { "version": "2.1.1", "resolved": "https://registry.npmjs.org/ajv-formats/-/ajv-formats-2.1.1.tgz", @@ -25696,6 +25582,13 @@ "node": ">= 10.13.0" } }, + "node_modules/terser-webpack-plugin/node_modules/json-schema-traverse": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz", + "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==", + "dev": true, + "license": "MIT" + }, "node_modules/terser-webpack-plugin/node_modules/schema-utils": { "version": "4.3.3", "resolved": "https://registry.npmjs.org/schema-utils/-/schema-utils-4.3.3.tgz", @@ -25815,6 +25708,26 @@ "url": "https://github.com/sponsors/SuperchupuDev" } }, + "node_modules/tinyrainbow": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/tinyrainbow/-/tinyrainbow-3.1.0.tgz", + "integrity": "sha512-Bf+ILmBgretUrdJxzXM0SgXLZ3XfiaUuOj/IKQHuTXip+05Xn+uyEYdVg0kYDipTBcLrCVyUzAPz7QmArb0mmw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/tinyspy": { + "version": "4.0.4", + "resolved": "https://registry.npmjs.org/tinyspy/-/tinyspy-4.0.4.tgz", + "integrity": "sha512-azl+t0z7pw/z958Gy9svOTuzqIk6xq+NSheJzn5MMWtWTFywIacg2wUlzKFGtt3cthx0r2SxMK0yzJOR0IES7Q==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=14.0.0" + } + }, "node_modules/tmpl": { "version": "1.0.5", "resolved": "https://registry.npmjs.org/tmpl/-/tmpl-1.0.5.tgz", @@ -25882,6 +25795,18 @@ "integrity": "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==", "license": "MIT" }, + "node_modules/tree-sitter": { + "version": "0.22.4", + "resolved": "https://registry.npmjs.org/tree-sitter/-/tree-sitter-0.22.4.tgz", + "integrity": "sha512-usbHZP9/oxNsUY65MQUsduGRqDHQOou1cagUSwjhoSYAmSahjQDAVsh9s+SlZkn8X8+O1FULRGwHu7AFP3kjzg==", + "hasInstallScript": true, + "license": "MIT", + "peer": true, + "dependencies": { + "node-addon-api": "^8.3.0", + "node-gyp-build": "^4.8.4" + } + }, "node_modules/trim-lines": { "version": "3.0.1", "resolved": "https://registry.npmjs.org/trim-lines/-/trim-lines-3.0.1.tgz", @@ -26402,9 +26327,9 @@ } }, "node_modules/typeorm/node_modules/brace-expansion": { - "version": "2.0.3", - "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.3.tgz", - "integrity": "sha512-MCV/fYJEbqx68aE58kv2cA/kiky1G8vux3OR6/jbS+jIMe/6fJWa0DTzJU7dqijOWYwHi1t29FlfYI9uytqlpA==", + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.1.0.tgz", + "integrity": "sha512-TN1kCZAgdgweJhWWpgKYrQaMNHcDULHkWwQIspdtjV4Y5aurRdZpjAqn6yX3FPqTA9ngHCc4hJxMAMgGfve85w==", "license": "MIT", "dependencies": { "balanced-match": "^1.0.0" @@ -27120,44 +27045,6 @@ "url": "https://opencollective.com/vitest" } }, - "node_modules/vitest/node_modules/@vitest/pretty-format": { - "version": "4.1.2", - "resolved": "https://registry.npmjs.org/@vitest/pretty-format/-/pretty-format-4.1.2.tgz", - "integrity": "sha512-dwQga8aejqeuB+TvXCMzSQemvV9hNEtDDpgUKDzOmNQayl2OG241PSWeJwKRH3CiC+sESrmoFd49rfnq7T4RnA==", - "dev": true, - "license": "MIT", - "dependencies": { - "tinyrainbow": "^3.1.0" - }, - "funding": { - "url": "https://opencollective.com/vitest" - } - }, - "node_modules/vitest/node_modules/@vitest/spy": { - "version": "4.1.2", - "resolved": "https://registry.npmjs.org/@vitest/spy/-/spy-4.1.2.tgz", - "integrity": "sha512-DU4fBnbVCJGNBwVA6xSToNXrkZNSiw59H8tcuUspVMsBDBST4nfvsPsEHDHGtWRRnqBERBQu7TrTKskmjqTXKA==", - "dev": true, - "license": "MIT", - "funding": { - "url": "https://opencollective.com/vitest" - } - }, - "node_modules/vitest/node_modules/@vitest/utils": { - "version": "4.1.2", - "resolved": "https://registry.npmjs.org/@vitest/utils/-/utils-4.1.2.tgz", - "integrity": "sha512-xw2/TiX82lQHA06cgbqRKFb5lCAy3axQ4H4SoUFhUsg+wztiet+co86IAMDtF6Vm1hc7J6j09oh/rgDn+JdKIQ==", - "dev": true, - "license": "MIT", - "dependencies": { - "@vitest/pretty-format": "4.1.2", - "convert-source-map": "^2.0.0", - "tinyrainbow": "^3.1.0" - }, - "funding": { - "url": "https://opencollective.com/vitest" - } - }, "node_modules/vitest/node_modules/chai": { "version": "6.2.2", "resolved": "https://registry.npmjs.org/chai/-/chai-6.2.2.tgz", @@ -27185,16 +27072,6 @@ "dev": true, "license": "MIT" }, - "node_modules/vitest/node_modules/tinyrainbow": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/tinyrainbow/-/tinyrainbow-3.1.0.tgz", - "integrity": "sha512-Bf+ILmBgretUrdJxzXM0SgXLZ3XfiaUuOj/IKQHuTXip+05Xn+uyEYdVg0kYDipTBcLrCVyUzAPz7QmArb0mmw==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=14.0.0" - } - }, "node_modules/void-elements": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/void-elements/-/void-elements-3.1.0.tgz", @@ -27270,6 +27147,13 @@ "node": ">= 8" } }, + "node_modules/web-tree-sitter": { + "version": "0.24.7", + "resolved": "https://registry.npmjs.org/web-tree-sitter/-/web-tree-sitter-0.24.7.tgz", + "integrity": "sha512-CdC/TqVFbXqR+C51v38hv6wOPatKEUGxa39scAeFSm98wIhZxAYonhRQPSMmfZ2w7JDI0zQDdzdmgtNk06/krQ==", + "license": "MIT", + "peer": true + }, "node_modules/webidl-conversions": { "version": "3.0.1", "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz", @@ -27277,12 +27161,11 @@ "license": "BSD-2-Clause" }, "node_modules/webpack": { - "version": "5.105.4", - "resolved": "https://registry.npmjs.org/webpack/-/webpack-5.105.4.tgz", - "integrity": "sha512-jTywjboN9aHxFlToqb0K0Zs9SbBoW4zRUlGzI2tYNxVYcEi/IPpn+Xi4ye5jTLvX2YeLuic/IvxNot+Q1jMoOw==", + "version": "5.106.0", + "resolved": "https://registry.npmjs.org/webpack/-/webpack-5.106.0.tgz", + "integrity": "sha512-Pkx5joZ9RrdgO5LBkyX1L2ZAJeK/Taz3vqZ9CbcP0wS5LEMx5QkKsEwLl29QJfihZ+DKRBFldzy1O30pJ1MDpA==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "@types/eslint-scope": "^3.7.7", "@types/estree": "^1.0.8", @@ -27353,13 +27236,29 @@ "dev": true, "license": "MIT" }, + "node_modules/webpack/node_modules/ajv": { + "version": "8.20.0", + "resolved": "https://registry.npmjs.org/ajv/-/ajv-8.20.0.tgz", + "integrity": "sha512-Thbli+OlOj+iMPYFBVBfJ3OmCAnaSyNn4M1vz9T6Gka5Jt9ba/HIR56joy65tY6kx/FCF5VXNB819Y7/GUrBGA==", + "dev": true, + "license": "MIT", + "dependencies": { + "fast-deep-equal": "^3.1.3", + "fast-uri": "^3.0.1", + "json-schema-traverse": "^1.0.0", + "require-from-string": "^2.0.2" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/epoberezkin" + } + }, "node_modules/webpack/node_modules/ajv-formats": { "version": "2.1.1", "resolved": "https://registry.npmjs.org/ajv-formats/-/ajv-formats-2.1.1.tgz", "integrity": "sha512-Wx0Kx52hxE7C18hkMEggYlEifqWZtYaRgouJor+WMdPnQyEK13vgEWyVNup7SoeeoLMsr4kf5h6dOW11I15MUA==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "ajv": "^8.0.0" }, @@ -27378,7 +27277,6 @@ "integrity": "sha512-YCS/JNFAUyr5vAuhk1DWm1CBxRHW9LbJ2ozWeemrIqpbsqKjHVxYPyi5GC0rjZIT5JxJ3virVTS8wk4i/Z+krw==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "fast-deep-equal": "^3.1.3" }, @@ -27386,13 +27284,19 @@ "ajv": "^8.8.2" } }, + "node_modules/webpack/node_modules/json-schema-traverse": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz", + "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==", + "dev": true, + "license": "MIT" + }, "node_modules/webpack/node_modules/mime-db": { "version": "1.52.0", "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz", "integrity": "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==", "dev": true, "license": "MIT", - "peer": true, "engines": { "node": ">= 0.6" } @@ -27403,7 +27307,6 @@ "integrity": "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "mime-db": "1.52.0" }, @@ -27417,7 +27320,6 @@ "integrity": "sha512-eflK8wEtyOE6+hsaRVPxvUKYCpRgzLqDTb8krvAsRIwOGlHoSgYLgBXoubGgLd2fT41/OUYdb48v4k4WWHQurA==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "@types/json-schema": "^7.0.9", "ajv": "^8.9.0", @@ -27785,9 +27687,9 @@ } }, "node_modules/zod": { - "version": "4.3.6", - "resolved": "https://registry.npmjs.org/zod/-/zod-4.3.6.tgz", - "integrity": "sha512-rftlrkhHZOcjDwkGlnUtZZkvaPHCsDATp4pGpuOOMDaTdDDXF91wuVDJoWoPsKX/3YPQ5fHuF3STjcYyKr+Qhg==", + "version": "4.4.3", + "resolved": "https://registry.npmjs.org/zod/-/zod-4.4.3.tgz", + "integrity": "sha512-ytENFjIJFl2UwYglde2jchW2Hwm4GJFLDiSXWdTrJQBIN9Fcyp7n4DhxJEiWNAJMV1/BqWfW/kkg71UDcHJyTQ==", "license": "MIT", "funding": { "url": "https://github.com/sponsors/colinhacks" From 111abc8d5e10597bf1937106dd6050f96a7b6832 Mon Sep 17 00:00:00 2001 From: Thomas Jego Date: Thu, 7 May 2026 09:55:17 +0200 Subject: [PATCH 44/58] test(api): add e2e coverage for crawling endpoints --- .../documents/crawling/e2e-tests/auth.spec.ts | 182 ++++++++++++++++++ .../crawling/e2e-tests/crawl-url.spec.ts | 125 ++++++++++++ .../crawling/e2e-tests/recrawl-url.spec.ts | 155 +++++++++++++++ .../src/domains/documents/test-overrides.ts | 36 ++++ 4 files changed, 498 insertions(+) create mode 100644 apps/api/src/domains/documents/crawling/e2e-tests/auth.spec.ts create mode 100644 apps/api/src/domains/documents/crawling/e2e-tests/crawl-url.spec.ts create mode 100644 apps/api/src/domains/documents/crawling/e2e-tests/recrawl-url.spec.ts diff --git a/apps/api/src/domains/documents/crawling/e2e-tests/auth.spec.ts b/apps/api/src/domains/documents/crawling/e2e-tests/auth.spec.ts new file mode 100644 index 00000000..59e8e031 --- /dev/null +++ b/apps/api/src/domains/documents/crawling/e2e-tests/auth.spec.ts @@ -0,0 +1,182 @@ +import { randomUUID } from "node:crypto" +import { DocumentsRoutes } from "@caseai-connect/api-contracts" +import type { INestApplication } from "@nestjs/common" +import type { App } from "supertest/types" +import { AUTH_ERRORS } from "@/common/errors/auth-errors" +import { + type AllRepositories, + clearTestDatabase, + setupE2eTestDatabase, + teardownE2eTestDatabase, +} from "@/common/test/test-database" +import { removeNullish } from "@/common/utils/remove-nullish" +import { createOrganizationWithDocument } from "@/domains/organizations/organization.factory" +import { projectFactory } from "@/domains/projects/project.factory" +import { mockForeignAuth0Id } from "../../../../../test/e2e.helpers" +import { expectResponse, type Requester, testRequester } from "../../../../../test/request" +import { DocumentsModule } from "../../documents.module" +import { withCrawlingAndAuthMocks } from "../../test-overrides" + +describe("Documents Crawling - Auth", () => { + let app: INestApplication + let request: Requester + let setup: Awaited> + let repositories: AllRepositories + + let organizationId: string | null = "random-organization-id" + let projectId: string | null = "random-project-id" + let documentId: string | null = "random-document-id" + let accessToken: string | null = "token" + let auth0Id = `auth0|${randomUUID()}` + + beforeAll(async () => { + setup = await setupE2eTestDatabase({ + additionalImports: [DocumentsModule], + applyOverrides: (moduleBuilder) => withCrawlingAndAuthMocks(moduleBuilder, () => auth0Id), + }) + repositories = setup.getAllRepositories() + app = setup.module.createNestApplication() + await app.init() + request = testRequester(app) + }) + + beforeEach(async () => { + await clearTestDatabase(setup.dataSource) + organizationId = "random-organization-id" + projectId = "random-project-id" + documentId = "random-document-id" + accessToken = "token" + auth0Id = `auth0|${randomUUID()}` + }) + + afterAll(async () => { + await teardownE2eTestDatabase(setup) + await app.close() + }) + + const createContextForRole = async (role: "owner" | "admin" | "member" = "owner") => { + const { organization, project, document } = await createOrganizationWithDocument(repositories, { + user: { auth0Id }, + projectMembership: { role }, + document: { sourceType: "webCrawl", sourceUrl: "https://example.com" }, + }) + organizationId = organization.id + projectId = project.id + documentId = document.id + accessToken = "token" + return { organization, project, document } + } + + describe("DocumentsRoutes.crawlUrl", () => { + const subject = async () => + request({ + route: DocumentsRoutes.crawlUrl, + pathParams: removeNullish({ organizationId, projectId }), + token: accessToken ?? undefined, + request: { payload: { url: "https://example.com" } }, + }) + + it("requires an authentication token", async () => { + accessToken = null + expectResponse(await subject(), 401, AUTH_ERRORS.NO_ACCESS_TOKEN) + }) + it("requires a valid organization ID", async () => { + organizationId = null + expectResponse(await subject(), 400, AUTH_ERRORS.NO_ORGANIZATION_ID) + }) + it("requires a valid project ID", async () => { + await createContextForRole("owner") + projectId = randomUUID() + expectResponse(await subject(), 404) + }) + it("requires the user to be a member of the organization", async () => { + await createContextForRole("owner") + auth0Id = mockForeignAuth0Id() + expectResponse(await subject(), 401, AUTH_ERRORS.NOT_MEMBER_OF_ORG) + }) + it("doesn't allow a simple member to crawl a URL", async () => { + await createContextForRole("member") + expectResponse(await subject(), 403, AUTH_ERRORS.UNAUTHORIZED_RESOURCE) + }) + it("allows an admin to crawl a URL", async () => { + await createContextForRole("admin") + expectResponse(await subject(), 202) + }) + }) + + describe("DocumentsRoutes.reCrawlUrl", () => { + const subject = async () => + request({ + route: DocumentsRoutes.reCrawlUrl, + pathParams: removeNullish({ organizationId, projectId, documentId }), + token: accessToken ?? undefined, + }) + + it("requires an authentication token", async () => { + accessToken = null + expectResponse(await subject(), 401, AUTH_ERRORS.NO_ACCESS_TOKEN) + }) + it("requires a valid organization ID", async () => { + organizationId = null + expectResponse(await subject(), 400, AUTH_ERRORS.NO_ORGANIZATION_ID) + }) + it("requires a valid project ID", async () => { + await createContextForRole("owner") + projectId = randomUUID() + expectResponse(await subject(), 404) + }) + it("requires the user to be a member of the organization", async () => { + await createContextForRole("owner") + auth0Id = mockForeignAuth0Id() + expectResponse(await subject(), 401, AUTH_ERRORS.NOT_MEMBER_OF_ORG) + }) + it("requires the document to be part of the project", async () => { + const { organization } = await createContextForRole("owner") + const project2 = await repositories.projectRepository.save( + projectFactory.transient({ organization }).build(), + ) + projectId = project2.id + expectResponse(await subject(), 404) + }) + it("doesn't allow a simple member to recrawl a document", async () => { + await createContextForRole("member") + expectResponse(await subject(), 403, AUTH_ERRORS.UNAUTHORIZED_RESOURCE) + }) + it("allows an admin to recrawl a document", async () => { + await createContextForRole("admin") + expectResponse(await subject(), 202) + }) + }) + + describe("DocumentsRoutes.streamCrawlProgress", () => { + const subject = async () => + request({ + route: DocumentsRoutes.streamCrawlProgress, + pathParams: removeNullish({ organizationId, projectId }), + token: accessToken ?? undefined, + }) + + it("requires an authentication token", async () => { + accessToken = null + expectResponse(await subject(), 401, AUTH_ERRORS.NO_ACCESS_TOKEN) + }) + it("requires a valid organization ID", async () => { + organizationId = null + expectResponse(await subject(), 400, AUTH_ERRORS.NO_ORGANIZATION_ID) + }) + it("requires a valid project ID", async () => { + await createContextForRole("owner") + projectId = randomUUID() + expectResponse(await subject(), 404) + }) + it("requires the user to be a member of the organization", async () => { + await createContextForRole("owner") + auth0Id = mockForeignAuth0Id() + expectResponse(await subject(), 401, AUTH_ERRORS.NOT_MEMBER_OF_ORG) + }) + it("doesn't allow a simple member to stream crawl progress", async () => { + await createContextForRole("member") + expectResponse(await subject(), 403, AUTH_ERRORS.UNAUTHORIZED_RESOURCE) + }) + }) +}) diff --git a/apps/api/src/domains/documents/crawling/e2e-tests/crawl-url.spec.ts b/apps/api/src/domains/documents/crawling/e2e-tests/crawl-url.spec.ts new file mode 100644 index 00000000..bc52c429 --- /dev/null +++ b/apps/api/src/domains/documents/crawling/e2e-tests/crawl-url.spec.ts @@ -0,0 +1,125 @@ +import { DocumentsRoutes } from "@caseai-connect/api-contracts" +import type { INestApplication } from "@nestjs/common" +import type { App } from "supertest/types" +import { + type AllRepositories, + clearTestDatabase, + setupE2eTestDatabase, + teardownE2eTestDatabase, +} from "@/common/test/test-database" +import { removeNullish } from "@/common/utils/remove-nullish" +import { createOrganizationWithProject } from "@/domains/organizations/organization.factory" +import { expectResponse, type Requester, testRequester } from "../../../../../test/request" +import { Document } from "../../document.entity" +import { DocumentsModule } from "../../documents.module" +import { withCrawlingAndAuthMocks } from "../../test-overrides" +import { URL_CRAWLING_BATCH_SERVICE, type UrlCrawlingBatchService } from "../url-crawling-batch.interface" + +describe("Documents - crawlUrl", () => { + let app: INestApplication + let request: Requester + let setup: Awaited> + let repositories: AllRepositories + + let organizationId: string + let projectId: string + let userId: string + let accessToken: string | undefined = "token" + let auth0Id = "auth0|123" + let crawlingBatchServiceMock: { enqueueCrawlUrl: jest.MockedFunction } + + beforeAll(async () => { + setup = await setupE2eTestDatabase({ + additionalImports: [DocumentsModule], + applyOverrides: (moduleBuilder) => withCrawlingAndAuthMocks(moduleBuilder, () => auth0Id), + }) + repositories = setup.getAllRepositories() + crawlingBatchServiceMock = setup.module.get(URL_CRAWLING_BATCH_SERVICE) + app = setup.module.createNestApplication() + await app.init() + request = testRequester(app) + }) + + beforeEach(async () => { + await clearTestDatabase(setup.dataSource) + accessToken = "token" + auth0Id = "auth0|123" + crawlingBatchServiceMock.enqueueCrawlUrl.mockClear() + }) + + afterAll(async () => { + await teardownE2eTestDatabase(setup) + await app.close() + }) + + const createContext = async () => { + const { user, organization, project } = await createOrganizationWithProject(repositories, { + user: { auth0Id }, + }) + userId = user.id + organizationId = organization.id + projectId = project.id + } + + const subject = async (payload: { url: string; name?: string }) => + request({ + route: DocumentsRoutes.crawlUrl, + pathParams: removeNullish({ organizationId, projectId }), + token: accessToken, + request: { payload }, + }) + + it("creates a webCrawl document and enqueues the crawl job", async () => { + await createContext() + + const url = "https://example.com" + const response = await subject({ url }) + + expectResponse(response, 202) + expect(response.body.data.message).toContain(url) + + const documents = await repositories.documentRepository.find({ + where: { projectId, sourceType: "webCrawl" }, + }) + expect(documents).toHaveLength(1) + const document = documents[0] as Document + expect(document.sourceType).toBe("webCrawl") + expect(document.sourceUrl).toBe(url) + expect(document.title).toBe(url) + expect(document.mimeType).toBe("text/html") + expect(document.embeddingStatus).toBe("pending") + + expect(crawlingBatchServiceMock.enqueueCrawlUrl).toHaveBeenCalledWith( + expect.objectContaining({ + documentId: document.id, + url, + organizationId, + projectId, + requestedByUserId: userId, + }), + ) + }) + + it("uses the optional name as the document title", async () => { + await createContext() + + const url = "https://example.com" + const name = "My Documentation Site" + await subject({ url, name }) + + const documents = await repositories.documentRepository.find({ + where: { projectId, sourceType: "webCrawl" }, + }) + expect(documents[0]?.title).toBe(name) + expect(documents[0]?.sourceUrl).toBe(url) + }) + + it("rejects an invalid URL with 422", async () => { + await createContext() + + const response = await subject({ url: "not-a-valid-url" }) + + expectResponse(response, 422, "Invalid URL.") + expect(crawlingBatchServiceMock.enqueueCrawlUrl).not.toHaveBeenCalled() + }) +}) diff --git a/apps/api/src/domains/documents/crawling/e2e-tests/recrawl-url.spec.ts b/apps/api/src/domains/documents/crawling/e2e-tests/recrawl-url.spec.ts new file mode 100644 index 00000000..b03e703c --- /dev/null +++ b/apps/api/src/domains/documents/crawling/e2e-tests/recrawl-url.spec.ts @@ -0,0 +1,155 @@ +import { DocumentsRoutes } from "@caseai-connect/api-contracts" +import type { INestApplication } from "@nestjs/common" +import type { App } from "supertest/types" +import { + type AllRepositories, + clearTestDatabase, + setupE2eTestDatabase, + teardownE2eTestDatabase, +} from "@/common/test/test-database" +import { removeNullish } from "@/common/utils/remove-nullish" +import { createOrganizationWithDocument } from "@/domains/organizations/organization.factory" +import { expectResponse, type Requester, testRequester } from "../../../../../test/request" +import { DocumentsModule } from "../../documents.module" +import { DocumentEmbeddingStatusNotifierService } from "../../embeddings/document-embedding-status-notifier.service" +import { withCrawlingAndAuthMocks } from "../../test-overrides" +import { URL_CRAWLING_BATCH_SERVICE, type UrlCrawlingBatchService } from "../url-crawling-batch.interface" + +describe("Documents - reCrawlUrl", () => { + let app: INestApplication + let request: Requester + let setup: Awaited> + let repositories: AllRepositories + + let organizationId: string + let projectId: string + let documentId: string + let accessToken: string | undefined = "token" + let auth0Id = "auth0|123" + let crawlingBatchServiceMock: { enqueueCrawlUrl: jest.MockedFunction } + let notifierMock: { notifyEmbeddingStatusChanged: jest.MockedFunction } + + beforeAll(async () => { + setup = await setupE2eTestDatabase({ + additionalImports: [DocumentsModule], + applyOverrides: (moduleBuilder) => withCrawlingAndAuthMocks(moduleBuilder, () => auth0Id), + }) + repositories = setup.getAllRepositories() + crawlingBatchServiceMock = setup.module.get(URL_CRAWLING_BATCH_SERVICE) + notifierMock = setup.module.get(DocumentEmbeddingStatusNotifierService) + app = setup.module.createNestApplication() + await app.init() + request = testRequester(app) + }) + + beforeEach(async () => { + await clearTestDatabase(setup.dataSource) + accessToken = "token" + auth0Id = "auth0|123" + crawlingBatchServiceMock.enqueueCrawlUrl.mockClear() + notifierMock.notifyEmbeddingStatusChanged.mockClear() + }) + + afterAll(async () => { + await teardownE2eTestDatabase(setup) + await app.close() + }) + + const createContext = async (overrides?: Partial<{ sourceUrl: string | null; title: string; content: string | null; embeddingStatus: string }>) => { + const { user, organization, project, document } = await createOrganizationWithDocument( + repositories, + { + user: { auth0Id }, + document: { + sourceType: "webCrawl", + sourceUrl: overrides?.sourceUrl !== undefined ? overrides.sourceUrl : "https://example.com", + title: overrides?.title ?? "https://example.com", + content: overrides?.content !== undefined ? overrides.content : null, + embeddingStatus: (overrides?.embeddingStatus as "completed") ?? "completed", + }, + }, + ) + organizationId = organization.id + projectId = project.id + documentId = document.id + return { user, organization, project, document } + } + + const subject = async () => + request({ + route: DocumentsRoutes.reCrawlUrl, + pathParams: removeNullish({ organizationId, projectId, documentId }), + token: accessToken, + }) + + it("resets the document and re-enqueues the crawl job using sourceUrl", async () => { + await createContext({ sourceUrl: "https://example.com" }) + + const response = await subject() + + expectResponse(response, 202) + expect(response.body.data.message).toContain("https://example.com") + + const document = await repositories.documentRepository.findOne({ where: { id: documentId } }) + expect(document?.embeddingStatus).toBe("pending") + expect(document?.content).toBeNull() + expect(document?.embeddingError).toBeNull() + + expect(crawlingBatchServiceMock.enqueueCrawlUrl).toHaveBeenCalledWith( + expect.objectContaining({ url: "https://example.com", documentId, organizationId, projectId }), + ) + expect(notifierMock.notifyEmbeddingStatusChanged).toHaveBeenCalledWith( + expect.objectContaining({ documentId, embeddingStatus: "pending" }), + ) + }) + + it("falls back to title when sourceUrl is null and title is a valid URL", async () => { + await createContext({ sourceUrl: null, title: "https://fallback.example.com" }) + + const response = await subject() + + expectResponse(response, 202) + expect(crawlingBatchServiceMock.enqueueCrawlUrl).toHaveBeenCalledWith( + expect.objectContaining({ url: "https://fallback.example.com" }), + ) + }) + + it("falls back to shortest URL in content when sourceUrl is null and title is an alias", async () => { + const content = JSON.stringify([ + { url: "https://example.com/page1", markdown: "" }, + { url: "https://example.com", markdown: "" }, + ]) + await createContext({ sourceUrl: null, title: "My Site Alias", content }) + + const response = await subject() + + expectResponse(response, 202) + expect(crawlingBatchServiceMock.enqueueCrawlUrl).toHaveBeenCalledWith( + expect.objectContaining({ url: "https://example.com" }), + ) + }) + + it("rejects documents that are not webCrawl type", async () => { + const { organization, project } = await createOrganizationWithDocument(repositories, { + user: { auth0Id }, + document: { sourceType: "project" }, + }) + organizationId = organization.id + projectId = project.id + documentId = (await repositories.documentRepository.findOne({ where: { projectId } }))!.id + + const response = await subject() + + expectResponse(response, 422, "Document is not a web crawl source.") + expect(crawlingBatchServiceMock.enqueueCrawlUrl).not.toHaveBeenCalled() + }) + + it("rejects when sourceUrl is null, title is not a URL, and content is empty", async () => { + await createContext({ sourceUrl: null, title: "Just a name", content: null }) + + const response = await subject() + + expectResponse(response, 422, "Source URL not available for this document. Please delete it and crawl the website again.") + expect(crawlingBatchServiceMock.enqueueCrawlUrl).not.toHaveBeenCalled() + }) +}) diff --git a/apps/api/src/domains/documents/test-overrides.ts b/apps/api/src/domains/documents/test-overrides.ts index 6e217c6b..25f04a2d 100644 --- a/apps/api/src/domains/documents/test-overrides.ts +++ b/apps/api/src/domains/documents/test-overrides.ts @@ -1,5 +1,7 @@ import type { TestingModuleBuilder } from "@nestjs/testing" import { setupUserGuardForTesting } from "../../../test/e2e.helpers" +import { URL_CRAWLING_BATCH_SERVICE } from "./crawling/url-crawling-batch.interface" +import { DocumentEmbeddingStatusNotifierService } from "./embeddings/document-embedding-status-notifier.service" import { DOCUMENT_EMBEDDINGS_BATCH_SERVICE } from "./embeddings/document-embeddings-batch.interface" function createDocumentEmbeddingsBatchServiceMock() { @@ -26,3 +28,37 @@ export function withDocumentAuthAndEmbeddingsMocks( ): TestingModuleBuilder { return setupUserGuardForTesting(withDocumentEmbeddingsBatchServiceMock(moduleBuilder), getAuth0Id) } + +function createUrlCrawlingBatchServiceMock() { + return { enqueueCrawlUrl: jest.fn().mockResolvedValue(undefined) } +} + +export function withUrlCrawlingBatchServiceMock( + moduleBuilder: TestingModuleBuilder, +): TestingModuleBuilder { + return moduleBuilder + .overrideProvider(URL_CRAWLING_BATCH_SERVICE) + .useValue(createUrlCrawlingBatchServiceMock()) +} + +export function withDocumentEmbeddingStatusNotifierMock( + moduleBuilder: TestingModuleBuilder, +): TestingModuleBuilder { + return moduleBuilder + .overrideProvider(DocumentEmbeddingStatusNotifierService) + .useValue({ notifyEmbeddingStatusChanged: jest.fn().mockResolvedValue(undefined) }) +} + +export function withCrawlingAndAuthMocks( + moduleBuilder: TestingModuleBuilder, + getAuth0Id: () => string, +): TestingModuleBuilder { + return setupUserGuardForTesting( + withUrlCrawlingBatchServiceMock( + withDocumentEmbeddingsBatchServiceMock( + withDocumentEmbeddingStatusNotifierMock(moduleBuilder), + ), + ), + getAuth0Id, + ) +} From 7ddda1f07682956484f67719d4715dd420e6a174 Mon Sep 17 00:00:00 2001 From: Thomas Jego Date: Tue, 12 May 2026 11:19:31 +0200 Subject: [PATCH 45/58] feat: adding a sourceType parameter to add listDocuments and getAll call to fetch only the concerned sources. --- .../api/src/domains/documents/documents.controller.ts | 3 ++- apps/api/src/domains/documents/documents.service.ts | 5 ++--- .../documents/service-tests/list-documents.spec.ts | 11 +++++++---- .../src/studio/features/documents/documents.slice.ts | 6 ++++++ .../src/studio/features/documents/documents.spi.ts | 2 +- .../src/studio/features/documents/documents.thunks.ts | 4 +++- .../features/documents/external/documents.api.ts | 4 ++-- .../api-contracts/src/documents/documents.routes.ts | 2 +- 8 files changed, 24 insertions(+), 13 deletions(-) diff --git a/apps/api/src/domains/documents/documents.controller.ts b/apps/api/src/domains/documents/documents.controller.ts index 5bf51e75..fca2877c 100644 --- a/apps/api/src/domains/documents/documents.controller.ts +++ b/apps/api/src/domains/documents/documents.controller.ts @@ -313,8 +313,9 @@ export class DocumentsController { @Get(DocumentsRoutes.getAll.path) async getAll( @Request() req: EndpointRequestWithProject, + @Param("sourceType") sourceType: DocumentSourceType, ): Promise { - const documents = await this.documentsService.listDocuments(getRequiredConnectScope(req)) + const documents = await this.documentsService.listDocuments(getRequiredConnectScope(req), sourceType) return { data: documents.map(toDocumentDto) } } diff --git a/apps/api/src/domains/documents/documents.service.ts b/apps/api/src/domains/documents/documents.service.ts index 73899eca..3948bcf6 100644 --- a/apps/api/src/domains/documents/documents.service.ts +++ b/apps/api/src/domains/documents/documents.service.ts @@ -78,12 +78,11 @@ export class DocumentsService { private sortNewestFirst = (a: Document, b: Document) => b.createdAt.getTime() - a.createdAt.getTime() - async listDocuments(connectScope: RequiredConnectScope): Promise { + async listDocuments(connectScope: RequiredConnectScope, sourceType: Document["sourceType"]): Promise { return ( await this.documentConnectRepository.find(connectScope, { where: [ - { sourceType: "project", uploadStatus: "uploaded" }, - { sourceType: "webCrawl", uploadStatus: "uploaded" }, + { sourceType, uploadStatus: "uploaded" } ], relations: ["tags"], }) diff --git a/apps/api/src/domains/documents/service-tests/list-documents.spec.ts b/apps/api/src/domains/documents/service-tests/list-documents.spec.ts index 17d58443..ce14134d 100644 --- a/apps/api/src/domains/documents/service-tests/list-documents.spec.ts +++ b/apps/api/src/domains/documents/service-tests/list-documents.spec.ts @@ -25,10 +25,13 @@ describe("listDocuments", () => { }) await repositories.documentRepository.save([document1, document2, deletedDocument]) - const result = await service.listDocuments({ - organizationId: organization.id, - projectId: project.id, - }) + const result = await service.listDocuments( + { + organizationId: organization.id, + projectId: project.id, + }, + "project", + ) expect(result).toHaveLength(2) expect(result.map((r) => r.title)).toContain("Document 1") diff --git a/apps/web/src/studio/features/documents/documents.slice.ts b/apps/web/src/studio/features/documents/documents.slice.ts index 3314c873..c7c472b4 100644 --- a/apps/web/src/studio/features/documents/documents.slice.ts +++ b/apps/web/src/studio/features/documents/documents.slice.ts @@ -1,4 +1,5 @@ import { createSlice, type PayloadAction } from "@reduxjs/toolkit" +import type { DocumentSourceType } from "@caseai-connect/api-contracts" import { ADS, type AsyncData, defaultAsyncData } from "@/common/store/async-data-status" import type { Document } from "./documents.models" import { listDocuments, uploadDocuments } from "./documents.thunks" @@ -18,6 +19,7 @@ type CrawlProgressStreamState = { } interface State { currentDocumentId: string | null + currentSourceType: DocumentSourceType | null data: AsyncData uploader: UploaderState embeddingStatusStream: EmbeddingStatusStreamState @@ -27,6 +29,7 @@ interface State { const initialState: State = { currentDocumentId: null, + currentSourceType: null, data: defaultAsyncData, uploader: { status: "idle", @@ -96,6 +99,9 @@ const slice = createSlice({ setCurrentDocumentId: (state, action: PayloadAction<{ documentId: string | null }>) => { state.currentDocumentId = action.payload.documentId }, + setCurrentSourceType: (state, action: PayloadAction<{ sourceType: DocumentSourceType | null }>) => { + state.currentSourceType = action.payload.sourceType + }, startEmbeddingStatusStream: (state) => { state.embeddingStatusStream.isActive = true }, diff --git a/apps/web/src/studio/features/documents/documents.spi.ts b/apps/web/src/studio/features/documents/documents.spi.ts index 4637c28d..7e8f49a7 100644 --- a/apps/web/src/studio/features/documents/documents.spi.ts +++ b/apps/web/src/studio/features/documents/documents.spi.ts @@ -7,7 +7,7 @@ import type { } from "./documents.models" export interface IDocumentsSpi { - getAll(params: { organizationId: string; projectId: string }): Promise + getAll(params: { organizationId: string; projectId: string; sourceType: DocumentSourceType }): Promise uploadOne(params: { organizationId: string projectId: string diff --git a/apps/web/src/studio/features/documents/documents.thunks.ts b/apps/web/src/studio/features/documents/documents.thunks.ts index 1804537d..0c7668f6 100644 --- a/apps/web/src/studio/features/documents/documents.thunks.ts +++ b/apps/web/src/studio/features/documents/documents.thunks.ts @@ -19,7 +19,9 @@ export const listDocuments = createAsyncThunk( state, wantedIds: ["organizationId", "projectId"], }) - return await services.documents.getAll({ organizationId, projectId }) + const sourceType = state.studio.documents.currentSourceType + if (!sourceType) throw new Error("sourceType is required to list documents") + return await services.documents.getAll({ organizationId, projectId, sourceType }) }, ) diff --git a/apps/web/src/studio/features/documents/external/documents.api.ts b/apps/web/src/studio/features/documents/external/documents.api.ts index cf50cf92..dc321a02 100644 --- a/apps/web/src/studio/features/documents/external/documents.api.ts +++ b/apps/web/src/studio/features/documents/external/documents.api.ts @@ -9,10 +9,10 @@ import type { IDocumentsSpi } from "../documents.spi" import { streamDocumentCrawlProgress, streamDocumentEmbeddingStatus } from "./documents-streaming" export default { - getAll: async ({ organizationId, projectId }) => { + getAll: async ({ organizationId, projectId, sourceType }) => { const axios = getAxiosInstance() const response = await axios.get( - DocumentsRoutes.getAll.getPath({ organizationId, projectId }), + DocumentsRoutes.getAll.getPath({ organizationId, projectId, sourceType }), ) return response.data.data.map(toDocument) }, diff --git a/packages/api-contracts/src/documents/documents.routes.ts b/packages/api-contracts/src/documents/documents.routes.ts index b60e5c1f..0ef1d6c5 100644 --- a/packages/api-contracts/src/documents/documents.routes.ts +++ b/packages/api-contracts/src/documents/documents.routes.ts @@ -35,7 +35,7 @@ export const DocumentsRoutes = { }), getAll: defineRoute>({ method: "get", - path: "organizations/:organizationId/projects/:projectId/documents", + path: "organizations/:organizationId/projects/:projectId/documents/:sourceType", }), getTemporaryUrl: defineRoute>({ method: "get", From 441436154ec11c049d66421c00d7d7da66db58cd Mon Sep 17 00:00:00 2001 From: Thomas Jego Date: Tue, 12 May 2026 11:26:04 +0200 Subject: [PATCH 46/58] fix(front): add sourceType filter to queries to fetch sources correctly (either websources or documents --- apps/web/src/studio/routes/DocumentsRoute.tsx | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/apps/web/src/studio/routes/DocumentsRoute.tsx b/apps/web/src/studio/routes/DocumentsRoute.tsx index 5a903e55..9bc12641 100644 --- a/apps/web/src/studio/routes/DocumentsRoute.tsx +++ b/apps/web/src/studio/routes/DocumentsRoute.tsx @@ -83,7 +83,7 @@ import { DocumentTagItem } from "../features/document-tags/components/DocumentTa import { DocumentTagsSheet } from "../features/document-tags/components/DocumentTagsSheet" export function DocumentsRoute({ sourceFilter }: { sourceFilter?: "project" | "webCrawl" }) { - useDocumentEmbeddingStatusStream() + useDocumentEmbeddingStatusStream(sourceFilter) const documents = useAppSelector(selectDocumentsData) const documentTags = useAppSelector(selectDocumentTagsData) return ( @@ -588,17 +588,19 @@ function MetaField({ label, value }: { label: string; value?: string }) { ) } -function useDocumentEmbeddingStatusStream() { +function useDocumentEmbeddingStatusStream(sourceFilter?: "project" | "webCrawl") { const dispatch = useAppDispatch() useEffect(() => { + dispatch(documentsActions.setCurrentSourceType({ sourceType: sourceFilter ?? null })) dispatch(documentsActions.startEmbeddingStatusStream()) dispatch(documentsActions.startCrawlProgressStream()) return () => { + dispatch(documentsActions.setCurrentSourceType({ sourceType: null })) dispatch(documentsActions.stopEmbeddingStatusStream()) dispatch(documentsActions.stopCrawlProgressStream()) } - }, [dispatch]) + }, [dispatch, sourceFilter]) } function UploaderStateComp() { From fcdc266c5bf5c07e5aa5cbccdd8ceee18e0c2646 Mon Sep 17 00:00:00 2001 From: Thomas Jego Date: Tue, 12 May 2026 11:46:34 +0200 Subject: [PATCH 47/58] fix(feature flag): revert feature flag changes --- apps/web/src/common/components/RestrictedFeature.tsx | 3 +-- apps/web/src/common/hooks/use-feature-flags.ts | 10 +++------- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/apps/web/src/common/components/RestrictedFeature.tsx b/apps/web/src/common/components/RestrictedFeature.tsx index 0377ff10..02bda9fb 100644 --- a/apps/web/src/common/components/RestrictedFeature.tsx +++ b/apps/web/src/common/components/RestrictedFeature.tsx @@ -8,8 +8,7 @@ export function RestrictedFeature({ feature: FeatureFlagKey children: React.ReactNode }) { - const { hasFeature, isLoading } = useFeatureFlags() - if (isLoading) return null + const { hasFeature } = useFeatureFlags() if (!hasFeature(feature)) return null return <>{children} } diff --git a/apps/web/src/common/hooks/use-feature-flags.ts b/apps/web/src/common/hooks/use-feature-flags.ts index d87066ec..c23d8165 100644 --- a/apps/web/src/common/hooks/use-feature-flags.ts +++ b/apps/web/src/common/hooks/use-feature-flags.ts @@ -13,16 +13,12 @@ export function useFeatureFlags(project?: Project) { const p = useAppSelector(selectCurrentProjectData) if (project) { return { - hasFeature: (feature: FeatureFlagKey): boolean => check(project.featureFlags || [], feature), - isLoading: false, + hasFeature: (feature: FeatureFlagKey): boolean => check(project.featureFlags || [], feature) } } else { - if (!ADS.isFulfilled(p)) { - return { hasFeature: () => false, isLoading: ADS.isLoading(p) || ADS.isUninitialized(p) } - } + if (!ADS.isFulfilled(p)) return { hasFeature: () => false } return { - hasFeature: (feature: FeatureFlagKey): boolean => check(p.value.featureFlags || [], feature), - isLoading: false, + hasFeature: (feature: FeatureFlagKey): boolean => check(p.value.featureFlags || [], feature) } } } From 7cecac2f60dca5b2e57c32737e5436054e196c00 Mon Sep 17 00:00:00 2001 From: Thomas Jego Date: Tue, 12 May 2026 11:59:31 +0200 Subject: [PATCH 48/58] fix: adding enum on retrieveChunkSchema documentSourceType --- .../streaming/tools/retrieve-project-document-chunks.tool.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/domains/agents/shared/agent-session-messages/streaming/tools/retrieve-project-document-chunks.tool.ts b/apps/api/src/domains/agents/shared/agent-session-messages/streaming/tools/retrieve-project-document-chunks.tool.ts index 4ec2981d..a693c39d 100644 --- a/apps/api/src/domains/agents/shared/agent-session-messages/streaming/tools/retrieve-project-document-chunks.tool.ts +++ b/apps/api/src/domains/agents/shared/agent-session-messages/streaming/tools/retrieve-project-document-chunks.tool.ts @@ -30,7 +30,7 @@ const retrievedChunkSchema = z.object({ documentId: z.string(), documentTitle: z.string(), documentFileName: z.string().nullable(), - documentSourceType: z.string(), + documentSourceType: z.enum(["project", "webCrawl"]), chunkIndex: z.number().int(), content: z.string(), distance: z.number(), From 5e75817009dd188be4d3dea6fae713feb299dbb0 Mon Sep 17 00:00:00 2001 From: Thomas Jego Date: Tue, 12 May 2026 12:15:42 +0200 Subject: [PATCH 49/58] refactor: switch casing on sourceType - embedding services --- .../document-embeddings-processor.service.ts | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/apps/api/src/domains/documents/embeddings/document-embeddings-processor.service.ts b/apps/api/src/domains/documents/embeddings/document-embeddings-processor.service.ts index 00fb9d79..4dc45342 100644 --- a/apps/api/src/domains/documents/embeddings/document-embeddings-processor.service.ts +++ b/apps/api/src/domains/documents/embeddings/document-embeddings-processor.service.ts @@ -93,19 +93,19 @@ export class DocumentEmbeddingsProcessorService { chunks: string[] extractionEngine: DocumentExtractionEngine }> { - if (document.content && !document.storageRelativePath) { - const chunks = this.splitWebCrawlContent(document.content) - this.logger.log(`Split document ${document.id} (from content) into ${chunks.length} chunks`) - return { chunks, extractionEngine: "web-crawl" } - } - - const fileBuffer = await this.fileStorage.readFile(document.storageRelativePath) - const extractionResult = await this.textExtractorService.extract(fileBuffer, document.mimeType) - const chunks = extractionResult.chunks ?? this.splitTextForEmbeddings(extractionResult.text) - this.logger.log(`Split document ${document.id} into ${chunks.length} chunks`) - return { - chunks, - extractionEngine: extractionResult.extractionEngine, + switch (document.sourceType) { + case "webCrawl": { + const chunks = this.splitWebCrawlContent(document.content ?? "") + this.logger.log(`Split document ${document.id} (from content) into ${chunks.length} chunks`) + return { chunks, extractionEngine: "web-crawl" } + } + default: { + const fileBuffer = await this.fileStorage.readFile(document.storageRelativePath) + const extractionResult = await this.textExtractorService.extract(fileBuffer, document.mimeType) + const chunks = extractionResult.chunks ?? this.splitTextForEmbeddings(extractionResult.text) + this.logger.log(`Split document ${document.id} into ${chunks.length} chunks`) + return { chunks, extractionEngine: extractionResult.extractionEngine } + } } } From d68fd54af943504b90160633cf69d23c85a7a532 Mon Sep 17 00:00:00 2001 From: Thomas Jego Date: Tue, 12 May 2026 12:27:24 +0200 Subject: [PATCH 50/58] refactor: remove unecessary ?? condition on document.sourceUrl since every web sources is not synced with the new url system --- .../domains/documents/documents.controller.ts | 30 ++----------------- 1 file changed, 3 insertions(+), 27 deletions(-) diff --git a/apps/api/src/domains/documents/documents.controller.ts b/apps/api/src/domains/documents/documents.controller.ts index fca2877c..90e6fe07 100644 --- a/apps/api/src/domains/documents/documents.controller.ts +++ b/apps/api/src/domains/documents/documents.controller.ts @@ -433,15 +433,14 @@ export class DocumentsController { throw new UnprocessableEntityException("Document is not a web crawl source.") } - const urlToRecrawl = - document.sourceUrl ?? resolveSourceUrlFallback(document.title, document.content) - - if (!urlToRecrawl) { + if (!document.sourceUrl) { throw new UnprocessableEntityException( "Source URL not available for this document. Please delete it and crawl the website again.", ) } + const urlToRecrawl = document.sourceUrl + const connectScope = getRequiredConnectScope(req) const reset = await this.documentsService.resetForRecrawl({ @@ -507,29 +506,6 @@ export class DocumentsController { } } -function resolveSourceUrlFallback(title: string, content: string | null): string | null { - // 1. Title may be the original URL if the document was never renamed. - try { - new URL(title) - return title - } catch { - // title is an alias, not a URL - } - // 2. Extract the shortest URL from crawled content — typically the root entry point. - if (content) { - try { - const pages: { url?: string }[] = JSON.parse(content) - const urls = pages.map((page) => page.url).filter((url): url is string => Boolean(url)) - if (urls.length > 0) { - urls.sort((a, b) => a.length - b.length) - return urls[0] ?? null - } - } catch { - // malformed content - } - } - return null -} function toDocumentDto(entity: Document): DocumentDto { return { From 702f7a5b4ad474fe4d07c6240e0a053eaa1f1ba7 Mon Sep 17 00:00:00 2001 From: Thomas Jego Date: Tue, 12 May 2026 12:42:31 +0200 Subject: [PATCH 51/58] refactor: export const instead of imbricated condition --- apps/api/src/domains/documents/documents.service.ts | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/apps/api/src/domains/documents/documents.service.ts b/apps/api/src/domains/documents/documents.service.ts index 3948bcf6..0f0255da 100644 --- a/apps/api/src/domains/documents/documents.service.ts +++ b/apps/api/src/domains/documents/documents.service.ts @@ -141,11 +141,12 @@ export class DocumentsService { throw new NotFoundException(`Document with id ${documentId} not found`) } - if ( + const shouldBackfillSourceUrl = fieldsToUpdate.title !== undefined && document.sourceType === "webCrawl" && document.sourceUrl === null - ) { + + if (shouldBackfillSourceUrl) { try { new URL(document.title) document.sourceUrl = document.title From 5e3da2950c4e2ad82ad241d1ad5bc124e717dad0 Mon Sep 17 00:00:00 2001 From: Thomas Jego Date: Tue, 12 May 2026 12:49:09 +0200 Subject: [PATCH 52/58] refactor: adding DTO field for page count, instead of parsing it in frontend directly --- .../domains/documents/documents.controller.ts | 19 ++++++++++++++++++- .../documents/external/documents.api.ts | 1 + apps/web/src/studio/routes/DocumentsRoute.tsx | 15 +-------------- .../src/documents/documents.dto.ts | 1 + 4 files changed, 21 insertions(+), 15 deletions(-) diff --git a/apps/api/src/domains/documents/documents.controller.ts b/apps/api/src/domains/documents/documents.controller.ts index 90e6fe07..53e72747 100644 --- a/apps/api/src/domains/documents/documents.controller.ts +++ b/apps/api/src/domains/documents/documents.controller.ts @@ -507,12 +507,29 @@ export class DocumentsController { } +function parseCrawledPages( + content: string | null, +): { url: string; markdown: string }[] | undefined { + if (!content) return undefined + try { + const parsed: unknown = JSON.parse(content) + if (Array.isArray(parsed) && parsed.length > 0 && parsed[0].url && parsed[0].markdown) { + return parsed as { url: string; markdown: string }[] + } + } catch { + // malformed content + } + return undefined +} + function toDocumentDto(entity: Document): DocumentDto { + const isWebCrawl = entity.sourceType === "webCrawl" return { id: entity.id, projectId: entity.projectId, title: entity.title, - content: entity.content, + content: isWebCrawl ? undefined : entity.content, + pages: isWebCrawl ? parseCrawledPages(entity.content) : undefined, fileName: entity.fileName, createdAt: entity.createdAt.getTime(), updatedAt: entity.updatedAt.getTime(), diff --git a/apps/web/src/studio/features/documents/external/documents.api.ts b/apps/web/src/studio/features/documents/external/documents.api.ts index dc321a02..d6840dc0 100644 --- a/apps/web/src/studio/features/documents/external/documents.api.ts +++ b/apps/web/src/studio/features/documents/external/documents.api.ts @@ -162,6 +162,7 @@ export default { function toDocument(dto: DocumentDto): Document { return { content: dto.content, + pages: dto.pages, createdAt: dto.createdAt, deletedAt: dto.deletedAt, fileName: dto.fileName, diff --git a/apps/web/src/studio/routes/DocumentsRoute.tsx b/apps/web/src/studio/routes/DocumentsRoute.tsx index 9bc12641..fe2564e2 100644 --- a/apps/web/src/studio/routes/DocumentsRoute.tsx +++ b/apps/web/src/studio/routes/DocumentsRoute.tsx @@ -181,19 +181,6 @@ function WithData({ ) } -function parseCrawledPages(content?: string): { url: string; markdown: string }[] | null { - if (!content) return null - try { - const parsed = JSON.parse(content) - if (Array.isArray(parsed) && parsed.length > 0 && parsed[0].url && parsed[0].markdown) { - return parsed - } - } catch { - // not JSON, not a crawl document - } - return null -} - function DocumentRow({ document, documentTags, @@ -205,7 +192,7 @@ function DocumentRow({ }) { const date = buildSince(document.updatedAt) const isWebCrawl = document.sourceType === "webCrawl" - const crawledPages = isWebCrawl ? parseCrawledPages(document.content) : null + const crawledPages = document.pages ?? null const hasPages = crawledPages && crawledPages.length > 0 const pagesCrawled = useAppSelector(selectCrawlProgressByDocumentId)[document.id] diff --git a/packages/api-contracts/src/documents/documents.dto.ts b/packages/api-contracts/src/documents/documents.dto.ts index c54f9370..54241a69 100644 --- a/packages/api-contracts/src/documents/documents.dto.ts +++ b/packages/api-contracts/src/documents/documents.dto.ts @@ -59,6 +59,7 @@ export type DocumentDto = { deletedAt?: TimeType title: string content?: string + pages?: { url: string; markdown: string }[] fileName?: string language: "en" | "fr" mimeType?: MimeTypes From 41496f7e51b25428d5527499a9984285e0eedca3 Mon Sep 17 00:00:00 2001 From: Thomas Jego Date: Tue, 12 May 2026 13:00:44 +0200 Subject: [PATCH 53/58] refactor: use Form in websources form --- .../documents/components/CrawlUrlButton.tsx | 56 +++++++++---------- 1 file changed, 27 insertions(+), 29 deletions(-) diff --git a/apps/web/src/studio/features/documents/components/CrawlUrlButton.tsx b/apps/web/src/studio/features/documents/components/CrawlUrlButton.tsx index 8d312706..d7c12ee8 100644 --- a/apps/web/src/studio/features/documents/components/CrawlUrlButton.tsx +++ b/apps/web/src/studio/features/documents/components/CrawlUrlButton.tsx @@ -9,12 +9,22 @@ import { } from "@caseai-connect/ui/shad/dialog" import { Field, FieldGroup, FieldLabel } from "@caseai-connect/ui/shad/field" import { Input } from "@caseai-connect/ui/shad/input" +import { zodResolver } from "@hookform/resolvers/zod" import { GlobeIcon, Loader2Icon } from "lucide-react" import { useState } from "react" +import { useForm } from "react-hook-form" import { useTranslation } from "react-i18next" +import { z } from "zod" import { useAppDispatch } from "@/common/store/hooks" import { crawlUrl } from "../documents.thunks" +const crawlUrlSchema = z.object({ + url: z.string().url(), + name: z.string(), +}) + +type CrawlUrlFormData = z.infer + export function CrawlUrlButton() { const [open, setOpen] = useState(false) @@ -36,34 +46,23 @@ export function CrawlUrlButton() { function CrawlUrlForm({ onSuccess }: { onSuccess: () => void }) { const dispatch = useAppDispatch() const { t } = useTranslation("document") - const [url, setUrl] = useState("") - const [name, setName] = useState("") - const [isSubmitting, setIsSubmitting] = useState(false) - - const isValidUrl = (() => { - try { - new URL(url) - return true - } catch { - return false - } - })() - const handleSubmit = async (event: React.FormEvent) => { - event.preventDefault() - if (!isValidUrl || isSubmitting) return + const { + register, + handleSubmit, + formState: { errors, isSubmitting }, + } = useForm({ + resolver: zodResolver(crawlUrlSchema), + defaultValues: { url: "", name: "" }, + }) - setIsSubmitting(true) - try { - await dispatch(crawlUrl({ url, name: name.trim() || undefined })).unwrap() - onSuccess() - } finally { - setIsSubmitting(false) - } + const onSubmit = async (data: CrawlUrlFormData) => { + await dispatch(crawlUrl({ url: data.url, name: data.name.trim() || undefined })).unwrap() + onSuccess() } return ( -
+ {t("document:crawl.title")} {t("document:crawl.description")} @@ -76,10 +75,10 @@ function CrawlUrlForm({ onSuccess }: { onSuccess: () => void }) { id="crawl-url" type="url" placeholder="https://example.com" - value={url} - onChange={(event) => setUrl(event.target.value)} - required + {...register("url")} + aria-invalid={errors.url ? "true" : "false"} /> + {errors.url &&

{errors.url.message}

} {t("document:crawl.nameLabel")} @@ -87,13 +86,12 @@ function CrawlUrlForm({ onSuccess }: { onSuccess: () => void }) { id="crawl-name" type="text" placeholder={t("document:crawl.namePlaceholder")} - value={name} - onChange={(event) => setName(event.target.value)} + {...register("name")} />
- From 1af498880cf04dfe5f556520b805f22fa4a6f95e Mon Sep 17 00:00:00 2001 From: Thomas Jego Date: Tue, 12 May 2026 13:13:19 +0200 Subject: [PATCH 54/58] refactor(documents services): remove unnecessary error handling --- .../api/src/domains/documents/documents.service.ts | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/apps/api/src/domains/documents/documents.service.ts b/apps/api/src/domains/documents/documents.service.ts index 0f0255da..b9e5a5d3 100644 --- a/apps/api/src/domains/documents/documents.service.ts +++ b/apps/api/src/domains/documents/documents.service.ts @@ -141,20 +141,6 @@ export class DocumentsService { throw new NotFoundException(`Document with id ${documentId} not found`) } - const shouldBackfillSourceUrl = - fieldsToUpdate.title !== undefined && - document.sourceType === "webCrawl" && - document.sourceUrl === null - - if (shouldBackfillSourceUrl) { - try { - new URL(document.title) - document.sourceUrl = document.title - } catch { - // title is not a URL (already an alias) — nothing to backfill - } - } - if (fieldsToUpdate.title !== undefined) { document.title = fieldsToUpdate.title } From 9deda139866cd6e69c47e020e19f25c3f4f3dfbb Mon Sep 17 00:00:00 2001 From: Thomas Jego Date: Tue, 12 May 2026 13:15:25 +0200 Subject: [PATCH 55/58] refactor(source tool): updating source tool for web sources --- .../shared/agent-session-messages/components/SourcesTool.tsx | 2 +- apps/web/src/common/features/agents/locales/agent.en.json | 4 ++-- apps/web/src/common/features/agents/locales/agent.fr.json | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/apps/web/src/common/features/agents/agent-sessions/shared/agent-session-messages/components/SourcesTool.tsx b/apps/web/src/common/features/agents/agent-sessions/shared/agent-session-messages/components/SourcesTool.tsx index 61dde6e6..b0f46a77 100644 --- a/apps/web/src/common/features/agents/agent-sessions/shared/agent-session-messages/components/SourcesTool.tsx +++ b/apps/web/src/common/features/agents/agent-sessions/shared/agent-session-messages/components/SourcesTool.tsx @@ -28,7 +28,7 @@ export function SourcesTool({ diff --git a/apps/web/src/common/features/agents/locales/agent.en.json b/apps/web/src/common/features/agents/locales/agent.en.json index 29e1c86d..5fffc188 100644 --- a/apps/web/src/common/features/agents/locales/agent.en.json +++ b/apps/web/src/common/features/agents/locales/agent.en.json @@ -98,8 +98,8 @@ "form": "Form", "sources": "Sources" }, - "source_one": "Source", - "source_other": "Sources", + "source_one": "{{count}} source", + "source_other": "{{count}} sources", "source_zero": "No sources" } } diff --git a/apps/web/src/common/features/agents/locales/agent.fr.json b/apps/web/src/common/features/agents/locales/agent.fr.json index 3807303b..059a3e76 100644 --- a/apps/web/src/common/features/agents/locales/agent.fr.json +++ b/apps/web/src/common/features/agents/locales/agent.fr.json @@ -98,8 +98,8 @@ "form": "Formulaire", "sources": "Sources" }, - "source_one": "Source", - "source_other": "Sources", + "source_one": "{{count}} source", + "source_other": "{{count}} sources", "source_zero": "Aucune source" } } From 46329792fe7e481ce28fb45c1b8e0cb01029f7e5 Mon Sep 17 00:00:00 2001 From: Thomas Jego Date: Tue, 12 May 2026 13:24:49 +0200 Subject: [PATCH 56/58] refactor(documents): one-shot resetForRecrawl with direct UPDATE --- .../domains/documents/documents.controller.ts | 14 +++++++------- .../src/domains/documents/documents.service.ts | 17 ++++++++--------- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/apps/api/src/domains/documents/documents.controller.ts b/apps/api/src/domains/documents/documents.controller.ts index 53e72747..16d5a943 100644 --- a/apps/api/src/domains/documents/documents.controller.ts +++ b/apps/api/src/domains/documents/documents.controller.ts @@ -443,18 +443,18 @@ export class DocumentsController { const connectScope = getRequiredConnectScope(req) - const reset = await this.documentsService.resetForRecrawl({ + await this.documentsService.resetForRecrawl({ connectScope, documentId: document.id, }) await this.documentEmbeddingStatusNotifierService.notifyEmbeddingStatusChanged({ - documentId: reset.id, - organizationId: reset.organizationId, - projectId: reset.projectId, - embeddingStatus: reset.embeddingStatus, - embeddingError: reset.embeddingError, - updatedAt: reset.updatedAt.getTime(), + documentId: document.id, + organizationId: document.organizationId, + projectId: document.projectId, + embeddingStatus: "pending", + embeddingError: null, + updatedAt: Date.now(), }) await this.urlCrawlingBatchService.enqueueCrawlUrl({ diff --git a/apps/api/src/domains/documents/documents.service.ts b/apps/api/src/domains/documents/documents.service.ts index b9e5a5d3..c31a5e2e 100644 --- a/apps/api/src/domains/documents/documents.service.ts +++ b/apps/api/src/domains/documents/documents.service.ts @@ -1,6 +1,6 @@ import { Injectable, NotFoundException } from "@nestjs/common" import { InjectRepository } from "@nestjs/typeorm" -import type { Repository } from "typeorm" +import type { Repository, UpdateResult } from "typeorm" import { ConnectRepository } from "@/common/entities/connect-repository" import type { RequiredConnectScope } from "@/common/entities/connect-required-fields" import { Document } from "./document.entity" @@ -11,7 +11,7 @@ import type { DocumentTagsUpdateFields } from "./tags/document-tags.types" @Injectable() export class DocumentsService { constructor( - @InjectRepository(Document) documentRepository: Repository, + @InjectRepository(Document) private readonly documentRepository: Repository, private readonly documentTagsService: DocumentTagsService, ) { this.documentConnectRepository = new ConnectRepository(documentRepository, "documents") @@ -203,15 +203,14 @@ export class DocumentsService { }: { connectScope: RequiredConnectScope documentId: string - }): Promise { - const document = await this.documentConnectRepository.getOneById(connectScope, documentId) - if (!document) { + }): Promise { + const result: UpdateResult = await this.documentRepository.update( + { id: documentId, organizationId: connectScope.organizationId, projectId: connectScope.projectId }, + { content: null as unknown as string, embeddingStatus: "pending", embeddingError: null }, + ) + if (!result.affected) { throw new NotFoundException(`Document with id ${documentId} not found`) } - document.content = null as unknown as string - document.embeddingStatus = "pending" - document.embeddingError = null - return this.documentConnectRepository.saveOne(document) } async deleteDocument({ From 93be239430865350005e0ee81d00e2c6cd704665 Mon Sep 17 00:00:00 2001 From: Thomas Jego Date: Tue, 12 May 2026 13:31:05 +0200 Subject: [PATCH 57/58] refactor(documents): replace fetch-then-save with direct UPDATE for markAsUploaded, updateContent, updateEmbeddingStatus --- .../url-crawling-processor.service.ts | 14 ++++---- .../domains/documents/documents.controller.ts | 9 +++-- .../domains/documents/documents.service.ts | 34 ++++++++++--------- 3 files changed, 32 insertions(+), 25 deletions(-) diff --git a/apps/api/src/domains/documents/crawling/url-crawling-processor.service.ts b/apps/api/src/domains/documents/crawling/url-crawling-processor.service.ts index 69a547c2..64ec3008 100644 --- a/apps/api/src/domains/documents/crawling/url-crawling-processor.service.ts +++ b/apps/api/src/domains/documents/crawling/url-crawling-processor.service.ts @@ -87,18 +87,18 @@ export class UrlCrawlingProcessorService { } catch (error) { this.logger.error(`Crawl failed for ${payload.url}: ${(error as Error).message}`) try { - const failed = await this.documentsService.updateEmbeddingStatus({ + await this.documentsService.updateEmbeddingStatus({ connectScope, documentId: payload.documentId, status: "failed", }) await this.embeddingStatusNotifierService.notifyEmbeddingStatusChanged({ - documentId: failed.id, - organizationId: failed.organizationId, - projectId: failed.projectId, - embeddingStatus: failed.embeddingStatus, - embeddingError: failed.embeddingError, - updatedAt: failed.updatedAt.getTime(), + documentId: payload.documentId, + organizationId: payload.organizationId, + projectId: payload.projectId, + embeddingStatus: "failed", + embeddingError: null, + updatedAt: Date.now(), }) } catch (notifyError) { this.logger.error( diff --git a/apps/api/src/domains/documents/documents.controller.ts b/apps/api/src/domains/documents/documents.controller.ts index 16d5a943..3c1e5383 100644 --- a/apps/api/src/domains/documents/documents.controller.ts +++ b/apps/api/src/domains/documents/documents.controller.ts @@ -252,14 +252,19 @@ export class DocumentsController { payload.tagIds !== undefined && payload.tagIds.length > 0 ? payload.tagIds : undefined for (const documentId of payload.documentIds) { - let document = await this.documentsService.markAsUploaded({ connectScope, documentId }) + await this.documentsService.markAsUploaded({ connectScope, documentId }) + let document: Document if (tagIds !== undefined) { document = await this.documentsService.updateDocument({ connectScope, - documentId: document.id, + documentId, fieldsToUpdate: { tagsToAdd: tagIds }, }) + } else { + const found = await this.documentsService.findById({ connectScope, documentId }) + if (!found) throw new NotFoundException(`Document ${documentId} not found`) + document = found } if (document.sourceType === "project") { diff --git a/apps/api/src/domains/documents/documents.service.ts b/apps/api/src/domains/documents/documents.service.ts index c31a5e2e..6d44d65e 100644 --- a/apps/api/src/domains/documents/documents.service.ts +++ b/apps/api/src/domains/documents/documents.service.ts @@ -66,13 +66,14 @@ export class DocumentsService { }: { connectScope: RequiredConnectScope documentId: string - }): Promise { - const document = await this.documentConnectRepository.getOneById(connectScope, documentId) - if (!document) { + }): Promise { + const result: UpdateResult = await this.documentRepository.update( + { id: documentId, organizationId: connectScope.organizationId, projectId: connectScope.projectId }, + { uploadStatus: "uploaded" }, + ) + if (!result.affected) { throw new NotFoundException(`Document with id ${documentId} not found`) } - document.uploadStatus = "uploaded" - return this.documentConnectRepository.saveOne(document) } private sortNewestFirst = (a: Document, b: Document) => @@ -166,14 +167,14 @@ export class DocumentsService { documentId: string content: string size: number - }): Promise { - const document = await this.documentConnectRepository.getOneById(connectScope, documentId) - if (!document) { + }): Promise { + const result: UpdateResult = await this.documentRepository.update( + { id: documentId, organizationId: connectScope.organizationId, projectId: connectScope.projectId }, + { content, size }, + ) + if (!result.affected) { throw new NotFoundException(`Document with id ${documentId} not found`) } - document.content = content - document.size = size - return this.documentConnectRepository.saveOne(document) } async saveOne(document: Document): Promise { @@ -188,13 +189,14 @@ export class DocumentsService { connectScope: RequiredConnectScope documentId: string status: Document["embeddingStatus"] - }): Promise { - const document = await this.documentConnectRepository.getOneById(connectScope, documentId) - if (!document) { + }): Promise { + const result: UpdateResult = await this.documentRepository.update( + { id: documentId, organizationId: connectScope.organizationId, projectId: connectScope.projectId }, + { embeddingStatus: status }, + ) + if (!result.affected) { throw new NotFoundException(`Document with id ${documentId} not found`) } - document.embeddingStatus = status - return this.documentConnectRepository.saveOne(document) } async resetForRecrawl({ From 13502e6ff024096d77fa88a5c34c30e9507985d4 Mon Sep 17 00:00:00 2001 From: Thomas Jego Date: Tue, 12 May 2026 13:36:21 +0200 Subject: [PATCH 58/58] refactor(web): remove crawledPages intermediate, use document.pages directly + changelog update --- CHANGELOG.md | 2 +- apps/web/src/studio/routes/DocumentsRoute.tsx | 9 ++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ff85c099..5415abeb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,7 @@ This project uses [CalVer](https://calver.org/) (YY.MM.Micro) for product versio ## [Unreleased] ### Added -- (beta) Documents sidebar entry replaced by a Sources dropdown with separate Documents and Websites sections +- (beta) Web sources crawling - (beta) Re-crawl a website from its action menu to refresh content and re-index all pages ### Changed diff --git a/apps/web/src/studio/routes/DocumentsRoute.tsx b/apps/web/src/studio/routes/DocumentsRoute.tsx index fe2564e2..9d924628 100644 --- a/apps/web/src/studio/routes/DocumentsRoute.tsx +++ b/apps/web/src/studio/routes/DocumentsRoute.tsx @@ -192,8 +192,7 @@ function DocumentRow({ }) { const date = buildSince(document.updatedAt) const isWebCrawl = document.sourceType === "webCrawl" - const crawledPages = document.pages ?? null - const hasPages = crawledPages && crawledPages.length > 0 + const hasPages = document.pages && document.pages.length > 0 const pagesCrawled = useAppSelector(selectCrawlProgressByDocumentId)[document.id] const [isOpen, setIsOpen] = useState(false) @@ -224,7 +223,7 @@ function DocumentRow({ {showPages && ( - {hasPages ? crawledPages.length : "—"} + {hasPages ? document.pages!.length : "—"} )} @@ -248,8 +247,8 @@ function DocumentRow({ - {crawledPages && isOpen - ? crawledPages.map((page) => ( + {document.pages && isOpen + ? document.pages.map((page) => (