bayesimpact · Mascode-Dev · Apr 16, 2026 · Apr 16, 2026 · Apr 16, 2026 · Apr 16, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,10 +8,13 @@ This project uses [CalVer](https://calver.org/) (YY.MM.Micro) for product versio
 ## [Unreleased]
 
 ### Added
+- (beta) Web sources crawling
+- (beta) Re-crawl a website from its action menu to refresh content and re-index all pages
 
 ### Changed
 
 ### Fixed
+- (beta) Re-crawl now works correctly for renamed web sources
 
 ### Security
 

diff --git a/apps/api/package.json b/apps/api/package.json
@@ -43,10 +43,10 @@
     "@ai-sdk/google": "^3.0.13",
     "@ai-sdk/google-vertex": "^4.0.28",
     "@ai-sdk/mcp": "^1.0.30",
-    "@bull-board/api": "^6.21.2",
-    "@bull-board/express": "^6.21.2",
-    "@bull-board/nestjs": "^6.21.2",
-    "@bull-board/ui": "^6.21.2",
+    "@bull-board/api": "^6.21.3",
+    "@bull-board/express": "^6.21.3",
+    "@bull-board/nestjs": "^6.21.3",
+    "@bull-board/ui": "^6.21.3",
     "@caseai-connect/api-contracts": "*",
     "@google-cloud/aiplatform": "^6.5.0",
     "@google-cloud/opentelemetry-cloud-monitoring-exporter": "^0.21.0",
@@ -67,12 +67,13 @@
     "@opentelemetry/sdk-metrics": "^2.6.1",
     "@opentelemetry/sdk-node": "^0.214.0",
     "@opentelemetry/sdk-trace-base": "^2.6.1",
+    "@spider-cloud/spider-client": "^0.2.0",
     "ai": "^6.0.87",
     "axios": "^1.12.2",
     "bullmq": "^5.70.2",
     "class-transformer": "^0.5.1",
     "class-validator": "^0.14.3",
-    "express-openid-connect": "^2.20.1",
+    "express-openid-connect": "^2.20.2",
     "jwks-rsa": "^3.2.0",
     "langfuse": "^3.38.6",
     "langfuse-core-v2": "npm:langfuse-core@3.16.2",

diff --git a/apps/api/src/domains/agents/shared/agent-session-messages/streaming/master-promts/helpers.ts b/apps/api/src/domains/agents/shared/agent-session-messages/streaming/master-promts/helpers.ts
@@ -17,7 +17,7 @@ ${names
         return `[${name}]: When the user asks about information that may exist in project documents, call the ${name} tool before answering. Use the returned chunks as primary context and avoid inventing facts not present in those chunks.`
 
       case ToolName.Sources:
-        return `[${name}]: After using ${ToolName.RetrieveProjectDocumentChunks} tool, call the ${name} tool to provide the user with the sources of the information you used to answer their question. This will help build trust and allow the user to verify the information.`
+        return `[${name}]: You MUST call the ${name} tool whenever you use information from the ${ToolName.RetrieveProjectDocumentChunks} tool to answer the user, regardless of whether the chunks come from uploaded documents (documentSourceType="project") or crawled web pages (documentSourceType="webCrawl"). Include EVERY document whose chunks you actually used — do not omit web-crawled pages. For each source, copy the documentId, documentTitle, and documentSourceType verbatim from the retrieved chunks. Do NOT cite sources inline in your text response; the ${name} tool is the only way to show sources to the user.`
 
       case ToolName.FillForm:
         return `[${name}]: You can use the ${name} tool to fill out the form fields. Just fill out the information you have and ask the user for the missing information. You can also update previously filled information if the user changes their answer. Pass undefined for fields that are not filled yet.`

diff --git a/...ts/shared/agent-session-messages/streaming/tools/retrieve-project-document-chunks.tool.ts b/...ts/shared/agent-session-messages/streaming/tools/retrieve-project-document-chunks.tool.ts
@@ -30,6 +30,7 @@ const retrievedChunkSchema = z.object({
   documentId: z.string(),
   documentTitle: z.string(),
   documentFileName: z.string().nullable(),
+  documentSourceType: z.enum(["project", "webCrawl"]),
   chunkIndex: z.number().int(),
   content: z.string(),
   distance: z.number(),

diff --git a/apps/api/src/domains/agents/shared/agent-session-messages/streaming/tools/sources.tool.ts b/apps/api/src/domains/agents/shared/agent-session-messages/streaming/tools/sources.tool.ts
@@ -14,6 +14,16 @@ export function sourcesTool({
       sources: z.array(
         z.object({
           documentId: z.string().describe("The ID of the document to retrieve sources from."),
+          documentTitle: z
+            .string()
+            .optional()
+            .describe("The title of the source document (copy from retrieved chunks)."),
+          documentSourceType: z
+            .string()
+            .optional()
+            .describe(
+              "The source type of the document, e.g. 'project' for an uploaded file or 'webCrawl' for a crawled web page (copy from retrieved chunks).",
+            ),
           chunks: z
             .array(
               z.object({

diff --git a/apps/api/src/domains/documents/crawling/bull-mq-url-crawling-batch.service.ts b/apps/api/src/domains/documents/crawling/bull-mq-url-crawling-batch.service.ts
@@ -0,0 +1,20 @@
+import { InjectQueue } from "@nestjs/bullmq"
+import { Injectable, Logger } from "@nestjs/common"
+import type { Queue } from "bullmq"
+import { URL_CRAWLING_JOB_NAME, URL_CRAWLING_QUEUE_NAME } from "./url-crawling.constants"
+import type { CrawlUrlJobPayload } from "./url-crawling.types"
+
+@Injectable()
+export class BullMqUrlCrawlingBatchService {
+  private readonly logger = new Logger(BullMqUrlCrawlingBatchService.name)
+
+  constructor(
+    @InjectQueue(URL_CRAWLING_QUEUE_NAME)
+    private readonly urlCrawlingQueue: Queue<CrawlUrlJobPayload>,
+  ) {}
+
+  async enqueueCrawlUrl(payload: CrawlUrlJobPayload): Promise<void> {
+    this.logger.log(`Enqueuing URL crawl job ${JSON.stringify(payload)}`)
+    await this.urlCrawlingQueue.add(URL_CRAWLING_JOB_NAME, payload)
+  }
+}
diff --git a/apps/api/src/domains/documents/crawling/bull-mq-web-source-embeddings-batch.service.ts b/apps/api/src/domains/documents/crawling/bull-mq-web-source-embeddings-batch.service.ts
@@ -0,0 +1,25 @@
+import { InjectQueue } from "@nestjs/bullmq"
+import { Injectable, Logger } from "@nestjs/common"
+import type { Queue } from "bullmq"
+import type { CreateDocumentEmbeddingsJobPayload } from "../embeddings/document-embeddings.types"
+import {
+  WEB_SOURCE_EMBEDDINGS_JOB_NAME,
+  WEB_SOURCE_EMBEDDINGS_QUEUE_NAME,
+} from "./web-source-embeddings.constants"
+
+@Injectable()
+export class BullMqWebSourceEmbeddingsBatchService {
+  private readonly logger = new Logger(BullMqWebSourceEmbeddingsBatchService.name)
+
+  constructor(
+    @InjectQueue(WEB_SOURCE_EMBEDDINGS_QUEUE_NAME)
+    private readonly webSourceEmbeddingsQueue: Queue<CreateDocumentEmbeddingsJobPayload>,
+  ) {}
+
+  async enqueueCreateEmbeddingsForDocument(
+    payload: CreateDocumentEmbeddingsJobPayload,
+  ): Promise<void> {
+    this.logger.log(`Enqueuing web source embeddings job ${JSON.stringify(payload)}`)
+    await this.webSourceEmbeddingsQueue.add(WEB_SOURCE_EMBEDDINGS_JOB_NAME, payload)
+  }
+}
diff --git a/apps/api/src/domains/documents/crawling/document-crawl-progress-notifier.service.ts b/apps/api/src/domains/documents/crawling/document-crawl-progress-notifier.service.ts
@@ -0,0 +1,29 @@
+import { Injectable } from "@nestjs/common"
+import { InjectDataSource } from "@nestjs/typeorm"
+import type { DataSource } from "typeorm"
+import { PostgresStatusNotifierService } from "@/common/sse/postgres-status-notifier.service"
+import { DOCUMENT_CRAWL_PROGRESS_CHANGED_CHANNEL } from "./document-crawl-progress.constants"
+
+@Injectable()
+export class DocumentCrawlProgressNotifierService extends PostgresStatusNotifierService {
+  constructor(@InjectDataSource() dataSource: DataSource) {
+    super(dataSource, DOCUMENT_CRAWL_PROGRESS_CHANGED_CHANNEL)
+  }
+
+  async notifyCrawlProgress(params: {
+    documentId: string
+    organizationId: string
+    projectId: string
+    pagesCrawled: number
+    updatedAt: number
+  }): Promise<void> {
+    await this.notify({
+      type: DOCUMENT_CRAWL_PROGRESS_CHANGED_CHANNEL,
+      documentId: params.documentId,
+      organizationId: params.organizationId,
+      projectId: params.projectId,
+      pagesCrawled: params.pagesCrawled,
+      updatedAt: params.updatedAt,
+    })
+  }
+}
diff --git a/apps/api/src/domains/documents/crawling/document-crawl-progress-stream.service.ts b/apps/api/src/domains/documents/crawling/document-crawl-progress-stream.service.ts
@@ -0,0 +1,16 @@
+import type { DocumentCrawlProgressChangedEventDto } from "@caseai-connect/api-contracts"
+import { Injectable } from "@nestjs/common"
+import { PostgresStatusStreamService } from "@/common/sse/postgres-status-stream.service"
+import { DOCUMENT_CRAWL_PROGRESS_CHANGED_CHANNEL } from "./document-crawl-progress.constants"
+
+@Injectable()
+export class DocumentCrawlProgressStreamService extends PostgresStatusStreamService<DocumentCrawlProgressChangedEventDto> {
+  constructor() {
+    super({
+      channel: DOCUMENT_CRAWL_PROGRESS_CHANGED_CHANNEL,
+      expectedType: DOCUMENT_CRAWL_PROGRESS_CHANGED_CHANNEL,
+      serviceName: DocumentCrawlProgressStreamService.name,
+      isExpectedEvent: (payload) => payload.type === DOCUMENT_CRAWL_PROGRESS_CHANGED_CHANNEL,
+    })
+  }
+}
diff --git a/apps/api/src/domains/documents/crawling/document-crawl-progress.constants.ts b/apps/api/src/domains/documents/crawling/document-crawl-progress.constants.ts
@@ -0,0 +1,3 @@
+import { DOCUMENT_CRAWL_PROGRESS_CHANGED_CHANNEL_DTO } from "@caseai-connect/api-contracts"
+
+export const DOCUMENT_CRAWL_PROGRESS_CHANGED_CHANNEL = DOCUMENT_CRAWL_PROGRESS_CHANGED_CHANNEL_DTO
diff --git a/apps/api/src/domains/documents/crawling/e2e-tests/auth.spec.ts b/apps/api/src/domains/documents/crawling/e2e-tests/auth.spec.ts
@@ -0,0 +1,182 @@
+import { randomUUID } from "node:crypto"
+import { DocumentsRoutes } from "@caseai-connect/api-contracts"
+import type { INestApplication } from "@nestjs/common"
+import type { App } from "supertest/types"
+import { AUTH_ERRORS } from "@/common/errors/auth-errors"
+import {
+  type AllRepositories,
+  clearTestDatabase,
+  setupE2eTestDatabase,
+  teardownE2eTestDatabase,
+} from "@/common/test/test-database"
+import { removeNullish } from "@/common/utils/remove-nullish"
+import { createOrganizationWithDocument } from "@/domains/organizations/organization.factory"
+import { projectFactory } from "@/domains/projects/project.factory"
+import { mockForeignAuth0Id } from "../../../../../test/e2e.helpers"
+import { expectResponse, type Requester, testRequester } from "../../../../../test/request"
+import { DocumentsModule } from "../../documents.module"
+import { withCrawlingAndAuthMocks } from "../../test-overrides"
+
+describe("Documents Crawling - Auth", () => {
+  let app: INestApplication<App>
+  let request: Requester
+  let setup: Awaited<ReturnType<typeof setupE2eTestDatabase>>
+  let repositories: AllRepositories
+
+  let organizationId: string | null = "random-organization-id"
+  let projectId: string | null = "random-project-id"
+  let documentId: string | null = "random-document-id"
+  let accessToken: string | null = "token"
+  let auth0Id = `auth0|${randomUUID()}`
+
+  beforeAll(async () => {
+    setup = await setupE2eTestDatabase({
+      additionalImports: [DocumentsModule],
+      applyOverrides: (moduleBuilder) => withCrawlingAndAuthMocks(moduleBuilder, () => auth0Id),
+    })
+    repositories = setup.getAllRepositories()
+    app = setup.module.createNestApplication()
+    await app.init()
+    request = testRequester(app)
+  })
+
+  beforeEach(async () => {
+    await clearTestDatabase(setup.dataSource)
+    organizationId = "random-organization-id"
+    projectId = "random-project-id"
+    documentId = "random-document-id"
+    accessToken = "token"
+    auth0Id = `auth0|${randomUUID()}`
+  })
+
+  afterAll(async () => {
+    await teardownE2eTestDatabase(setup)
+    await app.close()
+  })
+
+  const createContextForRole = async (role: "owner" | "admin" | "member" = "owner") => {
+    const { organization, project, document } = await createOrganizationWithDocument(repositories, {
+      user: { auth0Id },
+      projectMembership: { role },
+      document: { sourceType: "webCrawl", sourceUrl: "https://example.com" },
+    })
+    organizationId = organization.id
+    projectId = project.id
+    documentId = document.id
+    accessToken = "token"
+    return { organization, project, document }
+  }
+
+  describe("DocumentsRoutes.crawlUrl", () => {
+    const subject = async () =>
+      request({
+        route: DocumentsRoutes.crawlUrl,
+        pathParams: removeNullish({ organizationId, projectId }),
+        token: accessToken ?? undefined,
+        request: { payload: { url: "https://example.com" } },
+      })
+
+    it("requires an authentication token", async () => {
+      accessToken = null
+      expectResponse(await subject(), 401, AUTH_ERRORS.NO_ACCESS_TOKEN)
+    })
+    it("requires a valid organization ID", async () => {
+      organizationId = null
+      expectResponse(await subject(), 400, AUTH_ERRORS.NO_ORGANIZATION_ID)
+    })
+    it("requires a valid project ID", async () => {
+      await createContextForRole("owner")
+      projectId = randomUUID()
+      expectResponse(await subject(), 404)
+    })
+    it("requires the user to be a member of the organization", async () => {
+      await createContextForRole("owner")
+      auth0Id = mockForeignAuth0Id()
+      expectResponse(await subject(), 401, AUTH_ERRORS.NOT_MEMBER_OF_ORG)
+    })
+    it("doesn't allow a simple member to crawl a URL", async () => {
+      await createContextForRole("member")
+      expectResponse(await subject(), 403, AUTH_ERRORS.UNAUTHORIZED_RESOURCE)
+    })
+    it("allows an admin to crawl a URL", async () => {
+      await createContextForRole("admin")
+      expectResponse(await subject(), 202)
+    })
+  })
+
+  describe("DocumentsRoutes.reCrawlUrl", () => {
+    const subject = async () =>
+      request({
+        route: DocumentsRoutes.reCrawlUrl,
+        pathParams: removeNullish({ organizationId, projectId, documentId }),
+        token: accessToken ?? undefined,
+      })
+
+    it("requires an authentication token", async () => {
+      accessToken = null
+      expectResponse(await subject(), 401, AUTH_ERRORS.NO_ACCESS_TOKEN)
+    })
+    it("requires a valid organization ID", async () => {
+      organizationId = null
+      expectResponse(await subject(), 400, AUTH_ERRORS.NO_ORGANIZATION_ID)
+    })
+    it("requires a valid project ID", async () => {
+      await createContextForRole("owner")
+      projectId = randomUUID()
+      expectResponse(await subject(), 404)
+    })
+    it("requires the user to be a member of the organization", async () => {
+      await createContextForRole("owner")
+      auth0Id = mockForeignAuth0Id()
+      expectResponse(await subject(), 401, AUTH_ERRORS.NOT_MEMBER_OF_ORG)
+    })
+    it("requires the document to be part of the project", async () => {
+      const { organization } = await createContextForRole("owner")
+      const project2 = await repositories.projectRepository.save(
+        projectFactory.transient({ organization }).build(),
+      )
+      projectId = project2.id
+      expectResponse(await subject(), 404)
+    })
+    it("doesn't allow a simple member to recrawl a document", async () => {
+      await createContextForRole("member")
+      expectResponse(await subject(), 403, AUTH_ERRORS.UNAUTHORIZED_RESOURCE)
+    })
+    it("allows an admin to recrawl a document", async () => {
+      await createContextForRole("admin")
+      expectResponse(await subject(), 202)
+    })
+  })
+
+  describe("DocumentsRoutes.streamCrawlProgress", () => {
+    const subject = async () =>
+      request({
+        route: DocumentsRoutes.streamCrawlProgress,
+        pathParams: removeNullish({ organizationId, projectId }),
+        token: accessToken ?? undefined,
+      })
+
+    it("requires an authentication token", async () => {
+      accessToken = null
+      expectResponse(await subject(), 401, AUTH_ERRORS.NO_ACCESS_TOKEN)
+    })
+    it("requires a valid organization ID", async () => {
+      organizationId = null
+      expectResponse(await subject(), 400, AUTH_ERRORS.NO_ORGANIZATION_ID)
+    })
+    it("requires a valid project ID", async () => {
+      await createContextForRole("owner")
+      projectId = randomUUID()
+      expectResponse(await subject(), 404)
+    })
+    it("requires the user to be a member of the organization", async () => {
+      await createContextForRole("owner")
+      auth0Id = mockForeignAuth0Id()
+      expectResponse(await subject(), 401, AUTH_ERRORS.NOT_MEMBER_OF_ORG)
+    })
+    it("doesn't allow a simple member to stream crawl progress", async () => {
+      await createContextForRole("member")
+      expectResponse(await subject(), 403, AUTH_ERRORS.UNAUTHORIZED_RESOURCE)
+    })
+  })
+})
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		import { DOCUMENT_CRAWL_PROGRESS_CHANGED_CHANNEL_DTO } from "@caseai-connect/api-contracts"

		export const DOCUMENT_CRAWL_PROGRESS_CHANGED_CHANNEL = DOCUMENT_CRAWL_PROGRESS_CHANGED_CHANNEL_DTO