diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 151d2b4..dd1b728 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -32,16 +32,17 @@ jobs: - name: Checkout uses: actions/checkout@v4 + - name: Setup pnpm + uses: pnpm/action-setup@v4 + with: + version: 10.18.2 + - name: Setup Node.js uses: actions/setup-node@v4 with: node-version: 22 cache: pnpm - - name: Enable corepack - run: | - corepack enable - corepack prepare pnpm@10.18.2 --activate - name: Install dependencies run: pnpm install --frozen-lockfile @@ -90,16 +91,17 @@ jobs: - name: Checkout uses: actions/checkout@v4 + - name: Setup pnpm + uses: pnpm/action-setup@v4 + with: + version: 10.18.2 + - name: Setup Node.js uses: actions/setup-node@v4 with: node-version: 22 cache: pnpm - - name: Enable corepack - run: | - corepack enable - corepack prepare pnpm@10.18.2 --activate - name: Install dependencies run: pnpm install --frozen-lockfile @@ -148,16 +150,17 @@ jobs: - name: Checkout uses: actions/checkout@v4 + - name: Setup pnpm + uses: pnpm/action-setup@v4 + with: + version: 10.18.2 + - name: Setup Node.js uses: actions/setup-node@v4 with: node-version: 22 cache: pnpm - - name: Enable corepack - run: | - corepack enable - corepack prepare pnpm@10.18.2 --activate - name: Install dependencies run: pnpm install --frozen-lockfile diff --git a/acceptance/README.md b/acceptance/README.md index 334f3ae..55c565c 100644 --- a/acceptance/README.md +++ b/acceptance/README.md @@ -29,3 +29,4 @@ | Source Cache and Fetch Snapshot | `source-cache-and-fetch-snapshot.feature` | | Run Cache | `run-cache.feature` | | Source Intake and Fetching v0.1 | `source-intake-and-fetching.feature` | +| Source Discovery v0.1 | `source-discovery.feature` | diff --git a/acceptance/source-discovery.feature b/acceptance/source-discovery.feature new file mode 100644 index 0000000..7e94fd1 --- /dev/null +++ b/acceptance/source-discovery.feature @@ -0,0 +1,30 @@ +Feature: Source Discovery + As a TraceMap investigation user + I want TraceMap to discover likely source candidates from my research topic + So that investigations can proceed even when I do not paste URLs + + Scenario: Source discovery is disabled by default + Given source discovery provider is not configured + When a user starts an investigation + Then the existing manual URL intake behavior should be preserved + And the run should not require discovered sources to complete + + Scenario: Mock source discovery returns deterministic source candidates + Given the source discovery provider is set to mock + When a user starts an investigation without URLs + Then TraceMap should request discovered source candidates for the research topic + And discovered source URLs should be normalized and deduplicated + And valid discovered sources should be passed to the answer graph provider as source candidates + + Scenario: Manual URLs are prioritized over discovered URLs + Given the research topic contains manual URLs + And source discovery also returns URLs + When source candidates are built + Then manual URL candidates should appear before discovered candidates + And duplicate URLs should appear only once + + Scenario: Discovery failures do not fail the investigation + Given the source discovery provider returns a failure + When a user starts an investigation + Then the investigation should continue with any manually supplied URL candidates + And the discovery failure should be recorded as ignored source information diff --git a/specs/README.md b/specs/README.md index fb46e7a..c5f5ce0 100644 --- a/specs/README.md +++ b/specs/README.md @@ -40,3 +40,4 @@ Each feature spec should describe: | Source Cache and Fetch Snapshot | [source-cache-and-fetch-snapshot.md](./source-cache-and-fetch-snapshot.md) | | Run Cache | [run-cache.md](./run-cache.md) | | Source Intake and Fetching v0.1 | [source-intake-and-fetching.md](./source-intake-and-fetching.md) | +| Source Discovery v0.1 | [source-discovery.md](./source-discovery.md) | diff --git a/specs/source-discovery.md b/specs/source-discovery.md new file mode 100644 index 0000000..3b1f0d7 --- /dev/null +++ b/specs/source-discovery.md @@ -0,0 +1,88 @@ +# Source Discovery v0.1 + +## Purpose +Enable TraceMap to discover source candidates from a research topic even when the user does not provide manual URLs, while preserving the existing Source Intake / Fetch / Cache / Provider pipeline. + +## User value +- Users can start an investigation from a plain research topic and still get evidence candidates. +- Manual URLs remain first-class and prioritized when present. +- Discovery can evolve from mock to real search providers without breaking provider integration. + +## Scope +- Add `SourceDiscoveryProvider` boundary for pluggable source discovery. +- Add `disabled` and `mock` discovery providers. +- Add discovery hook from research topic (`question`) in source intake. +- Merge manual URLs and discovered URLs into one normalized, deduplicated intake list. +- Reuse existing source cache / fetch pipeline (`resolveSourceCacheForUrl`). +- Pass resulting `sourceCandidates` to answer graph providers. + +## Non-goals +- Production external search API integration. +- RAG / embeddings / reranking. +- Background job orchestration. +- DB schema changes or Prisma migrations. +- Large UI redesign. +- Major OpenAI answer graph schema changes. +- Full-text crawling. +- Dedicated persistence table for search result history. + +## Existing implementation constraints +- Keep `AnalysisRun.question` and form field `question` unchanged. +- Preserve existing Source Intake behavior for manual URLs. +- Keep run completion flow valid even when source discovery fails or yields no candidates. +- Do not bypass existing URL safety validation. + +## Provider strategy +- Environment variable switch: `TRACEMAP_SOURCE_DISCOVERY_PROVIDER=disabled|mock`. +- Default is `disabled`. +- `mock` provider must return deterministic results from the research topic. +- Provider boundary is designed to allow future providers (e.g. web search backends) without changing intake contracts. + +## Source candidate flow +1. Extract manual URLs from research topic. +2. Resolve source discovery provider. +3. If provider is enabled, discover additional URLs from the same topic. +4. Merge manual + discovered URLs (manual first). +5. Normalize, safety-check, dedupe by normalized URL. +6. Resolve cache/fetch metadata with `resolveSourceCacheForUrl`. +7. Build `SourceCandidate[]` and pass to answer graph provider input. + +## Deduplication rules +- Deduplicate by normalized URL. +- Manual URL candidates are evaluated before discovered candidates. +- When duplicates exist, keep first occurrence (manual precedence). +- Do not fetch or process the same normalized URL more than once. + +## Error handling +- Discovery provider failures do not fail the run. +- Per-URL cache/fetch failures do not fail the run. +- Discovery and URL failures are captured in `ignoredUrls` with reasons. +- Avoid excessive logging; never log secrets. + +## Security constraints +- Discovery outputs are treated as untrusted input. +- All discovered URLs must pass existing normalization and safety checks. +- Unsafe URLs are ignored and recorded, not fetched. +- Existing SSRF guard behavior remains authoritative. + +## Cost constraints +- `DEFAULT_DISCOVERY_MAX_RESULTS = 5`. +- `DEFAULT_SOURCE_CANDIDATE_MAX_RESULTS = 5`. +- Keep provider source context compact (no raw full HTML). +- Keep excerpt truncation behavior unchanged in answer graph providers. + +## Test requirements +- Provider resolution defaults to disabled. +- Disabled provider yields no discovered candidates. +- Mock provider is deterministic and respects maxResults. +- Intake integration preserves manual URL-only behavior. +- Discovery can produce candidates when no manual URLs exist. +- Manual URLs are prioritized over discovered URLs. +- Duplicate URLs are deduplicated. +- Discovery failure does not fail intake. +- Unsafe discovered URLs are ignored. +- Provider integration remains valid when sourceCandidates is empty or discovered. + +## Acceptance references +- `acceptance/source-discovery.feature` +- `acceptance/source-intake-and-fetching.feature` diff --git a/src/server/analysis/source-discovery/mock-source-discovery-provider.ts b/src/server/analysis/source-discovery/mock-source-discovery-provider.ts new file mode 100644 index 0000000..232eefe --- /dev/null +++ b/src/server/analysis/source-discovery/mock-source-discovery-provider.ts @@ -0,0 +1,35 @@ +import type { SourceDiscoveryProvider } from "@/server/analysis/source-discovery/source-discovery-provider"; + +function toTopicSlug(topic: string): string { + const normalized = topic.trim().toLowerCase().replace(/\s+/g, "-").replace(/[^a-z0-9-]/g, ""); + return normalized.slice(0, 48) || "general-investigation"; +} + +export const mockSourceDiscoveryProvider: SourceDiscoveryProvider = { + id: "mock", + async discoverSources(input) { + const slug = toTopicSlug(input.researchTopic); + const base = [ + { + title: `Mission dossier: ${slug}`, + url: `https://example.com/tracemap/mock-source-1?topic=${slug}`, + snippet: `Initial mission dossier for ${slug} with baseline facts and terminology.`, + sourceKind: "report" as const, + }, + { + title: `Primary evidence registry: ${slug}`, + url: `https://example.com/tracemap/mock-source-2?topic=${slug}`, + snippet: `Cross-checkable references and claims matrix for ${slug}.`, + sourceKind: "documentation" as const, + }, + { + title: `Risk and unknown map briefing: ${slug}`, + url: `https://example.com/tracemap/mock-source-3?topic=${slug}`, + snippet: `Potential uncertainties and unresolved questions related to ${slug}.`, + sourceKind: "news" as const, + }, + ].slice(0, Math.max(0, input.maxResults)); + + return { kind: "success", candidates: base.map((c) => ({ ...c, discoveredBy: "mock" as const })) }; + }, +}; diff --git a/src/server/analysis/source-discovery/resolve-source-discovery-provider.ts b/src/server/analysis/source-discovery/resolve-source-discovery-provider.ts new file mode 100644 index 0000000..ecbe9d3 --- /dev/null +++ b/src/server/analysis/source-discovery/resolve-source-discovery-provider.ts @@ -0,0 +1,15 @@ +import { mockSourceDiscoveryProvider } from "@/server/analysis/source-discovery/mock-source-discovery-provider"; +import type { SourceDiscoveryProvider } from "@/server/analysis/source-discovery/source-discovery-provider"; + +const disabledProvider: SourceDiscoveryProvider = { + id: "disabled", + async discoverSources() { + return { kind: "success", candidates: [] }; + }, +}; + +export function resolveSourceDiscoveryProvider(): SourceDiscoveryProvider { + const configured = process.env.TRACEMAP_SOURCE_DISCOVERY_PROVIDER?.trim().toLowerCase() ?? "disabled"; + if (configured === "mock") return mockSourceDiscoveryProvider; + return disabledProvider; +} diff --git a/src/server/analysis/source-discovery/source-discovery-provider.ts b/src/server/analysis/source-discovery/source-discovery-provider.ts new file mode 100644 index 0000000..f3bc409 --- /dev/null +++ b/src/server/analysis/source-discovery/source-discovery-provider.ts @@ -0,0 +1,21 @@ +export type SourceDiscoveryCandidate = { + title: string; + url: string; + snippet?: string | null; + sourceKind?: "official" | "news" | "documentation" | "report" | "paper" | "unknown"; + discoveredBy: "mock" | "manual_url" | "search_provider"; +}; + +export type SourceDiscoveryInput = { + researchTopic: string; + maxResults: number; +}; + +export type SourceDiscoveryResult = + | { kind: "success"; candidates: SourceDiscoveryCandidate[] } + | { kind: "failure"; errorMessage: string }; + +export type SourceDiscoveryProvider = { + id: "disabled" | "mock" | string; + discoverSources(input: SourceDiscoveryInput): Promise; +}; diff --git a/src/server/analysis/source-discovery/source-discovery-service.ts b/src/server/analysis/source-discovery/source-discovery-service.ts new file mode 100644 index 0000000..e0b71be --- /dev/null +++ b/src/server/analysis/source-discovery/source-discovery-service.ts @@ -0,0 +1,2 @@ +export const DEFAULT_DISCOVERY_MAX_RESULTS = 5; +export const DEFAULT_SOURCE_CANDIDATE_MAX_RESULTS = 5; diff --git a/src/server/analysis/source-intake/source-intake-service.ts b/src/server/analysis/source-intake/source-intake-service.ts index 116ffe1..d90ce61 100644 --- a/src/server/analysis/source-intake/source-intake-service.ts +++ b/src/server/analysis/source-intake/source-intake-service.ts @@ -1,32 +1,53 @@ import type { SourceCandidate, SourceIntakeResult } from "@/types/source-intake"; import { resolveSourceCacheForUrl } from "@/server/analysis/source-cache-service"; import { extractUrls } from "@/server/analysis/source-intake/extract-urls"; +import { resolveSourceDiscoveryProvider } from "@/server/analysis/source-discovery/resolve-source-discovery-provider"; +import { DEFAULT_DISCOVERY_MAX_RESULTS, DEFAULT_SOURCE_CANDIDATE_MAX_RESULTS } from "@/server/analysis/source-discovery/source-discovery-service"; export async function buildSourceIntakeFromQuestion(question: string): Promise { - const rawUrls = extractUrls(question); - const candidates: SourceCandidate[] = []; + const manualUrls = extractUrls(question); + const discoveryProvider = resolveSourceDiscoveryProvider(); const ignoredUrls: SourceIntakeResult["ignoredUrls"] = []; + + let discoveredUrls: string[] = []; + if (discoveryProvider.id !== "disabled") { + const discovery = await discoveryProvider.discoverSources({ + researchTopic: question, + maxResults: DEFAULT_DISCOVERY_MAX_RESULTS, + }); + if (discovery.kind === "failure") { + ignoredUrls.push({ url: "[source_discovery]", reason: discovery.errorMessage }); + } else { + discoveredUrls = discovery.candidates.map((candidate) => candidate.url); + } + } + + const merged = [ + ...manualUrls.map((url) => ({ url, origin: "manual_url" as const })), + ...discoveredUrls.map((url) => ({ url, origin: "discovered" as const })), + ]; + + const candidates: SourceCandidate[] = []; const seen = new Set(); - for (const rawUrl of rawUrls) { + for (const item of merged) { let result; try { - result = await resolveSourceCacheForUrl(rawUrl); + result = await resolveSourceCacheForUrl(item.url); } catch (error) { ignoredUrls.push({ - url: rawUrl, + url: item.url, reason: error instanceof Error ? error.message : String(error), }); continue; } if (result.kind === "invalid") { - ignoredUrls.push({ url: rawUrl, reason: result.errorMessage }); - continue; - } - if (seen.has(result.normalizedUrl)) { + ignoredUrls.push({ url: item.url, reason: result.errorMessage }); continue; } + if (seen.has(result.normalizedUrl)) continue; seen.add(result.normalizedUrl); + candidates.push({ normalizedUrl: result.normalizedUrl, originalUrl: result.originalUrl, @@ -39,7 +60,10 @@ export async function buildSourceIntakeFromQuestion(question: string): Promise= DEFAULT_SOURCE_CANDIDATE_MAX_RESULTS) break; } return { candidates, ignoredUrls }; diff --git a/src/types/source-intake.ts b/src/types/source-intake.ts index 31870b5..632fa1e 100644 --- a/src/types/source-intake.ts +++ b/src/types/source-intake.ts @@ -1,3 +1,5 @@ +export type SourceCandidateOrigin = "manual_url" | "discovered"; + export type SourceCandidate = { normalizedUrl: string; originalUrl: string; @@ -10,6 +12,7 @@ export type SourceCandidate = { sourceCacheEntryId?: string | null; sourceFetchSnapshotId?: string | null; fetchErrorMessage?: string | null; + origin?: SourceCandidateOrigin; }; export type SourceIntakeResult = { diff --git a/tests/source-discovery-provider.test.ts b/tests/source-discovery-provider.test.ts new file mode 100644 index 0000000..d1266f3 --- /dev/null +++ b/tests/source-discovery-provider.test.ts @@ -0,0 +1,25 @@ +import { describe, expect, it } from "vitest"; + +import { mockSourceDiscoveryProvider } from "@/server/analysis/source-discovery/mock-source-discovery-provider"; +import { resolveSourceDiscoveryProvider } from "@/server/analysis/source-discovery/resolve-source-discovery-provider"; + +describe("source discovery provider", () => { + it("defaults to disabled and returns empty candidates", async () => { + delete process.env.TRACEMAP_SOURCE_DISCOVERY_PROVIDER; + const provider = resolveSourceDiscoveryProvider(); + expect(provider.id).toBe("disabled"); + await expect(provider.discoverSources({ researchTopic: "x", maxResults: 5 })).resolves.toEqual({ kind: "success", candidates: [] }); + }); + + it("mock provider is deterministic", async () => { + const a = await mockSourceDiscoveryProvider.discoverSources({ researchTopic: "Acme Revenue", maxResults: 3 }); + const b = await mockSourceDiscoveryProvider.discoverSources({ researchTopic: "Acme Revenue", maxResults: 3 }); + expect(a).toEqual(b); + }); + + it("respects maxResults", async () => { + const result = await mockSourceDiscoveryProvider.discoverSources({ researchTopic: "Acme", maxResults: 2 }); + expect(result.kind).toBe("success"); + if (result.kind === "success") expect(result.candidates).toHaveLength(2); + }); +}); diff --git a/tests/source-intake-integration.test.ts b/tests/source-intake-integration.test.ts new file mode 100644 index 0000000..33ee33a --- /dev/null +++ b/tests/source-intake-integration.test.ts @@ -0,0 +1,18 @@ +import { beforeEach, describe, expect, it, vi } from "vitest"; + +vi.mock("@/server/analysis/source-cache-service", () => ({ resolveSourceCacheForUrl: vi.fn() })); +vi.mock("@/server/analysis/source-discovery/resolve-source-discovery-provider", () => ({ resolveSourceDiscoveryProvider: vi.fn() })); + +describe("buildSourceIntakeFromQuestion", () => { + beforeEach(() => vi.resetAllMocks()); + + it("keeps manual url behavior", async () => { + const { resolveSourceCacheForUrl } = await import("@/server/analysis/source-cache-service"); + const { resolveSourceDiscoveryProvider } = await import("@/server/analysis/source-discovery/resolve-source-discovery-provider"); + vi.mocked(resolveSourceDiscoveryProvider).mockReturnValue({ id: "disabled", discoverSources: vi.fn() }); + vi.mocked(resolveSourceCacheForUrl).mockResolvedValue({ kind: "resolved", normalizedUrl: "https://example.com/a", originalUrl: "https://example.com/a", finalUrl: "https://example.com/a", excerpt: null, contentType: "text/html", httpStatus: 200, checkedAt: new Date(), sourceCacheEntryId: "cache-1", sourceFetchSnapshotId: "snap-1", verificationStatus: "verified", contentHash: null, reusedCache: false }); + const { buildSourceIntakeFromQuestion } = await import("@/server/analysis/source-intake/source-intake-service"); + const result = await buildSourceIntakeFromQuestion("check https://example.com/a"); + expect(result.candidates[0]?.origin).toBe("manual_url"); + }); +});