From 6561c4058512be92e9ccd9abeee244fb058f86d6 Mon Sep 17 00:00:00 2001 From: Dico Angelo Date: Thu, 18 Jun 2026 20:31:09 -0400 Subject: [PATCH 01/11] Fix ucw_timeline/detect_emergence schema drift Both queries targeted flattened SQLite-style columns (light_topic, instinct_gut_signal, etc.) but the live Postgres cognitive_events table stores those fields inside JSONB columns (light_layer, instinct_layer). Result: 'column "light_topic" does not exist' on every call. Rewrote both queries to extract from JSONB with aliases that preserve existing row-access names. Verified against live ucw_cognitive DB. --- mcp_raw/tools/ucw_tools.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/mcp_raw/tools/ucw_tools.py b/mcp_raw/tools/ucw_tools.py index ee605b4..0daac87 100644 --- a/mcp_raw/tools/ucw_tools.py +++ b/mcp_raw/tools/ucw_tools.py @@ -185,8 +185,11 @@ async def _ucw_timeline(args: Dict) -> Dict: # Build parameterized query (PostgreSQL $1, $2, ...) query = """SELECT event_id, timestamp_ns, direction, method, platform, - light_topic, light_intent, light_summary, - instinct_gut_signal, instinct_coherence + light_layer->>'topic' AS light_topic, + light_layer->>'intent' AS light_intent, + light_layer->>'summary' AS light_summary, + instinct_layer->>'gut_signal' AS instinct_gut_signal, + (instinct_layer->>'coherence_potential')::float AS instinct_coherence FROM cognitive_events WHERE 1=1""" params: list = [] idx = 1 @@ -247,8 +250,12 @@ async def _detect_emergence(args: Dict) -> Dict: async with _pool.acquire() as conn: rows = await conn.fetch( """SELECT event_id, timestamp_ns, method, platform, - light_topic, light_concepts, light_intent, - instinct_coherence, instinct_indicators, instinct_gut_signal + light_layer->>'topic' AS light_topic, + light_layer->>'concepts' AS light_concepts, + light_layer->>'intent' AS light_intent, + (instinct_layer->>'coherence_potential')::float AS instinct_coherence, + instinct_layer->>'emergence_indicators' AS instinct_indicators, + instinct_layer->>'gut_signal' AS instinct_gut_signal FROM cognitive_events ORDER BY timestamp_ns DESC LIMIT $1""", limit, From e7625e31b866ad80ab7066a8496fa5e1086c9942 Mon Sep 17 00:00:00 2001 From: Dico Angelo Date: Sat, 20 Jun 2026 18:37:38 -0400 Subject: [PATCH 02/11] chore: ignore local AI context files --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index 0b592d8..95edfb4 100644 --- a/.gitignore +++ b/.gitignore @@ -66,3 +66,7 @@ cli_import_log.json # Large generated visual assets (refined pipeline iterations) visual_assets/ + +# Local AI context — kept on disk, not published +CLAUDE.md +.claude/ From 4eddae6e59b1e6d3a973969b536337a0cd0c2e74 Mon Sep 17 00:00:00 2001 From: Dico Angelo Date: Sat, 20 Jun 2026 19:52:55 -0400 Subject: [PATCH 03/11] chore(model-ids): sweep deprecated claude-opus-4-6 -> claude-opus-4-8 per registry --- setup.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.sh b/setup.sh index 0a90ee0..dd79986 100644 --- a/setup.sh +++ b/setup.sh @@ -153,7 +153,7 @@ if [ ! -f "$AGENT_CORE/config.json" ] || [ "$UPDATE" = true ]; then "version": "2.0", "defaults": { "auto_accept": true, - "model": "claude-opus-4-6", + "model": "claude-opus-4-8", "thinking": true, "max_parallel_sessions": 5 }, From 407421c6da2b118a2a643a6a30c87d4eae646252 Mon Sep 17 00:00:00 2001 From: Dico Angelo Date: Wed, 1 Jul 2026 00:10:51 -0400 Subject: [PATCH 04/11] feat(cpb): read Claude model IDs + costs from canonical registry Sweep sonnet-4-6 -> sonnet-5 in coherence_engine extractors, and rewire cpb/llm_client.py to load Claude model IDs and per-token costs from ~/.claude/config/pricing.json instead of inline literals. Hardcoded values kept only as offline fallback mirroring the current Claude 5 family. Kills the recurring manual model-id sweep on each release. --- coherence_engine/insight_extractor.py | 2 +- coherence_engine/knowledge_graph.py | 2 +- cpb/llm_client.py | 64 ++++++++++++++++++++++----- 3 files changed, 56 insertions(+), 12 deletions(-) diff --git a/coherence_engine/insight_extractor.py b/coherence_engine/insight_extractor.py index ad15aa4..fff8fb9 100644 --- a/coherence_engine/insight_extractor.py +++ b/coherence_engine/insight_extractor.py @@ -28,7 +28,7 @@ # LLM provider: "anthropic" (Claude API) or "local" (ollama) LLM_PROVIDER = os.environ.get("UCW_LLM_PROVIDER", "anthropic") -ANTHROPIC_MODEL = os.environ.get("UCW_INSIGHT_MODEL", "claude-sonnet-4-6") +ANTHROPIC_MODEL = os.environ.get("UCW_INSIGHT_MODEL", "claude-sonnet-5") OLLAMA_MODEL = os.environ.get("UCW_OLLAMA_MODEL", "llama3.2") OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434") diff --git a/coherence_engine/knowledge_graph.py b/coherence_engine/knowledge_graph.py index fa08097..2ff53ab 100644 --- a/coherence_engine/knowledge_graph.py +++ b/coherence_engine/knowledge_graph.py @@ -1102,7 +1102,7 @@ def __init__(self, provider: str = None): import os self._provider = provider or os.environ.get("UCW_LLM_PROVIDER", "anthropic") - self._model = os.environ.get("UCW_INSIGHT_MODEL", "claude-sonnet-4-6") + self._model = os.environ.get("UCW_INSIGHT_MODEL", "claude-sonnet-5") self._ollama_model = os.environ.get("UCW_OLLAMA_MODEL", "llama3.2") self._ollama_url = os.environ.get("OLLAMA_URL", "http://localhost:11434") diff --git a/cpb/llm_client.py b/cpb/llm_client.py index 678d761..2391ada 100644 --- a/cpb/llm_client.py +++ b/cpb/llm_client.py @@ -58,14 +58,59 @@ HOME = Path.home() CONFIG_FILE = HOME / ".agent-core" / "config.json" -# Model mappings -ANTHROPIC_MODELS = { - "opus": "claude-opus-4-20250514", - "sonnet": "claude-sonnet-4-6", - "haiku": "claude-3-5-haiku-20241022", - "default": "claude-sonnet-4-6", +# Model ID sovereignty: canonical registry is the single source of truth. +# Never hardcode model IDs — read them from ~/.claude/config/pricing.json so a +# model release/deprecation is a one-file registry edit, not a code sweep. +# The literals below are OFFLINE FALLBACKS only, used if the registry is +# unreadable; they must mirror the registry's current Claude 5 family. +PRICING_REGISTRY = HOME / ".claude" / "config" / "pricing.json" + +_FALLBACK_ANTHROPIC = { + "opus": "claude-opus-4-8", + "sonnet": "claude-sonnet-5", + "haiku": "claude-haiku-4-5-20251001", + "default": "claude-sonnet-5", +} +_FALLBACK_CLAUDE_COSTS = { + "claude-opus-4-8": (5.0, 25.0), + "claude-sonnet-5": (3.0, 15.0), + "claude-haiku-4-5-20251001": (1.0, 5.0), } + +def _load_registry() -> tuple[Dict[str, str], Dict[str, tuple]]: + """Load Claude model IDs + costs from the canonical registry. + + Returns (anthropic_models, claude_costs). Falls back to the mirrored + literals above if the registry is missing or malformed so the client + stays usable offline. + """ + try: + data = json.loads(PRICING_REGISTRY.read_text()) + models = data["models"] + anthropic_models: Dict[str, str] = {} + claude_costs: Dict[str, tuple] = {} + for tier in ("opus", "sonnet", "haiku"): + entry = models.get(tier) + if not entry or not entry.get("id"): + continue + mid = entry["id"] + anthropic_models[tier] = mid + if entry.get("input") is not None and entry.get("output") is not None: + claude_costs[mid] = (entry["input"], entry["output"]) + if "sonnet" not in anthropic_models: + raise ValueError("registry missing sonnet tier") + anthropic_models["default"] = anthropic_models["sonnet"] + return anthropic_models, claude_costs + except Exception: + return dict(_FALLBACK_ANTHROPIC), dict(_FALLBACK_CLAUDE_COSTS) + + +_ANTHROPIC_MODELS, _CLAUDE_COSTS = _load_registry() + +# Model mappings +ANTHROPIC_MODELS = _ANTHROPIC_MODELS + OPENAI_MODELS = { "gpt4": "gpt-4o", "gpt4o": "gpt-4o", @@ -79,11 +124,10 @@ "default": "gemini-1.5-pro", } -# Cost per 1M tokens (input/output) +# Cost per 1M tokens (input/output). Claude tiers sourced from the registry; +# non-Claude providers kept inline until they have a canonical registry. MODEL_COSTS = { - "claude-opus-4-20250514": (15.0, 75.0), - "claude-sonnet-4-6": (3.0, 15.0), - "claude-3-5-haiku-20241022": (0.25, 1.25), + **_CLAUDE_COSTS, "gpt-4o": (2.5, 10.0), "gpt-4o-mini": (0.15, 0.6), "gemini-1.5-pro": (1.25, 5.0), From e64dfb01f8f4f7b5cd41cfa3efe84ff3f17fa837 Mon Sep 17 00:00:00 2001 From: Dico Angelo Date: Wed, 1 Jul 2026 00:11:31 -0400 Subject: [PATCH 05/11] style: remove 6 extraneous f-string prefixes (ruff F541) --- graph/decay_engine.py | 6 +++--- mcp_raw/tools/oracle_tools.py | 4 ++-- mcp_raw/tools/react_synthesis.py | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/graph/decay_engine.py b/graph/decay_engine.py index b4ed0d6..2ad01ba 100644 --- a/graph/decay_engine.py +++ b/graph/decay_engine.py @@ -313,16 +313,16 @@ def main(): print("=== Knowledge Graph Decay Analysis ===\n") print(f"Active edges: {stats['total_active']}") print(f"Immortal (contains): {stats['immortal']}") - print(f"\nAge distribution:") + print("\nAge distribution:") for bucket, count in stats["age_distribution"].items(): bar = "█" * min(count // 10, 40) print(f" {bucket:>8}: {count:>5} {bar}") - print(f"\nDecay preview:") + print("\nDecay preview:") print(f" Would stale (90-180d): {stats['would_stale']}") print(f" Would expire (>180d): {stats['would_expire']}") print(f" Would reinforce (<30d): {stats['would_reinforce']}") print(f" Already stale: {stats['already_stale']}") - print(f"\nRelation types:") + print("\nRelation types:") for rel, count in sorted(stats["by_relation"].items(), key=lambda x: -x[1]): print(f" {rel:>15}: {count}") diff --git a/mcp_raw/tools/oracle_tools.py b/mcp_raw/tools/oracle_tools.py index e4b038d..6fa42ed 100644 --- a/mcp_raw/tools/oracle_tools.py +++ b/mcp_raw/tools/oracle_tools.py @@ -181,12 +181,12 @@ async def _vibe_config(args: Dict) -> Dict: if config.files_detected: output += f"**Files:** {', '.join(config.files_detected)}\n" - output += f"\n## Subtasks\n" + output += "\n## Subtasks\n" for i, st in enumerate(config.subtasks, 1): output += f"{i}. {st}\n" if config.personas: - output += f"\n## Graph Personas\n" + output += "\n## Graph Personas\n" for p in config.personas: output += f"- **{p['title']}** ({p['finding_count']} findings) — {p['domain']}\n" diff --git a/mcp_raw/tools/react_synthesis.py b/mcp_raw/tools/react_synthesis.py index 62f0e2b..143c746 100644 --- a/mcp_raw/tools/react_synthesis.py +++ b/mcp_raw/tools/react_synthesis.py @@ -189,7 +189,7 @@ async def _exec_knowledge_graph(query: str) -> str: else: output += f"No sessions directly matching '{query}' in graph.\n" if timeline: - output += f"**Recent sessions (for context):**\n" + output += "**Recent sessions (for context):**\n" for t in timeline[:5]: output += f" - {t['topic']} ({t['date']})\n" From 9dec00113910922fc39a81bb7d163e12f3a61bb3 Mon Sep 17 00:00:00 2001 From: Dico Angelo Date: Wed, 1 Jul 2026 00:14:49 -0400 Subject: [PATCH 06/11] feat(cpb): add Firecrawl Research Index as Tier-1 paper source Wire Firecrawl's research-specific paper index into TieredSearchLayer as the primary Tier-1 source, running alongside the raw arXiv client as fallback. Semantic relevance scoring, canonical/source id extraction, and approximate publish dates parsed from the arXiv YYMM id encoding feed the existing time-decay + tier-weight scoring. Papers deduped by arXiv id across providers. Keyless-capable; honors FIRECRAWL_API_KEY for higher rate limits. Docs: https://docs.firecrawl.dev/features/research --- cpb/search_layer.py | 124 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 121 insertions(+), 3 deletions(-) diff --git a/cpb/search_layer.py b/cpb/search_layer.py index 46dc18b..5e68360 100644 --- a/cpb/search_layer.py +++ b/cpb/search_layer.py @@ -245,6 +245,7 @@ def __init__(self, config: Optional[dict] = None): def _load_api_keys(self): """Load API keys from config.""" import json + import os from pathlib import Path config_path = Path.home() / ".agent-core" / "config.json" @@ -253,9 +254,15 @@ def _load_api_keys(self): cfg = json.load(f) self.cohere_key = cfg.get("cohere", {}).get("api_key") self.github_token = cfg.get("github", {}).get("token") + self.firecrawl_key = cfg.get("firecrawl", {}).get("api_key") else: self.cohere_key = None self.github_token = None + self.firecrawl_key = None + + # Env overrides config; Firecrawl Research works keyless but the key + # lifts rate limits and clears the "suspicious IP" gate. + self.firecrawl_key = os.environ.get("FIRECRAWL_API_KEY", self.firecrawl_key) async def search(self, query: str, max_results_per_tier: int = 10) -> SearchContext: """ @@ -300,20 +307,34 @@ async def search(self, query: str, max_results_per_tier: int = 10) -> SearchCont return context async def _search_tier1(self, query: str, limit: int) -> list[SearchResult]: - """Search Tier 1: Primary sources.""" + """Search Tier 1: Primary sources. + + Firecrawl Research is the primary (semantically-ranked) paper source; + the raw arXiv client runs alongside as a resilient fallback. Papers are + deduped by arXiv id so the same paper is not listed twice. + """ results = [] # Parallel search across Tier 1 sources tasks = [ + self._search_firecrawl_papers(query, limit // 2), self._search_arxiv(query, limit // 2), self._search_web_tier1(query, limit // 2), ] tier1_results = await asyncio.gather(*tasks, return_exceptions=True) + seen_arxiv: set[str] = set() for result in tier1_results: - if not isinstance(result, Exception): - results.extend(result) + if isinstance(result, Exception): + continue + for item in result: + aid = self._extract_arxiv_id("", item.url) if "arxiv.org/abs/" in item.url else None + if aid: + if aid in seen_arxiv: + continue + seen_arxiv.add(aid) + results.append(item) return results @@ -398,6 +419,103 @@ async def _search_arxiv(self, query: str, limit: int) -> list[SearchResult]: return results + async def _search_firecrawl_papers( + self, query: str, limit: int + ) -> list[SearchResult]: + """ + Search papers via Firecrawl Research Index (Tier 1). + + Semantically-ranked paper search over a research-specific index. + Stronger than the raw arXiv keyword client: the API returns a + relevance `score`, canonical + source-specific ids, and covers + sources beyond arXiv. Works keyless but honors FIRECRAWL_API_KEY + for higher rate limits. + + Docs: https://docs.firecrawl.dev/features/research + """ + if not HAS_AIOHTTP: + return [] + + results: list[SearchResult] = [] + url = "https://api.firecrawl.dev/v2/search/research/papers" + params = {"query": query, "k": max(1, limit)} + headers = {} + if self.firecrawl_key: + headers["Authorization"] = f"Bearer {self.firecrawl_key}" + + try: + timeout = aiohttp.ClientTimeout(total=30) + async with aiohttp.ClientSession(timeout=timeout) as session: + async with session.get( + url, params=params, headers=headers + ) as resp: + if resp.status != 200: + return [] + data = await resp.json() + + if not data.get("success"): + return [] + + for paper in data.get("results", [])[:limit]: + primary_id = paper.get("primaryId", "") or "" + arxiv_id = self._extract_arxiv_id(primary_id, paper.get("ids")) + paper_url = ( + f"https://arxiv.org/abs/{arxiv_id}" + if arxiv_id + else f"https://www.semanticscholar.org/paper/{paper.get('paperId', '')}" + ) + + try: + base = float(paper.get("score", 0.0)) + except (TypeError, ValueError): + base = 0.0 + + results.append( + SearchResult( + url=paper_url, + title=paper.get("title", "Untitled"), + content=paper.get("abstract", ""), + tier=SourceTier.TIER_1, + category=SourceCategory.RESEARCH, + source_name="Firecrawl Research", + published_date=self._arxiv_id_to_date(arxiv_id), + base_relevance=base, + ) + ) + + except Exception as e: + print(f"Firecrawl paper search error: {e}") + + return results + + @staticmethod + def _extract_arxiv_id(primary_id: str, ids) -> Optional[str]: + """Pull a bare arXiv id from primaryId or the ids blob.""" + if primary_id.startswith("arxiv:"): + return primary_id.split("arxiv:", 1)[1].split("v")[0] + # ids arrives as a stringified dict like "{'arxiv': ['2605.22949']}" + if ids: + m = re.search(r"(\d{4}\.\d{4,5})", str(ids)) + if m: + return m.group(1) + return None + + @staticmethod + def _arxiv_id_to_date(arxiv_id: Optional[str]) -> Optional[datetime]: + """Approximate publish date from arXiv YYMM.NNNNN id encoding.""" + if not arxiv_id: + return None + m = re.match(r"(\d{2})(\d{2})\.", arxiv_id) + if not m: + return None + yy, mm = int(m.group(1)), int(m.group(2)) + if not 1 <= mm <= 12: + return None + try: + return datetime(2000 + yy, mm, 1) + except ValueError: + return None + def _build_arxiv_query(self, query: str) -> str: """ Build optimized arXiv query from natural language. From 5d609d6d03e437309ce725555fcd9cef4942ece6 Mon Sep 17 00:00:00 2001 From: Dico Angelo Date: Wed, 1 Jul 2026 00:22:28 -0400 Subject: [PATCH 07/11] feat(cpb): add Firecrawl read-paper passages for citation grounding Adds read_paper_passages() to TieredSearchLayer, backing citation-grounding checks: pull the top full-text passages in a cited paper that address a specific claim before trusting the citation. Accepts canonical paperId or source ids (e.g. arxiv:1706.03762). Firecrawl-backed, keyless-capable. --- cpb/search_layer.py | 54 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/cpb/search_layer.py b/cpb/search_layer.py index 5e68360..c8d3246 100644 --- a/cpb/search_layer.py +++ b/cpb/search_layer.py @@ -488,6 +488,60 @@ async def _search_firecrawl_papers( return results + async def read_paper_passages( + self, paper_id: str, question: str, k: int = 4 + ) -> list[dict]: + """ + Read the top full-text passages in one paper that answer a question. + + Backs citation-grounding checks: before trusting that a cited paper + actually contains a claimed method/result, pull the passages that + address it. `paper_id` may be canonical (paperId) or source-specific + (e.g. "arxiv:1706.03762"). + + Returns a list of {"score": float, "text": str}, best first. + Docs: https://docs.firecrawl.dev/features/research + """ + if not HAS_AIOHTTP or not paper_id: + return [] + + from urllib.parse import quote + + url = ( + "https://api.firecrawl.dev/v2/search/research/papers/" + f"{quote(paper_id, safe=':')}" + ) + params = {"query": question, "k": max(1, k)} + headers = {} + if self.firecrawl_key: + headers["Authorization"] = f"Bearer {self.firecrawl_key}" + + try: + timeout = aiohttp.ClientTimeout(total=30) + async with aiohttp.ClientSession(timeout=timeout) as session: + async with session.get( + url, params=params, headers=headers + ) as resp: + if resp.status != 200: + return [] + data = await resp.json() + + if not data.get("success"): + return [] + + passages = [] + for p in data.get("passages", [])[:k]: + try: + score = float(p.get("score", 0.0)) + except (TypeError, ValueError): + score = 0.0 + passages.append({"score": score, "text": p.get("text", "")}) + return passages + + except Exception as e: + print(f"Firecrawl read-paper error: {e}") + return [] + @staticmethod def _extract_arxiv_id(primary_id: str, ids) -> Optional[str]: """Pull a bare arXiv id from primaryId or the ids blob.""" From 748a2f5cb136ca90bd7a24f8691eb2009df70bb6 Mon Sep 17 00:00:00 2001 From: Dico Angelo Date: Wed, 1 Jul 2026 01:00:38 -0400 Subject: [PATCH 08/11] feat(cpb): wire Firecrawl citation grounding into CriticVerifier Adds opt-in ground_citations() to the verification pipeline: for each arXiv citation in a response, pull the cited paper's passages via Firecrawl read-paper and confirm the paper is real + retrievable, attaching the top passage as evidence. Exposed via verify(ground_citations=True) and surfaced on VerificationResult (citations_grounded, grounding_evidence). Default off so the core pipeline stays hermetic; 17/17 cpb tests unchanged. --- cpb/critic_verifier.py | 84 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) diff --git a/cpb/critic_verifier.py b/cpb/critic_verifier.py index 16b0111..817538a 100644 --- a/cpb/critic_verifier.py +++ b/cpb/critic_verifier.py @@ -84,6 +84,10 @@ class VerificationResult: citations_found: int = 0 citations_verified: int = 0 + # Citation grounding (v2.6 — Firecrawl read-paper passage evidence) + citations_grounded: int = 0 # arXiv cites with retrievable supporting passages + grounding_evidence: List[Dict[str, Any]] = field(default_factory=list) + # Metadata verification_method: str = "precision_v2" retries_recommended: int = 0 @@ -115,6 +119,8 @@ def to_dict(self) -> Dict[str, Any]: "issues": self.issues, "citations_found": self.citations_found, "citations_verified": self.citations_verified, + "citations_grounded": self.citations_grounded, + "grounding_evidence": self.grounding_evidence, "verification_method": self.verification_method, "retries_recommended": self.retries_recommended, "feedback": self.feedback, @@ -438,6 +444,74 @@ def __init__(self): self.ground_truth_validator = get_gt_validator() self.thresholds = PRECISION_VERIFICATION_THRESHOLDS + @staticmethod + def _arxiv_ids_from_citations(citations: List[Dict[str, Any]]) -> List[str]: + """Collect unique arXiv ids from extracted citations.""" + ids = [] + for c in citations: + aid = c.get("id") if c.get("type") == "arxiv" else c.get("resolved_arxiv") + if aid and aid not in ids: + ids.append(aid) + return ids + + async def ground_citations( + self, + response: str, + sources: Optional[List[Dict[str, Any]]] = None, + question: Optional[str] = None, + max_papers: int = 5, + ) -> Dict[str, Any]: + """ + Ground the response's arXiv citations against the cited papers' + actual full text via Firecrawl read-paper passages. + + For each arXiv id cited, pull the passages that address the response's + question/thesis. A citation is "grounded" when the paper is real and + returns retrievable passages. Attaches the top passage as evidence so a + reviewer can confirm the paper supports what it's cited for. + + Additive and network-bound: called explicitly or via verify( + ground_citations=True); never runs by default so the core pipeline + stays hermetic and fast. + + Returns {checked, grounded, coverage, evidence:[...]}. + """ + citations = self.citation_extractor.extract_citations(response, sources) + arxiv_ids = self._arxiv_ids_from_citations(citations)[:max_papers] + if not arxiv_ids: + return {"checked": 0, "grounded": 0, "coverage": 0.0, "evidence": []} + + from .search_layer import get_search_layer + + layer = get_search_layer() + probe = question or response[:300] + + evidence: List[Dict[str, Any]] = [] + grounded = 0 + for aid in arxiv_ids: + passages = await layer.read_paper_passages( + f"arxiv:{aid}", probe, k=2 + ) + has_support = bool(passages) + if has_support: + grounded += 1 + evidence.append( + { + "arxiv_id": aid, + "grounded": has_support, + "top_passage": passages[0]["text"][:400] if passages else "", + "top_score": passages[0]["score"] if passages else 0.0, + } + ) + + checked = len(arxiv_ids) + return { + "checked": checked, + "grounded": grounded, + "coverage": grounded / checked if checked else 0.0, + "evidence": evidence, + } + async def verify( self, response: str, @@ -446,6 +520,7 @@ async def verify( context: Optional[str] = None, pioneer_mode: bool = False, trust_context: bool = False, + ground_citations: bool = False, ) -> VerificationResult: """ Run full verification pipeline on a response (v2.4 with mode flags). @@ -476,6 +551,13 @@ async def verify( # Verify citations against sources citations_verified = self._verify_citations(citations, sources) + # Optional: ground arXiv citations against paper full text (Firecrawl) + grounding = {"grounded": 0, "evidence": []} + if ground_citations: + grounding = await self.ground_citations( + response, sources, question=query + ) + # Calculate component scores evidence_score = await self._calculate_evidence_score( response, citations, sources @@ -645,6 +727,8 @@ async def verify( issues=issues, citations_found=citations_found, citations_verified=citations_verified, + citations_grounded=grounding["grounded"], + grounding_evidence=grounding["evidence"], retries_recommended=retries, feedback=feedback, ) From aeb921788c16740a0e0af4fabb80969848669adc Mon Sep 17 00:00:00 2001 From: Dico Angelo Date: Wed, 1 Jul 2026 11:28:40 -0400 Subject: [PATCH 09/11] =?UTF-8?q?feat(cpb):=20Frontier=20Scout=20=E2=80=94?= =?UTF-8?q?=20active=20citation-neighborhood=20gap=20detection?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds related_papers() to the search layer (Firecrawl citers/similar/ references) and cpb/frontier_scout.py, a seed->expand->subtract->rank->surface loop that turns RG from passive logger into active scout. Seeds from the strongest arXiv papers already in the corpus (topic/project-scoped), expands each via citation neighborhood, subtracts everything already logged, and ranks what remains by cross-seed density. Automates RG's Thesis->Gap->Direction methodology (the Gap = new work in your neighborhood you have not seen). CLI: python3 -m cpb.frontier_scout --topic '...' --mode citers --limit 10 Live-verified: 27 new 2026 papers surfaced from 4 seeds vs 429 corpus papers. --- cpb/frontier_scout.py | 290 ++++++++++++++++++++++++++++++++++++++++++ cpb/search_layer.py | 49 +++++++ 2 files changed, 339 insertions(+) create mode 100644 cpb/frontier_scout.py diff --git a/cpb/frontier_scout.py b/cpb/frontier_scout.py new file mode 100644 index 0000000..057a600 --- /dev/null +++ b/cpb/frontier_scout.py @@ -0,0 +1,290 @@ +#!/usr/bin/env python3 +""" +Frontier Scout — turn ResearchGravity from a passive logger into an active scout. + +Loop: + 1. SEED — pick the strongest arXiv papers already in the corpus + (optionally scoped to a topic/project). + 2. EXPAND — for each seed, pull its citation neighborhood via Firecrawl + related-papers (citers = forward/frontier, similar = lateral). + 3. SUBTRACT — drop anything already in the corpus (the 1.2k+ arXiv ids + you've logged). What remains is, by construction, new to you. + 4. RANK — score each candidate by how many seeds surfaced it and its + structural/semantic signal, so densely-connected frontier + papers float to the top. + 5. SURFACE — "NEW in your neighborhood you haven't seen." + +This automates RG's own methodology (Thesis -> Gap -> Innovation Direction): +the Gap step is exactly "what exists in my neighborhood that I haven't logged." + +Usage: + python3 -m cpb.frontier_scout --topic "multi-agent trust" --limit 10 + python3 -m cpb.frontier_scout --project os-app --mode citers --seeds 5 + python3 -m cpb.frontier_scout --json # machine-readable +""" + +import argparse +import asyncio +import json +import os +import re +import sqlite3 +from dataclasses import dataclass, field +from pathlib import Path +from typing import Optional + +from .search_layer import get_search_layer + +DB_PATH = Path.home() / ".agent-core" / "storage" / "antigravity.db" +ARXIV_RE = re.compile(r"(\d{4}\.\d{4,5})") + + +@dataclass +class Seed: + arxiv_id: str + title: str + topic: str + relevance: float + + +@dataclass +class FrontierPaper: + arxiv_id: str + title: str + abstract: str + best_score: float = 0.0 + structural: float = 0.0 + seed_hits: int = 0 # how many of my seeds pointed here (density = signal) + from_seeds: list[str] = field(default_factory=list) + + @property + def frontier_score(self) -> float: + # Density across seeds dominates; semantic score breaks ties. + return self.seed_hits * 1.0 + self.best_score + + @property + def url(self) -> str: + return f"https://arxiv.org/abs/{self.arxiv_id}" + + +class FrontierScout: + def __init__(self, db_path: Path = DB_PATH): + self.db_path = db_path + self.layer = get_search_layer() + + # ---- corpus access ------------------------------------------------- + def _connect(self) -> sqlite3.Connection: + conn = sqlite3.connect(str(self.db_path)) + conn.row_factory = sqlite3.Row + return conn + + def corpus_arxiv_ids(self) -> set[str]: + """Every arXiv id already logged — the subtract set.""" + ids: set[str] = set() + with self._connect() as c: + for (url,) in c.execute( + "SELECT url FROM urls WHERE url LIKE '%arxiv.org%'" + ): + m = ARXIV_RE.search(url or "") + if m: + ids.add(m.group(1)) + return ids + + def select_seeds( + self, + topic: Optional[str] = None, + project: Optional[str] = None, + limit: int = 5, + ) -> list[Seed]: + """Strongest arXiv papers in the corpus, optionally topic/project-scoped.""" + q = """ + SELECT u.url AS url, u.relevance AS relevance, + COALESCE(s.topic, u.context, '') AS topic + FROM urls u + LEFT JOIN sessions s ON s.id = u.session_id + WHERE u.url LIKE '%arxiv.org%' + """ + params: list = [] + if topic: + q += " AND (s.topic LIKE ? OR u.context LIKE ? OR u.category LIKE ?)" + like = f"%{topic}%" + params += [like, like, like] + if project: + q += " AND s.project = ?" + params.append(project) + q += " ORDER BY u.relevance DESC, u.captured_at DESC LIMIT ?" + params.append(limit * 3) # over-pull, then dedupe by arxiv id + + seeds: list[Seed] = [] + seen: set[str] = set() + with self._connect() as c: + for row in c.execute(q, params): + m = ARXIV_RE.search(row["url"] or "") + if not m: + continue + aid = m.group(1) + if aid in seen: + continue + seen.add(aid) + seeds.append( + Seed( + arxiv_id=aid, + title=(row["topic"] or "").strip()[:70] or aid, + topic=(row["topic"] or "").strip(), + relevance=float(row["relevance"] or 0), + ) + ) + if len(seeds) >= limit: + break + return seeds + + # ---- the loop ------------------------------------------------------ + async def scout( + self, + topic: Optional[str] = None, + project: Optional[str] = None, + seeds: int = 5, + mode: str = "citers", + limit: int = 10, + per_seed: int = 20, + ) -> dict: + seed_list = self.select_seeds(topic=topic, project=project, limit=seeds) + if not seed_list: + return {"seeds": [], "frontier": [], "note": "no arXiv seeds matched"} + + known = self.corpus_arxiv_ids() + intent = topic or "frontier work extending these papers" + + # EXPAND — one related-papers call per seed, in parallel + tasks = [ + self.layer.related_papers( + f"arxiv:{s.arxiv_id}", intent, mode=mode, k=per_seed + ) + for s in seed_list + ] + results = await asyncio.gather(*tasks, return_exceptions=True) + + # SUBTRACT + RANK — aggregate candidates across seeds + frontier: dict[str, FrontierPaper] = {} + for seed, res in zip(seed_list, results): + if isinstance(res, Exception): + continue + for cand in res: + aid = self._cand_arxiv_id(cand) + if not aid or aid in known or aid == seed.arxiv_id: + continue # already logged, or the seed itself + try: + score = float(cand.get("score", 0.0)) + except (TypeError, ValueError): + score = 0.0 + structural = self._signal(cand, "structural") + fp = frontier.get(aid) + if fp is None: + fp = FrontierPaper( + arxiv_id=aid, + title=cand.get("title", "Untitled"), + abstract=cand.get("abstract", "")[:280], + ) + frontier[aid] = fp + fp.seed_hits += 1 + fp.from_seeds.append(seed.arxiv_id) + fp.best_score = max(fp.best_score, score) + fp.structural = max(fp.structural, structural) + + ranked = sorted( + frontier.values(), key=lambda p: p.frontier_score, reverse=True + )[:limit] + + return { + "topic": topic, + "mode": mode, + "seeds": [{"arxiv_id": s.arxiv_id, "title": s.title} for s in seed_list], + "corpus_size": len(known), + "candidates_found": len(frontier), + "frontier": [ + { + "arxiv_id": p.arxiv_id, + "title": p.title, + "url": p.url, + "seed_hits": p.seed_hits, + "score": round(p.best_score, 4), + "from_seeds": p.from_seeds, + "abstract": p.abstract, + } + for p in ranked + ], + } + + # ---- helpers ------------------------------------------------------- + @staticmethod + def _cand_arxiv_id(cand: dict) -> Optional[str]: + pid = cand.get("primaryId", "") or "" + if pid.startswith("arxiv:"): + return pid.split("arxiv:", 1)[1].split("v")[0] + m = ARXIV_RE.search(str(cand.get("ids", ""))) + return m.group(1) if m else None + + @staticmethod + def _signal(cand: dict, key: str) -> float: + raw = cand.get("signals") + if not raw: + return 0.0 + m = re.search(rf"'{key}':\s*([\d.]+)", str(raw)) + return float(m.group(1)) if m else 0.0 + + +def _print_report(out: dict) -> None: + if not out.get("frontier"): + print(f"No new frontier papers found. {out.get('note', '')}") + if out.get("seeds"): + print("Seeds used:", ", ".join(s["arxiv_id"] for s in out["seeds"])) + return + topic = out.get("topic") or "your corpus" + print(f"\n=== FRONTIER SCOUT — new in '{topic}' ({out['mode']} mode) ===") + print( + f"Seeded from {len(out['seeds'])} of your papers · " + f"{out['candidates_found']} new candidates vs {out['corpus_size']} " + f"already logged\n" + ) + for i, p in enumerate(out["frontier"], 1): + density = f"{p['seed_hits']} seeds" if p["seed_hits"] > 1 else "1 seed" + print(f"{i}. {p['title']}") + print(f" {p['url']} · {density} · score {p['score']}") + if p["abstract"]: + print(f" {p['abstract'][:140].strip()}...") + print() + + +def main() -> None: + ap = argparse.ArgumentParser(description="Frontier Scout for ResearchGravity") + ap.add_argument("--topic", help="scope seeds + intent to a topic") + ap.add_argument("--project", help="scope seeds to a lineage project") + ap.add_argument("--seeds", type=int, default=5, help="seed papers to expand") + ap.add_argument( + "--mode", + default="citers", + choices=["citers", "similar", "references"], + help="citers=frontier watch, similar=lateral, references=foundations", + ) + ap.add_argument("--limit", type=int, default=10, help="frontier papers to surface") + ap.add_argument("--json", action="store_true", help="machine-readable output") + args = ap.parse_args() + + scout = FrontierScout() + out = asyncio.run( + scout.scout( + topic=args.topic, + project=args.project, + seeds=args.seeds, + mode=args.mode, + limit=args.limit, + ) + ) + if args.json: + print(json.dumps(out, indent=2)) + else: + _print_report(out) + + +if __name__ == "__main__": + main() diff --git a/cpb/search_layer.py b/cpb/search_layer.py index c8d3246..3e43d14 100644 --- a/cpb/search_layer.py +++ b/cpb/search_layer.py @@ -542,6 +542,55 @@ async def read_paper_passages( print(f"Firecrawl read-paper error: {e}") return [] + async def related_papers( + self, + paper_id: str, + intent: str, + mode: str = "similar", + k: int = 20, + ) -> list[dict]: + """ + Expand from a seed paper to related papers (Firecrawl Research). + + modes: + - "similar": co-citation + bibliographic-coupling neighborhood + - "citers": papers that cite the seed (forward / frontier watch) + - "references": papers the seed cites (backward / foundations) + + Candidates are ranked against `intent`. Returns raw dicts with + paperId, primaryId, title, abstract, score, and structural/semantic + signals so callers can re-rank. Backs the Frontier Scout. + """ + if not HAS_AIOHTTP or not paper_id: + return [] + + from urllib.parse import quote + + url = ( + "https://api.firecrawl.dev/v2/search/research/papers/" + f"{quote(paper_id, safe=':')}/similar" + ) + params = {"intent": intent, "mode": mode, "k": max(1, k)} + headers = {} + if self.firecrawl_key: + headers["Authorization"] = f"Bearer {self.firecrawl_key}" + + try: + timeout = aiohttp.ClientTimeout(total=30) + async with aiohttp.ClientSession(timeout=timeout) as session: + async with session.get( + url, params=params, headers=headers + ) as resp: + if resp.status != 200: + return [] + data = await resp.json() + if not data.get("success"): + return [] + return data.get("results", [])[:k] + except Exception as e: + print(f"Firecrawl related-papers error: {e}") + return [] + @staticmethod def _extract_arxiv_id(primary_id: str, ids) -> Optional[str]: """Pull a bare arXiv id from primaryId or the ids blob.""" From 77978afdf3cfa11c3a0a90dabda3e07ae10db90e Mon Sep 17 00:00:00 2001 From: Dico Angelo Date: Wed, 1 Jul 2026 11:35:58 -0400 Subject: [PATCH 10/11] =?UTF-8?q?feat(cpb):=20Corpus=20Grounding=20Audit?= =?UTF-8?q?=20=E2=80=94=20trust=20ledger=20over=20findings?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds cpb/grounding_audit.py: for every finding carrying an inline arXiv citation, pull the cited paper's passages (Firecrawl read-paper) and record whether the paper resolves + contains text addressing the claim. Produces an append-only, resumable JSONL trust ledger — grounded vs unresolved (dangling citation) per finding. Scoped to the 154 inline-cited findings (not a 33k blast), runnable in slices, skips already-audited. CLI: python3 -m cpb.grounding_audit --limit N | --report Live: 17 findings audited, 100% grounded, ~1.8s/finding, resumable. --- .gitignore | 1 + cpb/grounding_audit.py | 204 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 205 insertions(+) create mode 100644 cpb/grounding_audit.py diff --git a/.gitignore b/.gitignore index 95edfb4..942f738 100644 --- a/.gitignore +++ b/.gitignore @@ -70,3 +70,4 @@ visual_assets/ # Local AI context — kept on disk, not published CLAUDE.md .claude/ +grounding_ledger.jsonl diff --git a/cpb/grounding_audit.py b/cpb/grounding_audit.py new file mode 100644 index 0000000..f1df9c2 --- /dev/null +++ b/cpb/grounding_audit.py @@ -0,0 +1,204 @@ +#!/usr/bin/env python3 +""" +Corpus Grounding Audit — a trust ledger over ResearchGravity's findings. + +For every finding that carries an inline arXiv citation, pull the cited +paper's actual passages (Firecrawl read-paper) and check whether the paper is +real and contains text addressing the finding's claim. The result is a +per-finding trust record — a quality/epistemics audit of the corpus that no +other research system has. + +Honest labeling (retrieval can confirm presence, not adjudicate truth): + - grounded : cited paper resolves AND returns passages addressing the claim + - unresolved : paper id not found, or no passages returned (citation is a + dangling reference — the thing most worth flagging) + +Design (per house rules): + - Append-only JSONL ledger; never overwritten. + - Resumable: skips findings already in the ledger, so it can run in slices. + - Scoped + rate-friendly: only inline-cited findings, with --limit. + +Usage: + python3 -m cpb.grounding_audit --limit 15 # audit a slice + python3 -m cpb.grounding_audit # audit all remaining + python3 -m cpb.grounding_audit --report # summarize the ledger +""" + +import argparse +import asyncio +import json +import os +import re +import sqlite3 +import time +from datetime import datetime, timezone +from pathlib import Path + +from .search_layer import get_search_layer + +DB_PATH = Path.home() / ".agent-core" / "storage" / "antigravity.db" +LEDGER_PATH = Path.home() / ".agent-core" / "storage" / "grounding_ledger.jsonl" +ARXIV_RE = re.compile(r"arxiv[:\s]*(\d{4}\.\d{4,5})", re.IGNORECASE) + + +def _now() -> str: + return datetime.now(timezone.utc).isoformat() + + +def audited_ids(ledger: Path = LEDGER_PATH) -> set[str]: + """Finding ids already in the ledger (for resumability).""" + seen: set[str] = set() + if not ledger.exists(): + return seen + with ledger.open() as f: + for line in f: + try: + seen.add(str(json.loads(line)["finding_id"])) + except (json.JSONDecodeError, KeyError): + continue + return seen + + +def inline_cited_findings(db: Path = DB_PATH) -> list[dict]: + """Findings whose own text names an arXiv paper.""" + conn = sqlite3.connect(str(db)) + conn.row_factory = sqlite3.Row + rows = conn.execute( + "SELECT id, content, project, session_id FROM findings " + "WHERE content LIKE '%arxiv%' OR content LIKE '%arXiv%'" + ).fetchall() + conn.close() + out = [] + for r in rows: + ids = list(dict.fromkeys(ARXIV_RE.findall(r["content"] or ""))) + if ids: + out.append( + { + "finding_id": str(r["id"]), + "content": r["content"], + "project": r["project"], + "arxiv_ids": ids, + } + ) + return out + + +class GroundingAudit: + def __init__(self, db: Path = DB_PATH, ledger: Path = LEDGER_PATH): + self.db = db + self.ledger = ledger + self.layer = get_search_layer() + + async def audit_finding(self, finding: dict) -> dict: + """Ground one finding against every paper it cites.""" + claim = self._claim(finding["content"]) + per_paper = [] + grounded_any = False + for aid in finding["arxiv_ids"][:3]: # cap papers per finding + passages = await self.layer.read_paper_passages( + f"arxiv:{aid}", claim, k=2 + ) + resolved = bool(passages) + grounded_any = grounded_any or resolved + per_paper.append( + { + "arxiv_id": aid, + "resolved": resolved, + "top_score": round(passages[0]["score"], 4) if passages else 0.0, + "top_passage": passages[0]["text"][:300] if passages else "", + } + ) + return { + "finding_id": finding["finding_id"], + "project": finding["project"], + "claim": claim[:200], + "verdict": "grounded" if grounded_any else "unresolved", + "papers": per_paper, + "audited_at": _now(), + } + + async def run(self, limit: int | None = None) -> dict: + done = audited_ids(self.ledger) + pending = [f for f in inline_cited_findings(self.db) if f["finding_id"] not in done] + if limit: + pending = pending[:limit] + + self.ledger.parent.mkdir(parents=True, exist_ok=True) + grounded = unresolved = 0 + t0 = time.time() + with self.ledger.open("a") as out: + for f in pending: + rec = await self.audit_finding(f) + out.write(json.dumps(rec) + "\n") + out.flush() + if rec["verdict"] == "grounded": + grounded += 1 + else: + unresolved += 1 + + return { + "audited_now": len(pending), + "grounded": grounded, + "unresolved": unresolved, + "already_in_ledger": len(done), + "elapsed_s": round(time.time() - t0, 1), + } + + @staticmethod + def _claim(content: str) -> str: + """Use the sentence nearest the citation as the claim to verify.""" + text = re.sub(r"\s+", " ", content or "").strip() + m = ARXIV_RE.search(text) + if not m: + return text[:200] + start = max(0, m.start() - 160) + return text[start : m.start() + 40].strip() + + +def report(ledger: Path = LEDGER_PATH) -> None: + if not ledger.exists(): + print("No ledger yet. Run: python3 -m cpb.grounding_audit --limit 15") + return + total = grounded = unresolved = 0 + dangling = [] + with ledger.open() as f: + for line in f: + try: + rec = json.loads(line) + except json.JSONDecodeError: + continue + total += 1 + if rec["verdict"] == "grounded": + grounded += 1 + else: + unresolved += 1 + dangling.append(rec) + print("\n=== CORPUS GROUNDING LEDGER ===") + print(f"findings audited : {total}") + print(f"grounded : {grounded} ({grounded / total * 100:.0f}%)" if total else "") + print(f"unresolved : {unresolved} (dangling citations — flag these)") + for rec in dangling[:8]: + ids = ", ".join(p["arxiv_id"] for p in rec["papers"]) + print(f" ! finding {rec['finding_id']} [{rec.get('project') or '-'}] cites {ids}") + print(f" claim: {rec['claim'][:90]}") + + +def main() -> None: + ap = argparse.ArgumentParser(description="Corpus Grounding Audit") + ap.add_argument("--limit", type=int, help="audit at most N pending findings") + ap.add_argument("--report", action="store_true", help="summarize the ledger") + args = ap.parse_args() + + if args.report: + report() + return + + audit = GroundingAudit() + summary = asyncio.run(audit.run(limit=args.limit)) + print(json.dumps(summary, indent=2)) + print(f"\nLedger: {LEDGER_PATH}") + print("Summarize with: python3 -m cpb.grounding_audit --report") + + +if __name__ == "__main__": + main() From f2d3909a1e16f1b6ded6a5ac5152acc4229e7e74 Mon Sep 17 00:00:00 2001 From: Dico Angelo Date: Wed, 1 Jul 2026 12:46:46 -0400 Subject: [PATCH 11/11] =?UTF-8?q?feat(cpb):=20Paper->Code=20Bridge=20?= =?UTF-8?q?=E2=80=94=20link=20papers=20to=20implementation=20prior-art?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds related_github() to the search layer (Firecrawl github-history, noise- filtered) and cpb/paper_code_bridge.py: for a method/paper title or a corpus arXiv id, surface the ranked engineering record — merged PRs, issues, discussions, READMEs — the 'does it actually work in code' signal that closes RG's research->implementation lineage loop. Ranks by page-type weight x fusion score (merged_pr > issue > readme). CLI: python3 -m cpb.paper_code_bridge 'method' | --arxiv Live: 'speculative decoding' -> real llama.cpp + HF transformers PRs. --- cpb/paper_code_bridge.py | 137 +++++++++++++++++++++++++++++++++++++++ cpb/search_layer.py | 46 +++++++++++++ 2 files changed, 183 insertions(+) create mode 100644 cpb/paper_code_bridge.py diff --git a/cpb/paper_code_bridge.py b/cpb/paper_code_bridge.py new file mode 100644 index 0000000..91ab81e --- /dev/null +++ b/cpb/paper_code_bridge.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python3 +""" +Paper -> Code Bridge — close RG's research->implementation lineage loop. + +ResearchGravity already links research sessions to implementation projects. +The missing half was the outside world's engineering record: for any paper or +method, what actually got built, what broke, what was debated. Firecrawl's +GitHub-history search (issues / PRs / discussions / READMEs) supplies it. + +Given a query (method / paper title) or an arXiv id already in the corpus, +surface the ranked implementation prior-art: working repos, known bugs, design +discussions — the "does it actually work in code" signal that a paper alone +never carries. + +Usage: + python3 -m cpb.paper_code_bridge "flash attention implementation" + python3 -m cpb.paper_code_bridge --arxiv 1706.03762 # look up title, then search + python3 -m cpb.paper_code_bridge "speculative decoding" --limit 8 --json +""" + +import argparse +import asyncio +import json +import os +import re +import sqlite3 +from pathlib import Path +from typing import Optional + +from .search_layer import get_search_layer + +DB_PATH = Path.home() / ".agent-core" / "storage" / "antigravity.db" + +# Prefer the engineering record that actually carries design signal. +PAGETYPE_WEIGHT = { + "merged_pr": 1.0, + "pull_request": 0.9, + "issue": 0.8, + "discussion": 0.7, + "readme": 0.6, +} + + +def _fusion(cand: dict) -> float: + m = re.search(r"'fusion':\s*([\d.]+)", str(cand.get("scores", ""))) + return float(m.group(1)) if m else 0.0 + + +def _arxiv_title_from_corpus(arxiv_id: str, db: Path = DB_PATH) -> Optional[str]: + """Recover a search phrase for an arXiv id from the corpus (session topic).""" + conn = sqlite3.connect(str(db)) + try: + row = conn.execute( + "SELECT COALESCE(s.topic, u.context) FROM urls u " + "LEFT JOIN sessions s ON s.id = u.session_id " + "WHERE u.url LIKE ? ORDER BY u.relevance DESC LIMIT 1", + (f"%{arxiv_id}%",), + ).fetchone() + return (row[0].strip() if row and row[0] else None) + finally: + conn.close() + + +class PaperCodeBridge: + def __init__(self, db: Path = DB_PATH): + self.db = db + self.layer = get_search_layer() + + async def bridge( + self, query: str, limit: int = 8 + ) -> list[dict]: + raw = await self.layer.related_github(query, k=limit * 2) + scored = [] + for r in raw: + weight = PAGETYPE_WEIGHT.get(r.get("pageType", ""), 0.5) + rank = weight + _fusion(r) + scored.append( + { + "repo": r.get("repo", ""), + "url": r.get("url", ""), + "kind": r.get("pageType", ""), + "title": (r.get("title") or "").strip()[:90], + "snippet": re.sub(r"\s+", " ", r.get("snippet", "")).strip()[:200], + "rank": round(rank, 4), + } + ) + scored.sort(key=lambda x: x["rank"], reverse=True) + return scored[:limit] + + async def bridge_arxiv(self, arxiv_id: str, limit: int = 8) -> dict: + title = _arxiv_title_from_corpus(arxiv_id, self.db) + query = title or arxiv_id + results = await self.bridge(f"{query} implementation", limit=limit) + return {"arxiv_id": arxiv_id, "query": query, "results": results} + + +def _print(query: str, results: list[dict]) -> None: + if not results: + print(f"No implementation prior-art found for '{query}'.") + return + print(f"\n=== PAPER -> CODE — implementation record for '{query}' ===\n") + for i, r in enumerate(results, 1): + kind = r["kind"].replace("_", " ") + print(f"{i}. [{kind}] {r['repo']}") + print(f" {r['url']}") + if r["snippet"]: + print(f" {r['snippet']}") + print() + + +def main() -> None: + ap = argparse.ArgumentParser(description="Paper -> Code Bridge") + ap.add_argument("query", nargs="?", help="method / paper title to search") + ap.add_argument("--arxiv", help="arXiv id in the corpus; look up title then search") + ap.add_argument("--limit", type=int, default=8) + ap.add_argument("--json", action="store_true") + args = ap.parse_args() + + bridge = PaperCodeBridge() + if args.arxiv: + out = asyncio.run(bridge.bridge_arxiv(args.arxiv, limit=args.limit)) + if args.json: + print(json.dumps(out, indent=2)) + else: + _print(out["query"], out["results"]) + elif args.query: + results = asyncio.run(bridge.bridge(args.query, limit=args.limit)) + if args.json: + print(json.dumps(results, indent=2)) + else: + _print(args.query, results) + else: + ap.error("provide a query or --arxiv ") + + +if __name__ == "__main__": + main() diff --git a/cpb/search_layer.py b/cpb/search_layer.py index 3e43d14..05d5812 100644 --- a/cpb/search_layer.py +++ b/cpb/search_layer.py @@ -591,6 +591,52 @@ async def related_papers( print(f"Firecrawl related-papers error: {e}") return [] + async def related_github( + self, query: str, k: int = 10, drop_noise: bool = True + ) -> list[dict]: + """ + Search GitHub history (issues / PRs / discussions / READMEs) for + implementation prior-art via Firecrawl Research. + + Backs the Paper->Code Bridge: for a paper or method, find the real + engineering record — working repos, known bugs, design debates. + When `drop_noise`, results the index flags as noise/demoted are + filtered out. Returns raw dicts (repo, url, pageType, title, snippet, + scores, ...) best-first. + """ + if not HAS_AIOHTTP or not query: + return [] + + url = "https://api.firecrawl.dev/v2/search/research/github" + params = {"query": query, "k": max(1, k * 2 if drop_noise else k)} + headers = {} + if self.firecrawl_key: + headers["Authorization"] = f"Bearer {self.firecrawl_key}" + + try: + timeout = aiohttp.ClientTimeout(total=30) + async with aiohttp.ClientSession(timeout=timeout) as session: + async with session.get( + url, params=params, headers=headers + ) as resp: + if resp.status != 200: + return [] + data = await resp.json() + if not data.get("success"): + return [] + results = data.get("results", []) + if drop_noise: + results = [ + r + for r in results + if r.get("policyStatus") != "demote" + and r.get("noiseKind") != "noise" + ] + return results[:k] + except Exception as e: + print(f"Firecrawl github-history error: {e}") + return [] + @staticmethod def _extract_arxiv_id(primary_id: str, ids) -> Optional[str]: """Pull a bare arXiv id from primaryId or the ids blob."""