diff --git a/backend/schemas/__init__.py b/backend/schemas/__init__.py
new file mode 100644
index 0000000..2dd4f6b
--- /dev/null
+++ b/backend/schemas/__init__.py
@@ -0,0 +1 @@
+# Backend schemas package
diff --git a/backend/schemas/code_collection_schema.py b/backend/schemas/code_collection_schema.py
new file mode 100644
index 0000000..d629add
--- /dev/null
+++ b/backend/schemas/code_collection_schema.py
@@ -0,0 +1,156 @@
+"""
+Milvus schema definition for the code_collection.
+
+Stores chunked and embedded code from kubeflow/manifests repository.
+Supports Python, Go, YAML, and Markdown file types.
+Uses HNSW index with COSINE metric for fast ANN retrieval.
+
+Dimension defaults to 384 (all-MiniLM-L6-v2). Override via EMBEDDING_MODEL env var.
+"""
+
+import logging
+import os
+import sys
+
+from pymilvus import CollectionSchema, DataType, FieldSchema
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
+
+from pipelines.shared.embedding_utils import get_embedding_dimension
+
+logger = logging.getLogger(__name__)
+
+COLLECTION_NAME = "code_collection"
+
+
+def get_code_fields(dim: int = None) -> list:
+    """Define the field schema for code_collection.
+
+    Args:
+        dim: Embedding vector dimension. Auto-detected from EMBEDDING_MODEL if None.
+
+    Returns:
+        List of FieldSchema objects.
+    """
+    if dim is None:
+        dim = get_embedding_dimension()
+
+    fields = [
+        FieldSchema(
+            name="chunk_id",
+            dtype=DataType.VARCHAR,
+            max_length=128,
+            is_primary=True,
+            description="Unique chunk identifier (hash of file_path + symbol + index)",
+        ),
+        FieldSchema(
+            name="file_path",
+            dtype=DataType.VARCHAR,
+            max_length=512,
+            description="Relative file path within the repository",
+        ),
+        FieldSchema(
+            name="extension",
+            dtype=DataType.VARCHAR,
+            max_length=16,
+            description="File extension (e.g., .py, .go, .yaml)",
+        ),
+        FieldSchema(
+            name="language",
+            dtype=DataType.VARCHAR,
+            max_length=32,
+            description="Programming language (python, go, yaml, markdown)",
+        ),
+        FieldSchema(
+            name="symbol_name",
+            dtype=DataType.VARCHAR,
+            max_length=256,
+            description="Function/class/struct/resource name",
+        ),
+        FieldSchema(
+            name="folder_context",
+            dtype=DataType.VARCHAR,
+            max_length=128,
+            description="Top-level folder for domain context (e.g., apps, common)",
+        ),
+        FieldSchema(
+            name="chunk_text",
+            dtype=DataType.VARCHAR,
+            max_length=8192,
+            description="The actual code/content chunk text",
+        ),
+        FieldSchema(
+            name="start_line",
+            dtype=DataType.INT64,
+            description="Starting line number in the source file",
+        ),
+        FieldSchema(
+            name="end_line",
+            dtype=DataType.INT64,
+            description="Ending line number in the source file",
+        ),
+        FieldSchema(
+            name="commit_sha",
+            dtype=DataType.VARCHAR,
+            max_length=64,
+            description="Git commit SHA for provenance tracking",
+        ),
+        FieldSchema(
+            name="chunk_index",
+            dtype=DataType.INT64,
+            description="Index of this chunk within the file (for compatibility)",
+        ),
+        FieldSchema(
+            name="embedding",
+            dtype=DataType.FLOAT_VECTOR,
+            dim=dim,
+            description=f"Dense embedding vector ({dim} dimensions)",
+        ),
+    ]
+    return fields
+
+
+def get_code_schema(dim: int = None) -> CollectionSchema:
+    """Create the full CollectionSchema for code_collection.
+
+    Args:
+        dim: Embedding vector dimension.
+
+    Returns:
+        CollectionSchema object.
+    """
+    fields = get_code_fields(dim)
+    schema = CollectionSchema(
+        fields=fields,
+        description="Kubeflow manifests code chunks with embeddings for RAG retrieval",
+    )
+    return schema
+
+
+def get_code_index_params() -> dict:
+    """Get the HNSW index parameters for the embedding field.
+
+    Returns:
+        Dict of index parameters for Milvus.
+    """
+    return {
+        "metric_type": "COSINE",
+        "index_type": "HNSW",
+        "params": {"M": 16, "efConstruction": 200},
+    }
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    schema = get_code_schema()
+    logger.info("code_collection schema:")
+    for field in schema.fields:
+        logger.info(
+            "  %s: %s (max_length=%s, dim=%s, primary=%s)",
+            field.name,
+            field.dtype.name,
+            getattr(field, "max_length", "-"),
+            getattr(field, "dim", "-"),
+            field.is_primary,
+        )
+    logger.info("Index params: %s", get_code_index_params())
diff --git a/backend/schemas/docs_collection_schema.py b/backend/schemas/docs_collection_schema.py
new file mode 100644
index 0000000..7574316
--- /dev/null
+++ b/backend/schemas/docs_collection_schema.py
@@ -0,0 +1,145 @@
+"""
+Milvus schema definition for the docs_collection.
+
+Stores chunked and embedded Kubeflow documentation from kubeflow.org.
+Uses HNSW index with COSINE metric for fast ANN retrieval.
+
+Dimension defaults to 384 (all-MiniLM-L6-v2). Override via EMBEDDING_MODEL env var.
+"""
+
+import logging
+import os
+import sys
+
+from pymilvus import CollectionSchema, DataType, FieldSchema
+
+# Allow imports from project root
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
+
+from pipelines.shared.embedding_utils import get_embedding_dimension
+
+logger = logging.getLogger(__name__)
+
+COLLECTION_NAME = "docs_collection"
+
+
+def get_docs_fields(dim: int = None) -> list:
+    """Define the field schema for docs_collection.
+
+    Args:
+        dim: Embedding vector dimension. Auto-detected from EMBEDDING_MODEL if None.
+
+    Returns:
+        List of FieldSchema objects.
+    """
+    if dim is None:
+        dim = get_embedding_dimension()
+
+    fields = [
+        FieldSchema(
+            name="chunk_id",
+            dtype=DataType.VARCHAR,
+            max_length=128,
+            is_primary=True,
+            description="Unique chunk identifier (hash of url + chunk_index)",
+        ),
+        FieldSchema(
+            name="source_url",
+            dtype=DataType.VARCHAR,
+            max_length=512,
+            description="Original page URL from kubeflow.org",
+        ),
+        FieldSchema(
+            name="page_title",
+            dtype=DataType.VARCHAR,
+            max_length=256,
+            description="Page title extracted from content",
+        ),
+        FieldSchema(
+            name="heading",
+            dtype=DataType.VARCHAR,
+            max_length=256,
+            description="H2/H3 heading this chunk belongs to",
+        ),
+        FieldSchema(
+            name="section",
+            dtype=DataType.VARCHAR,
+            max_length=128,
+            description="Top-level docs section (e.g., components, started)",
+        ),
+        FieldSchema(
+            name="chunk_text",
+            dtype=DataType.VARCHAR,
+            max_length=16384,
+            description="The actual chunk text content",
+        ),
+        FieldSchema(
+            name="token_count",
+            dtype=DataType.INT64,
+            description="Number of tokens in this chunk",
+        ),
+        FieldSchema(
+            name="chunk_index",
+            dtype=DataType.INT64,
+            description="Sequential index of this chunk within its page",
+        ),
+        FieldSchema(
+            name="crawled_at",
+            dtype=DataType.VARCHAR,
+            max_length=64,
+            description="ISO timestamp when the page was crawled",
+        ),
+        FieldSchema(
+            name="embedding",
+            dtype=DataType.FLOAT_VECTOR,
+            dim=dim,
+            description=f"Dense embedding vector ({dim} dimensions)",
+        ),
+    ]
+    return fields
+
+
+def get_docs_schema(dim: int = None) -> CollectionSchema:
+    """Create the full CollectionSchema for docs_collection.
+
+    Args:
+        dim: Embedding vector dimension.
+
+    Returns:
+        CollectionSchema object.
+    """
+    fields = get_docs_fields(dim)
+    schema = CollectionSchema(
+        fields=fields,
+        description="Kubeflow documentation chunks with embeddings for RAG retrieval",
+    )
+    return schema
+
+
+def get_docs_index_params() -> dict:
+    """Get the HNSW index parameters for the embedding field.
+
+    Returns:
+        Dict of index parameters for Milvus.
+    """
+    return {
+        "metric_type": "COSINE",
+        "index_type": "HNSW",
+        "params": {"M": 16, "efConstruction": 200},
+    }
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    schema = get_docs_schema()
+    logger.info("docs_collection schema:")
+    for field in schema.fields:
+        logger.info(
+            "  %s: %s (max_length=%s, dim=%s, primary=%s)",
+            field.name,
+            field.dtype.name,
+            getattr(field, "max_length", "-"),
+            getattr(field, "dim", "-"),
+            field.is_primary,
+        )
+    logger.info("Index params: %s", get_docs_index_params())
diff --git a/pipelines/code_ingestion/__init__.py b/pipelines/code_ingestion/__init__.py
new file mode 100644
index 0000000..2749f5f
--- /dev/null
+++ b/pipelines/code_ingestion/__init__.py
@@ -0,0 +1 @@
+# Code ingestion pipeline components
diff --git a/pipelines/code_ingestion/components/__init__.py b/pipelines/code_ingestion/components/__init__.py
new file mode 100644
index 0000000..1362c76
--- /dev/null
+++ b/pipelines/code_ingestion/components/__init__.py
@@ -0,0 +1 @@
+# Code ingestion components
diff --git a/pipelines/code_ingestion/components/ast_parser.py b/pipelines/code_ingestion/components/ast_parser.py
new file mode 100644
index 0000000..d70e019
--- /dev/null
+++ b/pipelines/code_ingestion/components/ast_parser.py
@@ -0,0 +1,739 @@
+"""
+Code Ingestion — AST Parser Component
+
+Multi-language parser that extracts logical code units:
+  - Python: AST-based extraction of functions and classes with docstrings
+  - Go: Regex-based splitting on func/struct boundaries
+  - YAML/YML: Split by top-level Kubernetes resource kind
+  - Markdown: Split by H2/H3 headings
+
+Each extracted unit becomes a chunk with rich metadata for retrieval.
+"""
+
+import ast
+import hashlib
+import logging
+import os
+import re
+from typing import Any, Dict, Iterable, List, Optional
+
+import yaml
+
+logger = logging.getLogger(__name__)
+
+PATH_ALIAS_HINTS = {
+    "common/istio": [
+        "istio",
+        "service mesh",
+        "gateway",
+        "authorization policy",
+        "peer authentication",
+        "virtual service",
+        "sidecar",
+        "envoy",
+        "mtls",
+        "ingress",
+    ],
+    "common/knative": [
+        "knative",
+        "serving",
+        "eventing",
+        "serverless",
+        "scale to zero",
+        "activator",
+        "revision",
+        "service",
+        "net istio",
+        "webhook",
+    ],
+    "common/dex": [
+        "dex",
+        "oidc",
+        "oauth2",
+        "authentication",
+        "identity provider",
+        "connector",
+        "login",
+    ],
+    "common/cert-manager": [
+        "cert manager",
+        "certificate",
+        "issuer",
+        "clusterissuer",
+        "cainjector",
+        "tls",
+        "webhook",
+    ],
+    "applications/pipeline": [
+        "kubeflow pipelines",
+        "kfp",
+        "pipeline api server",
+        "deployment",
+        "service",
+        "configmap",
+        "role",
+        "rolebinding",
+        "serviceaccount",
+        "crd",
+        "webhook",
+        "scheduled workflow",
+    ],
+    "applications/profiles": [
+        "profiles",
+        "namespaces",
+        "rbac",
+        "rolebinding",
+        "serviceaccount",
+        "user profile",
+    ],
+    "tests": [
+        "tests",
+        "e2e",
+        "integration",
+        "validation",
+        "presubmit",
+    ],
+}
+
+
+def split_search_terms(value: str) -> List[str]:
+    """Split identifiers and paths into normalized search terms."""
+    expanded = re.sub(r"([a-z0-9])([A-Z])", r"\1 \2", value)
+    normalized = re.sub(r"[^A-Za-z0-9]+", " ", expanded)
+    return [token.lower() for token in normalized.split() if token]
+
+
+def unique_terms(values: Iterable[str], limit: int = 24) -> List[str]:
+    """Return unique normalized search terms while preserving order."""
+    seen = set()
+    ordered: List[str] = []
+    for value in values:
+        for token in split_search_terms(value):
+            if token not in seen:
+                seen.add(token)
+                ordered.append(token)
+                if len(ordered) >= limit:
+                    return ordered
+    return ordered
+
+
+def get_path_aliases(file_path: str) -> List[str]:
+    """Return path-aware semantic aliases for common Kubeflow manifest areas."""
+    normalized = file_path.replace("\\", "/").lower()
+    aliases: List[str] = []
+    for prefix, hints in PATH_ALIAS_HINTS.items():
+        if normalized.startswith(prefix):
+            aliases.extend(hints)
+    return aliases
+
+
+def summarize_list(values: Any, limit: int = 8) -> str:
+    """Summarize a list-like value for retrieval context lines."""
+    if not isinstance(values, list):
+        return ""
+    flattened = [str(item) for item in values if item]
+    return ", ".join(flattened[:limit])
+
+
+def extract_container_names(parsed: Dict[str, Any]) -> List[str]:
+    """Extract workload container names when present."""
+    spec = parsed.get("spec")
+    if not isinstance(spec, dict):
+        return []
+
+    template = spec.get("template", {})
+    if isinstance(template, dict):
+        template_spec = template.get("spec", {})
+        if isinstance(template_spec, dict):
+            containers = template_spec.get("containers", [])
+            if isinstance(containers, list):
+                return [
+                    str(container.get("name"))
+                    for container in containers
+                    if isinstance(container, dict) and container.get("name")
+                ]
+
+    job_template = spec.get("jobTemplate", {})
+    if isinstance(job_template, dict):
+        nested_spec = job_template.get("spec", {})
+        if isinstance(nested_spec, dict):
+            nested_template = nested_spec.get("template", {})
+            if isinstance(nested_template, dict):
+                nested_template_spec = nested_template.get("spec", {})
+                if isinstance(nested_template_spec, dict):
+                    containers = nested_template_spec.get("containers", [])
+                    if isinstance(containers, list):
+                        return [
+                            str(container.get("name"))
+                            for container in containers
+                            if isinstance(container, dict) and container.get("name")
+                        ]
+    return []
+
+
+def build_manifest_context(
+    parsed: Dict[str, Any],
+    file_path: str,
+    folder_context: str,
+) -> str:
+    """Build retrieval-oriented context text for a Kubernetes manifest."""
+    metadata = parsed.get("metadata", {})
+    metadata = metadata if isinstance(metadata, dict) else {}
+
+    kind = str(parsed.get("kind", "Unknown"))
+    api_version = str(parsed.get("apiVersion", "unknown"))
+    name = str(metadata.get("name", "unknown"))
+    namespace = str(metadata.get("namespace", "cluster-scoped"))
+
+    path_terms = unique_terms([file_path, os.path.basename(file_path), folder_context], limit=18)
+    alias_terms = unique_terms(get_path_aliases(file_path), limit=18)
+    label_keys = summarize_list(list((metadata.get("labels") or {}).keys()))
+    annotation_keys = summarize_list(list((metadata.get("annotations") or {}).keys()))
+    top_level_keys = summarize_list(list(parsed.keys()))
+
+    summary_lines = [
+        f"Manifest file path: {file_path}",
+        f"Folder context: {folder_context}",
+        f"Resource kind: {kind}",
+        f"API version: {api_version}",
+        f"Metadata name: {name}",
+        f"Namespace: {namespace}",
+    ]
+
+    if path_terms:
+        summary_lines.append(f"Path hints: {' '.join(path_terms)}")
+    if alias_terms:
+        summary_lines.append(f"Domain hints: {' '.join(alias_terms)}")
+    if top_level_keys:
+        summary_lines.append(f"Top-level keys: {top_level_keys}")
+    if label_keys:
+        summary_lines.append(f"Label keys: {label_keys}")
+    if annotation_keys:
+        summary_lines.append(f"Annotation keys: {annotation_keys}")
+
+    spec = parsed.get("spec")
+    spec = spec if isinstance(spec, dict) else {}
+
+    if kind.lower() == "kustomization" or os.path.basename(file_path).lower() == "kustomization.yaml":
+        resources = summarize_list(parsed.get("resources"))
+        components = summarize_list(parsed.get("components"))
+        bases = summarize_list(parsed.get("bases"))
+        patches = summarize_list(parsed.get("patchesStrategicMerge"))
+        if resources:
+            summary_lines.append(f"Kustomize resources: {resources}")
+        if components:
+            summary_lines.append(f"Kustomize components: {components}")
+        if bases:
+            summary_lines.append(f"Kustomize bases: {bases}")
+        if patches:
+            summary_lines.append(f"Kustomize patches: {patches}")
+
+    if kind in {"Deployment", "StatefulSet", "DaemonSet", "Job", "CronJob"}:
+        container_names = summarize_list(extract_container_names(parsed))
+        service_account = spec.get("serviceAccountName") or (
+            spec.get("template", {}).get("spec", {}).get("serviceAccountName")
+            if isinstance(spec.get("template"), dict)
+            else None
+        )
+        if container_names:
+            summary_lines.append(f"Workload containers: {container_names}")
+        if service_account:
+            summary_lines.append(f"Service account: {service_account}")
+
+    if kind == "Service":
+        service_type = spec.get("type")
+        ports = spec.get("ports")
+        selector = spec.get("selector")
+        if service_type:
+            summary_lines.append(f"Service type: {service_type}")
+        if isinstance(selector, dict) and selector:
+            summary_lines.append(
+                f"Service selector keys: {', '.join(list(selector.keys())[:8])}"
+            )
+        if isinstance(ports, list) and ports:
+            port_values = [str(port.get("port")) for port in ports if isinstance(port, dict) and port.get("port")]
+            if port_values:
+                summary_lines.append(f"Service ports: {', '.join(port_values[:8])}")
+
+    if kind == "CustomResourceDefinition":
+        crd_spec = spec
+        names = crd_spec.get("names", {}) if isinstance(crd_spec.get("names"), dict) else {}
+        versions = crd_spec.get("versions", [])
+        if crd_spec.get("group"):
+            summary_lines.append(f"CRD group: {crd_spec.get('group')}")
+        if names.get("kind"):
+            summary_lines.append(f"CRD served kind: {names.get('kind')}")
+        if isinstance(versions, list) and versions:
+            version_names = [str(version.get("name")) for version in versions if isinstance(version, dict) and version.get("name")]
+            if version_names:
+                summary_lines.append(f"CRD versions: {', '.join(version_names[:8])}")
+
+    if kind in {"Role", "ClusterRole"}:
+        rules = spec.get("rules", parsed.get("rules"))
+        if isinstance(rules, list) and rules:
+            resource_names = []
+            verbs = []
+            for rule in rules[:4]:
+                if isinstance(rule, dict):
+                    resource_names.extend(str(item) for item in rule.get("resources", [])[:4])
+                    verbs.extend(str(item) for item in rule.get("verbs", [])[:4])
+            if resource_names:
+                summary_lines.append(f"RBAC resources: {', '.join(resource_names[:10])}")
+            if verbs:
+                summary_lines.append(f"RBAC verbs: {', '.join(verbs[:10])}")
+
+    if kind in {"RoleBinding", "ClusterRoleBinding"}:
+        role_ref = parsed.get("roleRef", {})
+        subjects = parsed.get("subjects", [])
+        if isinstance(role_ref, dict) and role_ref.get("name"):
+            summary_lines.append(f"Binding roleRef: {role_ref.get('name')}")
+        if isinstance(subjects, list) and subjects:
+            subject_names = [
+                str(subject.get("name"))
+                for subject in subjects
+                if isinstance(subject, dict) and subject.get("name")
+            ]
+            if subject_names:
+                summary_lines.append(f"Binding subjects: {', '.join(subject_names[:10])}")
+
+    if kind in {"AuthorizationPolicy", "PeerAuthentication", "VirtualService", "Gateway", "DestinationRule"}:
+        selector = spec.get("selector", {})
+        if isinstance(selector, dict):
+            match_labels = selector.get("matchLabels", {})
+            if isinstance(match_labels, dict) and match_labels:
+                summary_lines.append(
+                    f"Istio selector labels: {', '.join(list(match_labels.keys())[:8])}"
+                )
+        gateways = spec.get("gateways")
+        hosts = spec.get("hosts")
+        if isinstance(gateways, list) and gateways:
+            summary_lines.append(f"Istio gateways: {', '.join(str(g) for g in gateways[:8])}")
+        if isinstance(hosts, list) and hosts:
+            summary_lines.append(f"Istio hosts: {', '.join(str(h) for h in hosts[:8])}")
+
+    return "\n".join(f"# {line}" for line in summary_lines if line)
+
+
+def generate_chunk_id(file_path: str, symbol_name: str, index: int) -> str:
+    """Generate a deterministic chunk ID.
+
+    Args:
+        file_path: Relative file path.
+        symbol_name: Function/class/resource name.
+        index: Sequential index.
+
+    Returns:
+        SHA256 hash string (first 32 chars).
+    """
+    raw = f"{file_path}::{symbol_name}::{index}"
+    return hashlib.sha256(raw.encode()).hexdigest()[:32]
+
+
+# ─── Python Parser ──────────────────────────────────────────────────────────
+
+def parse_python(content: str, file_path: str, commit_sha: str,
+                 folder_context: str) -> List[Dict[str, Any]]:
+    """Parse Python source into function and class chunks via AST.
+
+    Args:
+        content: Python source code.
+        file_path: Relative file path.
+        commit_sha: Git commit SHA.
+        folder_context: Top-level folder name.
+
+    Returns:
+        List of chunk dicts.
+    """
+    chunks = []
+    lines = content.split("\n")
+
+    try:
+        tree = ast.parse(content)
+    except SyntaxError as e:
+        logger.warning("Syntax error in %s: %s", file_path, e)
+        # Fall back to whole-file chunk
+        return [{
+            "chunk_id": generate_chunk_id(file_path, "module", 0),
+            "file_path": file_path,
+            "extension": ".py",
+            "language": "python",
+            "symbol_name": os.path.basename(file_path),
+            "chunk_text": content,
+            "start_line": 1,
+            "end_line": len(lines),
+            "commit_sha": commit_sha,
+            "folder_context": folder_context,
+        }]
+
+    idx = 0
+    for node in ast.walk(tree):
+        if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
+            symbol_name = node.name
+            start_line = node.lineno
+            end_line = node.end_lineno or start_line
+
+            # Extract the source lines
+            chunk_lines = lines[start_line - 1 : end_line]
+            chunk_text = "\n".join(chunk_lines)
+
+            # Extract docstring if present
+            docstring = ast.get_docstring(node) or ""
+            symbol_type = "class" if isinstance(node, ast.ClassDef) else "function"
+
+            chunks.append({
+                "chunk_id": generate_chunk_id(file_path, symbol_name, idx),
+                "file_path": file_path,
+                "extension": ".py",
+                "language": "python",
+                "symbol_name": f"{symbol_type}:{symbol_name}",
+                "chunk_text": chunk_text,
+                "start_line": start_line,
+                "end_line": end_line,
+                "commit_sha": commit_sha,
+                "folder_context": folder_context,
+            })
+            idx += 1
+
+    # If no functions/classes found, treat whole file as one chunk
+    if not chunks:
+        chunks.append({
+            "chunk_id": generate_chunk_id(file_path, "module", 0),
+            "file_path": file_path,
+            "extension": ".py",
+            "language": "python",
+            "symbol_name": f"module:{os.path.basename(file_path)}",
+            "chunk_text": content,
+            "start_line": 1,
+            "end_line": len(lines),
+            "commit_sha": commit_sha,
+            "folder_context": folder_context,
+        })
+
+    return chunks
+
+
+# ─── Go Parser ──────────────────────────────────────────────────────────────
+
+def parse_go(content: str, file_path: str, commit_sha: str,
+             folder_context: str) -> List[Dict[str, Any]]:
+    """Parse Go source by splitting on func and type struct boundaries.
+
+    Args:
+        content: Go source code.
+        file_path: Relative file path.
+        commit_sha: Git commit SHA.
+        folder_context: Top-level folder name.
+
+    Returns:
+        List of chunk dicts.
+    """
+    chunks = []
+    lines = content.split("\n")
+
+    # Match func declarations and type struct declarations
+    pattern = re.compile(
+        r"^(?:func\s+(?:\([^)]+\)\s+)?(\w+)|type\s+(\w+)\s+struct)\b",
+        re.MULTILINE,
+    )
+
+    matches = list(pattern.finditer(content))
+
+    if not matches:
+        # Whole file as one chunk
+        return [{
+            "chunk_id": generate_chunk_id(file_path, "file", 0),
+            "file_path": file_path,
+            "extension": ".go",
+            "language": "go",
+            "symbol_name": f"file:{os.path.basename(file_path)}",
+            "chunk_text": content,
+            "start_line": 1,
+            "end_line": len(lines),
+            "commit_sha": commit_sha,
+            "folder_context": folder_context,
+        }]
+
+    for i, match in enumerate(matches):
+        symbol = match.group(1) or match.group(2)
+        start_pos = match.start()
+        end_pos = matches[i + 1].start() if i + 1 < len(matches) else len(content)
+
+        chunk_text = content[start_pos:end_pos].rstrip()
+        start_line = content[:start_pos].count("\n") + 1
+        end_line = start_line + chunk_text.count("\n")
+
+        is_struct = match.group(2) is not None
+        symbol_type = "struct" if is_struct else "func"
+
+        chunks.append({
+            "chunk_id": generate_chunk_id(file_path, symbol, i),
+            "file_path": file_path,
+            "extension": ".go",
+            "language": "go",
+            "symbol_name": f"{symbol_type}:{symbol}",
+            "chunk_text": chunk_text,
+            "start_line": start_line,
+            "end_line": end_line,
+            "commit_sha": commit_sha,
+            "folder_context": folder_context,
+        })
+
+    return chunks
+
+
+# ─── YAML Parser ────────────────────────────────────────────────────────────
+
+def parse_yaml(content: str, file_path: str, commit_sha: str,
+               folder_context: str) -> List[Dict[str, Any]]:
+    """Parse YAML by splitting on Kubernetes resource kind boundaries.
+
+    Args:
+        content: YAML content (may contain multiple documents).
+        file_path: Relative file path.
+        commit_sha: Git commit SHA.
+        folder_context: Top-level folder name.
+
+    Returns:
+        List of chunk dicts.
+    """
+    chunks = []
+
+    # Split multi-document YAML
+    documents = content.split("\n---")
+
+    for idx, doc in enumerate(documents):
+        doc = doc.strip()
+        if not doc:
+            continue
+
+        # Try to parse as YAML
+        try:
+            parsed = yaml.safe_load(doc)
+        except yaml.YAMLError:
+            parsed = None
+
+        if isinstance(parsed, dict):
+            kind = parsed.get("kind", "Unknown")
+            name = "unknown"
+            metadata = parsed.get("metadata", {})
+            if isinstance(metadata, dict):
+                name = metadata.get("name", "unknown")
+            symbol_name = f"{kind}:{name}"
+            manifest_context = build_manifest_context(parsed, file_path, folder_context)
+            chunk_body = f"{manifest_context}\n\n{doc}" if manifest_context else doc
+        else:
+            kind = "fragment"
+            symbol_name = f"fragment:{idx}"
+            chunk_body = doc
+
+        # Calculate line numbers
+        preceding = "\n---".join(documents[:idx])
+        start_line = preceding.count("\n") + 1 if preceding else 1
+        end_line = start_line + doc.count("\n")
+
+        chunks.append({
+            "chunk_id": generate_chunk_id(file_path, symbol_name, idx),
+            "file_path": file_path,
+            "extension": os.path.splitext(file_path)[1].lower(),
+            "language": "yaml",
+            "symbol_name": symbol_name,
+            "chunk_text": chunk_body,
+            "start_line": start_line,
+            "end_line": end_line,
+            "commit_sha": commit_sha,
+            "folder_context": folder_context,
+        })
+
+    if not chunks:
+        chunks.append({
+            "chunk_id": generate_chunk_id(file_path, "file", 0),
+            "file_path": file_path,
+            "extension": os.path.splitext(file_path)[1].lower(),
+            "language": "yaml",
+            "symbol_name": f"file:{os.path.basename(file_path)}",
+            "chunk_text": content,
+            "start_line": 1,
+            "end_line": content.count("\n") + 1,
+            "commit_sha": commit_sha,
+            "folder_context": folder_context,
+        })
+
+    return chunks
+
+
+# ─── Markdown Parser ────────────────────────────────────────────────────────
+
+def parse_markdown(content: str, file_path: str, commit_sha: str,
+                   folder_context: str) -> List[Dict[str, Any]]:
+    """Parse Markdown by H2/H3 headings.
+
+    Args:
+        content: Markdown content.
+        file_path: Relative file path.
+        commit_sha: Git commit SHA.
+        folder_context: Top-level folder name.
+
+    Returns:
+        List of chunk dicts.
+    """
+    chunks = []
+    heading_pattern = re.compile(r"^(#{2,3})\s+(.+)$", re.MULTILINE)
+    matches = list(heading_pattern.finditer(content))
+
+    if not matches:
+        return [{
+            "chunk_id": generate_chunk_id(file_path, "doc", 0),
+            "file_path": file_path,
+            "extension": ".md",
+            "language": "markdown",
+            "symbol_name": f"doc:{os.path.basename(file_path)}",
+            "chunk_text": content,
+            "start_line": 1,
+            "end_line": content.count("\n") + 1,
+            "commit_sha": commit_sha,
+            "folder_context": folder_context,
+        }]
+
+    for i, match in enumerate(matches):
+        heading = match.group(2).strip()
+        start = match.start()
+        end = matches[i + 1].start() if i + 1 < len(matches) else len(content)
+        text = content[start:end].strip()
+
+        start_line = content[:start].count("\n") + 1
+        end_line = start_line + text.count("\n")
+
+        chunks.append({
+            "chunk_id": generate_chunk_id(file_path, heading, i),
+            "file_path": file_path,
+            "extension": ".md",
+            "language": "markdown",
+            "symbol_name": f"heading:{heading[:100]}",
+            "chunk_text": text,
+            "start_line": start_line,
+            "end_line": end_line,
+            "commit_sha": commit_sha,
+            "folder_context": folder_context,
+        })
+
+    return chunks
+
+
+# ─── Main Dispatcher ────────────────────────────────────────────────────────
+
+PARSERS = {
+    ".py": parse_python,
+    ".go": parse_go,
+    ".yaml": parse_yaml,
+    ".yml": parse_yaml,
+    ".md": parse_markdown,
+}
+
+
+def parse_file(
+    content: str,
+    file_path: str,
+    extension: str,
+    commit_sha: str,
+    folder_context: str,
+) -> List[Dict[str, Any]]:
+    """Parse a file into chunks using the appropriate language parser.
+
+    Args:
+        content: File content string.
+        file_path: Relative file path.
+        extension: File extension (e.g., '.py').
+        commit_sha: Git commit SHA.
+        folder_context: Top-level folder name.
+
+    Returns:
+        List of chunk dicts.
+    """
+    parser = PARSERS.get(extension.lower())
+    if parser is None:
+        logger.warning("No parser for extension: %s (%s)", extension, file_path)
+        return []
+
+    try:
+        return parser(content, file_path, commit_sha, folder_context)
+    except Exception as e:
+        logger.error("Parser error for %s: %s", file_path, e)
+        return []
+
+
+def parse_all_files(
+    repo_dir: str,
+    file_list: List[Dict[str, Any]],
+    commit_sha: str,
+) -> List[Dict[str, Any]]:
+    """Parse all files in the file list.
+
+    Args:
+        repo_dir: Repository root directory.
+        file_list: List of file info dicts from repo_cloner.
+        commit_sha: Git commit SHA.
+
+    Returns:
+        List of all chunk dicts across all files.
+    """
+    all_chunks = []
+
+    for i, file_info in enumerate(file_list):
+        file_path = file_info["path"]
+        extension = file_info["extension"]
+        folder_context = file_info.get("folder_context", "root")
+
+        full_path = os.path.join(repo_dir, file_path)
+        try:
+            with open(full_path, "r", encoding="utf-8", errors="replace") as f:
+                content = f.read()
+        except Exception as e:
+            logger.warning("Cannot read %s: %s", file_path, e)
+            continue
+
+        chunks = parse_file(content, file_path, extension, commit_sha, folder_context)
+        all_chunks.extend(chunks)
+
+        if (i + 1) % 50 == 0:
+            logger.info("Parsed %d/%d files (%d chunks so far)", i + 1, len(file_list), len(all_chunks))
+
+    logger.info("AST parsing complete: %d chunks from %d files.", len(all_chunks), len(file_list))
+    return all_chunks
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
+
+    # Test Python parser
+    py_code = '''
+def hello_world():
+    """Say hello."""
+    print("Hello, World!")
+
+class MyClass:
+    """A test class."""
+    def method(self):
+        pass
+'''
+    chunks = parse_python(py_code, "test.py", "abc123", "tests")
+    logger.info("=== Python Parser Test ===")
+    for c in chunks:
+        logger.info("  %s [L%d-%d]", c["symbol_name"], c["start_line"], c["end_line"])
+
+    # Test YAML parser
+    yaml_content = '''apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: my-app
+spec:
+  replicas: 3
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: my-service
+'''
+    chunks = parse_yaml(yaml_content, "deploy.yaml", "abc123", "apps")
+    logger.info("=== YAML Parser Test ===")
+    for c in chunks:
+        logger.info("  %s [L%d-%d]", c["symbol_name"], c["start_line"], c["end_line"])
diff --git a/pipelines/code_ingestion/components/chunker.py b/pipelines/code_ingestion/components/chunker.py
new file mode 100644
index 0000000..872a606
--- /dev/null
+++ b/pipelines/code_ingestion/components/chunker.py
@@ -0,0 +1,237 @@
+"""
+Code Ingestion — Chunker Component
+
+Post-processes AST parser output to enforce token limits and add
+context headers to each chunk.
+
+Features:
+  - Enforces 50-512 token limits
+  - Prepends context header: # File: ... | Symbol: ... | Lang: ...
+  - Splits oversized chunks at logical boundaries (blank lines)
+"""
+
+import hashlib
+import json
+import logging
+import os
+from typing import Any, Dict, List
+
+logger = logging.getLogger(__name__)
+
+# Try to use tiktoken
+try:
+    import tiktoken
+    _ENCODER = tiktoken.get_encoding("cl100k_base")
+    def count_tokens(text: str) -> int:
+        return len(_ENCODER.encode(text))
+except ImportError:
+    def count_tokens(text: str) -> int:
+        return int(len(text.split()) * 1.3)
+
+MIN_TOKENS = 50
+MAX_TOKENS = 512
+
+
+def build_path_hints(chunk: Dict[str, Any]) -> str:
+    """Build a normalized path-hint string for retrieval context."""
+    raw = " ".join(
+        str(chunk.get(key, ""))
+        for key in ("file_path", "folder_context", "symbol_name")
+    )
+    expanded = raw.replace("/", " ").replace("_", " ").replace("-", " ")
+    expanded = "".join(
+        (
+            f" {char}" if index > 0 and char.isupper() and expanded[index - 1].islower() else char
+        )
+        for index, char in enumerate(expanded)
+    )
+    normalized = " ".join(expanded.split())
+    return normalized.lower()
+
+
+def make_context_header(chunk: Dict[str, Any]) -> str:
+    """Create a context header string for a code chunk.
+
+    This header is prepended to the chunk text before embedding to help
+    the model understand the code's origin and purpose.
+
+    Args:
+        chunk: Chunk dict with file_path, symbol_name, language.
+
+    Returns:
+        Context header string.
+    """
+    lines = [
+        (
+            f"# File: {chunk.get('file_path', 'unknown')} "
+            f"| Symbol: {chunk.get('symbol_name', 'unknown')} "
+            f"| Lang: {chunk.get('language', 'unknown')} "
+            f"| Folder: {chunk.get('folder_context', 'unknown')}"
+        )
+    ]
+    path_hints = build_path_hints(chunk)
+    if path_hints:
+        lines.append(f"# Path Hints: {path_hints}")
+    return "\n".join(lines)
+
+
+def split_oversized_chunk(text: str, max_tokens: int) -> List[str]:
+    """Split an oversized chunk at logical boundaries.
+
+    Tries to split at blank lines first, then single newlines,
+    then falls back to word splitting.
+
+    Args:
+        text: Text to split.
+        max_tokens: Maximum tokens per sub-chunk.
+
+    Returns:
+        List of sub-chunk strings.
+    """
+    if count_tokens(text) <= max_tokens:
+        return [text]
+
+    # Try blank line split first
+    for sep in ["\n\n", "\n"]:
+        parts = text.split(sep)
+        if len(parts) <= 1:
+            continue
+
+        chunks = []
+        current = ""
+
+        for part in parts:
+            candidate = current + sep + part if current else part
+            if count_tokens(candidate) > max_tokens:
+                if current.strip():
+                    chunks.append(current.strip())
+                current = part
+            else:
+                current = candidate
+
+        if current.strip():
+            chunks.append(current.strip())
+
+        if len(chunks) > 1:
+            return chunks
+
+    # Last resort: word split
+    words = text.split()
+    chunks = []
+    current_words = []
+
+    for word in words:
+        current_words.append(word)
+        if count_tokens(" ".join(current_words)) > max_tokens:
+            if len(current_words) > 1:
+                chunks.append(" ".join(current_words[:-1]))
+                current_words = [word]
+
+    if current_words:
+        chunks.append(" ".join(current_words))
+
+    return chunks
+
+
+def process_chunks(
+    raw_chunks: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+    """Post-process AST parser chunks with token limits and context headers.
+
+    Args:
+        raw_chunks: List of raw chunk dicts from AST parser.
+
+    Returns:
+        List of processed chunk dicts ready for embedding.
+    """
+    processed = []
+    skipped_short = 0
+    split_count = 0
+    # Track index per file_path
+    file_indices = {}
+
+    for chunk in raw_chunks:
+        # Add context header
+        header = make_context_header(chunk)
+        full_text = f"{header}\n\n{chunk['chunk_text']}"
+        token_count = count_tokens(full_text)
+
+        if token_count < MIN_TOKENS:
+            skipped_short += 1
+            continue
+
+        if token_count <= MAX_TOKENS:
+            processed_chunk = chunk.copy()
+            fp = chunk.get("file_path", "unknown")
+            ci = file_indices.get(fp, 0)
+            processed_chunk["chunk_text"] = full_text[:8192]
+            processed_chunk["token_count"] = token_count
+            processed_chunk["chunk_index"] = ci
+            processed.append(processed_chunk)
+            file_indices[fp] = ci + 1
+        else:
+            # Split oversized chunk
+            sub_chunks = split_oversized_chunk(full_text, MAX_TOKENS)
+            split_count += 1
+
+            for idx, sub_text in enumerate(sub_chunks):
+                sub_tokens = count_tokens(sub_text)
+                if sub_tokens < MIN_TOKENS:
+                    continue
+
+                sub_chunk = chunk.copy()
+                fp = chunk.get("file_path", "unknown")
+                ci = file_indices.get(fp, 0)
+                sub_chunk["chunk_id"] = hashlib.sha256(
+                    f"{chunk['chunk_id']}::{idx}".encode()
+                ).hexdigest()[:32]
+                sub_chunk["chunk_text"] = sub_text[:8192]
+                sub_chunk["token_count"] = sub_tokens
+                sub_chunk["chunk_index"] = ci
+                processed.append(sub_chunk)
+                file_indices[fp] = ci + 1
+
+    logger.info(
+        "Chunker: %d input -> %d output (%d short skipped, %d split)",
+        len(raw_chunks), len(processed), skipped_short, split_count,
+    )
+    return processed
+
+
+def save_chunks(chunks: List[Dict[str, Any]], output_path: str) -> None:
+    """Save chunks to a JSONL file.
+
+    Args:
+        chunks: List of chunk dicts.
+        output_path: Path to write the file.
+    """
+    os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
+    with open(output_path, "w", encoding="utf-8") as f:
+        for chunk in chunks:
+            f.write(json.dumps(chunk, ensure_ascii=False) + "\n")
+    logger.info("Saved %d chunks to %s", len(chunks), output_path)
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
+
+    test_chunks = [
+        {
+            "chunk_id": "abc123",
+            "file_path": "apps/pipeline/upstream/kfp/v2/compiler.py",
+            "extension": ".py",
+            "language": "python",
+            "symbol_name": "function:compile_pipeline",
+            "chunk_text": "def compile_pipeline(pipeline_func):\n    \"\"\"Compile a pipeline function.\"\"\"\n    return compiled",
+            "start_line": 10,
+            "end_line": 12,
+            "commit_sha": "deadbeef",
+            "folder_context": "apps",
+        },
+    ]
+
+    result = process_chunks(test_chunks)
+    logger.info("=== Code Chunker Test ===")
+    for c in result:
+        logger.info("  %s (tokens=%d)", c["symbol_name"], c.get("token_count", 0))
+        logger.info("  Text preview: %s...", c["chunk_text"][:100])
diff --git a/pipelines/code_ingestion/components/embedder.py b/pipelines/code_ingestion/components/embedder.py
new file mode 100644
index 0000000..de49f62
--- /dev/null
+++ b/pipelines/code_ingestion/components/embedder.py
@@ -0,0 +1,85 @@
+"""
+Code Ingestion — Embedder Component
+
+Embeds code chunks using configurable embedding model.
+Identical to docs embedder but imports from shared utilities.
+
+The context header prepended by the chunker is included in the
+embedding input so vectors capture both code semantics and file location.
+"""
+
+import json
+import logging
+import os
+import sys
+from typing import Any, Dict, List
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", ".."))
+
+from pipelines.shared.embedding_utils import EmbeddingClient
+
+logger = logging.getLogger(__name__)
+
+
+def embed_code_chunks(
+    chunks: List[Dict[str, Any]],
+    batch_size: int = 32,
+) -> List[Dict[str, Any]]:
+    """Embed all code chunks and add embeddings to each chunk dict.
+
+    Args:
+        chunks: List of chunk dicts (must have 'chunk_text' key).
+        batch_size: Batch size for embedding.
+
+    Returns:
+        Same chunk dicts with added 'embedding' key.
+    """
+    if not chunks:
+        logger.warning("No chunks to embed.")
+        return []
+
+    client = EmbeddingClient(batch_size=batch_size)
+    texts = [chunk["chunk_text"] for chunk in chunks]
+
+    logger.info("Embedding %d code chunks with model: %s", len(texts), client.model_name)
+    embeddings = client.embed_texts(texts)
+
+    for chunk, embedding in zip(chunks, embeddings):
+        chunk["embedding"] = embedding
+
+    logger.info("Embedding complete. %d code chunks embedded.", len(chunks))
+    return chunks
+
+
+def load_chunks(input_path: str) -> List[Dict[str, Any]]:
+    """Load chunks from a JSONL file."""
+    chunks = []
+    with open(input_path, "r", encoding="utf-8") as f:
+        for line in f:
+            if line.strip():
+                chunks.append(json.loads(line))
+    return chunks
+
+
+def save_embedded_chunks(chunks: List[Dict[str, Any]], output_path: str) -> None:
+    """Save embedded chunks to a JSONL file."""
+    os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
+    with open(output_path, "w", encoding="utf-8") as f:
+        for chunk in chunks:
+            f.write(json.dumps(chunk, ensure_ascii=False) + "\n")
+    logger.info("Saved %d embedded chunks to %s", len(chunks), output_path)
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
+    test_chunks = [
+        {
+            "chunk_id": "test001",
+            "chunk_text": "# File: apps/kfp/compiler.py | Symbol: func:compile | Lang: python\n\ndef compile(): pass",
+            "file_path": "apps/kfp/compiler.py",
+            "language": "python",
+        },
+    ]
+    result = embed_code_chunks(test_chunks)
+    for c in result:
+        logger.info("  chunk_id=%s dim=%d", c["chunk_id"], len(c.get("embedding", [])))
diff --git a/pipelines/code_ingestion/components/loader.py b/pipelines/code_ingestion/components/loader.py
new file mode 100644
index 0000000..42961e5
--- /dev/null
+++ b/pipelines/code_ingestion/components/loader.py
@@ -0,0 +1,107 @@
+"""
+Code Ingestion — Loader Component
+
+Loads embedded code chunks into the Milvus code_collection.
+Uses upsert pattern with chunk_id as primary key.
+"""
+
+import json
+import logging
+import os
+import sys
+from typing import Any, Dict, List
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", ".."))
+
+from pymilvus import (Collection, CollectionSchema, DataType, FieldSchema,
+                      connections, utility)
+
+from pipelines.shared.milvus_utils import connect, create_collection_if_not_exists, upsert_batch
+from backend.schemas.code_collection_schema import (
+    COLLECTION_NAME,
+    get_code_fields,
+    get_code_index_params,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def should_recreate_collection() -> bool:
+    """Return whether the loader should drop and recreate the collection.
+
+    This is disabled by default so local re-runs preserve previously indexed
+    data and rely on primary-key upserts instead of destructive reloads.
+    """
+    return os.environ.get("MILVUS_DROP_EXISTING", "false").lower() == "true"
+
+
+def load_to_milvus(
+    chunks: List[Dict[str, Any]],
+    collection_name: str = None,
+) -> Dict[str, int]:
+    """Load embedded code chunks into Milvus code_collection.
+
+    Args:
+        chunks: List of chunk dicts with embeddings.
+        collection_name: Override collection name.
+
+    Returns:
+        Ingestion summary with inserted, failed, total counts.
+    """
+    col_name = collection_name or COLLECTION_NAME
+
+    connect()
+
+    # Recreate only when explicitly requested.
+    if should_recreate_collection() and utility.has_collection(col_name):
+        utility.drop_collection(col_name)
+        logger.info("Dropped existing collection %s for schema refresh", col_name)
+
+    fields = get_code_fields()
+    index_params = get_code_index_params()
+    collection = create_collection_if_not_exists(
+        collection_name=col_name,
+        fields=fields,
+        description="Kubeflow manifests code chunks for RAG retrieval",
+        index_field="embedding",
+        index_params=index_params,
+    )
+
+    rows = []
+    for chunk in chunks:
+        if "embedding" not in chunk:
+            continue
+
+        row = {
+            "chunk_id": str(chunk["chunk_id"])[:128],
+            "file_path": str(chunk.get("file_path", ""))[:512],
+            "extension": str(chunk.get("extension", ""))[:16],
+            "language": str(chunk.get("language", ""))[:32],
+            "symbol_name": str(chunk.get("symbol_name", ""))[:256],
+            "folder_context": str(chunk.get("folder_context", ""))[:128],
+            "chunk_text": str(chunk.get("chunk_text", ""))[:8192],
+            "start_line": int(chunk.get("start_line", 0)),
+            "end_line": int(chunk.get("end_line", 0)),
+            "commit_sha": str(chunk.get("commit_sha", ""))[:64],
+            "chunk_index": int(chunk.get("chunk_index", 0)),
+            "embedding": chunk["embedding"],
+        }
+        rows.append(row)
+
+    if not rows:
+        return {"inserted": 0, "failed": 0, "total": 0, "skipped": len(chunks)}
+
+    summary = upsert_batch(collection, rows, batch_size=100)
+    summary["skipped"] = len(chunks) - len(rows)
+
+    logger.info(
+        "Code ingestion: %d inserted, %d failed, %d skipped",
+        summary["inserted"], summary["failed"], summary["skipped"],
+    )
+    return summary
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
+    logger.info("=== Code Loader Smoke Test ===")
+    logger.info("Requires Milvus at localhost:19530")
diff --git a/pipelines/code_ingestion/components/repo_cloner.py b/pipelines/code_ingestion/components/repo_cloner.py
new file mode 100644
index 0000000..922e364
--- /dev/null
+++ b/pipelines/code_ingestion/components/repo_cloner.py
@@ -0,0 +1,221 @@
+"""
+Code Ingestion — Repo Cloner Component
+
+Clones the kubeflow/manifests repository and collects file metadata.
+Records commit SHA for provenance tracking.
+
+Features:
+  - Clones via subprocess (git) or GitPython
+  - Skips hidden dirs, __pycache__, node_modules
+  - Size filter: skip files < 200 bytes or > 100KB
+  - Groups files by extension
+
+Environment variables:
+  KUBEFLOW_MANIFESTS_REPO: Repo URL (default: https://github.com/kubeflow/manifests)
+"""
+
+import json
+import logging
+import os
+import shutil
+import subprocess
+import tempfile
+from typing import Any, Dict, List, Optional
+
+logger = logging.getLogger(__name__)
+
+SKIP_DIRS = {
+    ".git", "__pycache__", "node_modules", ".tox", ".mypy_cache",
+    ".pytest_cache", ".venv", "venv", ".eggs", "*.egg-info",
+}
+
+SUPPORTED_EXTENSIONS = {".py", ".go", ".yaml", ".yml", ".md"}
+
+MIN_FILE_SIZE = 200       # bytes
+MAX_FILE_SIZE = 100_000   # 100KB
+
+
+def get_repo_url() -> str:
+    """Get the repository URL from environment.
+
+    Returns:
+        Repository URL string.
+    """
+    return os.environ.get(
+        "KUBEFLOW_MANIFESTS_REPO",
+        "https://github.com/kubeflow/manifests",
+    )
+
+
+def clone_repo(
+    repo_url: Optional[str] = None,
+    target_dir: Optional[str] = None,
+    branch: str = "master",
+) -> Dict[str, Any]:
+    """Clone a git repository and collect file metadata.
+
+    Args:
+        repo_url: Repository URL to clone.
+        target_dir: Directory to clone into (temp dir if None).
+        branch: Git branch to clone.
+
+    Returns:
+        Dict with commit_sha, repo_dir, and file_list.
+    """
+    url = repo_url or get_repo_url()
+    clone_dir = target_dir or tempfile.mkdtemp(prefix="docs-agent-code-")
+
+    logger.info("Cloning %s (branch: %s) to %s", url, branch, clone_dir)
+
+    try:
+        subprocess.run(
+            ["git", "clone", "--depth", "1", "--branch", branch, url, clone_dir],
+            check=True,
+            capture_output=True,
+            text=True,
+        )
+    except subprocess.CalledProcessError as e:
+        logger.error("Git clone failed: %s", e.stderr)
+        raise RuntimeError(f"Failed to clone {url}: {e.stderr}") from e
+
+    # Get commit SHA
+    try:
+        result = subprocess.run(
+            ["git", "rev-parse", "HEAD"],
+            capture_output=True, text=True, cwd=clone_dir, check=True,
+        )
+        commit_sha = result.stdout.strip()
+    except subprocess.CalledProcessError:
+        commit_sha = "unknown"
+
+    logger.info("Cloned at commit: %s", commit_sha[:12])
+
+    # Walk and collect files
+    file_list = collect_files(clone_dir)
+
+    return {
+        "commit_sha": commit_sha,
+        "repo_dir": clone_dir,
+        "file_list": file_list,
+    }
+
+
+def should_skip_dir(dir_name: str) -> bool:
+    """Check if a directory should be skipped.
+
+    Args:
+        dir_name: Directory basename.
+
+    Returns:
+        True if the directory should be skipped.
+    """
+    if dir_name.startswith("."):
+        return True
+    return dir_name in SKIP_DIRS
+
+
+def collect_files(repo_dir: str) -> List[Dict[str, Any]]:
+    """Walk a directory and collect file metadata.
+
+    Filters by extension, size, and skips hidden/utility directories.
+
+    Args:
+        repo_dir: Root directory to walk.
+
+    Returns:
+        List of file info dicts: {path, extension, size_bytes, folder_context}.
+    """
+    files = []
+
+    for root, dirs, filenames in os.walk(repo_dir):
+        # Filter out directories to skip (modifies in-place)
+        dirs[:] = [d for d in dirs if not should_skip_dir(d)]
+
+        for filename in filenames:
+            filepath = os.path.join(root, filename)
+            rel_path = os.path.relpath(filepath, repo_dir)
+
+            # Check extension
+            _, ext = os.path.splitext(filename)
+            if ext.lower() not in SUPPORTED_EXTENSIONS:
+                continue
+
+            # Check size
+            try:
+                size = os.path.getsize(filepath)
+            except OSError:
+                continue
+
+            if size < MIN_FILE_SIZE or size > MAX_FILE_SIZE:
+                continue
+
+            # Determine folder context (top-level directory)
+            parts = rel_path.split(os.sep)
+            folder_context = parts[0] if len(parts) > 1 else "root"
+
+            files.append({
+                "path": rel_path,
+                "extension": ext.lower(),
+                "size_bytes": size,
+                "folder_context": folder_context,
+            })
+
+    # Log summary by extension
+    ext_counts: Dict[str, int] = {}
+    for f in files:
+        ext_counts[f["extension"]] = ext_counts.get(f["extension"], 0) + 1
+
+    logger.info("Collected %d files: %s", len(files), ext_counts)
+    return files
+
+
+def read_file_content(repo_dir: str, file_path: str) -> Optional[str]:
+    """Read file content safely.
+
+    Args:
+        repo_dir: Repository root directory.
+        file_path: Relative file path.
+
+    Returns:
+        File content string, or None if unreadable.
+    """
+    full_path = os.path.join(repo_dir, file_path)
+    try:
+        with open(full_path, "r", encoding="utf-8", errors="replace") as f:
+            return f.read()
+    except Exception as e:
+        logger.warning("Cannot read %s: %s", file_path, e)
+        return None
+
+
+def save_clone_results(
+    result: Dict[str, Any], output_path: str
+) -> None:
+    """Save clone results to a JSON file.
+
+    Args:
+        result: Clone result dict.
+        output_path: Path to write the file.
+    """
+    os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
+    # Don't include repo_dir in the saved output (it's a temp path)
+    save_data = {
+        "commit_sha": result["commit_sha"],
+        "file_count": len(result["file_list"]),
+        "file_list": result["file_list"],
+    }
+    with open(output_path, "w") as f:
+        json.dump(save_data, f, indent=2)
+    logger.info("Saved clone results to %s", output_path)
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
+    logger.info("=== Repo Cloner Smoke Test ===")
+    result = clone_repo()
+    logger.info("Commit: %s", result["commit_sha"][:12])
+    logger.info("Files: %d", len(result["file_list"]))
+    for f in result["file_list"][:10]:
+        logger.info("  %s (%s, %d bytes)", f["path"], f["extension"], f["size_bytes"])
+    # Cleanup
+    shutil.rmtree(result["repo_dir"], ignore_errors=True)
diff --git a/pipelines/code_ingestion/full_pipeline.yaml b/pipelines/code_ingestion/full_pipeline.yaml
new file mode 100644
index 0000000..5977c73
--- /dev/null
+++ b/pipelines/code_ingestion/full_pipeline.yaml
@@ -0,0 +1,1434 @@
+# PIPELINE DEFINITION
+# Name: full-ingestion-pipeline
+# Description: Run both docs and code ingestion pipelines in parallel
+# Inputs:
+#    chunk_overlap: int [Default: 50.0]
+#    chunk_size: int [Default: 500.0]
+#    code_branch: str [Default: 'master']
+#    code_repo_url: str [Default: 'https://github.com/kubeflow/manifests']
+#    docs_base_url: str [Default: 'https://www.kubeflow.org']
+#    docs_crawl_delay: float [Default: 1.0]
+#    docs_max_pages: int [Default: 0.0]
+#    embedding_dim: int [Default: 384.0]
+#    embedding_model: str [Default: 'sentence-transformers/all-MiniLM-L6-v2']
+#    milvus_host: str [Default: 'localhost']
+#    milvus_port: str [Default: '19530']
+components:
+  comp-chunk-code:
+    executorLabel: exec-chunk-code
+    inputDefinitions:
+      artifacts:
+        parsed_data:
+          artifactType:
+            schemaTitle: system.Dataset
+            schemaVersion: 0.0.1
+          description: Input dataset of parsed chunks.
+    outputDefinitions:
+      artifacts:
+        chunked_data:
+          artifactType:
+            schemaTitle: system.Dataset
+            schemaVersion: 0.0.1
+  comp-chunk-docs:
+    executorLabel: exec-chunk-docs
+    inputDefinitions:
+      artifacts:
+        crawled_data:
+          artifactType:
+            schemaTitle: system.Dataset
+            schemaVersion: 0.0.1
+          description: Input dataset of crawled pages.
+      parameters:
+        chunk_overlap:
+          description: Token overlap between chunks.
+          parameterType: NUMBER_INTEGER
+        chunk_size:
+          description: Maximum tokens per chunk.
+          parameterType: NUMBER_INTEGER
+    outputDefinitions:
+      artifacts:
+        chunked_data:
+          artifactType:
+            schemaTitle: system.Dataset
+            schemaVersion: 0.0.1
+  comp-clone-repo:
+    executorLabel: exec-clone-repo
+    inputDefinitions:
+      parameters:
+        branch:
+          description: Branch name to clone.
+          parameterType: STRING
+        repo_url:
+          description: Repository URL to clone.
+          parameterType: STRING
+    outputDefinitions:
+      artifacts:
+        clone_data:
+          artifactType:
+            schemaTitle: system.Dataset
+            schemaVersion: 0.0.1
+  comp-code-ingestion-pipeline:
+    dag:
+      tasks:
+        chunk-code:
+          cachingOptions:
+            enableCache: true
+          componentRef:
+            name: comp-chunk-code
+          dependentTasks:
+          - parse-code
+          inputs:
+            artifacts:
+              parsed_data:
+                taskOutputArtifact:
+                  outputArtifactKey: parsed_data
+                  producerTask: parse-code
+          retryPolicy:
+            backoffDuration: 30s
+            backoffFactor: 2.0
+            backoffMaxDuration: 3600s
+            maxRetryCount: 3
+          taskInfo:
+            name: chunk-code
+        clone-repo:
+          cachingOptions:
+            enableCache: true
+          componentRef:
+            name: comp-clone-repo
+          inputs:
+            parameters:
+              branch:
+                componentInputParameter: branch
+              repo_url:
+                componentInputParameter: repo_url
+          retryPolicy:
+            backoffDuration: 30s
+            backoffFactor: 2.0
+            backoffMaxDuration: 3600s
+            maxRetryCount: 3
+          taskInfo:
+            name: clone-repo
+        embed-code:
+          cachingOptions:
+            enableCache: true
+          componentRef:
+            name: comp-embed-code
+          dependentTasks:
+          - chunk-code
+          inputs:
+            artifacts:
+              chunked_data:
+                taskOutputArtifact:
+                  outputArtifactKey: chunked_data
+                  producerTask: chunk-code
+            parameters:
+              embedding_model:
+                componentInputParameter: embedding_model
+          retryPolicy:
+            backoffDuration: 30s
+            backoffFactor: 2.0
+            backoffMaxDuration: 3600s
+            maxRetryCount: 3
+          taskInfo:
+            name: embed-code
+        load-code:
+          cachingOptions:
+            enableCache: true
+          componentRef:
+            name: comp-load-code
+          dependentTasks:
+          - embed-code
+          inputs:
+            artifacts:
+              embedded_data:
+                taskOutputArtifact:
+                  outputArtifactKey: embedded_data
+                  producerTask: embed-code
+            parameters:
+              collection_name:
+                componentInputParameter: collection_name
+              embedding_dim:
+                componentInputParameter: embedding_dim
+              milvus_host:
+                componentInputParameter: milvus_host
+              milvus_port:
+                componentInputParameter: milvus_port
+          retryPolicy:
+            backoffDuration: 30s
+            backoffFactor: 2.0
+            backoffMaxDuration: 3600s
+            maxRetryCount: 3
+          taskInfo:
+            name: load-code
+        parse-code:
+          cachingOptions:
+            enableCache: true
+          componentRef:
+            name: comp-parse-code
+          dependentTasks:
+          - clone-repo
+          inputs:
+            artifacts:
+              clone_data:
+                taskOutputArtifact:
+                  outputArtifactKey: clone_data
+                  producerTask: clone-repo
+          retryPolicy:
+            backoffDuration: 30s
+            backoffFactor: 2.0
+            backoffMaxDuration: 3600s
+            maxRetryCount: 3
+          taskInfo:
+            name: parse-code
+    inputDefinitions:
+      parameters:
+        branch:
+          defaultValue: master
+          isOptional: true
+          parameterType: STRING
+        collection_name:
+          defaultValue: code_collection
+          isOptional: true
+          parameterType: STRING
+        embedding_dim:
+          defaultValue: 384.0
+          isOptional: true
+          parameterType: NUMBER_INTEGER
+        embedding_model:
+          defaultValue: sentence-transformers/all-MiniLM-L6-v2
+          isOptional: true
+          parameterType: STRING
+        milvus_host:
+          defaultValue: localhost
+          isOptional: true
+          parameterType: STRING
+        milvus_port:
+          defaultValue: '19530'
+          isOptional: true
+          parameterType: STRING
+        repo_url:
+          defaultValue: https://github.com/kubeflow/manifests
+          isOptional: true
+          parameterType: STRING
+  comp-crawl-docs:
+    executorLabel: exec-crawl-docs
+    inputDefinitions:
+      parameters:
+        base_url:
+          description: Base URL for kubeflow docs (e.g., https://www.kubeflow.org).
+          parameterType: STRING
+        crawl_delay:
+          description: Delay in seconds between requests.
+          parameterType: NUMBER_DOUBLE
+        max_pages:
+          description: Max pages to crawl (0 = unlimited).
+          parameterType: NUMBER_INTEGER
+    outputDefinitions:
+      artifacts:
+        crawled_data:
+          artifactType:
+            schemaTitle: system.Dataset
+            schemaVersion: 0.0.1
+  comp-docs-ingestion-pipeline:
+    dag:
+      tasks:
+        chunk-docs:
+          cachingOptions:
+            enableCache: true
+          componentRef:
+            name: comp-chunk-docs
+          dependentTasks:
+          - crawl-docs
+          inputs:
+            artifacts:
+              crawled_data:
+                taskOutputArtifact:
+                  outputArtifactKey: crawled_data
+                  producerTask: crawl-docs
+            parameters:
+              chunk_overlap:
+                componentInputParameter: chunk_overlap
+              chunk_size:
+                componentInputParameter: chunk_size
+          retryPolicy:
+            backoffDuration: 30s
+            backoffFactor: 2.0
+            backoffMaxDuration: 3600s
+            maxRetryCount: 3
+          taskInfo:
+            name: chunk-docs
+        crawl-docs:
+          cachingOptions:
+            enableCache: true
+          componentRef:
+            name: comp-crawl-docs
+          inputs:
+            parameters:
+              base_url:
+                componentInputParameter: base_url
+              crawl_delay:
+                componentInputParameter: crawl_delay
+              max_pages:
+                componentInputParameter: max_pages
+          retryPolicy:
+            backoffDuration: 30s
+            backoffFactor: 2.0
+            backoffMaxDuration: 3600s
+            maxRetryCount: 3
+          taskInfo:
+            name: crawl-docs
+        embed-docs:
+          cachingOptions:
+            enableCache: true
+          componentRef:
+            name: comp-embed-docs
+          dependentTasks:
+          - chunk-docs
+          inputs:
+            artifacts:
+              chunked_data:
+                taskOutputArtifact:
+                  outputArtifactKey: chunked_data
+                  producerTask: chunk-docs
+            parameters:
+              embedding_model:
+                componentInputParameter: embedding_model
+          retryPolicy:
+            backoffDuration: 30s
+            backoffFactor: 2.0
+            backoffMaxDuration: 3600s
+            maxRetryCount: 3
+          taskInfo:
+            name: embed-docs
+        load-docs:
+          cachingOptions:
+            enableCache: true
+          componentRef:
+            name: comp-load-docs
+          dependentTasks:
+          - embed-docs
+          inputs:
+            artifacts:
+              embedded_data:
+                taskOutputArtifact:
+                  outputArtifactKey: embedded_data
+                  producerTask: embed-docs
+            parameters:
+              collection_name:
+                componentInputParameter: collection_name
+              embedding_dim:
+                componentInputParameter: embedding_dim
+              milvus_host:
+                componentInputParameter: milvus_host
+              milvus_port:
+                componentInputParameter: milvus_port
+          retryPolicy:
+            backoffDuration: 30s
+            backoffFactor: 2.0
+            backoffMaxDuration: 3600s
+            maxRetryCount: 3
+          taskInfo:
+            name: load-docs
+    inputDefinitions:
+      parameters:
+        base_url:
+          defaultValue: https://www.kubeflow.org
+          isOptional: true
+          parameterType: STRING
+        chunk_overlap:
+          defaultValue: 50.0
+          isOptional: true
+          parameterType: NUMBER_INTEGER
+        chunk_size:
+          defaultValue: 500.0
+          isOptional: true
+          parameterType: NUMBER_INTEGER
+        collection_name:
+          defaultValue: docs_collection
+          isOptional: true
+          parameterType: STRING
+        crawl_delay:
+          defaultValue: 1.0
+          isOptional: true
+          parameterType: NUMBER_DOUBLE
+        embedding_dim:
+          defaultValue: 384.0
+          isOptional: true
+          parameterType: NUMBER_INTEGER
+        embedding_model:
+          defaultValue: sentence-transformers/all-MiniLM-L6-v2
+          isOptional: true
+          parameterType: STRING
+        max_pages:
+          defaultValue: 0.0
+          isOptional: true
+          parameterType: NUMBER_INTEGER
+        milvus_host:
+          defaultValue: localhost
+          isOptional: true
+          parameterType: STRING
+        milvus_port:
+          defaultValue: '19530'
+          isOptional: true
+          parameterType: STRING
+  comp-embed-code:
+    executorLabel: exec-embed-code
+    inputDefinitions:
+      artifacts:
+        chunked_data:
+          artifactType:
+            schemaTitle: system.Dataset
+            schemaVersion: 0.0.1
+          description: Input dataset of chunked code.
+      parameters:
+        embedding_model:
+          description: Model name for embeddings.
+          parameterType: STRING
+    outputDefinitions:
+      artifacts:
+        embedded_data:
+          artifactType:
+            schemaTitle: system.Dataset
+            schemaVersion: 0.0.1
+  comp-embed-docs:
+    executorLabel: exec-embed-docs
+    inputDefinitions:
+      artifacts:
+        chunked_data:
+          artifactType:
+            schemaTitle: system.Dataset
+            schemaVersion: 0.0.1
+          description: Input dataset of chunks.
+      parameters:
+        embedding_model:
+          description: Model name for embeddings.
+          parameterType: STRING
+    outputDefinitions:
+      artifacts:
+        embedded_data:
+          artifactType:
+            schemaTitle: system.Dataset
+            schemaVersion: 0.0.1
+  comp-load-code:
+    executorLabel: exec-load-code
+    inputDefinitions:
+      artifacts:
+        embedded_data:
+          artifactType:
+            schemaTitle: system.Dataset
+            schemaVersion: 0.0.1
+          description: Input dataset with embedded chunks.
+      parameters:
+        collection_name:
+          description: Target collection name.
+          parameterType: STRING
+        embedding_dim:
+          description: Vector dimension.
+          parameterType: NUMBER_INTEGER
+        milvus_host:
+          description: Milvus server host.
+          parameterType: STRING
+        milvus_port:
+          description: Milvus server port.
+          parameterType: STRING
+  comp-load-docs:
+    executorLabel: exec-load-docs
+    inputDefinitions:
+      artifacts:
+        embedded_data:
+          artifactType:
+            schemaTitle: system.Dataset
+            schemaVersion: 0.0.1
+          description: Input dataset with embedded chunks.
+      parameters:
+        collection_name:
+          description: Target collection name.
+          parameterType: STRING
+        embedding_dim:
+          description: Vector dimension.
+          parameterType: NUMBER_INTEGER
+        milvus_host:
+          description: Milvus server host.
+          parameterType: STRING
+        milvus_port:
+          description: Milvus server port.
+          parameterType: STRING
+  comp-parse-code:
+    executorLabel: exec-parse-code
+    inputDefinitions:
+      artifacts:
+        clone_data:
+          artifactType:
+            schemaTitle: system.Dataset
+            schemaVersion: 0.0.1
+          description: Input dataset from repo cloner.
+    outputDefinitions:
+      artifacts:
+        parsed_data:
+          artifactType:
+            schemaTitle: system.Dataset
+            schemaVersion: 0.0.1
+deploymentSpec:
+  executors:
+    exec-chunk-code:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - chunk_code
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'tiktoken==0.7.0'\
+          \  &&  python3 -m pip install --quiet --no-warn-script-location 'kfp==2.16.0'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\
+          $0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef chunk_code(\n    parsed_data: Input[Dataset],\n    chunked_data:\
+          \ Output[Dataset],\n):\n    \"\"\"Post-process parsed chunks with token\
+          \ limits and context headers.\n\n    Args:\n        parsed_data: Input dataset\
+          \ of parsed chunks.\n        chunked_data: Output dataset of token-bounded\
+          \ chunks.\n    \"\"\"\n    import hashlib\n    import json\n    import logging\n\
+          \n    import tiktoken\n\n    logging.basicConfig(level=logging.INFO)\n \
+          \   logger = logging.getLogger(\"chunker\")\n\n    enc = tiktoken.get_encoding(\"\
+          cl100k_base\")\n    count = lambda t: len(enc.encode(t))\n\n    MIN_T, MAX_T\
+          \ = 50, 512\n\n    def build_path_hints(chunk):\n        raw = \" \".join(str(chunk.get(key,\
+          \ \"\")) for key in (\"file_path\", \"folder_context\", \"symbol_name\"\
+          ))\n        expanded = raw.replace(\"/\", \" \").replace(\"_\", \" \").replace(\"\
+          -\", \" \")\n        expanded = \"\".join(\n            (\n            \
+          \    f\" {char}\" if index > 0 and char.isupper() and expanded[index - 1].islower()\
+          \ else char\n            )\n            for index, char in enumerate(expanded)\n\
+          \        )\n        return \" \".join(expanded.split()).lower()\n\n    raw\
+          \ = []\n    with open(parsed_data.path) as f:\n        for line in f:\n\
+          \            if line.strip():\n                raw.append(json.loads(line))\n\
+          \n    processed = []\n    for chunk in raw:\n        header = (\n      \
+          \      f\"# File: {chunk.get('file_path', '?')} | Symbol: {chunk.get('symbol_name',\
+          \ '?')} \"\n            f\"| Lang: {chunk.get('language', '?')} | Folder:\
+          \ {chunk.get('folder_context', '?')}\"\n        )\n        path_hints =\
+          \ build_path_hints(chunk)\n        if path_hints:\n            header =\
+          \ f\"{header}\\n# Path Hints: {path_hints}\"\n        full = f\"{header}\\\
+          n\\n{chunk['chunk_text']}\"\n        tc = count(full)\n\n        if tc <\
+          \ MIN_T:\n            continue\n\n        if tc <= MAX_T:\n            chunk[\"\
+          chunk_text\"] = full[:8192]\n            chunk[\"token_count\"] = tc\n \
+          \           processed.append(chunk)\n        else:\n            # Split\
+          \ oversized\n            parts = full.split(\"\\n\\n\")\n            cur,\
+          \ subs = \"\", []\n            for p in parts:\n                cand = cur\
+          \ + \"\\n\\n\" + p if cur else p\n                if count(cand) > MAX_T:\n\
+          \                    if cur.strip():\n                        subs.append(cur.strip())\n\
+          \                    cur = p\n                else:\n                  \
+          \  cur = cand\n            if cur.strip():\n                subs.append(cur.strip())\n\
+          \n            for si, sub in enumerate(subs):\n                st = count(sub)\n\
+          \                if st < MIN_T:\n                    continue\n        \
+          \        sc = chunk.copy()\n                sc[\"chunk_id\"] = hashlib.sha256(f\"\
+          {chunk['chunk_id']}::{si}\".encode()).hexdigest()[:32]\n               \
+          \ sc[\"chunk_text\"] = sub[:8192]\n                sc[\"token_count\"] =\
+          \ st\n                processed.append(sc)\n\n    logger.info(\"Chunked\
+          \ %d -> %d chunks\", len(raw), len(processed))\n\n    with open(chunked_data.path,\
+          \ \"w\") as f:\n        for c in processed:\n            f.write(json.dumps(c,\
+          \ ensure_ascii=False) + \"\\n\")\n\n"
+        image: python:3.11-slim
+    exec-chunk-docs:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - chunk_docs
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'tiktoken==0.7.0'\
+          \  &&  python3 -m pip install --quiet --no-warn-script-location 'kfp==2.16.0'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\
+          $0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef chunk_docs(\n    crawled_data: Input[Dataset],\n    chunk_size:\
+          \ int,\n    chunk_overlap: int,\n    chunked_data: Output[Dataset],\n):\n\
+          \    \"\"\"Chunk crawled documentation by headings with token-aware limits.\n\
+          \n    Args:\n        crawled_data: Input dataset of crawled pages.\n   \
+          \     chunk_size: Maximum tokens per chunk.\n        chunk_overlap: Token\
+          \ overlap between chunks.\n        chunked_data: Output dataset of chunks.\n\
+          \    \"\"\"\n    import hashlib\n    import json\n    import logging\n \
+          \   import re\n\n    import tiktoken\n\n    logging.basicConfig(level=logging.INFO)\n\
+          \    logger = logging.getLogger(\"chunker\")\n\n    enc = tiktoken.get_encoding(\"\
+          cl100k_base\")\n    count_tokens = lambda t: len(enc.encode(t))\n\n    def\
+          \ gen_id(url, idx):\n        return hashlib.sha256(f\"{url}::{idx}\".encode()).hexdigest()[:32]\n\
+          \n    def split_headings(content):\n        pat = re.compile(r\"^(#{2,4})\\\
+          s+(.+)$\", re.MULTILINE)\n        matches = list(pat.finditer(content))\n\
+          \        if not matches:\n            return [{\"heading\": \"Overview\"\
+          , \"text\": content.strip()}]\n        sections = []\n        pre = content[:matches[0].start()].strip()\n\
+          \        if pre and len(pre) > 50:\n            sections.append({\"heading\"\
+          : \"Introduction\", \"text\": pre})\n        for i, m in enumerate(matches):\n\
+          \            end = matches[i + 1].start() if i + 1 < len(matches) else len(content)\n\
+          \            text = content[m.end():end].strip()\n            if text:\n\
+          \                sections.append({\"heading\": m.group(2).strip(), \"text\"\
+          : text})\n        return sections\n\n    def recursive_split(text, max_t,\
+          \ overlap):\n        if count_tokens(text) <= max_t:\n            return\
+          \ [text]\n        for sep in [\"\\n\\n\", \"\\n\", \". \", \" \"]:\n   \
+          \         parts = text.split(sep)\n            if len(parts) <= 1:\n   \
+          \             continue\n            chunks, cur = [], \"\"\n           \
+          \ for p in parts:\n                cand = cur + sep + p if cur else p\n\
+          \                if count_tokens(cand) > max_t:\n                    if\
+          \ cur:\n                        chunks.append(cur.strip())\n           \
+          \             words = cur.split()\n                        ow = max(1, int(overlap\
+          \ / 1.3))\n                        cur = \" \".join(words[-ow:]) + sep +\
+          \ p\n                    else:\n                        cur = p\n      \
+          \          else:\n                    cur = cand\n            if cur.strip():\n\
+          \                chunks.append(cur.strip())\n            if chunks:\n  \
+          \              return chunks\n        words = text.split()\n        chunks,\
+          \ cw = [], []\n        for w in words:\n            cw.append(w)\n     \
+          \       if count_tokens(\" \".join(cw)) > max_t:\n                chunks.append(\"\
+          \ \".join(cw[:-1]))\n                ow = max(1, int(overlap / 1.3))\n \
+          \               cw = cw[-ow:]\n        if cw:\n            chunks.append(\"\
+          \ \".join(cw))\n        return chunks\n\n    pages = []\n    with open(crawled_data.path)\
+          \ as f:\n        for line in f:\n            if line.strip():\n        \
+          \        pages.append(json.loads(line))\n\n    all_chunks = []\n    for\
+          \ page in pages:\n        sections = split_headings(page[\"content\"])\n\
+          \        ci = 0\n        context_prefix = f\"Page: {page['title']} | Source:\
+          \ {page['url']}\"\n        for sec in sections:\n            prefixed =\
+          \ f\"{context_prefix}\\n{sec['heading']}\\n\\n{sec['text']}\"\n        \
+          \    tc = count_tokens(prefixed)\n            if tc <= chunk_size and tc\
+          \ >= 30:\n                all_chunks.append({\n                    \"chunk_id\"\
+          : gen_id(page[\"url\"], ci),\n                    \"source_url\": page[\"\
+          url\"], \"page_title\": page[\"title\"],\n                    \"heading\"\
+          : sec[\"heading\"][:256], \"section\": page.get(\"section\", \"\")[:128],\n\
+          \                    \"chunk_text\": prefixed[:16384], \"token_count\":\
+          \ tc,\n                    \"chunk_index\": ci, \"crawled_at\": page.get(\"\
+          crawled_at\", \"\"),\n                })\n                ci += 1\n    \
+          \        elif tc > chunk_size:\n                for sub in recursive_split(prefixed,\
+          \ chunk_size, chunk_overlap):\n                    st = count_tokens(sub)\n\
+          \                    if st >= 30:\n                        all_chunks.append({\n\
+          \                            \"chunk_id\": gen_id(page[\"url\"], ci),\n\
+          \                            \"source_url\": page[\"url\"], \"page_title\"\
+          : page[\"title\"],\n                            \"heading\": sec[\"heading\"\
+          ][:256], \"section\": page.get(\"section\", \"\")[:128],\n             \
+          \               \"chunk_text\": sub[:16384], \"token_count\": st,\n    \
+          \                        \"chunk_index\": ci, \"crawled_at\": page.get(\"\
+          crawled_at\", \"\"),\n                        })\n                     \
+          \   ci += 1\n\n    logger.info(\"Created %d chunks from %d pages.\", len(all_chunks),\
+          \ len(pages))\n\n    with open(chunked_data.path, \"w\") as f:\n       \
+          \ for c in all_chunks:\n            f.write(json.dumps(c, ensure_ascii=False)\
+          \ + \"\\n\")\n\n"
+        image: python:3.11-slim
+    exec-clone-repo:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - clone_repo
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'gitpython==3.1.43'\
+          \  &&  python3 -m pip install --quiet --no-warn-script-location 'kfp==2.16.0'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\
+          $0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef clone_repo(\n    repo_url: str,\n    branch: str,\n    clone_data:\
+          \ Output[Dataset],\n):\n    \"\"\"Clone a git repository and collect file\
+          \ metadata.\n\n    Args:\n        repo_url: Repository URL to clone.\n \
+          \       branch: Branch name to clone.\n        clone_data: Output dataset\
+          \ artifact.\n    \"\"\"\n    import json\n    import logging\n    import\
+          \ os\n    import subprocess\n    import tempfile\n\n    logging.basicConfig(level=logging.INFO)\n\
+          \    logger = logging.getLogger(\"repo_cloner\")\n\n    SKIP_DIRS = {\"\
+          .git\", \"__pycache__\", \"node_modules\", \".tox\", \".mypy_cache\"}\n\
+          \    EXTENSIONS = {\".py\", \".go\", \".yaml\", \".yml\", \".md\"}\n   \
+          \ MIN_SIZE, MAX_SIZE = 200, 100_000\n\n    clone_dir = tempfile.mkdtemp(prefix=\"\
+          code-ingest-\")\n    logger.info(\"Cloning %s -> %s\", repo_url, clone_dir)\n\
+          \n    subprocess.run(\n        [\"git\", \"clone\", \"--depth\", \"1\",\
+          \ \"--branch\", branch, repo_url, clone_dir],\n        check=True, capture_output=True,\
+          \ text=True,\n    )\n\n    result = subprocess.run(\n        [\"git\", \"\
+          rev-parse\", \"HEAD\"],\n        capture_output=True, text=True, cwd=clone_dir,\
+          \ check=True,\n    )\n    commit_sha = result.stdout.strip()\n    logger.info(\"\
+          Commit: %s\", commit_sha[:12])\n\n    files = []\n    for root, dirs, fnames\
+          \ in os.walk(clone_dir):\n        dirs[:] = [d for d in dirs if d not in\
+          \ SKIP_DIRS and not d.startswith(\".\")]\n        for fn in fnames:\n  \
+          \          fp = os.path.join(root, fn)\n            rel = os.path.relpath(fp,\
+          \ clone_dir)\n            _, ext = os.path.splitext(fn)\n            if\
+          \ ext.lower() not in EXTENSIONS:\n                continue\n           \
+          \ try:\n                sz = os.path.getsize(fp)\n            except OSError:\n\
+          \                continue\n            if sz < MIN_SIZE or sz > MAX_SIZE:\n\
+          \                continue\n            parts = rel.split(os.sep)\n     \
+          \       folder = parts[0] if len(parts) > 1 else \"root\"\n            files.append({\"\
+          path\": rel, \"extension\": ext.lower(),\n                          \"size_bytes\"\
+          : sz, \"folder_context\": folder})\n\n    logger.info(\"Collected %d files\"\
+          , len(files))\n\n    # Save file list + contents\n    output = []\n    for\
+          \ f in files:\n        full = os.path.join(clone_dir, f[\"path\"])\n   \
+          \     try:\n            with open(full, \"r\", encoding=\"utf-8\", errors=\"\
+          replace\") as fh:\n                content = fh.read()\n        except Exception:\n\
+          \            continue\n        output.append({**f, \"content\": content,\
+          \ \"commit_sha\": commit_sha})\n\n    with open(clone_data.path, \"w\")\
+          \ as fh:\n        for item in output:\n            fh.write(json.dumps(item,\
+          \ ensure_ascii=False) + \"\\n\")\n\n    # Cleanup\n    import shutil\n \
+          \   shutil.rmtree(clone_dir, ignore_errors=True)\n\n"
+        image: python:3.11-slim
+    exec-crawl-docs:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - crawl_docs
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'requests==2.31.0'\
+          \ 'beautifulsoup4==4.12.3'  &&  python3 -m pip install --quiet --no-warn-script-location\
+          \ 'kfp==2.16.0' '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"\
+          3.9\"' && \"$0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef crawl_docs(\n    base_url: str,\n    crawl_delay: float,\n  \
+          \  max_pages: int,\n    crawled_data: Output[Dataset],\n):\n    \"\"\"Crawl\
+          \ kubeflow.org documentation pages via sitemap.xml.\n\n    Args:\n     \
+          \   base_url: Base URL for kubeflow docs (e.g., https://www.kubeflow.org).\n\
+          \        crawl_delay: Delay in seconds between requests.\n        max_pages:\
+          \ Max pages to crawl (0 = unlimited).\n        crawled_data: Output dataset\
+          \ artifact for crawled pages.\n    \"\"\"\n    import json\n    import logging\n\
+          \    import re\n    import time\n    from datetime import datetime, timezone\n\
+          \    from xml.etree import ElementTree\n\n    import requests\n    from\
+          \ bs4 import BeautifulSoup\n\n    logging.basicConfig(level=logging.INFO)\n\
+          \    logger = logging.getLogger(\"crawler\")\n\n    SITEMAP_NS = {\"ns\"\
+          : \"http://www.sitemaps.org/schemas/sitemap/0.9\"}\n\n    def fetch(url,\
+          \ retries=3):\n        for attempt in range(retries):\n            try:\n\
+          \                resp = requests.get(url, timeout=30, headers={\n      \
+          \              \"User-Agent\": \"Kubeflow-DocsAgent-Crawler/1.0\"\n    \
+          \            })\n                resp.raise_for_status()\n             \
+          \   return resp\n            except Exception as e:\n                if\
+          \ attempt < retries - 1:\n                    time.sleep(2 ** attempt)\n\
+          \                else:\n                    logger.error(\"Failed: %s \u2014\
+          \ %s\", url, e)\n        return None\n\n    def fix_url(raw):\n        prefix\
+          \ = \"https://www.kubeflow.org\"\n        if raw.startswith(prefix + prefix):\n\
+          \            return raw[len(prefix):]\n        if raw.startswith(prefix\
+          \ + \"https://\"):\n            return raw[len(prefix):]\n        return\
+          \ raw\n\n    # Parse sitemap\n    resp = fetch(f\"{base_url}/sitemap.xml\"\
+          )\n    if not resp:\n        raise RuntimeError(\"Cannot fetch sitemap\"\
+          )\n\n    root = ElementTree.fromstring(resp.content)\n    urls = []\n  \
+          \  for elem in root.findall(\"ns:url\", SITEMAP_NS):\n        loc = elem.find(\"\
+          ns:loc\", SITEMAP_NS)\n        if loc is not None and loc.text:\n      \
+          \      url = fix_url(loc.text.strip())\n            if \"/docs/\" in url:\n\
+          \                urls.append(url)\n\n    if max_pages > 0:\n        urls\
+          \ = urls[:max_pages]\n\n    logger.info(\"Found %d docs URLs from sitemap.\"\
+          , len(urls))\n\n    results = []\n    crawled_at = datetime.now(timezone.utc).isoformat()\n\
+          \n    for i, url in enumerate(urls):\n        logger.info(\"[%d/%d] %s\"\
+          , i + 1, len(urls), url)\n        resp = fetch(url)\n        if not resp:\n\
+          \            continue\n\n        soup = BeautifulSoup(resp.text, \"html.parser\"\
+          )\n        title_tag = soup.find(\"title\")\n        title = title_tag.get_text(strip=True)\
+          \ if title_tag else \"Untitled\"\n        title = re.sub(r\"\\s*[|\u2013\
+          -]\\s*Kubeflow\\s*$\", \"\", title)\n\n        for sel in [\"nav\", \"footer\"\
+          , \"header\", \"aside\", \"script\", \"style\",\n                     \"\
+          .navbar\", \".sidebar\", \".toc\", \".breadcrumb\",\n                  \
+          \   \".td-sidebar\", \".td-toc\", \".td-navbar\",\n                    \
+          \ \".edit-page\", \".page-meta\"]:\n            for el in soup.select(sel):\n\
+          \                el.decompose()\n\n        main = (\n            soup.find(\"\
+          main\")\n            or soup.find(\"article\")\n            or soup.find(\"\
+          div\", class_=re.compile(r\"content|td-content|docs-content\"))\n      \
+          \      or soup.body\n            or soup\n        )\n\n        # Preserve\
+          \ heading structure so downstream chunking can split correctly.\n      \
+          \  for tag_name, md_prefix in [(\"h1\", \"#\"), (\"h2\", \"##\"), (\"h3\"\
+          , \"###\"), (\"h4\", \"####\")]:\n            for heading_tag in main.find_all(tag_name):\n\
+          \                heading_text = heading_tag.get_text(strip=True)\n     \
+          \           if heading_text:\n                    heading_tag.replace_with(f\"\
+          \\n\\n{md_prefix} {heading_text}\\n\\n\")\n\n        # Preserve code blocks\
+          \ as inline fenced text markers instead of flattening them away.\n     \
+          \   for code_tag in main.find_all([\"code\", \"pre\"]):\n            code_text\
+          \ = code_tag.get_text()\n            code_tag.replace_with(f\"`{code_text}`\"\
+          )\n\n        content = main.get_text(separator=\"\\n\", strip=False)\n \
+          \       content = re.sub(r\"\\n{3,}\", \"\\n\\n\", content)\n        content\
+          \ = re.sub(r\"[ \\t]+\", \" \", content).strip()\n\n        if len(content)\
+          \ < 200:\n            continue\n\n        section_match = re.search(r\"\
+          /docs/([^/]+)\", url)\n        section = section_match.group(1) if section_match\
+          \ else \"root\"\n\n        results.append({\n            \"url\": url, \"\
+          title\": title.strip(), \"content\": content.strip(),\n            \"section\"\
+          : section, \"crawled_at\": crawled_at,\n        })\n\n        if crawl_delay\
+          \ > 0 and i < len(urls) - 1:\n            time.sleep(crawl_delay)\n\n  \
+          \  logger.info(\"Crawled %d pages.\", len(results))\n\n    with open(crawled_data.path,\
+          \ \"w\") as f:\n        for r in results:\n            f.write(json.dumps(r,\
+          \ ensure_ascii=False) + \"\\n\")\n\n"
+        image: python:3.11-slim
+    exec-embed-code:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - embed_code
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'sentence-transformers==2.7.0'\
+          \ 'torch==2.3.0'  &&  python3 -m pip install --quiet --no-warn-script-location\
+          \ 'kfp==2.16.0' '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"\
+          3.9\"' && \"$0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef embed_code(\n    chunked_data: Input[Dataset],\n    embedding_model:\
+          \ str,\n    embedded_data: Output[Dataset],\n):\n    \"\"\"Embed code chunks\
+          \ using configurable model.\n\n    Args:\n        chunked_data: Input dataset\
+          \ of chunked code.\n        embedding_model: Model name for embeddings.\n\
+          \        embedded_data: Output dataset with embeddings.\n    \"\"\"\n  \
+          \  import json\n    import logging\n\n    from sentence_transformers import\
+          \ SentenceTransformer\n\n    logging.basicConfig(level=logging.INFO)\n \
+          \   logger = logging.getLogger(\"embedder\")\n\n    chunks = []\n    with\
+          \ open(chunked_data.path) as f:\n        for line in f:\n            if\
+          \ line.strip():\n                chunks.append(json.loads(line))\n\n   \
+          \ logger.info(\"Embedding %d code chunks with %s\", len(chunks), embedding_model)\n\
+          \    model = SentenceTransformer(embedding_model)\n\n    texts = [c[\"chunk_text\"\
+          ] for c in chunks]\n    bs = 32\n    all_embs = []\n    for i in range(0,\
+          \ len(texts), bs):\n        batch = texts[i:i + bs]\n        embs = model.encode(batch,\
+          \ show_progress_bar=False)\n        all_embs.extend([e.tolist() for e in\
+          \ embs])\n        logger.info(\"Batch %d/%d\", i // bs + 1, (len(texts)\
+          \ + bs - 1) // bs)\n\n    for c, e in zip(chunks, all_embs):\n        c[\"\
+          embedding\"] = e\n\n    with open(embedded_data.path, \"w\") as f:\n   \
+          \     for c in chunks:\n            f.write(json.dumps(c, ensure_ascii=False)\
+          \ + \"\\n\")\n\n"
+        image: python:3.11-slim
+    exec-embed-docs:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - embed_docs
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'sentence-transformers==2.7.0'\
+          \ 'torch==2.3.0'  &&  python3 -m pip install --quiet --no-warn-script-location\
+          \ 'kfp==2.16.0' '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"\
+          3.9\"' && \"$0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef embed_docs(\n    chunked_data: Input[Dataset],\n    embedding_model:\
+          \ str,\n    embedded_data: Output[Dataset],\n):\n    \"\"\"Embed documentation\
+          \ chunks using configurable model.\n\n    Args:\n        chunked_data: Input\
+          \ dataset of chunks.\n        embedding_model: Model name for embeddings.\n\
+          \        embedded_data: Output dataset with embeddings.\n    \"\"\"\n  \
+          \  import json\n    import logging\n\n    from sentence_transformers import\
+          \ SentenceTransformer\n\n    logging.basicConfig(level=logging.INFO)\n \
+          \   logger = logging.getLogger(\"embedder\")\n\n    chunks = []\n    with\
+          \ open(chunked_data.path) as f:\n        for line in f:\n            if\
+          \ line.strip():\n                chunks.append(json.loads(line))\n\n   \
+          \ logger.info(\"Embedding %d chunks with %s\", len(chunks), embedding_model)\n\
+          \    model = SentenceTransformer(embedding_model)\n\n    texts = [c[\"chunk_text\"\
+          ] for c in chunks]\n    batch_size = 32\n    all_embeddings = []\n\n   \
+          \ for i in range(0, len(texts), batch_size):\n        batch = texts[i:i\
+          \ + batch_size]\n        embs = model.encode(batch, show_progress_bar=False)\n\
+          \        all_embeddings.extend([e.tolist() for e in embs])\n        logger.info(\"\
+          Batch %d/%d done.\", i // batch_size + 1,\n                     (len(texts)\
+          \ + batch_size - 1) // batch_size)\n\n    for chunk, emb in zip(chunks,\
+          \ all_embeddings):\n        chunk[\"embedding\"] = emb\n\n    logger.info(\"\
+          Embedding complete.\")\n\n    with open(embedded_data.path, \"w\") as f:\n\
+          \        for c in chunks:\n            f.write(json.dumps(c, ensure_ascii=False)\
+          \ + \"\\n\")\n\n"
+        image: python:3.11-slim
+    exec-load-code:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - load_code
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'pymilvus==2.4.0'\
+          \  &&  python3 -m pip install --quiet --no-warn-script-location 'kfp==2.16.0'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\
+          $0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef load_code(\n    embedded_data: Input[Dataset],\n    milvus_host:\
+          \ str,\n    milvus_port: str,\n    collection_name: str,\n    embedding_dim:\
+          \ int,\n):\n    \"\"\"Load embedded code chunks into Milvus code_collection.\n\
+          \n    Args:\n        embedded_data: Input dataset with embedded chunks.\n\
+          \        milvus_host: Milvus server host.\n        milvus_port: Milvus server\
+          \ port.\n        collection_name: Target collection name.\n        embedding_dim:\
+          \ Vector dimension.\n    \"\"\"\n    import json\n    import logging\n\n\
+          \    from pymilvus import (Collection, CollectionSchema, DataType, FieldSchema,\n\
+          \                          connections, utility)\n\n    logging.basicConfig(level=logging.INFO)\n\
+          \    logger = logging.getLogger(\"loader\")\n\n    connections.connect(\"\
+          default\", host=milvus_host, port=milvus_port)\n\n    if not utility.has_collection(collection_name):\n\
+          \        fields = [\n            FieldSchema(\"chunk_id\", DataType.VARCHAR,\
+          \ max_length=128, is_primary=True),\n            FieldSchema(\"file_path\"\
+          , DataType.VARCHAR, max_length=512),\n            FieldSchema(\"extension\"\
+          , DataType.VARCHAR, max_length=16),\n            FieldSchema(\"language\"\
+          , DataType.VARCHAR, max_length=32),\n            FieldSchema(\"symbol_name\"\
+          , DataType.VARCHAR, max_length=256),\n            FieldSchema(\"folder_context\"\
+          , DataType.VARCHAR, max_length=128),\n            FieldSchema(\"chunk_text\"\
+          , DataType.VARCHAR, max_length=8192),\n            FieldSchema(\"start_line\"\
+          , DataType.INT64),\n            FieldSchema(\"end_line\", DataType.INT64),\n\
+          \            FieldSchema(\"commit_sha\", DataType.VARCHAR, max_length=64),\n\
+          \            FieldSchema(\"embedding\", DataType.FLOAT_VECTOR, dim=embedding_dim),\n\
+          \        ]\n        schema = CollectionSchema(fields, \"Kubeflow manifests\
+          \ code chunks\")\n        collection = Collection(collection_name, schema)\n\
+          \        collection.create_index(\"embedding\", {\n            \"metric_type\"\
+          : \"COSINE\", \"index_type\": \"HNSW\",\n            \"params\": {\"M\"\
+          : 16, \"efConstruction\": 200},\n        })\n    else:\n        collection\
+          \ = Collection(collection_name)\n\n    collection.load()\n\n    chunks =\
+          \ []\n    with open(embedded_data.path) as f:\n        for line in f:\n\
+          \            if line.strip():\n                chunks.append(json.loads(line))\n\
+          \n    rows = []\n    for c in chunks:\n        rows.append({\n         \
+          \   \"chunk_id\": str(c[\"chunk_id\"])[:128],\n            \"file_path\"\
+          : str(c.get(\"file_path\", \"\"))[:512],\n            \"extension\": str(c.get(\"\
+          extension\", \"\"))[:16],\n            \"language\": str(c.get(\"language\"\
+          , \"\"))[:32],\n            \"symbol_name\": str(c.get(\"symbol_name\",\
+          \ \"\"))[:256],\n            \"folder_context\": str(c.get(\"folder_context\"\
+          , \"\"))[:128],\n            \"chunk_text\": str(c.get(\"chunk_text\", \"\
+          \"))[:8192],\n            \"start_line\": int(c.get(\"start_line\", 0)),\n\
+          \            \"end_line\": int(c.get(\"end_line\", 0)),\n            \"\
+          commit_sha\": str(c.get(\"commit_sha\", \"\"))[:64],\n            \"embedding\"\
+          : c[\"embedding\"],\n        })\n\n    bs = 100\n    inserted = 0\n    for\
+          \ i in range(0, len(rows), bs):\n        batch = rows[i:i + bs]\n      \
+          \  collection.upsert(batch)\n        inserted += len(batch)\n\n    collection.flush()\n\
+          \    logger.info(\"Loaded %d chunks into %s. Total: %d\",\n            \
+          \    inserted, collection_name, collection.num_entities)\n\n"
+        image: python:3.11-slim
+    exec-load-docs:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - load_docs
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'pymilvus==2.4.0'\
+          \  &&  python3 -m pip install --quiet --no-warn-script-location 'kfp==2.16.0'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\
+          $0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef load_docs(\n    embedded_data: Input[Dataset],\n    milvus_host:\
+          \ str,\n    milvus_port: str,\n    collection_name: str,\n    embedding_dim:\
+          \ int,\n):\n    \"\"\"Load embedded chunks into Milvus docs_collection.\n\
+          \n    Args:\n        embedded_data: Input dataset with embedded chunks.\n\
+          \        milvus_host: Milvus server host.\n        milvus_port: Milvus server\
+          \ port.\n        collection_name: Target collection name.\n        embedding_dim:\
+          \ Vector dimension.\n    \"\"\"\n    import json\n    import logging\n\n\
+          \    from pymilvus import (Collection, CollectionSchema, DataType, FieldSchema,\n\
+          \                          connections, utility)\n\n    logging.basicConfig(level=logging.INFO)\n\
+          \    logger = logging.getLogger(\"loader\")\n\n    connections.connect(\"\
+          default\", host=milvus_host, port=milvus_port)\n    logger.info(\"Connected\
+          \ to Milvus at %s:%s\", milvus_host, milvus_port)\n\n    if not utility.has_collection(collection_name):\n\
+          \        fields = [\n            FieldSchema(\"chunk_id\", DataType.VARCHAR,\
+          \ max_length=128, is_primary=True),\n            FieldSchema(\"source_url\"\
+          , DataType.VARCHAR, max_length=512),\n            FieldSchema(\"page_title\"\
+          , DataType.VARCHAR, max_length=256),\n            FieldSchema(\"heading\"\
+          , DataType.VARCHAR, max_length=256),\n            FieldSchema(\"section\"\
+          , DataType.VARCHAR, max_length=128),\n            FieldSchema(\"chunk_text\"\
+          , DataType.VARCHAR, max_length=16384),\n            FieldSchema(\"token_count\"\
+          , DataType.INT64),\n            FieldSchema(\"chunk_index\", DataType.INT64),\n\
+          \            FieldSchema(\"crawled_at\", DataType.VARCHAR, max_length=64),\n\
+          \            FieldSchema(\"embedding\", DataType.FLOAT_VECTOR, dim=embedding_dim),\n\
+          \        ]\n        schema = CollectionSchema(fields, \"Kubeflow docs chunks\"\
+          )\n        collection = Collection(collection_name, schema)\n        collection.create_index(\"\
+          embedding\", {\n            \"metric_type\": \"COSINE\", \"index_type\"\
+          : \"HNSW\",\n            \"params\": {\"M\": 16, \"efConstruction\": 200},\n\
+          \        })\n        logger.info(\"Created collection: %s\", collection_name)\n\
+          \    else:\n        collection = Collection(collection_name)\n\n    collection.load()\n\
+          \n    chunks = []\n    with open(embedded_data.path) as f:\n        for\
+          \ line in f:\n            if line.strip():\n                chunks.append(json.loads(line))\n\
+          \n    rows = []\n    for c in chunks:\n        rows.append({\n         \
+          \   \"chunk_id\": str(c[\"chunk_id\"])[:128],\n            \"source_url\"\
+          : str(c.get(\"source_url\", \"\"))[:512],\n            \"page_title\": str(c.get(\"\
+          page_title\", \"\"))[:256],\n            \"heading\": str(c.get(\"heading\"\
+          , \"\"))[:256],\n            \"section\": str(c.get(\"section\", \"\"))[:128],\n\
+          \            \"chunk_text\": str(c.get(\"chunk_text\", \"\"))[:16384],\n\
+          \            \"token_count\": int(c.get(\"token_count\", 0)),\n        \
+          \    \"chunk_index\": int(c.get(\"chunk_index\", 0)),\n            \"crawled_at\"\
+          : str(c.get(\"crawled_at\", \"\"))[:64],\n            \"embedding\": c[\"\
+          embedding\"],\n        })\n\n    batch_size = 100\n    inserted = 0\n  \
+          \  for i in range(0, len(rows), batch_size):\n        batch = rows[i:i +\
+          \ batch_size]\n        collection.upsert(batch)\n        inserted += len(batch)\n\
+          \        logger.info(\"Upserted batch %d/%d\", i // batch_size + 1,\n  \
+          \                   (len(rows) + batch_size - 1) // batch_size)\n\n    collection.flush()\n\
+          \    logger.info(\"Loaded %d chunks into %s. Total: %d\",\n            \
+          \    inserted, collection_name, collection.num_entities)\n\n"
+        image: python:3.11-slim
+    exec-parse-code:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - parse_code
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'PyYAML==6.0.1'\
+          \  &&  python3 -m pip install --quiet --no-warn-script-location 'kfp==2.16.0'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\
+          $0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef parse_code(\n    clone_data: Input[Dataset],\n    parsed_data:\
+          \ Output[Dataset],\n):\n    \"\"\"Parse files into logical code chunks using\
+          \ language-specific parsers.\n\n    Args:\n        clone_data: Input dataset\
+          \ from repo cloner.\n        parsed_data: Output dataset of parsed chunks.\n\
+          \    \"\"\"\n    import ast as pyast\n    import hashlib\n    import json\n\
+          \    import logging\n    import os\n    import re\n\n    import yaml\n\n\
+          \    logging.basicConfig(level=logging.INFO)\n    logger = logging.getLogger(\"\
+          ast_parser\")\n\n    PATH_ALIAS_HINTS = {\n        \"common/istio\": [\n\
+          \            \"istio\", \"service mesh\", \"gateway\", \"authorization policy\"\
+          ,\n            \"peer authentication\", \"virtual service\", \"sidecar\"\
+          , \"envoy\", \"mtls\", \"ingress\",\n        ],\n        \"common/knative\"\
+          : [\n            \"knative\", \"serving\", \"eventing\", \"serverless\"\
+          , \"scale to zero\",\n            \"activator\", \"revision\", \"service\"\
+          , \"net istio\", \"webhook\",\n        ],\n        \"common/dex\": [\n \
+          \           \"dex\", \"oidc\", \"oauth2\", \"authentication\", \"identity\
+          \ provider\",\n            \"connector\", \"login\",\n        ],\n     \
+          \   \"common/cert-manager\": [\n            \"cert manager\", \"certificate\"\
+          , \"issuer\", \"clusterissuer\",\n            \"cainjector\", \"tls\", \"\
+          webhook\",\n        ],\n        \"applications/pipeline\": [\n         \
+          \   \"kubeflow pipelines\", \"kfp\", \"pipeline api server\", \"deployment\"\
+          ,\n            \"service\", \"configmap\", \"role\", \"rolebinding\", \"\
+          serviceaccount\",\n            \"crd\", \"webhook\", \"scheduled workflow\"\
+          ,\n        ],\n        \"applications/profiles\": [\n            \"profiles\"\
+          , \"namespaces\", \"rbac\", \"rolebinding\", \"serviceaccount\", \"user\
+          \ profile\",\n        ],\n        \"tests\": [\"tests\", \"e2e\", \"integration\"\
+          , \"validation\", \"presubmit\"],\n    }\n\n    def gen_id(fp, sym, idx):\n\
+          \        return hashlib.sha256(f\"{fp}::{sym}::{idx}\".encode()).hexdigest()[:32]\n\
+          \n    def split_terms(value):\n        expanded = re.sub(r\"([a-z0-9])([A-Z])\"\
+          , r\"\\1 \\2\", value)\n        normalized = re.sub(r\"[^A-Za-z0-9]+\",\
+          \ \" \", expanded)\n        return [token.lower() for token in normalized.split()\
+          \ if token]\n\n    def unique_terms(values, limit=24):\n        seen = set()\n\
+          \        ordered = []\n        for value in values:\n            for token\
+          \ in split_terms(str(value)):\n                if token not in seen:\n \
+          \                   seen.add(token)\n                    ordered.append(token)\n\
+          \                    if len(ordered) >= limit:\n                       \
+          \ return ordered\n        return ordered\n\n    def summarize_list(values,\
+          \ limit=8):\n        if not isinstance(values, list):\n            return\
+          \ \"\"\n        flattened = [str(item) for item in values if item]\n   \
+          \     return \", \".join(flattened[:limit])\n\n    def get_path_aliases(fp):\n\
+          \        normalized = fp.replace(\"\\\\\", \"/\").lower()\n        aliases\
+          \ = []\n        for prefix, hints in PATH_ALIAS_HINTS.items():\n       \
+          \     if normalized.startswith(prefix):\n                aliases.extend(hints)\n\
+          \        return aliases\n\n    def extract_container_names(parsed):\n  \
+          \      spec = parsed.get(\"spec\")\n        if not isinstance(spec, dict):\n\
+          \            return []\n        template = spec.get(\"template\", {})\n\
+          \        if isinstance(template, dict):\n            template_spec = template.get(\"\
+          spec\", {})\n            if isinstance(template_spec, dict):\n         \
+          \       containers = template_spec.get(\"containers\", [])\n           \
+          \     if isinstance(containers, list):\n                    return [\n \
+          \                       str(container.get(\"name\"))\n                 \
+          \       for container in containers\n                        if isinstance(container,\
+          \ dict) and container.get(\"name\")\n                    ]\n        job_template\
+          \ = spec.get(\"jobTemplate\", {})\n        if isinstance(job_template, dict):\n\
+          \            nested_spec = job_template.get(\"spec\", {})\n            if\
+          \ isinstance(nested_spec, dict):\n                nested_template = nested_spec.get(\"\
+          template\", {})\n                if isinstance(nested_template, dict):\n\
+          \                    nested_template_spec = nested_template.get(\"spec\"\
+          , {})\n                    if isinstance(nested_template_spec, dict):\n\
+          \                        containers = nested_template_spec.get(\"containers\"\
+          , [])\n                        if isinstance(containers, list):\n      \
+          \                      return [\n                                str(container.get(\"\
+          name\"))\n                                for container in containers\n\
+          \                                if isinstance(container, dict) and container.get(\"\
+          name\")\n                            ]\n        return []\n\n    def build_manifest_context(parsed,\
+          \ fp, ctx):\n        metadata = parsed.get(\"metadata\", {})\n        metadata\
+          \ = metadata if isinstance(metadata, dict) else {}\n        kind = str(parsed.get(\"\
+          kind\", \"Unknown\"))\n        api_version = str(parsed.get(\"apiVersion\"\
+          , \"unknown\"))\n        name = str(metadata.get(\"name\", \"unknown\"))\n\
+          \        namespace = str(metadata.get(\"namespace\", \"cluster-scoped\"\
+          ))\n        path_terms = unique_terms([fp, os.path.basename(fp), ctx], limit=18)\n\
+          \        alias_terms = unique_terms(get_path_aliases(fp), limit=18)\n  \
+          \      top_level_keys = summarize_list(list(parsed.keys()))\n        label_keys\
+          \ = summarize_list(list((metadata.get(\"labels\") or {}).keys()))\n    \
+          \    annotation_keys = summarize_list(list((metadata.get(\"annotations\"\
+          ) or {}).keys()))\n\n        lines = [\n            f\"Manifest file path:\
+          \ {fp}\",\n            f\"Folder context: {ctx}\",\n            f\"Resource\
+          \ kind: {kind}\",\n            f\"API version: {api_version}\",\n      \
+          \      f\"Metadata name: {name}\",\n            f\"Namespace: {namespace}\"\
+          ,\n        ]\n        if path_terms:\n            lines.append(f\"Path hints:\
+          \ {' '.join(path_terms)}\")\n        if alias_terms:\n            lines.append(f\"\
+          Domain hints: {' '.join(alias_terms)}\")\n        if top_level_keys:\n \
+          \           lines.append(f\"Top-level keys: {top_level_keys}\")\n      \
+          \  if label_keys:\n            lines.append(f\"Label keys: {label_keys}\"\
+          )\n        if annotation_keys:\n            lines.append(f\"Annotation keys:\
+          \ {annotation_keys}\")\n\n        spec = parsed.get(\"spec\")\n        spec\
+          \ = spec if isinstance(spec, dict) else {}\n\n        if kind.lower() ==\
+          \ \"kustomization\" or os.path.basename(fp).lower() == \"kustomization.yaml\"\
+          :\n            resources = summarize_list(parsed.get(\"resources\"))\n \
+          \           components = summarize_list(parsed.get(\"components\"))\n  \
+          \          bases = summarize_list(parsed.get(\"bases\"))\n            patches\
+          \ = summarize_list(parsed.get(\"patchesStrategicMerge\"))\n            if\
+          \ resources:\n                lines.append(f\"Kustomize resources: {resources}\"\
+          )\n            if components:\n                lines.append(f\"Kustomize\
+          \ components: {components}\")\n            if bases:\n                lines.append(f\"\
+          Kustomize bases: {bases}\")\n            if patches:\n                lines.append(f\"\
+          Kustomize patches: {patches}\")\n\n        if kind in {\"Deployment\", \"\
+          StatefulSet\", \"DaemonSet\", \"Job\", \"CronJob\"}:\n            container_names\
+          \ = summarize_list(extract_container_names(parsed))\n            service_account\
+          \ = spec.get(\"serviceAccountName\")\n            if not service_account\
+          \ and isinstance(spec.get(\"template\"), dict):\n                template_spec\
+          \ = spec.get(\"template\", {}).get(\"spec\", {})\n                if isinstance(template_spec,\
+          \ dict):\n                    service_account = template_spec.get(\"serviceAccountName\"\
+          )\n            if container_names:\n                lines.append(f\"Workload\
+          \ containers: {container_names}\")\n            if service_account:\n  \
+          \              lines.append(f\"Service account: {service_account}\")\n\n\
+          \        if kind == \"Service\":\n            service_type = spec.get(\"\
+          type\")\n            selector = spec.get(\"selector\")\n            ports\
+          \ = spec.get(\"ports\")\n            if service_type:\n                lines.append(f\"\
+          Service type: {service_type}\")\n            if isinstance(selector, dict)\
+          \ and selector:\n                lines.append(f\"Service selector keys:\
+          \ {', '.join(list(selector.keys())[:8])}\")\n            if isinstance(ports,\
+          \ list) and ports:\n                port_values = [str(port.get('port'))\
+          \ for port in ports if isinstance(port, dict) and port.get('port')]\n  \
+          \              if port_values:\n                    lines.append(f\"Service\
+          \ ports: {', '.join(port_values[:8])}\")\n\n        if kind == \"CustomResourceDefinition\"\
+          :\n            names = spec.get(\"names\", {}) if isinstance(spec.get(\"\
+          names\"), dict) else {}\n            versions = spec.get(\"versions\", [])\n\
+          \            if spec.get(\"group\"):\n                lines.append(f\"CRD\
+          \ group: {spec.get('group')}\")\n            if names.get(\"kind\"):\n \
+          \               lines.append(f\"CRD served kind: {names.get('kind')}\")\n\
+          \            if isinstance(versions, list) and versions:\n             \
+          \   version_names = [str(version.get(\"name\")) for version in versions\
+          \ if isinstance(version, dict) and version.get(\"name\")]\n            \
+          \    if version_names:\n                    lines.append(f\"CRD versions:\
+          \ {', '.join(version_names[:8])}\")\n\n        if kind in {\"Role\", \"\
+          ClusterRole\"}:\n            rules = spec.get(\"rules\", parsed.get(\"rules\"\
+          ))\n            if isinstance(rules, list) and rules:\n                resource_names\
+          \ = []\n                verbs = []\n                for rule in rules[:4]:\n\
+          \                    if isinstance(rule, dict):\n                      \
+          \  resource_names.extend(str(item) for item in rule.get(\"resources\", [])[:4])\n\
+          \                        verbs.extend(str(item) for item in rule.get(\"\
+          verbs\", [])[:4])\n                if resource_names:\n                \
+          \    lines.append(f\"RBAC resources: {', '.join(resource_names[:10])}\"\
+          )\n                if verbs:\n                    lines.append(f\"RBAC verbs:\
+          \ {', '.join(verbs[:10])}\")\n\n        if kind in {\"RoleBinding\", \"\
+          ClusterRoleBinding\"}:\n            role_ref = parsed.get(\"roleRef\", {})\n\
+          \            subjects = parsed.get(\"subjects\", [])\n            if isinstance(role_ref,\
+          \ dict) and role_ref.get(\"name\"):\n                lines.append(f\"Binding\
+          \ roleRef: {role_ref.get('name')}\")\n            if isinstance(subjects,\
+          \ list) and subjects:\n                subject_names = [str(subject.get(\"\
+          name\")) for subject in subjects if isinstance(subject, dict) and subject.get(\"\
+          name\")]\n                if subject_names:\n                    lines.append(f\"\
+          Binding subjects: {', '.join(subject_names[:10])}\")\n\n        if kind\
+          \ in {\"AuthorizationPolicy\", \"PeerAuthentication\", \"VirtualService\"\
+          , \"Gateway\", \"DestinationRule\"}:\n            selector = spec.get(\"\
+          selector\", {})\n            if isinstance(selector, dict):\n          \
+          \      match_labels = selector.get(\"matchLabels\", {})\n              \
+          \  if isinstance(match_labels, dict) and match_labels:\n               \
+          \     lines.append(f\"Istio selector labels: {', '.join(list(match_labels.keys())[:8])}\"\
+          )\n            gateways = spec.get(\"gateways\")\n            hosts = spec.get(\"\
+          hosts\")\n            if isinstance(gateways, list) and gateways:\n    \
+          \            lines.append(f\"Istio gateways: {', '.join(str(g) for g in\
+          \ gateways[:8])}\")\n            if isinstance(hosts, list) and hosts:\n\
+          \                lines.append(f\"Istio hosts: {', '.join(str(h) for h in\
+          \ hosts[:8])}\")\n\n        return \"\\n\".join(f\"# {line}\" for line in\
+          \ lines if line)\n\n    def parse_python(content, fp, sha, ctx):\n     \
+          \   chunks, lines = [], content.split(\"\\n\")\n        try:\n         \
+          \   tree = pyast.parse(content)\n        except SyntaxError:\n         \
+          \   return [{\"chunk_id\": gen_id(fp, \"module\", 0), \"file_path\": fp,\n\
+          \                     \"extension\": \".py\", \"language\": \"python\",\n\
+          \                     \"symbol_name\": os.path.basename(fp), \"chunk_text\"\
+          : content,\n                     \"start_line\": 1, \"end_line\": len(lines),\n\
+          \                     \"commit_sha\": sha, \"folder_context\": ctx}]\n \
+          \       idx = 0\n        for node in pyast.walk(tree):\n            if isinstance(node,\
+          \ (pyast.FunctionDef, pyast.AsyncFunctionDef, pyast.ClassDef)):\n      \
+          \          sl, el = node.lineno, node.end_lineno or node.lineno\n      \
+          \          ct = \"\\n\".join(lines[sl - 1:el])\n                tp = \"\
+          class\" if isinstance(node, pyast.ClassDef) else \"function\"\n        \
+          \        chunks.append({\"chunk_id\": gen_id(fp, node.name, idx), \"file_path\"\
+          : fp,\n                               \"extension\": \".py\", \"language\"\
+          : \"python\",\n                               \"symbol_name\": f\"{tp}:{node.name}\"\
+          , \"chunk_text\": ct,\n                               \"start_line\": sl,\
+          \ \"end_line\": el,\n                               \"commit_sha\": sha,\
+          \ \"folder_context\": ctx})\n                idx += 1\n        if not chunks:\n\
+          \            chunks.append({\"chunk_id\": gen_id(fp, \"module\", 0), \"\
+          file_path\": fp,\n                           \"extension\": \".py\", \"\
+          language\": \"python\",\n                           \"symbol_name\": f\"\
+          module:{os.path.basename(fp)}\", \"chunk_text\": content,\n            \
+          \               \"start_line\": 1, \"end_line\": len(lines),\n         \
+          \                  \"commit_sha\": sha, \"folder_context\": ctx})\n    \
+          \    return chunks\n\n    def parse_go(content, fp, sha, ctx):\n       \
+          \ pat = re.compile(r\"^(?:func\\s+(?:\\([^)]+\\)\\s+)?(\\w+)|type\\s+(\\\
+          w+)\\s+struct)\\b\", re.MULTILINE)\n        matches = list(pat.finditer(content))\n\
+          \        if not matches:\n            return [{\"chunk_id\": gen_id(fp,\
+          \ \"file\", 0), \"file_path\": fp,\n                     \"extension\":\
+          \ \".go\", \"language\": \"go\",\n                     \"symbol_name\":\
+          \ f\"file:{os.path.basename(fp)}\", \"chunk_text\": content,\n         \
+          \            \"start_line\": 1, \"end_line\": content.count(\"\\n\") + 1,\n\
+          \                     \"commit_sha\": sha, \"folder_context\": ctx}]\n \
+          \       chunks = []\n        for i, m in enumerate(matches):\n         \
+          \   sym = m.group(1) or m.group(2)\n            s, e = m.start(), matches[i\
+          \ + 1].start() if i + 1 < len(matches) else len(content)\n            ct\
+          \ = content[s:e].rstrip()\n            sl = content[:s].count(\"\\n\") +\
+          \ 1\n            tp = \"struct\" if m.group(2) else \"func\"\n         \
+          \   chunks.append({\"chunk_id\": gen_id(fp, sym, i), \"file_path\": fp,\n\
+          \                           \"extension\": \".go\", \"language\": \"go\"\
+          ,\n                           \"symbol_name\": f\"{tp}:{sym}\", \"chunk_text\"\
+          : ct,\n                           \"start_line\": sl, \"end_line\": sl +\
+          \ ct.count(\"\\n\"),\n                           \"commit_sha\": sha, \"\
+          folder_context\": ctx})\n        return chunks\n\n    def parse_yaml_file(content,\
+          \ fp, sha, ctx):\n        ext = os.path.splitext(fp)[1].lower()\n      \
+          \  docs = content.split(\"\\n---\")\n        chunks = []\n        for idx,\
+          \ doc in enumerate(docs):\n            doc = doc.strip()\n            if\
+          \ not doc:\n                continue\n            try:\n               \
+          \ parsed = yaml.safe_load(doc)\n            except yaml.YAMLError:\n   \
+          \             parsed = None\n            if isinstance(parsed, dict):\n\
+          \                kind = parsed.get(\"kind\", \"Unknown\")\n            \
+          \    md = parsed.get(\"metadata\", {})\n                name = md.get(\"\
+          name\", \"unknown\") if isinstance(md, dict) else \"unknown\"\n        \
+          \        sym = f\"{kind}:{name}\"\n                manifest_context = build_manifest_context(parsed,\
+          \ fp, ctx)\n                chunk_body = f\"{manifest_context}\\n\\n{doc}\"\
+          \ if manifest_context else doc\n            else:\n                sym =\
+          \ f\"fragment:{idx}\"\n                chunk_body = doc\n            pre\
+          \ = \"\\n---\".join(docs[:idx])\n            sl = pre.count(\"\\n\") + 1\
+          \ if pre else 1\n            chunks.append({\"chunk_id\": gen_id(fp, sym,\
+          \ idx), \"file_path\": fp,\n                           \"extension\": ext,\
+          \ \"language\": \"yaml\",\n                           \"symbol_name\": sym,\
+          \ \"chunk_text\": chunk_body,\n                           \"start_line\"\
+          : sl, \"end_line\": sl + doc.count(\"\\n\"),\n                         \
+          \  \"commit_sha\": sha, \"folder_context\": ctx})\n        return chunks\
+          \ or [{\"chunk_id\": gen_id(fp, \"file\", 0), \"file_path\": fp,\n     \
+          \                      \"extension\": ext, \"language\": \"yaml\",\n   \
+          \                        \"symbol_name\": f\"file:{os.path.basename(fp)}\"\
+          , \"chunk_text\": content,\n                           \"start_line\": 1,\
+          \ \"end_line\": content.count(\"\\n\") + 1,\n                          \
+          \ \"commit_sha\": sha, \"folder_context\": ctx}]\n\n    def parse_md(content,\
+          \ fp, sha, ctx):\n        pat = re.compile(r\"^(#{2,3})\\s+(.+)$\", re.MULTILINE)\n\
+          \        matches = list(pat.finditer(content))\n        if not matches:\n\
+          \            return [{\"chunk_id\": gen_id(fp, \"doc\", 0), \"file_path\"\
+          : fp,\n                     \"extension\": \".md\", \"language\": \"markdown\"\
+          ,\n                     \"symbol_name\": f\"doc:{os.path.basename(fp)}\"\
+          , \"chunk_text\": content,\n                     \"start_line\": 1, \"end_line\"\
+          : content.count(\"\\n\") + 1,\n                     \"commit_sha\": sha,\
+          \ \"folder_context\": ctx}]\n        chunks = []\n        for i, m in enumerate(matches):\n\
+          \            h = m.group(2).strip()\n            s = m.start()\n       \
+          \     e = matches[i + 1].start() if i + 1 < len(matches) else len(content)\n\
+          \            text = content[s:e].strip()\n            sl = content[:s].count(\"\
+          \\n\") + 1\n            chunks.append({\"chunk_id\": gen_id(fp, h, i), \"\
+          file_path\": fp,\n                           \"extension\": \".md\", \"\
+          language\": \"markdown\",\n                           \"symbol_name\": f\"\
+          heading:{h[:100]}\", \"chunk_text\": text,\n                           \"\
+          start_line\": sl, \"end_line\": sl + text.count(\"\\n\"),\n            \
+          \               \"commit_sha\": sha, \"folder_context\": ctx})\n       \
+          \ return chunks\n\n    PARSERS = {\".py\": parse_python, \".go\": parse_go,\n\
+          \               \".yaml\": parse_yaml_file, \".yml\": parse_yaml_file, \"\
+          .md\": parse_md}\n\n    files = []\n    with open(clone_data.path) as f:\n\
+          \        for line in f:\n            if line.strip():\n                files.append(json.loads(line))\n\
+          \n    all_chunks = []\n    for fi in files:\n        parser = PARSERS.get(fi[\"\
+          extension\"])\n        if not parser:\n            continue\n        try:\n\
+          \            chunks = parser(fi[\"content\"], fi[\"path\"], fi[\"commit_sha\"\
+          ], fi[\"folder_context\"])\n            all_chunks.extend(chunks)\n    \
+          \    except Exception as ex:\n            logger.warning(\"Error parsing\
+          \ %s: %s\", fi[\"path\"], ex)\n\n    logger.info(\"Parsed %d chunks from\
+          \ %d files\", len(all_chunks), len(files))\n\n    with open(parsed_data.path,\
+          \ \"w\") as f:\n        for c in all_chunks:\n            f.write(json.dumps(c,\
+          \ ensure_ascii=False) + \"\\n\")\n\n"
+        image: python:3.11-slim
+pipelineInfo:
+  description: Run both docs and code ingestion pipelines in parallel
+  name: full-ingestion-pipeline
+root:
+  dag:
+    tasks:
+      code-ingestion-pipeline:
+        cachingOptions:
+          enableCache: true
+        componentRef:
+          name: comp-code-ingestion-pipeline
+        inputs:
+          parameters:
+            branch:
+              componentInputParameter: code_branch
+            collection_name:
+              runtimeValue:
+                constant: code_collection
+            embedding_dim:
+              componentInputParameter: embedding_dim
+            embedding_model:
+              componentInputParameter: embedding_model
+            milvus_host:
+              componentInputParameter: milvus_host
+            milvus_port:
+              componentInputParameter: milvus_port
+            repo_url:
+              componentInputParameter: code_repo_url
+        taskInfo:
+          name: code-ingestion-pipeline
+      docs-ingestion-pipeline:
+        cachingOptions:
+          enableCache: true
+        componentRef:
+          name: comp-docs-ingestion-pipeline
+        inputs:
+          parameters:
+            base_url:
+              componentInputParameter: docs_base_url
+            chunk_overlap:
+              componentInputParameter: chunk_overlap
+            chunk_size:
+              componentInputParameter: chunk_size
+            collection_name:
+              runtimeValue:
+                constant: docs_collection
+            crawl_delay:
+              componentInputParameter: docs_crawl_delay
+            embedding_dim:
+              componentInputParameter: embedding_dim
+            embedding_model:
+              componentInputParameter: embedding_model
+            max_pages:
+              componentInputParameter: docs_max_pages
+            milvus_host:
+              componentInputParameter: milvus_host
+            milvus_port:
+              componentInputParameter: milvus_port
+        taskInfo:
+          name: docs-ingestion-pipeline
+  inputDefinitions:
+    parameters:
+      chunk_overlap:
+        defaultValue: 50.0
+        isOptional: true
+        parameterType: NUMBER_INTEGER
+      chunk_size:
+        defaultValue: 500.0
+        isOptional: true
+        parameterType: NUMBER_INTEGER
+      code_branch:
+        defaultValue: master
+        isOptional: true
+        parameterType: STRING
+      code_repo_url:
+        defaultValue: https://github.com/kubeflow/manifests
+        isOptional: true
+        parameterType: STRING
+      docs_base_url:
+        defaultValue: https://www.kubeflow.org
+        isOptional: true
+        parameterType: STRING
+      docs_crawl_delay:
+        defaultValue: 1.0
+        isOptional: true
+        parameterType: NUMBER_DOUBLE
+      docs_max_pages:
+        defaultValue: 0.0
+        isOptional: true
+        parameterType: NUMBER_INTEGER
+      embedding_dim:
+        defaultValue: 384.0
+        isOptional: true
+        parameterType: NUMBER_INTEGER
+      embedding_model:
+        defaultValue: sentence-transformers/all-MiniLM-L6-v2
+        isOptional: true
+        parameterType: STRING
+      milvus_host:
+        defaultValue: localhost
+        isOptional: true
+        parameterType: STRING
+      milvus_port:
+        defaultValue: '19530'
+        isOptional: true
+        parameterType: STRING
+schemaVersion: 2.1.0
+sdkVersion: kfp-2.16.0
diff --git a/pipelines/code_ingestion/pipeline.py b/pipelines/code_ingestion/pipeline.py
new file mode 100644
index 0000000..a8493d3
--- /dev/null
+++ b/pipelines/code_ingestion/pipeline.py
@@ -0,0 +1,880 @@
+"""
+Code Ingestion — KFP v2 Pipeline
+
+Orchestrates the complete code ingestion flow:
+  repo_cloner -> ast_parser -> chunker -> embedder -> loader
+
+Usage:
+  # Compile to YAML
+  python pipelines/code_ingestion/pipeline.py
+
+  # Run locally (without KFP)
+  python -m pipelines.code_ingestion.pipeline --local
+"""
+
+import os
+import sys
+
+import kfp
+from kfp import dsl
+from kfp.dsl import Dataset, Input, Output
+
+# Ensure package imports work both when executed as a script and as a module.
+PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
+if PROJECT_ROOT not in sys.path:
+    sys.path.insert(0, PROJECT_ROOT)
+
+# Import docs pipeline for composition
+try:
+    from pipelines.docs_ingestion.pipeline import docs_ingestion_pipeline
+except ImportError:
+    # This might happen if PYTHONPATH is not set during some CI steps
+    docs_ingestion_pipeline = None
+
+
+# ─── KFP Components ─────────────────────────────────────────────────────────
+
+@dsl.component(
+    base_image="python:3.11-slim",
+    packages_to_install=["gitpython==3.1.43"],
+)
+def clone_repo(
+    repo_url: str,
+    branch: str,
+    clone_data: Output[Dataset],
+):
+    """Clone a git repository and collect file metadata.
+
+    Args:
+        repo_url: Repository URL to clone.
+        branch: Branch name to clone.
+        clone_data: Output dataset artifact.
+    """
+    import json
+    import logging
+    import os
+    import subprocess
+    import tempfile
+
+    logging.basicConfig(level=logging.INFO)
+    logger = logging.getLogger("repo_cloner")
+
+    SKIP_DIRS = {".git", "__pycache__", "node_modules", ".tox", ".mypy_cache"}
+    EXTENSIONS = {".py", ".go", ".yaml", ".yml", ".md"}
+    MIN_SIZE, MAX_SIZE = 200, 100_000
+
+    clone_dir = tempfile.mkdtemp(prefix="code-ingest-")
+    logger.info("Cloning %s -> %s", repo_url, clone_dir)
+
+    subprocess.run(
+        ["git", "clone", "--depth", "1", "--branch", branch, repo_url, clone_dir],
+        check=True, capture_output=True, text=True,
+    )
+
+    result = subprocess.run(
+        ["git", "rev-parse", "HEAD"],
+        capture_output=True, text=True, cwd=clone_dir, check=True,
+    )
+    commit_sha = result.stdout.strip()
+    logger.info("Commit: %s", commit_sha[:12])
+
+    files = []
+    for root, dirs, fnames in os.walk(clone_dir):
+        dirs[:] = [d for d in dirs if d not in SKIP_DIRS and not d.startswith(".")]
+        for fn in fnames:
+            fp = os.path.join(root, fn)
+            rel = os.path.relpath(fp, clone_dir)
+            _, ext = os.path.splitext(fn)
+            if ext.lower() not in EXTENSIONS:
+                continue
+            try:
+                sz = os.path.getsize(fp)
+            except OSError:
+                continue
+            if sz < MIN_SIZE or sz > MAX_SIZE:
+                continue
+            parts = rel.split(os.sep)
+            folder = parts[0] if len(parts) > 1 else "root"
+            files.append({"path": rel, "extension": ext.lower(),
+                          "size_bytes": sz, "folder_context": folder})
+
+    logger.info("Collected %d files", len(files))
+
+    # Save file list + contents
+    output = []
+    for f in files:
+        full = os.path.join(clone_dir, f["path"])
+        try:
+            with open(full, "r", encoding="utf-8", errors="replace") as fh:
+                content = fh.read()
+        except Exception:
+            continue
+        output.append({**f, "content": content, "commit_sha": commit_sha})
+
+    with open(clone_data.path, "w") as fh:
+        for item in output:
+            fh.write(json.dumps(item, ensure_ascii=False) + "\n")
+
+    # Cleanup
+    import shutil
+    shutil.rmtree(clone_dir, ignore_errors=True)
+
+
+@dsl.component(
+    base_image="python:3.11-slim",
+    packages_to_install=["PyYAML==6.0.1"],
+)
+def parse_code(
+    clone_data: Input[Dataset],
+    parsed_data: Output[Dataset],
+):
+    """Parse files into logical code chunks using language-specific parsers.
+
+    Args:
+        clone_data: Input dataset from repo cloner.
+        parsed_data: Output dataset of parsed chunks.
+    """
+    import ast as pyast
+    import hashlib
+    import json
+    import logging
+    import os
+    import re
+
+    import yaml
+
+    logging.basicConfig(level=logging.INFO)
+    logger = logging.getLogger("ast_parser")
+
+    PATH_ALIAS_HINTS = {
+        "common/istio": [
+            "istio", "service mesh", "gateway", "authorization policy",
+            "peer authentication", "virtual service", "sidecar", "envoy", "mtls", "ingress",
+        ],
+        "common/knative": [
+            "knative", "serving", "eventing", "serverless", "scale to zero",
+            "activator", "revision", "service", "net istio", "webhook",
+        ],
+        "common/dex": [
+            "dex", "oidc", "oauth2", "authentication", "identity provider",
+            "connector", "login",
+        ],
+        "common/cert-manager": [
+            "cert manager", "certificate", "issuer", "clusterissuer",
+            "cainjector", "tls", "webhook",
+        ],
+        "applications/pipeline": [
+            "kubeflow pipelines", "kfp", "pipeline api server", "deployment",
+            "service", "configmap", "role", "rolebinding", "serviceaccount",
+            "crd", "webhook", "scheduled workflow",
+        ],
+        "applications/profiles": [
+            "profiles", "namespaces", "rbac", "rolebinding", "serviceaccount", "user profile",
+        ],
+        "tests": ["tests", "e2e", "integration", "validation", "presubmit"],
+    }
+
+    def gen_id(fp, sym, idx):
+        return hashlib.sha256(f"{fp}::{sym}::{idx}".encode()).hexdigest()[:32]
+
+    def split_terms(value):
+        expanded = re.sub(r"([a-z0-9])([A-Z])", r"\1 \2", value)
+        normalized = re.sub(r"[^A-Za-z0-9]+", " ", expanded)
+        return [token.lower() for token in normalized.split() if token]
+
+    def unique_terms(values, limit=24):
+        seen = set()
+        ordered = []
+        for value in values:
+            for token in split_terms(str(value)):
+                if token not in seen:
+                    seen.add(token)
+                    ordered.append(token)
+                    if len(ordered) >= limit:
+                        return ordered
+        return ordered
+
+    def summarize_list(values, limit=8):
+        if not isinstance(values, list):
+            return ""
+        flattened = [str(item) for item in values if item]
+        return ", ".join(flattened[:limit])
+
+    def get_path_aliases(fp):
+        normalized = fp.replace("\\", "/").lower()
+        aliases = []
+        for prefix, hints in PATH_ALIAS_HINTS.items():
+            if normalized.startswith(prefix):
+                aliases.extend(hints)
+        return aliases
+
+    def extract_container_names(parsed):
+        spec = parsed.get("spec")
+        if not isinstance(spec, dict):
+            return []
+        template = spec.get("template", {})
+        if isinstance(template, dict):
+            template_spec = template.get("spec", {})
+            if isinstance(template_spec, dict):
+                containers = template_spec.get("containers", [])
+                if isinstance(containers, list):
+                    return [
+                        str(container.get("name"))
+                        for container in containers
+                        if isinstance(container, dict) and container.get("name")
+                    ]
+        job_template = spec.get("jobTemplate", {})
+        if isinstance(job_template, dict):
+            nested_spec = job_template.get("spec", {})
+            if isinstance(nested_spec, dict):
+                nested_template = nested_spec.get("template", {})
+                if isinstance(nested_template, dict):
+                    nested_template_spec = nested_template.get("spec", {})
+                    if isinstance(nested_template_spec, dict):
+                        containers = nested_template_spec.get("containers", [])
+                        if isinstance(containers, list):
+                            return [
+                                str(container.get("name"))
+                                for container in containers
+                                if isinstance(container, dict) and container.get("name")
+                            ]
+        return []
+
+    def build_manifest_context(parsed, fp, ctx):
+        metadata = parsed.get("metadata", {})
+        metadata = metadata if isinstance(metadata, dict) else {}
+        kind = str(parsed.get("kind", "Unknown"))
+        api_version = str(parsed.get("apiVersion", "unknown"))
+        name = str(metadata.get("name", "unknown"))
+        namespace = str(metadata.get("namespace", "cluster-scoped"))
+        path_terms = unique_terms([fp, os.path.basename(fp), ctx], limit=18)
+        alias_terms = unique_terms(get_path_aliases(fp), limit=18)
+        top_level_keys = summarize_list(list(parsed.keys()))
+        label_keys = summarize_list(list((metadata.get("labels") or {}).keys()))
+        annotation_keys = summarize_list(list((metadata.get("annotations") or {}).keys()))
+
+        lines = [
+            f"Manifest file path: {fp}",
+            f"Folder context: {ctx}",
+            f"Resource kind: {kind}",
+            f"API version: {api_version}",
+            f"Metadata name: {name}",
+            f"Namespace: {namespace}",
+        ]
+        if path_terms:
+            lines.append(f"Path hints: {' '.join(path_terms)}")
+        if alias_terms:
+            lines.append(f"Domain hints: {' '.join(alias_terms)}")
+        if top_level_keys:
+            lines.append(f"Top-level keys: {top_level_keys}")
+        if label_keys:
+            lines.append(f"Label keys: {label_keys}")
+        if annotation_keys:
+            lines.append(f"Annotation keys: {annotation_keys}")
+
+        spec = parsed.get("spec")
+        spec = spec if isinstance(spec, dict) else {}
+
+        if kind.lower() == "kustomization" or os.path.basename(fp).lower() == "kustomization.yaml":
+            resources = summarize_list(parsed.get("resources"))
+            components = summarize_list(parsed.get("components"))
+            bases = summarize_list(parsed.get("bases"))
+            patches = summarize_list(parsed.get("patchesStrategicMerge"))
+            if resources:
+                lines.append(f"Kustomize resources: {resources}")
+            if components:
+                lines.append(f"Kustomize components: {components}")
+            if bases:
+                lines.append(f"Kustomize bases: {bases}")
+            if patches:
+                lines.append(f"Kustomize patches: {patches}")
+
+        if kind in {"Deployment", "StatefulSet", "DaemonSet", "Job", "CronJob"}:
+            container_names = summarize_list(extract_container_names(parsed))
+            service_account = spec.get("serviceAccountName")
+            if not service_account and isinstance(spec.get("template"), dict):
+                template_spec = spec.get("template", {}).get("spec", {})
+                if isinstance(template_spec, dict):
+                    service_account = template_spec.get("serviceAccountName")
+            if container_names:
+                lines.append(f"Workload containers: {container_names}")
+            if service_account:
+                lines.append(f"Service account: {service_account}")
+
+        if kind == "Service":
+            service_type = spec.get("type")
+            selector = spec.get("selector")
+            ports = spec.get("ports")
+            if service_type:
+                lines.append(f"Service type: {service_type}")
+            if isinstance(selector, dict) and selector:
+                lines.append(f"Service selector keys: {', '.join(list(selector.keys())[:8])}")
+            if isinstance(ports, list) and ports:
+                port_values = [str(port.get('port')) for port in ports if isinstance(port, dict) and port.get('port')]
+                if port_values:
+                    lines.append(f"Service ports: {', '.join(port_values[:8])}")
+
+        if kind == "CustomResourceDefinition":
+            names = spec.get("names", {}) if isinstance(spec.get("names"), dict) else {}
+            versions = spec.get("versions", [])
+            if spec.get("group"):
+                lines.append(f"CRD group: {spec.get('group')}")
+            if names.get("kind"):
+                lines.append(f"CRD served kind: {names.get('kind')}")
+            if isinstance(versions, list) and versions:
+                version_names = [str(version.get("name")) for version in versions if isinstance(version, dict) and version.get("name")]
+                if version_names:
+                    lines.append(f"CRD versions: {', '.join(version_names[:8])}")
+
+        if kind in {"Role", "ClusterRole"}:
+            rules = spec.get("rules", parsed.get("rules"))
+            if isinstance(rules, list) and rules:
+                resource_names = []
+                verbs = []
+                for rule in rules[:4]:
+                    if isinstance(rule, dict):
+                        resource_names.extend(str(item) for item in rule.get("resources", [])[:4])
+                        verbs.extend(str(item) for item in rule.get("verbs", [])[:4])
+                if resource_names:
+                    lines.append(f"RBAC resources: {', '.join(resource_names[:10])}")
+                if verbs:
+                    lines.append(f"RBAC verbs: {', '.join(verbs[:10])}")
+
+        if kind in {"RoleBinding", "ClusterRoleBinding"}:
+            role_ref = parsed.get("roleRef", {})
+            subjects = parsed.get("subjects", [])
+            if isinstance(role_ref, dict) and role_ref.get("name"):
+                lines.append(f"Binding roleRef: {role_ref.get('name')}")
+            if isinstance(subjects, list) and subjects:
+                subject_names = [str(subject.get("name")) for subject in subjects if isinstance(subject, dict) and subject.get("name")]
+                if subject_names:
+                    lines.append(f"Binding subjects: {', '.join(subject_names[:10])}")
+
+        if kind in {"AuthorizationPolicy", "PeerAuthentication", "VirtualService", "Gateway", "DestinationRule"}:
+            selector = spec.get("selector", {})
+            if isinstance(selector, dict):
+                match_labels = selector.get("matchLabels", {})
+                if isinstance(match_labels, dict) and match_labels:
+                    lines.append(f"Istio selector labels: {', '.join(list(match_labels.keys())[:8])}")
+            gateways = spec.get("gateways")
+            hosts = spec.get("hosts")
+            if isinstance(gateways, list) and gateways:
+                lines.append(f"Istio gateways: {', '.join(str(g) for g in gateways[:8])}")
+            if isinstance(hosts, list) and hosts:
+                lines.append(f"Istio hosts: {', '.join(str(h) for h in hosts[:8])}")
+
+        return "\n".join(f"# {line}" for line in lines if line)
+
+    def parse_python(content, fp, sha, ctx):
+        chunks, lines = [], content.split("\n")
+        try:
+            tree = pyast.parse(content)
+        except SyntaxError:
+            return [{"chunk_id": gen_id(fp, "module", 0), "file_path": fp,
+                     "extension": ".py", "language": "python",
+                     "symbol_name": os.path.basename(fp), "chunk_text": content,
+                     "start_line": 1, "end_line": len(lines),
+                     "commit_sha": sha, "folder_context": ctx}]
+        idx = 0
+        for node in pyast.walk(tree):
+            if isinstance(node, (pyast.FunctionDef, pyast.AsyncFunctionDef, pyast.ClassDef)):
+                sl, el = node.lineno, node.end_lineno or node.lineno
+                ct = "\n".join(lines[sl - 1:el])
+                tp = "class" if isinstance(node, pyast.ClassDef) else "function"
+                chunks.append({"chunk_id": gen_id(fp, node.name, idx), "file_path": fp,
+                               "extension": ".py", "language": "python",
+                               "symbol_name": f"{tp}:{node.name}", "chunk_text": ct,
+                               "start_line": sl, "end_line": el,
+                               "commit_sha": sha, "folder_context": ctx})
+                idx += 1
+        if not chunks:
+            chunks.append({"chunk_id": gen_id(fp, "module", 0), "file_path": fp,
+                           "extension": ".py", "language": "python",
+                           "symbol_name": f"module:{os.path.basename(fp)}", "chunk_text": content,
+                           "start_line": 1, "end_line": len(lines),
+                           "commit_sha": sha, "folder_context": ctx})
+        return chunks
+
+    def parse_go(content, fp, sha, ctx):
+        pat = re.compile(r"^(?:func\s+(?:\([^)]+\)\s+)?(\w+)|type\s+(\w+)\s+struct)\b", re.MULTILINE)
+        matches = list(pat.finditer(content))
+        if not matches:
+            return [{"chunk_id": gen_id(fp, "file", 0), "file_path": fp,
+                     "extension": ".go", "language": "go",
+                     "symbol_name": f"file:{os.path.basename(fp)}", "chunk_text": content,
+                     "start_line": 1, "end_line": content.count("\n") + 1,
+                     "commit_sha": sha, "folder_context": ctx}]
+        chunks = []
+        for i, m in enumerate(matches):
+            sym = m.group(1) or m.group(2)
+            s, e = m.start(), matches[i + 1].start() if i + 1 < len(matches) else len(content)
+            ct = content[s:e].rstrip()
+            sl = content[:s].count("\n") + 1
+            tp = "struct" if m.group(2) else "func"
+            chunks.append({"chunk_id": gen_id(fp, sym, i), "file_path": fp,
+                           "extension": ".go", "language": "go",
+                           "symbol_name": f"{tp}:{sym}", "chunk_text": ct,
+                           "start_line": sl, "end_line": sl + ct.count("\n"),
+                           "commit_sha": sha, "folder_context": ctx})
+        return chunks
+
+    def parse_yaml_file(content, fp, sha, ctx):
+        ext = os.path.splitext(fp)[1].lower()
+        docs = content.split("\n---")
+        chunks = []
+        for idx, doc in enumerate(docs):
+            doc = doc.strip()
+            if not doc:
+                continue
+            try:
+                parsed = yaml.safe_load(doc)
+            except yaml.YAMLError:
+                parsed = None
+            if isinstance(parsed, dict):
+                kind = parsed.get("kind", "Unknown")
+                md = parsed.get("metadata", {})
+                name = md.get("name", "unknown") if isinstance(md, dict) else "unknown"
+                sym = f"{kind}:{name}"
+                manifest_context = build_manifest_context(parsed, fp, ctx)
+                chunk_body = f"{manifest_context}\n\n{doc}" if manifest_context else doc
+            else:
+                sym = f"fragment:{idx}"
+                chunk_body = doc
+            pre = "\n---".join(docs[:idx])
+            sl = pre.count("\n") + 1 if pre else 1
+            chunks.append({"chunk_id": gen_id(fp, sym, idx), "file_path": fp,
+                           "extension": ext, "language": "yaml",
+                           "symbol_name": sym, "chunk_text": chunk_body,
+                           "start_line": sl, "end_line": sl + doc.count("\n"),
+                           "commit_sha": sha, "folder_context": ctx})
+        return chunks or [{"chunk_id": gen_id(fp, "file", 0), "file_path": fp,
+                           "extension": ext, "language": "yaml",
+                           "symbol_name": f"file:{os.path.basename(fp)}", "chunk_text": content,
+                           "start_line": 1, "end_line": content.count("\n") + 1,
+                           "commit_sha": sha, "folder_context": ctx}]
+
+    def parse_md(content, fp, sha, ctx):
+        pat = re.compile(r"^(#{2,3})\s+(.+)$", re.MULTILINE)
+        matches = list(pat.finditer(content))
+        if not matches:
+            return [{"chunk_id": gen_id(fp, "doc", 0), "file_path": fp,
+                     "extension": ".md", "language": "markdown",
+                     "symbol_name": f"doc:{os.path.basename(fp)}", "chunk_text": content,
+                     "start_line": 1, "end_line": content.count("\n") + 1,
+                     "commit_sha": sha, "folder_context": ctx}]
+        chunks = []
+        for i, m in enumerate(matches):
+            h = m.group(2).strip()
+            s = m.start()
+            e = matches[i + 1].start() if i + 1 < len(matches) else len(content)
+            text = content[s:e].strip()
+            sl = content[:s].count("\n") + 1
+            chunks.append({"chunk_id": gen_id(fp, h, i), "file_path": fp,
+                           "extension": ".md", "language": "markdown",
+                           "symbol_name": f"heading:{h[:100]}", "chunk_text": text,
+                           "start_line": sl, "end_line": sl + text.count("\n"),
+                           "commit_sha": sha, "folder_context": ctx})
+        return chunks
+
+    PARSERS = {".py": parse_python, ".go": parse_go,
+               ".yaml": parse_yaml_file, ".yml": parse_yaml_file, ".md": parse_md}
+
+    files = []
+    with open(clone_data.path) as f:
+        for line in f:
+            if line.strip():
+                files.append(json.loads(line))
+
+    all_chunks = []
+    for fi in files:
+        parser = PARSERS.get(fi["extension"])
+        if not parser:
+            continue
+        try:
+            chunks = parser(fi["content"], fi["path"], fi["commit_sha"], fi["folder_context"])
+            all_chunks.extend(chunks)
+        except Exception as ex:
+            logger.warning("Error parsing %s: %s", fi["path"], ex)
+
+    logger.info("Parsed %d chunks from %d files", len(all_chunks), len(files))
+
+    with open(parsed_data.path, "w") as f:
+        for c in all_chunks:
+            f.write(json.dumps(c, ensure_ascii=False) + "\n")
+
+
+@dsl.component(
+    base_image="python:3.11-slim",
+    packages_to_install=["tiktoken==0.7.0"],
+)
+def chunk_code(
+    parsed_data: Input[Dataset],
+    chunked_data: Output[Dataset],
+):
+    """Post-process parsed chunks with token limits and context headers.
+
+    Args:
+        parsed_data: Input dataset of parsed chunks.
+        chunked_data: Output dataset of token-bounded chunks.
+    """
+    import hashlib
+    import json
+    import logging
+
+    import tiktoken
+
+    logging.basicConfig(level=logging.INFO)
+    logger = logging.getLogger("chunker")
+
+    enc = tiktoken.get_encoding("cl100k_base")
+    count = lambda t: len(enc.encode(t))
+
+    MIN_T, MAX_T = 50, 512
+
+    def build_path_hints(chunk):
+        raw = " ".join(str(chunk.get(key, "")) for key in ("file_path", "folder_context", "symbol_name"))
+        expanded = raw.replace("/", " ").replace("_", " ").replace("-", " ")
+        expanded = "".join(
+            (
+                f" {char}" if index > 0 and char.isupper() and expanded[index - 1].islower() else char
+            )
+            for index, char in enumerate(expanded)
+        )
+        return " ".join(expanded.split()).lower()
+
+    raw = []
+    with open(parsed_data.path) as f:
+        for line in f:
+            if line.strip():
+                raw.append(json.loads(line))
+
+    processed = []
+    for chunk in raw:
+        header = (
+            f"# File: {chunk.get('file_path', '?')} | Symbol: {chunk.get('symbol_name', '?')} "
+            f"| Lang: {chunk.get('language', '?')} | Folder: {chunk.get('folder_context', '?')}"
+        )
+        path_hints = build_path_hints(chunk)
+        if path_hints:
+            header = f"{header}\n# Path Hints: {path_hints}"
+        full = f"{header}\n\n{chunk['chunk_text']}"
+        tc = count(full)
+
+        if tc < MIN_T:
+            continue
+
+        if tc <= MAX_T:
+            chunk["chunk_text"] = full[:8192]
+            chunk["token_count"] = tc
+            processed.append(chunk)
+        else:
+            # Split oversized
+            parts = full.split("\n\n")
+            cur, subs = "", []
+            for p in parts:
+                cand = cur + "\n\n" + p if cur else p
+                if count(cand) > MAX_T:
+                    if cur.strip():
+                        subs.append(cur.strip())
+                    cur = p
+                else:
+                    cur = cand
+            if cur.strip():
+                subs.append(cur.strip())
+
+            for si, sub in enumerate(subs):
+                st = count(sub)
+                if st < MIN_T:
+                    continue
+                sc = chunk.copy()
+                sc["chunk_id"] = hashlib.sha256(f"{chunk['chunk_id']}::{si}".encode()).hexdigest()[:32]
+                sc["chunk_text"] = sub[:8192]
+                sc["token_count"] = st
+                processed.append(sc)
+
+    logger.info("Chunked %d -> %d chunks", len(raw), len(processed))
+
+    with open(chunked_data.path, "w") as f:
+        for c in processed:
+            f.write(json.dumps(c, ensure_ascii=False) + "\n")
+
+
+@dsl.component(
+    base_image="python:3.11-slim",
+    packages_to_install=["sentence-transformers==2.7.0", "torch==2.3.0"],
+)
+def embed_code(
+    chunked_data: Input[Dataset],
+    embedding_model: str,
+    embedded_data: Output[Dataset],
+):
+    """Embed code chunks using configurable model.
+
+    Args:
+        chunked_data: Input dataset of chunked code.
+        embedding_model: Model name for embeddings.
+        embedded_data: Output dataset with embeddings.
+    """
+    import json
+    import logging
+
+    from sentence_transformers import SentenceTransformer
+
+    logging.basicConfig(level=logging.INFO)
+    logger = logging.getLogger("embedder")
+
+    chunks = []
+    with open(chunked_data.path) as f:
+        for line in f:
+            if line.strip():
+                chunks.append(json.loads(line))
+
+    logger.info("Embedding %d code chunks with %s", len(chunks), embedding_model)
+    model = SentenceTransformer(embedding_model)
+
+    texts = [c["chunk_text"] for c in chunks]
+    bs = 32
+    all_embs = []
+    for i in range(0, len(texts), bs):
+        batch = texts[i:i + bs]
+        embs = model.encode(batch, show_progress_bar=False)
+        all_embs.extend([e.tolist() for e in embs])
+        logger.info("Batch %d/%d", i // bs + 1, (len(texts) + bs - 1) // bs)
+
+    for c, e in zip(chunks, all_embs):
+        c["embedding"] = e
+
+    with open(embedded_data.path, "w") as f:
+        for c in chunks:
+            f.write(json.dumps(c, ensure_ascii=False) + "\n")
+
+
+@dsl.component(
+    base_image="python:3.11-slim",
+    packages_to_install=["pymilvus==2.4.0"],
+)
+def load_code(
+    embedded_data: Input[Dataset],
+    milvus_host: str,
+    milvus_port: str,
+    collection_name: str,
+    embedding_dim: int,
+):
+    """Load embedded code chunks into Milvus code_collection.
+
+    Args:
+        embedded_data: Input dataset with embedded chunks.
+        milvus_host: Milvus server host.
+        milvus_port: Milvus server port.
+        collection_name: Target collection name.
+        embedding_dim: Vector dimension.
+    """
+    import json
+    import logging
+
+    from pymilvus import (Collection, CollectionSchema, DataType, FieldSchema,
+                          connections, utility)
+
+    logging.basicConfig(level=logging.INFO)
+    logger = logging.getLogger("loader")
+
+    connections.connect("default", host=milvus_host, port=milvus_port)
+
+    if not utility.has_collection(collection_name):
+        fields = [
+            FieldSchema("chunk_id", DataType.VARCHAR, max_length=128, is_primary=True),
+            FieldSchema("file_path", DataType.VARCHAR, max_length=512),
+            FieldSchema("extension", DataType.VARCHAR, max_length=16),
+            FieldSchema("language", DataType.VARCHAR, max_length=32),
+            FieldSchema("symbol_name", DataType.VARCHAR, max_length=256),
+            FieldSchema("folder_context", DataType.VARCHAR, max_length=128),
+            FieldSchema("chunk_text", DataType.VARCHAR, max_length=8192),
+            FieldSchema("start_line", DataType.INT64),
+            FieldSchema("end_line", DataType.INT64),
+            FieldSchema("commit_sha", DataType.VARCHAR, max_length=64),
+            FieldSchema("chunk_index", DataType.INT64),
+            FieldSchema("embedding", DataType.FLOAT_VECTOR, dim=embedding_dim),
+        ]
+        schema = CollectionSchema(fields, "Kubeflow manifests code chunks")
+        collection = Collection(collection_name, schema)
+        collection.create_index("embedding", {
+            "metric_type": "COSINE", "index_type": "HNSW",
+            "params": {"M": 16, "efConstruction": 200},
+        })
+    else:
+        collection = Collection(collection_name)
+
+    collection.load()
+
+    chunks = []
+    with open(embedded_data.path) as f:
+        for line in f:
+            if line.strip():
+                chunks.append(json.loads(line))
+
+    rows = []
+    for c in chunks:
+        rows.append({
+            "chunk_id": str(c["chunk_id"])[:128],
+            "file_path": str(c.get("file_path", ""))[:512],
+            "extension": str(c.get("extension", ""))[:16],
+            "language": str(c.get("language", ""))[:32],
+            "symbol_name": str(c.get("symbol_name", ""))[:256],
+            "folder_context": str(c.get("folder_context", ""))[:128],
+            "chunk_text": str(c.get("chunk_text", ""))[:8192],
+            "start_line": int(c.get("start_line", 0)),
+            "end_line": int(c.get("end_line", 0)),
+            "commit_sha": str(c.get("commit_sha", ""))[:64],
+            "chunk_index": int(c.get("chunk_index", 0)),
+            "embedding": c["embedding"],
+        })
+
+    bs = 100
+    inserted = 0
+    for i in range(0, len(rows), bs):
+        batch = rows[i:i + bs]
+        collection.upsert(batch)
+        inserted += len(batch)
+
+    collection.flush()
+    logger.info("Loaded %d chunks into %s. Total: %d",
+                inserted, collection_name, collection.num_entities)
+
+
+# ─── Pipeline Definition ────────────────────────────────────────────────────
+
+@dsl.pipeline(
+    name="code-ingestion-pipeline",
+    description="Clone kubeflow/manifests, parse code by language, embed, and load into Milvus",
+)
+def code_ingestion_pipeline(
+    repo_url: str = "https://github.com/kubeflow/manifests",
+    branch: str = "master",
+    embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2",
+    milvus_host: str = "localhost",
+    milvus_port: str = "19530",
+    collection_name: str = "code_collection",
+    embedding_dim: int = 384,
+):
+    """Full code ingestion pipeline: clone -> parse -> chunk -> embed -> load."""
+
+    clone_task = clone_repo(repo_url=repo_url, branch=branch)
+    clone_task.set_retry(num_retries=3, backoff_duration="30s", backoff_factor=2.0)
+
+    parse_task = parse_code(clone_data=clone_task.outputs["clone_data"])
+    parse_task.set_retry(num_retries=3, backoff_duration="30s", backoff_factor=2.0)
+
+    chunk_task = chunk_code(parsed_data=parse_task.outputs["parsed_data"])
+    chunk_task.set_retry(num_retries=3, backoff_duration="30s", backoff_factor=2.0)
+
+    embed_task = embed_code(
+        chunked_data=chunk_task.outputs["chunked_data"],
+        embedding_model=embedding_model,
+    )
+    embed_task.set_retry(num_retries=3, backoff_duration="30s", backoff_factor=2.0)
+
+    load_task = load_code(
+        embedded_data=embed_task.outputs["embedded_data"],
+        milvus_host=milvus_host,
+        milvus_port=milvus_port,
+        collection_name=collection_name,
+        embedding_dim=embedding_dim,
+    )
+    load_task.set_retry(num_retries=3, backoff_duration="30s", backoff_factor=2.0)
+
+
+# ─── Parent Pipeline (Composes Both) ────────────────────────────────────────
+
+if docs_ingestion_pipeline is not None:
+    @dsl.pipeline(
+        name="full-ingestion-pipeline",
+        description="Run both docs and code ingestion pipelines in parallel",
+    )
+    def full_ingestion_pipeline(
+        # Docs params
+        docs_base_url: str = "https://www.kubeflow.org",
+        docs_crawl_delay: float = 1.0,
+        docs_max_pages: int = 0,
+        chunk_size: int = 500,
+        chunk_overlap: int = 50,
+        # Code params
+        code_repo_url: str = "https://github.com/kubeflow/manifests",
+        code_branch: str = "master",
+        # Shared params
+        embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2",
+        milvus_host: str = "localhost",
+        milvus_port: str = "19530",
+        embedding_dim: int = 384,
+    ):
+        """Parent pipeline that runs docs + code ingestion in parallel."""
+        docs_ingestion_pipeline(
+            base_url=docs_base_url,
+            crawl_delay=docs_crawl_delay,
+            max_pages=docs_max_pages,
+            chunk_size=chunk_size,
+            chunk_overlap=chunk_overlap,
+            embedding_model=embedding_model,
+            milvus_host=milvus_host,
+            milvus_port=milvus_port,
+            collection_name="docs_collection",
+            embedding_dim=embedding_dim,
+        )
+        code_ingestion_pipeline(
+            repo_url=code_repo_url,
+            branch=code_branch,
+            embedding_model=embedding_model,
+            milvus_host=milvus_host,
+            milvus_port=milvus_port,
+            collection_name="code_collection",
+            embedding_dim=embedding_dim,
+        )
+else:
+    full_ingestion_pipeline = None
+
+
+
+
+# ─── Main ────────────────────────────────────────────────────────────────────
+
+if __name__ == "__main__":
+    if "--local" in sys.argv:
+        print("Running code ingestion pipeline locally...")
+        sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
+        from pipelines.code_ingestion.components.repo_cloner import clone_repo as do_clone
+        from pipelines.code_ingestion.components.ast_parser import parse_all_files
+        from pipelines.code_ingestion.components.chunker import process_chunks
+        from pipelines.code_ingestion.components.embedder import embed_code_chunks
+        from pipelines.code_ingestion.components.loader import load_to_milvus
+        import logging, shutil
+        logging.basicConfig(level=logging.INFO)
+
+        result = do_clone()
+        chunks = parse_all_files(result["repo_dir"], result["file_list"], result["commit_sha"])
+        processed = process_chunks(chunks)
+        embedded = embed_code_chunks(processed)
+        summary = load_to_milvus(embedded)
+        print(f"Pipeline complete: {summary}")
+        shutil.rmtree(result["repo_dir"], ignore_errors=True)
+    else:
+        output_path = os.path.join(os.path.dirname(__file__), "pipeline.yaml")
+        kfp.compiler.Compiler().compile(
+            pipeline_func=code_ingestion_pipeline,
+            package_path=output_path,
+        )
+        print(f"Compiled code ingestion pipeline to: {output_path}")
+
+        if full_ingestion_pipeline is not None:
+            full_output_path = os.path.join(
+                os.path.dirname(__file__),
+                "full_pipeline.yaml",
+            )
+            kfp.compiler.Compiler().compile(
+                pipeline_func=full_ingestion_pipeline,
+                package_path=full_output_path,
+            )
+            print(f"Compiled full ingestion pipeline to: {full_output_path}")
+        else:
+            print(
+                "Skipped full ingestion pipeline compilation because the docs "
+                "pipeline import was unavailable."
+            )
diff --git a/pipelines/code_ingestion/pipeline.yaml b/pipelines/code_ingestion/pipeline.yaml
new file mode 100644
index 0000000..97e4a7e
--- /dev/null
+++ b/pipelines/code_ingestion/pipeline.yaml
@@ -0,0 +1,790 @@
+# PIPELINE DEFINITION
+# Name: code-ingestion-pipeline
+# Description: Clone kubeflow/manifests, parse code by language, embed, and load into Milvus
+# Inputs:
+#    branch: str [Default: 'master']
+#    collection_name: str [Default: 'code_collection']
+#    embedding_dim: int [Default: 384.0]
+#    embedding_model: str [Default: 'sentence-transformers/all-MiniLM-L6-v2']
+#    milvus_host: str [Default: 'localhost']
+#    milvus_port: str [Default: '19530']
+#    repo_url: str [Default: 'https://github.com/kubeflow/manifests']
+components:
+  comp-chunk-code:
+    executorLabel: exec-chunk-code
+    inputDefinitions:
+      artifacts:
+        parsed_data:
+          artifactType:
+            schemaTitle: system.Dataset
+            schemaVersion: 0.0.1
+          description: Input dataset of parsed chunks.
+    outputDefinitions:
+      artifacts:
+        chunked_data:
+          artifactType:
+            schemaTitle: system.Dataset
+            schemaVersion: 0.0.1
+  comp-clone-repo:
+    executorLabel: exec-clone-repo
+    inputDefinitions:
+      parameters:
+        branch:
+          description: Branch name to clone.
+          parameterType: STRING
+        repo_url:
+          description: Repository URL to clone.
+          parameterType: STRING
+    outputDefinitions:
+      artifacts:
+        clone_data:
+          artifactType:
+            schemaTitle: system.Dataset
+            schemaVersion: 0.0.1
+  comp-embed-code:
+    executorLabel: exec-embed-code
+    inputDefinitions:
+      artifacts:
+        chunked_data:
+          artifactType:
+            schemaTitle: system.Dataset
+            schemaVersion: 0.0.1
+          description: Input dataset of chunked code.
+      parameters:
+        embedding_model:
+          description: Model name for embeddings.
+          parameterType: STRING
+    outputDefinitions:
+      artifacts:
+        embedded_data:
+          artifactType:
+            schemaTitle: system.Dataset
+            schemaVersion: 0.0.1
+  comp-load-code:
+    executorLabel: exec-load-code
+    inputDefinitions:
+      artifacts:
+        embedded_data:
+          artifactType:
+            schemaTitle: system.Dataset
+            schemaVersion: 0.0.1
+          description: Input dataset with embedded chunks.
+      parameters:
+        collection_name:
+          description: Target collection name.
+          parameterType: STRING
+        embedding_dim:
+          description: Vector dimension.
+          parameterType: NUMBER_INTEGER
+        milvus_host:
+          description: Milvus server host.
+          parameterType: STRING
+        milvus_port:
+          description: Milvus server port.
+          parameterType: STRING
+  comp-parse-code:
+    executorLabel: exec-parse-code
+    inputDefinitions:
+      artifacts:
+        clone_data:
+          artifactType:
+            schemaTitle: system.Dataset
+            schemaVersion: 0.0.1
+          description: Input dataset from repo cloner.
+    outputDefinitions:
+      artifacts:
+        parsed_data:
+          artifactType:
+            schemaTitle: system.Dataset
+            schemaVersion: 0.0.1
+deploymentSpec:
+  executors:
+    exec-chunk-code:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - chunk_code
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'tiktoken==0.7.0'\
+          \  &&  python3 -m pip install --quiet --no-warn-script-location 'kfp==2.16.0'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\
+          $0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef chunk_code(\n    parsed_data: Input[Dataset],\n    chunked_data:\
+          \ Output[Dataset],\n):\n    \"\"\"Post-process parsed chunks with token\
+          \ limits and context headers.\n\n    Args:\n        parsed_data: Input dataset\
+          \ of parsed chunks.\n        chunked_data: Output dataset of token-bounded\
+          \ chunks.\n    \"\"\"\n    import hashlib\n    import json\n    import logging\n\
+          \n    import tiktoken\n\n    logging.basicConfig(level=logging.INFO)\n \
+          \   logger = logging.getLogger(\"chunker\")\n\n    enc = tiktoken.get_encoding(\"\
+          cl100k_base\")\n    count = lambda t: len(enc.encode(t))\n\n    MIN_T, MAX_T\
+          \ = 50, 512\n\n    def build_path_hints(chunk):\n        raw = \" \".join(str(chunk.get(key,\
+          \ \"\")) for key in (\"file_path\", \"folder_context\", \"symbol_name\"\
+          ))\n        expanded = raw.replace(\"/\", \" \").replace(\"_\", \" \").replace(\"\
+          -\", \" \")\n        expanded = \"\".join(\n            (\n            \
+          \    f\" {char}\" if index > 0 and char.isupper() and expanded[index - 1].islower()\
+          \ else char\n            )\n            for index, char in enumerate(expanded)\n\
+          \        )\n        return \" \".join(expanded.split()).lower()\n\n    raw\
+          \ = []\n    with open(parsed_data.path) as f:\n        for line in f:\n\
+          \            if line.strip():\n                raw.append(json.loads(line))\n\
+          \n    processed = []\n    for chunk in raw:\n        header = (\n      \
+          \      f\"# File: {chunk.get('file_path', '?')} | Symbol: {chunk.get('symbol_name',\
+          \ '?')} \"\n            f\"| Lang: {chunk.get('language', '?')} | Folder:\
+          \ {chunk.get('folder_context', '?')}\"\n        )\n        path_hints =\
+          \ build_path_hints(chunk)\n        if path_hints:\n            header =\
+          \ f\"{header}\\n# Path Hints: {path_hints}\"\n        full = f\"{header}\\\
+          n\\n{chunk['chunk_text']}\"\n        tc = count(full)\n\n        if tc <\
+          \ MIN_T:\n            continue\n\n        if tc <= MAX_T:\n            chunk[\"\
+          chunk_text\"] = full[:8192]\n            chunk[\"token_count\"] = tc\n \
+          \           processed.append(chunk)\n        else:\n            # Split\
+          \ oversized\n            parts = full.split(\"\\n\\n\")\n            cur,\
+          \ subs = \"\", []\n            for p in parts:\n                cand = cur\
+          \ + \"\\n\\n\" + p if cur else p\n                if count(cand) > MAX_T:\n\
+          \                    if cur.strip():\n                        subs.append(cur.strip())\n\
+          \                    cur = p\n                else:\n                  \
+          \  cur = cand\n            if cur.strip():\n                subs.append(cur.strip())\n\
+          \n            for si, sub in enumerate(subs):\n                st = count(sub)\n\
+          \                if st < MIN_T:\n                    continue\n        \
+          \        sc = chunk.copy()\n                sc[\"chunk_id\"] = hashlib.sha256(f\"\
+          {chunk['chunk_id']}::{si}\".encode()).hexdigest()[:32]\n               \
+          \ sc[\"chunk_text\"] = sub[:8192]\n                sc[\"token_count\"] =\
+          \ st\n                processed.append(sc)\n\n    logger.info(\"Chunked\
+          \ %d -> %d chunks\", len(raw), len(processed))\n\n    with open(chunked_data.path,\
+          \ \"w\") as f:\n        for c in processed:\n            f.write(json.dumps(c,\
+          \ ensure_ascii=False) + \"\\n\")\n\n"
+        image: python:3.11-slim
+    exec-clone-repo:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - clone_repo
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'gitpython==3.1.43'\
+          \  &&  python3 -m pip install --quiet --no-warn-script-location 'kfp==2.16.0'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\
+          $0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef clone_repo(\n    repo_url: str,\n    branch: str,\n    clone_data:\
+          \ Output[Dataset],\n):\n    \"\"\"Clone a git repository and collect file\
+          \ metadata.\n\n    Args:\n        repo_url: Repository URL to clone.\n \
+          \       branch: Branch name to clone.\n        clone_data: Output dataset\
+          \ artifact.\n    \"\"\"\n    import json\n    import logging\n    import\
+          \ os\n    import subprocess\n    import tempfile\n\n    logging.basicConfig(level=logging.INFO)\n\
+          \    logger = logging.getLogger(\"repo_cloner\")\n\n    SKIP_DIRS = {\"\
+          .git\", \"__pycache__\", \"node_modules\", \".tox\", \".mypy_cache\"}\n\
+          \    EXTENSIONS = {\".py\", \".go\", \".yaml\", \".yml\", \".md\"}\n   \
+          \ MIN_SIZE, MAX_SIZE = 200, 100_000\n\n    clone_dir = tempfile.mkdtemp(prefix=\"\
+          code-ingest-\")\n    logger.info(\"Cloning %s -> %s\", repo_url, clone_dir)\n\
+          \n    subprocess.run(\n        [\"git\", \"clone\", \"--depth\", \"1\",\
+          \ \"--branch\", branch, repo_url, clone_dir],\n        check=True, capture_output=True,\
+          \ text=True,\n    )\n\n    result = subprocess.run(\n        [\"git\", \"\
+          rev-parse\", \"HEAD\"],\n        capture_output=True, text=True, cwd=clone_dir,\
+          \ check=True,\n    )\n    commit_sha = result.stdout.strip()\n    logger.info(\"\
+          Commit: %s\", commit_sha[:12])\n\n    files = []\n    for root, dirs, fnames\
+          \ in os.walk(clone_dir):\n        dirs[:] = [d for d in dirs if d not in\
+          \ SKIP_DIRS and not d.startswith(\".\")]\n        for fn in fnames:\n  \
+          \          fp = os.path.join(root, fn)\n            rel = os.path.relpath(fp,\
+          \ clone_dir)\n            _, ext = os.path.splitext(fn)\n            if\
+          \ ext.lower() not in EXTENSIONS:\n                continue\n           \
+          \ try:\n                sz = os.path.getsize(fp)\n            except OSError:\n\
+          \                continue\n            if sz < MIN_SIZE or sz > MAX_SIZE:\n\
+          \                continue\n            parts = rel.split(os.sep)\n     \
+          \       folder = parts[0] if len(parts) > 1 else \"root\"\n            files.append({\"\
+          path\": rel, \"extension\": ext.lower(),\n                          \"size_bytes\"\
+          : sz, \"folder_context\": folder})\n\n    logger.info(\"Collected %d files\"\
+          , len(files))\n\n    # Save file list + contents\n    output = []\n    for\
+          \ f in files:\n        full = os.path.join(clone_dir, f[\"path\"])\n   \
+          \     try:\n            with open(full, \"r\", encoding=\"utf-8\", errors=\"\
+          replace\") as fh:\n                content = fh.read()\n        except Exception:\n\
+          \            continue\n        output.append({**f, \"content\": content,\
+          \ \"commit_sha\": commit_sha})\n\n    with open(clone_data.path, \"w\")\
+          \ as fh:\n        for item in output:\n            fh.write(json.dumps(item,\
+          \ ensure_ascii=False) + \"\\n\")\n\n    # Cleanup\n    import shutil\n \
+          \   shutil.rmtree(clone_dir, ignore_errors=True)\n\n"
+        image: python:3.11-slim
+    exec-embed-code:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - embed_code
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'sentence-transformers==2.7.0'\
+          \ 'torch==2.3.0'  &&  python3 -m pip install --quiet --no-warn-script-location\
+          \ 'kfp==2.16.0' '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"\
+          3.9\"' && \"$0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef embed_code(\n    chunked_data: Input[Dataset],\n    embedding_model:\
+          \ str,\n    embedded_data: Output[Dataset],\n):\n    \"\"\"Embed code chunks\
+          \ using configurable model.\n\n    Args:\n        chunked_data: Input dataset\
+          \ of chunked code.\n        embedding_model: Model name for embeddings.\n\
+          \        embedded_data: Output dataset with embeddings.\n    \"\"\"\n  \
+          \  import json\n    import logging\n\n    from sentence_transformers import\
+          \ SentenceTransformer\n\n    logging.basicConfig(level=logging.INFO)\n \
+          \   logger = logging.getLogger(\"embedder\")\n\n    chunks = []\n    with\
+          \ open(chunked_data.path) as f:\n        for line in f:\n            if\
+          \ line.strip():\n                chunks.append(json.loads(line))\n\n   \
+          \ logger.info(\"Embedding %d code chunks with %s\", len(chunks), embedding_model)\n\
+          \    model = SentenceTransformer(embedding_model)\n\n    texts = [c[\"chunk_text\"\
+          ] for c in chunks]\n    bs = 32\n    all_embs = []\n    for i in range(0,\
+          \ len(texts), bs):\n        batch = texts[i:i + bs]\n        embs = model.encode(batch,\
+          \ show_progress_bar=False)\n        all_embs.extend([e.tolist() for e in\
+          \ embs])\n        logger.info(\"Batch %d/%d\", i // bs + 1, (len(texts)\
+          \ + bs - 1) // bs)\n\n    for c, e in zip(chunks, all_embs):\n        c[\"\
+          embedding\"] = e\n\n    with open(embedded_data.path, \"w\") as f:\n   \
+          \     for c in chunks:\n            f.write(json.dumps(c, ensure_ascii=False)\
+          \ + \"\\n\")\n\n"
+        image: python:3.11-slim
+    exec-load-code:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - load_code
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'pymilvus==2.4.0'\
+          \  &&  python3 -m pip install --quiet --no-warn-script-location 'kfp==2.16.0'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\
+          $0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef load_code(\n    embedded_data: Input[Dataset],\n    milvus_host:\
+          \ str,\n    milvus_port: str,\n    collection_name: str,\n    embedding_dim:\
+          \ int,\n):\n    \"\"\"Load embedded code chunks into Milvus code_collection.\n\
+          \n    Args:\n        embedded_data: Input dataset with embedded chunks.\n\
+          \        milvus_host: Milvus server host.\n        milvus_port: Milvus server\
+          \ port.\n        collection_name: Target collection name.\n        embedding_dim:\
+          \ Vector dimension.\n    \"\"\"\n    import json\n    import logging\n\n\
+          \    from pymilvus import (Collection, CollectionSchema, DataType, FieldSchema,\n\
+          \                          connections, utility)\n\n    logging.basicConfig(level=logging.INFO)\n\
+          \    logger = logging.getLogger(\"loader\")\n\n    connections.connect(\"\
+          default\", host=milvus_host, port=milvus_port)\n\n    if not utility.has_collection(collection_name):\n\
+          \        fields = [\n            FieldSchema(\"chunk_id\", DataType.VARCHAR,\
+          \ max_length=128, is_primary=True),\n            FieldSchema(\"file_path\"\
+          , DataType.VARCHAR, max_length=512),\n            FieldSchema(\"extension\"\
+          , DataType.VARCHAR, max_length=16),\n            FieldSchema(\"language\"\
+          , DataType.VARCHAR, max_length=32),\n            FieldSchema(\"symbol_name\"\
+          , DataType.VARCHAR, max_length=256),\n            FieldSchema(\"folder_context\"\
+          , DataType.VARCHAR, max_length=128),\n            FieldSchema(\"chunk_text\"\
+          , DataType.VARCHAR, max_length=8192),\n            FieldSchema(\"start_line\"\
+          , DataType.INT64),\n            FieldSchema(\"end_line\", DataType.INT64),\n\
+          \            FieldSchema(\"commit_sha\", DataType.VARCHAR, max_length=64),\n\
+          \            FieldSchema(\"embedding\", DataType.FLOAT_VECTOR, dim=embedding_dim),\n\
+          \        ]\n        schema = CollectionSchema(fields, \"Kubeflow manifests\
+          \ code chunks\")\n        collection = Collection(collection_name, schema)\n\
+          \        collection.create_index(\"embedding\", {\n            \"metric_type\"\
+          : \"COSINE\", \"index_type\": \"HNSW\",\n            \"params\": {\"M\"\
+          : 16, \"efConstruction\": 200},\n        })\n    else:\n        collection\
+          \ = Collection(collection_name)\n\n    collection.load()\n\n    chunks =\
+          \ []\n    with open(embedded_data.path) as f:\n        for line in f:\n\
+          \            if line.strip():\n                chunks.append(json.loads(line))\n\
+          \n    rows = []\n    for c in chunks:\n        rows.append({\n         \
+          \   \"chunk_id\": str(c[\"chunk_id\"])[:128],\n            \"file_path\"\
+          : str(c.get(\"file_path\", \"\"))[:512],\n            \"extension\": str(c.get(\"\
+          extension\", \"\"))[:16],\n            \"language\": str(c.get(\"language\"\
+          , \"\"))[:32],\n            \"symbol_name\": str(c.get(\"symbol_name\",\
+          \ \"\"))[:256],\n            \"folder_context\": str(c.get(\"folder_context\"\
+          , \"\"))[:128],\n            \"chunk_text\": str(c.get(\"chunk_text\", \"\
+          \"))[:8192],\n            \"start_line\": int(c.get(\"start_line\", 0)),\n\
+          \            \"end_line\": int(c.get(\"end_line\", 0)),\n            \"\
+          commit_sha\": str(c.get(\"commit_sha\", \"\"))[:64],\n            \"embedding\"\
+          : c[\"embedding\"],\n        })\n\n    bs = 100\n    inserted = 0\n    for\
+          \ i in range(0, len(rows), bs):\n        batch = rows[i:i + bs]\n      \
+          \  collection.upsert(batch)\n        inserted += len(batch)\n\n    collection.flush()\n\
+          \    logger.info(\"Loaded %d chunks into %s. Total: %d\",\n            \
+          \    inserted, collection_name, collection.num_entities)\n\n"
+        image: python:3.11-slim
+    exec-parse-code:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - parse_code
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'PyYAML==6.0.1'\
+          \  &&  python3 -m pip install --quiet --no-warn-script-location 'kfp==2.16.0'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\
+          $0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef parse_code(\n    clone_data: Input[Dataset],\n    parsed_data:\
+          \ Output[Dataset],\n):\n    \"\"\"Parse files into logical code chunks using\
+          \ language-specific parsers.\n\n    Args:\n        clone_data: Input dataset\
+          \ from repo cloner.\n        parsed_data: Output dataset of parsed chunks.\n\
+          \    \"\"\"\n    import ast as pyast\n    import hashlib\n    import json\n\
+          \    import logging\n    import os\n    import re\n\n    import yaml\n\n\
+          \    logging.basicConfig(level=logging.INFO)\n    logger = logging.getLogger(\"\
+          ast_parser\")\n\n    PATH_ALIAS_HINTS = {\n        \"common/istio\": [\n\
+          \            \"istio\", \"service mesh\", \"gateway\", \"authorization policy\"\
+          ,\n            \"peer authentication\", \"virtual service\", \"sidecar\"\
+          , \"envoy\", \"mtls\", \"ingress\",\n        ],\n        \"common/knative\"\
+          : [\n            \"knative\", \"serving\", \"eventing\", \"serverless\"\
+          , \"scale to zero\",\n            \"activator\", \"revision\", \"service\"\
+          , \"net istio\", \"webhook\",\n        ],\n        \"common/dex\": [\n \
+          \           \"dex\", \"oidc\", \"oauth2\", \"authentication\", \"identity\
+          \ provider\",\n            \"connector\", \"login\",\n        ],\n     \
+          \   \"common/cert-manager\": [\n            \"cert manager\", \"certificate\"\
+          , \"issuer\", \"clusterissuer\",\n            \"cainjector\", \"tls\", \"\
+          webhook\",\n        ],\n        \"applications/pipeline\": [\n         \
+          \   \"kubeflow pipelines\", \"kfp\", \"pipeline api server\", \"deployment\"\
+          ,\n            \"service\", \"configmap\", \"role\", \"rolebinding\", \"\
+          serviceaccount\",\n            \"crd\", \"webhook\", \"scheduled workflow\"\
+          ,\n        ],\n        \"applications/profiles\": [\n            \"profiles\"\
+          , \"namespaces\", \"rbac\", \"rolebinding\", \"serviceaccount\", \"user\
+          \ profile\",\n        ],\n        \"tests\": [\"tests\", \"e2e\", \"integration\"\
+          , \"validation\", \"presubmit\"],\n    }\n\n    def gen_id(fp, sym, idx):\n\
+          \        return hashlib.sha256(f\"{fp}::{sym}::{idx}\".encode()).hexdigest()[:32]\n\
+          \n    def split_terms(value):\n        expanded = re.sub(r\"([a-z0-9])([A-Z])\"\
+          , r\"\\1 \\2\", value)\n        normalized = re.sub(r\"[^A-Za-z0-9]+\",\
+          \ \" \", expanded)\n        return [token.lower() for token in normalized.split()\
+          \ if token]\n\n    def unique_terms(values, limit=24):\n        seen = set()\n\
+          \        ordered = []\n        for value in values:\n            for token\
+          \ in split_terms(str(value)):\n                if token not in seen:\n \
+          \                   seen.add(token)\n                    ordered.append(token)\n\
+          \                    if len(ordered) >= limit:\n                       \
+          \ return ordered\n        return ordered\n\n    def summarize_list(values,\
+          \ limit=8):\n        if not isinstance(values, list):\n            return\
+          \ \"\"\n        flattened = [str(item) for item in values if item]\n   \
+          \     return \", \".join(flattened[:limit])\n\n    def get_path_aliases(fp):\n\
+          \        normalized = fp.replace(\"\\\\\", \"/\").lower()\n        aliases\
+          \ = []\n        for prefix, hints in PATH_ALIAS_HINTS.items():\n       \
+          \     if normalized.startswith(prefix):\n                aliases.extend(hints)\n\
+          \        return aliases\n\n    def extract_container_names(parsed):\n  \
+          \      spec = parsed.get(\"spec\")\n        if not isinstance(spec, dict):\n\
+          \            return []\n        template = spec.get(\"template\", {})\n\
+          \        if isinstance(template, dict):\n            template_spec = template.get(\"\
+          spec\", {})\n            if isinstance(template_spec, dict):\n         \
+          \       containers = template_spec.get(\"containers\", [])\n           \
+          \     if isinstance(containers, list):\n                    return [\n \
+          \                       str(container.get(\"name\"))\n                 \
+          \       for container in containers\n                        if isinstance(container,\
+          \ dict) and container.get(\"name\")\n                    ]\n        job_template\
+          \ = spec.get(\"jobTemplate\", {})\n        if isinstance(job_template, dict):\n\
+          \            nested_spec = job_template.get(\"spec\", {})\n            if\
+          \ isinstance(nested_spec, dict):\n                nested_template = nested_spec.get(\"\
+          template\", {})\n                if isinstance(nested_template, dict):\n\
+          \                    nested_template_spec = nested_template.get(\"spec\"\
+          , {})\n                    if isinstance(nested_template_spec, dict):\n\
+          \                        containers = nested_template_spec.get(\"containers\"\
+          , [])\n                        if isinstance(containers, list):\n      \
+          \                      return [\n                                str(container.get(\"\
+          name\"))\n                                for container in containers\n\
+          \                                if isinstance(container, dict) and container.get(\"\
+          name\")\n                            ]\n        return []\n\n    def build_manifest_context(parsed,\
+          \ fp, ctx):\n        metadata = parsed.get(\"metadata\", {})\n        metadata\
+          \ = metadata if isinstance(metadata, dict) else {}\n        kind = str(parsed.get(\"\
+          kind\", \"Unknown\"))\n        api_version = str(parsed.get(\"apiVersion\"\
+          , \"unknown\"))\n        name = str(metadata.get(\"name\", \"unknown\"))\n\
+          \        namespace = str(metadata.get(\"namespace\", \"cluster-scoped\"\
+          ))\n        path_terms = unique_terms([fp, os.path.basename(fp), ctx], limit=18)\n\
+          \        alias_terms = unique_terms(get_path_aliases(fp), limit=18)\n  \
+          \      top_level_keys = summarize_list(list(parsed.keys()))\n        label_keys\
+          \ = summarize_list(list((metadata.get(\"labels\") or {}).keys()))\n    \
+          \    annotation_keys = summarize_list(list((metadata.get(\"annotations\"\
+          ) or {}).keys()))\n\n        lines = [\n            f\"Manifest file path:\
+          \ {fp}\",\n            f\"Folder context: {ctx}\",\n            f\"Resource\
+          \ kind: {kind}\",\n            f\"API version: {api_version}\",\n      \
+          \      f\"Metadata name: {name}\",\n            f\"Namespace: {namespace}\"\
+          ,\n        ]\n        if path_terms:\n            lines.append(f\"Path hints:\
+          \ {' '.join(path_terms)}\")\n        if alias_terms:\n            lines.append(f\"\
+          Domain hints: {' '.join(alias_terms)}\")\n        if top_level_keys:\n \
+          \           lines.append(f\"Top-level keys: {top_level_keys}\")\n      \
+          \  if label_keys:\n            lines.append(f\"Label keys: {label_keys}\"\
+          )\n        if annotation_keys:\n            lines.append(f\"Annotation keys:\
+          \ {annotation_keys}\")\n\n        spec = parsed.get(\"spec\")\n        spec\
+          \ = spec if isinstance(spec, dict) else {}\n\n        if kind.lower() ==\
+          \ \"kustomization\" or os.path.basename(fp).lower() == \"kustomization.yaml\"\
+          :\n            resources = summarize_list(parsed.get(\"resources\"))\n \
+          \           components = summarize_list(parsed.get(\"components\"))\n  \
+          \          bases = summarize_list(parsed.get(\"bases\"))\n            patches\
+          \ = summarize_list(parsed.get(\"patchesStrategicMerge\"))\n            if\
+          \ resources:\n                lines.append(f\"Kustomize resources: {resources}\"\
+          )\n            if components:\n                lines.append(f\"Kustomize\
+          \ components: {components}\")\n            if bases:\n                lines.append(f\"\
+          Kustomize bases: {bases}\")\n            if patches:\n                lines.append(f\"\
+          Kustomize patches: {patches}\")\n\n        if kind in {\"Deployment\", \"\
+          StatefulSet\", \"DaemonSet\", \"Job\", \"CronJob\"}:\n            container_names\
+          \ = summarize_list(extract_container_names(parsed))\n            service_account\
+          \ = spec.get(\"serviceAccountName\")\n            if not service_account\
+          \ and isinstance(spec.get(\"template\"), dict):\n                template_spec\
+          \ = spec.get(\"template\", {}).get(\"spec\", {})\n                if isinstance(template_spec,\
+          \ dict):\n                    service_account = template_spec.get(\"serviceAccountName\"\
+          )\n            if container_names:\n                lines.append(f\"Workload\
+          \ containers: {container_names}\")\n            if service_account:\n  \
+          \              lines.append(f\"Service account: {service_account}\")\n\n\
+          \        if kind == \"Service\":\n            service_type = spec.get(\"\
+          type\")\n            selector = spec.get(\"selector\")\n            ports\
+          \ = spec.get(\"ports\")\n            if service_type:\n                lines.append(f\"\
+          Service type: {service_type}\")\n            if isinstance(selector, dict)\
+          \ and selector:\n                lines.append(f\"Service selector keys:\
+          \ {', '.join(list(selector.keys())[:8])}\")\n            if isinstance(ports,\
+          \ list) and ports:\n                port_values = [str(port.get('port'))\
+          \ for port in ports if isinstance(port, dict) and port.get('port')]\n  \
+          \              if port_values:\n                    lines.append(f\"Service\
+          \ ports: {', '.join(port_values[:8])}\")\n\n        if kind == \"CustomResourceDefinition\"\
+          :\n            names = spec.get(\"names\", {}) if isinstance(spec.get(\"\
+          names\"), dict) else {}\n            versions = spec.get(\"versions\", [])\n\
+          \            if spec.get(\"group\"):\n                lines.append(f\"CRD\
+          \ group: {spec.get('group')}\")\n            if names.get(\"kind\"):\n \
+          \               lines.append(f\"CRD served kind: {names.get('kind')}\")\n\
+          \            if isinstance(versions, list) and versions:\n             \
+          \   version_names = [str(version.get(\"name\")) for version in versions\
+          \ if isinstance(version, dict) and version.get(\"name\")]\n            \
+          \    if version_names:\n                    lines.append(f\"CRD versions:\
+          \ {', '.join(version_names[:8])}\")\n\n        if kind in {\"Role\", \"\
+          ClusterRole\"}:\n            rules = spec.get(\"rules\", parsed.get(\"rules\"\
+          ))\n            if isinstance(rules, list) and rules:\n                resource_names\
+          \ = []\n                verbs = []\n                for rule in rules[:4]:\n\
+          \                    if isinstance(rule, dict):\n                      \
+          \  resource_names.extend(str(item) for item in rule.get(\"resources\", [])[:4])\n\
+          \                        verbs.extend(str(item) for item in rule.get(\"\
+          verbs\", [])[:4])\n                if resource_names:\n                \
+          \    lines.append(f\"RBAC resources: {', '.join(resource_names[:10])}\"\
+          )\n                if verbs:\n                    lines.append(f\"RBAC verbs:\
+          \ {', '.join(verbs[:10])}\")\n\n        if kind in {\"RoleBinding\", \"\
+          ClusterRoleBinding\"}:\n            role_ref = parsed.get(\"roleRef\", {})\n\
+          \            subjects = parsed.get(\"subjects\", [])\n            if isinstance(role_ref,\
+          \ dict) and role_ref.get(\"name\"):\n                lines.append(f\"Binding\
+          \ roleRef: {role_ref.get('name')}\")\n            if isinstance(subjects,\
+          \ list) and subjects:\n                subject_names = [str(subject.get(\"\
+          name\")) for subject in subjects if isinstance(subject, dict) and subject.get(\"\
+          name\")]\n                if subject_names:\n                    lines.append(f\"\
+          Binding subjects: {', '.join(subject_names[:10])}\")\n\n        if kind\
+          \ in {\"AuthorizationPolicy\", \"PeerAuthentication\", \"VirtualService\"\
+          , \"Gateway\", \"DestinationRule\"}:\n            selector = spec.get(\"\
+          selector\", {})\n            if isinstance(selector, dict):\n          \
+          \      match_labels = selector.get(\"matchLabels\", {})\n              \
+          \  if isinstance(match_labels, dict) and match_labels:\n               \
+          \     lines.append(f\"Istio selector labels: {', '.join(list(match_labels.keys())[:8])}\"\
+          )\n            gateways = spec.get(\"gateways\")\n            hosts = spec.get(\"\
+          hosts\")\n            if isinstance(gateways, list) and gateways:\n    \
+          \            lines.append(f\"Istio gateways: {', '.join(str(g) for g in\
+          \ gateways[:8])}\")\n            if isinstance(hosts, list) and hosts:\n\
+          \                lines.append(f\"Istio hosts: {', '.join(str(h) for h in\
+          \ hosts[:8])}\")\n\n        return \"\\n\".join(f\"# {line}\" for line in\
+          \ lines if line)\n\n    def parse_python(content, fp, sha, ctx):\n     \
+          \   chunks, lines = [], content.split(\"\\n\")\n        try:\n         \
+          \   tree = pyast.parse(content)\n        except SyntaxError:\n         \
+          \   return [{\"chunk_id\": gen_id(fp, \"module\", 0), \"file_path\": fp,\n\
+          \                     \"extension\": \".py\", \"language\": \"python\",\n\
+          \                     \"symbol_name\": os.path.basename(fp), \"chunk_text\"\
+          : content,\n                     \"start_line\": 1, \"end_line\": len(lines),\n\
+          \                     \"commit_sha\": sha, \"folder_context\": ctx}]\n \
+          \       idx = 0\n        for node in pyast.walk(tree):\n            if isinstance(node,\
+          \ (pyast.FunctionDef, pyast.AsyncFunctionDef, pyast.ClassDef)):\n      \
+          \          sl, el = node.lineno, node.end_lineno or node.lineno\n      \
+          \          ct = \"\\n\".join(lines[sl - 1:el])\n                tp = \"\
+          class\" if isinstance(node, pyast.ClassDef) else \"function\"\n        \
+          \        chunks.append({\"chunk_id\": gen_id(fp, node.name, idx), \"file_path\"\
+          : fp,\n                               \"extension\": \".py\", \"language\"\
+          : \"python\",\n                               \"symbol_name\": f\"{tp}:{node.name}\"\
+          , \"chunk_text\": ct,\n                               \"start_line\": sl,\
+          \ \"end_line\": el,\n                               \"commit_sha\": sha,\
+          \ \"folder_context\": ctx})\n                idx += 1\n        if not chunks:\n\
+          \            chunks.append({\"chunk_id\": gen_id(fp, \"module\", 0), \"\
+          file_path\": fp,\n                           \"extension\": \".py\", \"\
+          language\": \"python\",\n                           \"symbol_name\": f\"\
+          module:{os.path.basename(fp)}\", \"chunk_text\": content,\n            \
+          \               \"start_line\": 1, \"end_line\": len(lines),\n         \
+          \                  \"commit_sha\": sha, \"folder_context\": ctx})\n    \
+          \    return chunks\n\n    def parse_go(content, fp, sha, ctx):\n       \
+          \ pat = re.compile(r\"^(?:func\\s+(?:\\([^)]+\\)\\s+)?(\\w+)|type\\s+(\\\
+          w+)\\s+struct)\\b\", re.MULTILINE)\n        matches = list(pat.finditer(content))\n\
+          \        if not matches:\n            return [{\"chunk_id\": gen_id(fp,\
+          \ \"file\", 0), \"file_path\": fp,\n                     \"extension\":\
+          \ \".go\", \"language\": \"go\",\n                     \"symbol_name\":\
+          \ f\"file:{os.path.basename(fp)}\", \"chunk_text\": content,\n         \
+          \            \"start_line\": 1, \"end_line\": content.count(\"\\n\") + 1,\n\
+          \                     \"commit_sha\": sha, \"folder_context\": ctx}]\n \
+          \       chunks = []\n        for i, m in enumerate(matches):\n         \
+          \   sym = m.group(1) or m.group(2)\n            s, e = m.start(), matches[i\
+          \ + 1].start() if i + 1 < len(matches) else len(content)\n            ct\
+          \ = content[s:e].rstrip()\n            sl = content[:s].count(\"\\n\") +\
+          \ 1\n            tp = \"struct\" if m.group(2) else \"func\"\n         \
+          \   chunks.append({\"chunk_id\": gen_id(fp, sym, i), \"file_path\": fp,\n\
+          \                           \"extension\": \".go\", \"language\": \"go\"\
+          ,\n                           \"symbol_name\": f\"{tp}:{sym}\", \"chunk_text\"\
+          : ct,\n                           \"start_line\": sl, \"end_line\": sl +\
+          \ ct.count(\"\\n\"),\n                           \"commit_sha\": sha, \"\
+          folder_context\": ctx})\n        return chunks\n\n    def parse_yaml_file(content,\
+          \ fp, sha, ctx):\n        ext = os.path.splitext(fp)[1].lower()\n      \
+          \  docs = content.split(\"\\n---\")\n        chunks = []\n        for idx,\
+          \ doc in enumerate(docs):\n            doc = doc.strip()\n            if\
+          \ not doc:\n                continue\n            try:\n               \
+          \ parsed = yaml.safe_load(doc)\n            except yaml.YAMLError:\n   \
+          \             parsed = None\n            if isinstance(parsed, dict):\n\
+          \                kind = parsed.get(\"kind\", \"Unknown\")\n            \
+          \    md = parsed.get(\"metadata\", {})\n                name = md.get(\"\
+          name\", \"unknown\") if isinstance(md, dict) else \"unknown\"\n        \
+          \        sym = f\"{kind}:{name}\"\n                manifest_context = build_manifest_context(parsed,\
+          \ fp, ctx)\n                chunk_body = f\"{manifest_context}\\n\\n{doc}\"\
+          \ if manifest_context else doc\n            else:\n                sym =\
+          \ f\"fragment:{idx}\"\n                chunk_body = doc\n            pre\
+          \ = \"\\n---\".join(docs[:idx])\n            sl = pre.count(\"\\n\") + 1\
+          \ if pre else 1\n            chunks.append({\"chunk_id\": gen_id(fp, sym,\
+          \ idx), \"file_path\": fp,\n                           \"extension\": ext,\
+          \ \"language\": \"yaml\",\n                           \"symbol_name\": sym,\
+          \ \"chunk_text\": chunk_body,\n                           \"start_line\"\
+          : sl, \"end_line\": sl + doc.count(\"\\n\"),\n                         \
+          \  \"commit_sha\": sha, \"folder_context\": ctx})\n        return chunks\
+          \ or [{\"chunk_id\": gen_id(fp, \"file\", 0), \"file_path\": fp,\n     \
+          \                      \"extension\": ext, \"language\": \"yaml\",\n   \
+          \                        \"symbol_name\": f\"file:{os.path.basename(fp)}\"\
+          , \"chunk_text\": content,\n                           \"start_line\": 1,\
+          \ \"end_line\": content.count(\"\\n\") + 1,\n                          \
+          \ \"commit_sha\": sha, \"folder_context\": ctx}]\n\n    def parse_md(content,\
+          \ fp, sha, ctx):\n        pat = re.compile(r\"^(#{2,3})\\s+(.+)$\", re.MULTILINE)\n\
+          \        matches = list(pat.finditer(content))\n        if not matches:\n\
+          \            return [{\"chunk_id\": gen_id(fp, \"doc\", 0), \"file_path\"\
+          : fp,\n                     \"extension\": \".md\", \"language\": \"markdown\"\
+          ,\n                     \"symbol_name\": f\"doc:{os.path.basename(fp)}\"\
+          , \"chunk_text\": content,\n                     \"start_line\": 1, \"end_line\"\
+          : content.count(\"\\n\") + 1,\n                     \"commit_sha\": sha,\
+          \ \"folder_context\": ctx}]\n        chunks = []\n        for i, m in enumerate(matches):\n\
+          \            h = m.group(2).strip()\n            s = m.start()\n       \
+          \     e = matches[i + 1].start() if i + 1 < len(matches) else len(content)\n\
+          \            text = content[s:e].strip()\n            sl = content[:s].count(\"\
+          \\n\") + 1\n            chunks.append({\"chunk_id\": gen_id(fp, h, i), \"\
+          file_path\": fp,\n                           \"extension\": \".md\", \"\
+          language\": \"markdown\",\n                           \"symbol_name\": f\"\
+          heading:{h[:100]}\", \"chunk_text\": text,\n                           \"\
+          start_line\": sl, \"end_line\": sl + text.count(\"\\n\"),\n            \
+          \               \"commit_sha\": sha, \"folder_context\": ctx})\n       \
+          \ return chunks\n\n    PARSERS = {\".py\": parse_python, \".go\": parse_go,\n\
+          \               \".yaml\": parse_yaml_file, \".yml\": parse_yaml_file, \"\
+          .md\": parse_md}\n\n    files = []\n    with open(clone_data.path) as f:\n\
+          \        for line in f:\n            if line.strip():\n                files.append(json.loads(line))\n\
+          \n    all_chunks = []\n    for fi in files:\n        parser = PARSERS.get(fi[\"\
+          extension\"])\n        if not parser:\n            continue\n        try:\n\
+          \            chunks = parser(fi[\"content\"], fi[\"path\"], fi[\"commit_sha\"\
+          ], fi[\"folder_context\"])\n            all_chunks.extend(chunks)\n    \
+          \    except Exception as ex:\n            logger.warning(\"Error parsing\
+          \ %s: %s\", fi[\"path\"], ex)\n\n    logger.info(\"Parsed %d chunks from\
+          \ %d files\", len(all_chunks), len(files))\n\n    with open(parsed_data.path,\
+          \ \"w\") as f:\n        for c in all_chunks:\n            f.write(json.dumps(c,\
+          \ ensure_ascii=False) + \"\\n\")\n\n"
+        image: python:3.11-slim
+pipelineInfo:
+  description: Clone kubeflow/manifests, parse code by language, embed, and load into
+    Milvus
+  name: code-ingestion-pipeline
+root:
+  dag:
+    tasks:
+      chunk-code:
+        cachingOptions:
+          enableCache: true
+        componentRef:
+          name: comp-chunk-code
+        dependentTasks:
+        - parse-code
+        inputs:
+          artifacts:
+            parsed_data:
+              taskOutputArtifact:
+                outputArtifactKey: parsed_data
+                producerTask: parse-code
+        retryPolicy:
+          backoffDuration: 30s
+          backoffFactor: 2.0
+          backoffMaxDuration: 3600s
+          maxRetryCount: 3
+        taskInfo:
+          name: chunk-code
+      clone-repo:
+        cachingOptions:
+          enableCache: true
+        componentRef:
+          name: comp-clone-repo
+        inputs:
+          parameters:
+            branch:
+              componentInputParameter: branch
+            repo_url:
+              componentInputParameter: repo_url
+        retryPolicy:
+          backoffDuration: 30s
+          backoffFactor: 2.0
+          backoffMaxDuration: 3600s
+          maxRetryCount: 3
+        taskInfo:
+          name: clone-repo
+      embed-code:
+        cachingOptions:
+          enableCache: true
+        componentRef:
+          name: comp-embed-code
+        dependentTasks:
+        - chunk-code
+        inputs:
+          artifacts:
+            chunked_data:
+              taskOutputArtifact:
+                outputArtifactKey: chunked_data
+                producerTask: chunk-code
+          parameters:
+            embedding_model:
+              componentInputParameter: embedding_model
+        retryPolicy:
+          backoffDuration: 30s
+          backoffFactor: 2.0
+          backoffMaxDuration: 3600s
+          maxRetryCount: 3
+        taskInfo:
+          name: embed-code
+      load-code:
+        cachingOptions:
+          enableCache: true
+        componentRef:
+          name: comp-load-code
+        dependentTasks:
+        - embed-code
+        inputs:
+          artifacts:
+            embedded_data:
+              taskOutputArtifact:
+                outputArtifactKey: embedded_data
+                producerTask: embed-code
+          parameters:
+            collection_name:
+              componentInputParameter: collection_name
+            embedding_dim:
+              componentInputParameter: embedding_dim
+            milvus_host:
+              componentInputParameter: milvus_host
+            milvus_port:
+              componentInputParameter: milvus_port
+        retryPolicy:
+          backoffDuration: 30s
+          backoffFactor: 2.0
+          backoffMaxDuration: 3600s
+          maxRetryCount: 3
+        taskInfo:
+          name: load-code
+      parse-code:
+        cachingOptions:
+          enableCache: true
+        componentRef:
+          name: comp-parse-code
+        dependentTasks:
+        - clone-repo
+        inputs:
+          artifacts:
+            clone_data:
+              taskOutputArtifact:
+                outputArtifactKey: clone_data
+                producerTask: clone-repo
+        retryPolicy:
+          backoffDuration: 30s
+          backoffFactor: 2.0
+          backoffMaxDuration: 3600s
+          maxRetryCount: 3
+        taskInfo:
+          name: parse-code
+  inputDefinitions:
+    parameters:
+      branch:
+        defaultValue: master
+        isOptional: true
+        parameterType: STRING
+      collection_name:
+        defaultValue: code_collection
+        isOptional: true
+        parameterType: STRING
+      embedding_dim:
+        defaultValue: 384.0
+        isOptional: true
+        parameterType: NUMBER_INTEGER
+      embedding_model:
+        defaultValue: sentence-transformers/all-MiniLM-L6-v2
+        isOptional: true
+        parameterType: STRING
+      milvus_host:
+        defaultValue: localhost
+        isOptional: true
+        parameterType: STRING
+      milvus_port:
+        defaultValue: '19530'
+        isOptional: true
+        parameterType: STRING
+      repo_url:
+        defaultValue: https://github.com/kubeflow/manifests
+        isOptional: true
+        parameterType: STRING
+schemaVersion: 2.1.0
+sdkVersion: kfp-2.16.0
diff --git a/pipelines/shared/__init__.py b/pipelines/shared/__init__.py
new file mode 100644
index 0000000..002a1f6
--- /dev/null
+++ b/pipelines/shared/__init__.py
@@ -0,0 +1 @@
+# Shared utilities for docs-agent ingestion pipelines
diff --git a/pipelines/shared/embedding_utils.py b/pipelines/shared/embedding_utils.py
new file mode 100644
index 0000000..25771a7
--- /dev/null
+++ b/pipelines/shared/embedding_utils.py
@@ -0,0 +1,236 @@
+"""
+Shared embedding utilities for docs-agent ingestion pipelines.
+
+Supports multiple embedding backends:
+  - sentence-transformers (local, default for development)
+  - openai (API-based, for production)
+
+Configure via environment variables:
+  EMBEDDING_MODEL: Model name/path (default: sentence-transformers/all-MiniLM-L6-v2)
+  OPENAI_API_KEY: Required only when EMBEDDING_MODEL=openai
+"""
+
+import logging
+import os
+import time
+from typing import List, Optional
+
+logger = logging.getLogger(__name__)
+
+
+def get_embedding_model_name() -> str:
+    """Get the configured embedding model name from environment."""
+    return os.environ.get(
+        "EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2"
+    )
+
+
+def get_embedding_dimension() -> int:
+    """Return the embedding dimension for the configured model.
+
+    Returns:
+        int: Vector dimension size.
+    """
+    model_name = get_embedding_model_name()
+    dimension_map = {
+        "sentence-transformers/all-MiniLM-L6-v2": 384,
+        "sentence-transformers/all-mpnet-base-v2": 768,
+        "nomic-embed-text": 768,
+        "openai": 1536,
+        "text-embedding-3-small": 1536,
+    }
+    for key, dim in dimension_map.items():
+        if key in model_name:
+            return dim
+    # Default fallback
+    logger.warning(
+        "Unknown model '%s', defaulting to 384 dimensions.", model_name
+    )
+    return 384
+
+
+class EmbeddingClient:
+    """Unified embedding client supporting local and API-based models.
+
+    Usage:
+        client = EmbeddingClient()
+        vectors = client.embed_batch(["hello world", "kubeflow pipelines"])
+    """
+
+    def __init__(self, model_name: Optional[str] = None, batch_size: int = 32):
+        """Initialize the embedding client.
+
+        Args:
+            model_name: Override for EMBEDDING_MODEL env var.
+            batch_size: Number of texts to embed per batch.
+        """
+        self.model_name = model_name or get_embedding_model_name()
+        self.batch_size = batch_size
+        self._model = None
+        self._client = None
+
+        logger.info("Embedding client initialized with model: %s", self.model_name)
+
+    def _is_openai(self) -> bool:
+        """Check if using OpenAI API backend."""
+        return "openai" in self.model_name or "text-embedding" in self.model_name
+
+    def _load_local_model(self):
+        """Lazy-load the sentence-transformers model."""
+        if self._model is None:
+            from sentence_transformers import SentenceTransformer
+
+            model_path = self.model_name
+            # Strip the prefix if it's a sentence-transformers model
+            if "/" in model_path and not model_path.startswith("/"):
+                pass  # Use full HuggingFace path
+            logger.info("Loading local model: %s", model_path)
+            self._model = SentenceTransformer(model_path)
+            logger.info("Model loaded successfully.")
+        return self._model
+
+    def _get_openai_client(self):
+        """Lazy-initialize the OpenAI client."""
+        if self._client is None:
+            import openai
+
+            api_key = os.environ.get("OPENAI_API_KEY")
+            if not api_key:
+                raise ValueError(
+                    "OPENAI_API_KEY environment variable is required "
+                    "when using OpenAI embeddings."
+                )
+            self._client = openai.OpenAI(api_key=api_key)
+            logger.info("OpenAI client initialized.")
+        return self._client
+
+    def embed_texts(self, texts: List[str]) -> List[List[float]]:
+        """Embed a list of texts with automatic batching and retry.
+
+        Args:
+            texts: List of text strings to embed.
+
+        Returns:
+            List of embedding vectors (list of floats).
+        """
+        if not texts:
+            return []
+
+        all_embeddings: List[List[float]] = []
+
+        for i in range(0, len(texts), self.batch_size):
+            batch = texts[i : i + self.batch_size]
+            batch_num = i // self.batch_size + 1
+            total_batches = (len(texts) + self.batch_size - 1) // self.batch_size
+
+            embeddings = self._embed_batch_with_retry(batch)
+            all_embeddings.extend(embeddings)
+
+            logger.info(
+                "Embedded batch %d/%d (%d texts)",
+                batch_num,
+                total_batches,
+                len(batch),
+            )
+
+        return all_embeddings
+
+    def _embed_batch_with_retry(
+        self, texts: List[str], max_retries: int = 3
+    ) -> List[List[float]]:
+        """Embed a single batch with exponential backoff retry.
+
+        Args:
+            texts: Batch of texts to embed.
+            max_retries: Maximum number of retry attempts.
+
+        Returns:
+            List of embedding vectors.
+
+        Raises:
+            RuntimeError: If all retries are exhausted.
+        """
+        for attempt in range(max_retries):
+            try:
+                if self._is_openai():
+                    return self._embed_openai(texts)
+                else:
+                    return self._embed_local(texts)
+            except Exception as e:
+                wait_time = (2 ** attempt) + (0.1 * attempt)
+                logger.warning(
+                    "Embedding failed (attempt %d/%d): %s. Retrying in %.1fs...",
+                    attempt + 1,
+                    max_retries,
+                    str(e),
+                    wait_time,
+                )
+                if attempt < max_retries - 1:
+                    time.sleep(wait_time)
+                else:
+                    raise RuntimeError(
+                        f"Embedding failed after {max_retries} attempts: {e}"
+                    ) from e
+        return []  # unreachable, but satisfies type checker
+
+    def _embed_local(self, texts: List[str]) -> List[List[float]]:
+        """Embed using local sentence-transformers model.
+
+        Args:
+            texts: Batch of texts to embed.
+
+        Returns:
+            List of embedding vectors.
+        """
+        model = self._load_local_model()
+        embeddings = model.encode(texts, show_progress_bar=False)
+        return [emb.tolist() for emb in embeddings]
+
+    def _embed_openai(self, texts: List[str]) -> List[List[float]]:
+        """Embed using OpenAI API.
+
+        Args:
+            texts: Batch of texts to embed.
+
+        Returns:
+            List of embedding vectors.
+        """
+        client = self._get_openai_client()
+        model_name = self.model_name
+        if "openai" in model_name and "text-embedding" not in model_name:
+            model_name = "text-embedding-3-small"
+
+        response = client.embeddings.create(input=texts, model=model_name)
+        return [item.embedding for item in response.data]
+
+
+# Convenience function
+def embed_texts(texts: List[str], model_name: Optional[str] = None) -> List[List[float]]:
+    """Convenience function to embed texts with default settings.
+
+    Args:
+        texts: List of texts to embed.
+        model_name: Optional model override.
+
+    Returns:
+        List of embedding vectors.
+    """
+    client = EmbeddingClient(model_name=model_name)
+    return client.embed_texts(texts)
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    # Quick smoke test
+    test_texts = [
+        "How to install Kubeflow on Kubernetes",
+        "KFP pipeline component decorator",
+        "Milvus vector database schema design",
+    ]
+    logger.info("Testing embedding with model: %s", get_embedding_model_name())
+    logger.info("Expected dimensions: %d", get_embedding_dimension())
+
+    client = EmbeddingClient()
+    vectors = client.embed_texts(test_texts)
+    for i, (text, vec) in enumerate(zip(test_texts, vectors)):
+        logger.info("Text %d: '%s...' -> dim=%d", i, text[:40], len(vec))
diff --git a/pipelines/shared/milvus_utils.py b/pipelines/shared/milvus_utils.py
new file mode 100644
index 0000000..7570ef3
--- /dev/null
+++ b/pipelines/shared/milvus_utils.py
@@ -0,0 +1,297 @@
+"""
+Shared Milvus utilities for docs-agent ingestion pipelines.
+
+Provides connection management, collection creation, upsert, and search
+operations with retry logic and exponential backoff.
+
+Configure via environment variables:
+  MILVUS_HOST: Milvus server host (default: localhost)
+  MILVUS_PORT: Milvus server port (default: 19530)
+  MILVUS_TOKEN: Authentication token (default: empty, no auth)
+"""
+
+import logging
+import os
+import time
+from typing import Any, Dict, List, Optional
+
+from pymilvus import (
+    Collection,
+    CollectionSchema,
+    DataType,
+    FieldSchema,
+    connections,
+    utility,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def get_milvus_config() -> Dict[str, str]:
+    """Get Milvus connection configuration from environment.
+
+    Returns:
+        Dict with host, port, and token.
+    """
+    return {
+        "host": os.environ.get("MILVUS_HOST", "localhost"),
+        "port": os.environ.get("MILVUS_PORT", "19530"),
+        "token": os.environ.get("MILVUS_TOKEN", ""),
+    }
+
+
+def connect(
+    alias: str = "default",
+    host: Optional[str] = None,
+    port: Optional[str] = None,
+    token: Optional[str] = None,
+    max_retries: int = 3,
+) -> None:
+    """Connect to Milvus with retry logic.
+
+    Args:
+        alias: Connection alias.
+        host: Override for MILVUS_HOST env var.
+        port: Override for MILVUS_PORT env var.
+        token: Override for MILVUS_TOKEN env var.
+        max_retries: Maximum retry attempts.
+
+    Raises:
+        ConnectionError: If all retries are exhausted.
+    """
+    config = get_milvus_config()
+    host = host or config["host"]
+    port = port or config["port"]
+    token = token or config["token"]
+
+    for attempt in range(max_retries):
+        try:
+            connect_params = {"alias": alias, "host": host, "port": port}
+            if token:
+                connect_params["token"] = token
+
+            connections.connect(**connect_params)
+            logger.info("Connected to Milvus at %s:%s", host, port)
+            return
+        except Exception as e:
+            wait_time = (2 ** attempt) + 1
+            logger.warning(
+                "Milvus connection failed (attempt %d/%d): %s. Retrying in %ds...",
+                attempt + 1,
+                max_retries,
+                str(e),
+                wait_time,
+            )
+            if attempt < max_retries - 1:
+                time.sleep(wait_time)
+            else:
+                raise ConnectionError(
+                    f"Failed to connect to Milvus after {max_retries} attempts: {e}"
+                ) from e
+
+
+def create_collection_if_not_exists(
+    collection_name: str,
+    fields: List[FieldSchema],
+    description: str = "",
+    index_field: str = "embedding",
+    index_params: Optional[Dict[str, Any]] = None,
+) -> Collection:
+    """Create a Milvus collection if it doesn't already exist.
+
+    Args:
+        collection_name: Name of the collection.
+        fields: List of FieldSchema objects defining the schema.
+        description: Collection description.
+        index_field: Name of the vector field to index.
+        index_params: Custom index parameters. Defaults to HNSW + COSINE.
+
+    Returns:
+        The Milvus Collection object.
+    """
+    if utility.has_collection(collection_name):
+        logger.info("Collection '%s' already exists. Loading.", collection_name)
+        collection = Collection(collection_name)
+        collection.load()
+        return collection
+
+    schema = CollectionSchema(fields, description=description)
+    collection = Collection(name=collection_name, schema=schema)
+    logger.info("Created collection: %s", collection_name)
+
+    # Default HNSW index params
+    if index_params is None:
+        index_params = {
+            "metric_type": "COSINE",
+            "index_type": "HNSW",
+            "params": {"M": 16, "efConstruction": 200},
+        }
+
+    collection.create_index(field_name=index_field, index_params=index_params)
+    logger.info(
+        "Created HNSW index on '%s' for collection '%s'",
+        index_field,
+        collection_name,
+    )
+
+    collection.load()
+    return collection
+
+
+def upsert_batch(
+    collection: Collection,
+    rows: List[Dict[str, Any]],
+    batch_size: int = 100,
+    max_retries: int = 3,
+) -> Dict[str, int]:
+    """Upsert rows into a Milvus collection in batches.
+
+    Uses the primary key to handle duplicates (Milvus upsert semantics).
+
+    Args:
+        collection: The target Milvus collection.
+        rows: List of row dicts matching the collection schema.
+        batch_size: Number of rows per insert batch.
+        max_retries: Retry attempts per batch.
+
+    Returns:
+        Dict with counts: inserted, failed, total.
+    """
+    total = len(rows)
+    inserted = 0
+    failed = 0
+
+    for i in range(0, total, batch_size):
+        batch = rows[i : i + batch_size]
+        batch_num = i // batch_size + 1
+        total_batches = (total + batch_size - 1) // batch_size
+
+        success = False
+        for attempt in range(max_retries):
+            try:
+                collection.upsert(batch)
+                inserted += len(batch)
+                success = True
+                logger.info(
+                    "Upserted batch %d/%d (%d rows)",
+                    batch_num,
+                    total_batches,
+                    len(batch),
+                )
+                break
+            except Exception as e:
+                wait_time = (2 ** attempt) + 1
+                logger.warning(
+                    "Upsert failed (batch %d, attempt %d/%d): %s. "
+                    "Retrying in %ds...",
+                    batch_num,
+                    attempt + 1,
+                    max_retries,
+                    str(e),
+                    wait_time,
+                )
+                if attempt < max_retries - 1:
+                    time.sleep(wait_time)
+
+        if not success:
+            failed += len(batch)
+            logger.error("Batch %d permanently failed after %d retries.", batch_num, max_retries)
+
+    collection.flush()
+    summary = {"inserted": inserted, "failed": failed, "total": total}
+    logger.info("Upsert complete: %s", summary)
+    return summary
+
+
+def search(
+    collection: Collection,
+    query_vector: List[float],
+    top_k: int = 3,
+    output_fields: Optional[List[str]] = None,
+    search_params: Optional[Dict[str, Any]] = None,
+    max_retries: int = 3,
+) -> List[Dict[str, Any]]:
+    """Search a Milvus collection by vector similarity.
+
+    Args:
+        collection: The Milvus collection to search.
+        query_vector: The query embedding vector.
+        top_k: Number of results to return.
+        output_fields: Fields to include in results.
+        search_params: Custom search parameters.
+        max_retries: Retry attempts.
+
+    Returns:
+        List of result dicts with fields and distance score.
+    """
+    if search_params is None:
+        search_params = {"metric_type": "COSINE", "params": {"ef": 64}}
+
+    if output_fields is None:
+        output_fields = ["chunk_text"]
+
+    for attempt in range(max_retries):
+        try:
+            results = collection.search(
+                data=[query_vector],
+                anns_field="embedding",
+                param=search_params,
+                limit=top_k,
+                output_fields=output_fields,
+            )
+
+            hits = []
+            for hit in results[0]:
+                hit_dict = {"id": hit.id, "distance": hit.distance}
+                for field in output_fields:
+                    hit_dict[field] = hit.entity.get(field)
+                hits.append(hit_dict)
+
+            logger.info("Search returned %d results.", len(hits))
+            return hits
+
+        except Exception as e:
+            wait_time = (2 ** attempt) + 1
+            logger.warning(
+                "Search failed (attempt %d/%d): %s. Retrying in %ds...",
+                attempt + 1,
+                max_retries,
+                str(e),
+                wait_time,
+            )
+            if attempt < max_retries - 1:
+                time.sleep(wait_time)
+            else:
+                logger.error("Search failed after %d retries: %s", max_retries, e)
+                return []
+
+    return []
+
+
+def drop_collection(collection_name: str) -> bool:
+    """Drop a collection if it exists.
+
+    Args:
+        collection_name: Name of the collection to drop.
+
+    Returns:
+        True if dropped, False if it didn't exist.
+    """
+    if utility.has_collection(collection_name):
+        utility.drop_collection(collection_name)
+        logger.info("Dropped collection: %s", collection_name)
+        return True
+    logger.info("Collection '%s' does not exist. Nothing to drop.", collection_name)
+    return False
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    # Quick connection test
+    try:
+        connect()
+        logger.info("Milvus connection test: SUCCESS")
+        collections = utility.list_collections()
+        logger.info("Existing collections: %s", collections)
+    except ConnectionError as e:
+        logger.error("Milvus connection test: FAILED — %s", e)
diff --git a/pipelines/shared/retrieval_strategy.py b/pipelines/shared/retrieval_strategy.py
new file mode 100644
index 0000000..898fe2b
--- /dev/null
+++ b/pipelines/shared/retrieval_strategy.py
@@ -0,0 +1,359 @@
+"""
+Shared retrieval strategy helpers for docs-agent search and validation.
+
+This module adds lightweight hybrid-retrieval behavior on top of vector search:
+  - query expansion for manifest-heavy questions
+  - collection preference inference (docs vs code)
+  - path/domain-aware reranking of candidate hits
+"""
+
+from __future__ import annotations
+
+import re
+from typing import Dict, Iterable, List
+
+PATH_ALIAS_HINTS = {
+    "common/istio": [
+        "istio",
+        "service mesh",
+        "gateway",
+        "authorization policy",
+        "peer authentication",
+        "virtual service",
+        "sidecar",
+        "envoy",
+        "mtls",
+        "ingress",
+    ],
+    "common/knative": [
+        "knative",
+        "serving",
+        "eventing",
+        "serverless",
+        "scale to zero",
+        "activator",
+        "revision",
+        "service",
+        "net istio",
+        "webhook",
+    ],
+    "common/dex": [
+        "dex",
+        "oidc",
+        "oauth2",
+        "authentication",
+        "identity provider",
+        "connector",
+        "login",
+    ],
+    "common/cert-manager": [
+        "cert manager",
+        "certificate",
+        "issuer",
+        "clusterissuer",
+        "cainjector",
+        "tls",
+        "webhook",
+    ],
+    "applications/pipeline": [
+        "kubeflow pipelines",
+        "kfp",
+        "pipeline api server",
+        "deployment",
+        "service",
+        "configmap",
+        "role",
+        "rolebinding",
+        "serviceaccount",
+        "crd",
+        "webhook",
+        "scheduled workflow",
+    ],
+    "applications/profiles": [
+        "profiles",
+        "namespaces",
+        "rbac",
+        "rolebinding",
+        "serviceaccount",
+        "user profile",
+    ],
+    "tests": [
+        "tests",
+        "e2e",
+        "integration",
+        "validation",
+        "presubmit",
+    ],
+}
+
+QUERY_EXPANSIONS = {
+    "istio": [
+        "istio",
+        "service mesh",
+        "gateway",
+        "authorization policy",
+        "peer authentication",
+        "virtual service",
+        "mtls",
+    ],
+    "knative": [
+        "knative",
+        "serving",
+        "eventing",
+        "serverless",
+        "scale to zero",
+        "activator",
+        "revision",
+    ],
+    "dex": [
+        "dex",
+        "oidc",
+        "oauth2",
+        "authentication",
+        "identity provider",
+        "connector",
+    ],
+    "cert-manager": [
+        "cert manager",
+        "certificate",
+        "issuer",
+        "clusterissuer",
+        "cainjector",
+        "tls",
+    ],
+    "component": [
+        "dsl component",
+        "lightweight python component",
+        "lightweight python components",
+        "containerized python component",
+        "base image",
+        "@dsl.component",
+    ],
+    "compile": [
+        "compile pipeline",
+        "pipeline compiler",
+        "kfp compiler",
+        "pipeline yaml",
+        "compiler compile",
+    ],
+    "resources": [
+        "deployment",
+        "service",
+        "configmap",
+        "role",
+        "rolebinding",
+        "serviceaccount",
+        "custom resource definition",
+    ],
+    "testing": [
+        "tests",
+        "e2e",
+        "integration",
+        "validation",
+        "presubmit",
+    ],
+}
+
+CODE_INTENT_TERMS = {
+    "yaml",
+    "manifest",
+    "manifests",
+    "deployment",
+    "deployments",
+    "service",
+    "services",
+    "configmap",
+    "configmaps",
+    "rolebinding",
+    "clusterrolebinding",
+    "clusterrole",
+    "serviceaccount",
+    "crd",
+    "resources",
+    "rbac",
+    "istio",
+    "knative",
+    "dex",
+    "cert",
+    "cert-manager",
+    "namespace",
+    "namespaces",
+    "authorizationpolicy",
+    "authorizationpolicies",
+    "clustertrainingruntime",
+    "clusterservingruntimes",
+    "pvcviewer",
+    "networkpolicy",
+    "horizontalpodautoscaler",
+    "webhook",
+    "kustomization",
+    "dockerfile",
+    "helm",
+}
+
+# Stronger signal terms that definitively mean the user wants code/manifest
+# results rather than documentation pages.
+STRONG_CODE_TERMS = {
+    "authorizationpolicy", "authorizationpolicies",
+    "clusterrolebinding", "clusterrole",
+    "clustertrainingruntime", "clusterservingruntimes",
+    "clusterservingruntime",
+    "pvcviewer", "networkpolicy",
+    "kustomization", "dockerfile",
+    "helm", "cache server",
+    "metadata service", "metadata-grpc",
+}
+
+DOCS_INTENT_TERMS = {
+    "how",
+    "what",
+    "overview",
+    "introduction",
+    "guide",
+    "concept",
+    "architecture",
+    "tutorial",
+}
+
+
+def split_terms(value: str) -> List[str]:
+    """Split free text, paths, and identifiers into normalized terms."""
+    expanded = re.sub(r"([a-z0-9])([A-Z])", r"\1 \2", value)
+    normalized = re.sub(r"[^A-Za-z0-9]+", " ", expanded)
+    return [token.lower() for token in normalized.split() if token]
+
+
+def unique_terms(values: Iterable[str], limit: int = 32) -> List[str]:
+    """Return unique normalized terms while preserving order."""
+    seen = set()
+    ordered: List[str] = []
+    for value in values:
+        for token in split_terms(str(value)):
+            if token not in seen:
+                seen.add(token)
+                ordered.append(token)
+                if len(ordered) >= limit:
+                    return ordered
+    return ordered
+
+
+def source_alias_terms(source: str) -> List[str]:
+    """Return semantic alias terms for a source path or URL."""
+    normalized = source.replace("\\", "/").lower()
+    aliases: List[str] = []
+    for prefix, hints in PATH_ALIAS_HINTS.items():
+        if prefix in normalized:
+            aliases.extend(hints)
+    return unique_terms(aliases, limit=20)
+
+
+def analyze_query(question: str) -> Dict[str, object]:
+    """Analyze a user question and produce retrieval hints."""
+    lowered = question.lower()
+    expanded_terms = [question]
+
+    for trigger, additions in QUERY_EXPANSIONS.items():
+        if trigger in lowered:
+            expanded_terms.extend(additions)
+
+    question_terms = set(split_terms(question))
+    prefer_code = bool(question_terms & CODE_INTENT_TERMS)
+    # If any strong code term is present, strongly prefer code.
+    strongly_prefer_code = bool(
+        question_terms & STRONG_CODE_TERMS
+    ) or any(term in lowered for term in STRONG_CODE_TERMS)
+    prefer_docs = not prefer_code and bool(question_terms & DOCS_INTENT_TERMS)
+
+    priority_terms = unique_terms(expanded_terms, limit=28)
+    enhanced_query = question
+    if len(priority_terms) > len(split_terms(question)):
+        enhanced_query = (
+            f"{question}\n"
+            f"Relevant retrieval hints: {' '.join(priority_terms)}"
+        )
+
+    return {
+        "question": question,
+        "enhanced_query": enhanced_query,
+        "priority_terms": priority_terms,
+        "prefer_code": prefer_code,
+        "strongly_prefer_code": strongly_prefer_code,
+        "prefer_docs": prefer_docs,
+    }
+
+
+def rerank_hits(
+    hits: List[Dict[str, object]],
+    query_analysis: Dict[str, object],
+    top_k: int,
+) -> List[Dict[str, object]]:
+    """Rerank candidate hits with lightweight hybrid-retrieval heuristics."""
+    priority_terms = set(query_analysis.get("priority_terms", []))
+    prefer_code = bool(query_analysis.get("prefer_code"))
+    strongly_prefer_code = bool(query_analysis.get("strongly_prefer_code"))
+    prefer_docs = bool(query_analysis.get("prefer_docs"))
+    question_lower = str(query_analysis.get("question", "")).lower()
+
+    reranked: List[Dict[str, object]] = []
+
+    for hit in hits:
+        score = float(hit.get("distance", 0.0))
+        collection = str(hit.get("collection", ""))
+        source = str(hit.get("source_url") or hit.get("file_path") or "")
+        symbol_name = str(hit.get("symbol_name", ""))
+        heading = str(hit.get("heading", ""))
+        text = str(hit.get("chunk_text", ""))
+
+        haystack = " ".join([source, symbol_name, heading, text]).lower()
+        haystack_terms = set(split_terms(haystack))
+        path_aliases = set(source_alias_terms(source))
+
+        # --- Collection preference ---
+        if strongly_prefer_code:
+            # Strongly boost code results when query mentions specific K8s resources
+            if collection == "code_collection":
+                score += 0.15
+            elif collection == "docs_collection":
+                score -= 0.06
+        elif prefer_code:
+            if collection == "code_collection":
+                score += 0.08
+            elif collection == "docs_collection":
+                score -= 0.03
+
+        if prefer_docs:
+            if collection == "docs_collection":
+                score += 0.09
+            elif collection == "code_collection":
+                score -= 0.04
+
+        # --- Term-overlap scoring ---
+        term_overlap = len(priority_terms & haystack_terms)
+        alias_overlap = len(priority_terms & path_aliases)
+        score += min(0.16, 0.014 * term_overlap)
+        score += min(0.10, 0.025 * alias_overlap)
+
+        # --- Path-keyword boosting ---
+        # Extract meaningful keywords from the query and boost hits whose
+        # file_path or source_url directly contain those keywords.
+        source_lower = source.lower()
+        path_keywords = [
+            "cache", "metadata", "rbac", "authorization", "runtimes",
+            "catalog", "pvcviewer", "release", "webhook", "training-operator",
+            "trainer", "kserve", "pipeline", "model-registry",
+        ]
+        for kw in path_keywords:
+            if kw in question_lower and kw in source_lower:
+                score += 0.06
+
+        if prefer_code and source.endswith((".yaml", ".yml")):
+            score += 0.02
+        if "kustomization.yaml" in source:
+            score += 0.02
+
+        reranked_hit = dict(hit)
+        reranked_hit["rerank_score"] = score
+        reranked.append(reranked_hit)
+
+    reranked.sort(key=lambda item: item.get("rerank_score", item.get("distance", 0.0)), reverse=True)
+    return reranked[:top_k]