diff --git a/backend/schemas/__init__.py b/backend/schemas/__init__.py new file mode 100644 index 0000000..2dd4f6b --- /dev/null +++ b/backend/schemas/__init__.py @@ -0,0 +1 @@ +# Backend schemas package diff --git a/backend/schemas/code_collection_schema.py b/backend/schemas/code_collection_schema.py new file mode 100644 index 0000000..d629add --- /dev/null +++ b/backend/schemas/code_collection_schema.py @@ -0,0 +1,156 @@ +""" +Milvus schema definition for the code_collection. + +Stores chunked and embedded code from kubeflow/manifests repository. +Supports Python, Go, YAML, and Markdown file types. +Uses HNSW index with COSINE metric for fast ANN retrieval. + +Dimension defaults to 384 (all-MiniLM-L6-v2). Override via EMBEDDING_MODEL env var. +""" + +import logging +import os +import sys + +from pymilvus import CollectionSchema, DataType, FieldSchema + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..")) + +from pipelines.shared.embedding_utils import get_embedding_dimension + +logger = logging.getLogger(__name__) + +COLLECTION_NAME = "code_collection" + + +def get_code_fields(dim: int = None) -> list: + """Define the field schema for code_collection. + + Args: + dim: Embedding vector dimension. Auto-detected from EMBEDDING_MODEL if None. + + Returns: + List of FieldSchema objects. + """ + if dim is None: + dim = get_embedding_dimension() + + fields = [ + FieldSchema( + name="chunk_id", + dtype=DataType.VARCHAR, + max_length=128, + is_primary=True, + description="Unique chunk identifier (hash of file_path + symbol + index)", + ), + FieldSchema( + name="file_path", + dtype=DataType.VARCHAR, + max_length=512, + description="Relative file path within the repository", + ), + FieldSchema( + name="extension", + dtype=DataType.VARCHAR, + max_length=16, + description="File extension (e.g., .py, .go, .yaml)", + ), + FieldSchema( + name="language", + dtype=DataType.VARCHAR, + max_length=32, + description="Programming language (python, go, yaml, markdown)", + ), + FieldSchema( + name="symbol_name", + dtype=DataType.VARCHAR, + max_length=256, + description="Function/class/struct/resource name", + ), + FieldSchema( + name="folder_context", + dtype=DataType.VARCHAR, + max_length=128, + description="Top-level folder for domain context (e.g., apps, common)", + ), + FieldSchema( + name="chunk_text", + dtype=DataType.VARCHAR, + max_length=8192, + description="The actual code/content chunk text", + ), + FieldSchema( + name="start_line", + dtype=DataType.INT64, + description="Starting line number in the source file", + ), + FieldSchema( + name="end_line", + dtype=DataType.INT64, + description="Ending line number in the source file", + ), + FieldSchema( + name="commit_sha", + dtype=DataType.VARCHAR, + max_length=64, + description="Git commit SHA for provenance tracking", + ), + FieldSchema( + name="chunk_index", + dtype=DataType.INT64, + description="Index of this chunk within the file (for compatibility)", + ), + FieldSchema( + name="embedding", + dtype=DataType.FLOAT_VECTOR, + dim=dim, + description=f"Dense embedding vector ({dim} dimensions)", + ), + ] + return fields + + +def get_code_schema(dim: int = None) -> CollectionSchema: + """Create the full CollectionSchema for code_collection. + + Args: + dim: Embedding vector dimension. + + Returns: + CollectionSchema object. + """ + fields = get_code_fields(dim) + schema = CollectionSchema( + fields=fields, + description="Kubeflow manifests code chunks with embeddings for RAG retrieval", + ) + return schema + + +def get_code_index_params() -> dict: + """Get the HNSW index parameters for the embedding field. + + Returns: + Dict of index parameters for Milvus. + """ + return { + "metric_type": "COSINE", + "index_type": "HNSW", + "params": {"M": 16, "efConstruction": 200}, + } + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + schema = get_code_schema() + logger.info("code_collection schema:") + for field in schema.fields: + logger.info( + " %s: %s (max_length=%s, dim=%s, primary=%s)", + field.name, + field.dtype.name, + getattr(field, "max_length", "-"), + getattr(field, "dim", "-"), + field.is_primary, + ) + logger.info("Index params: %s", get_code_index_params()) diff --git a/backend/schemas/docs_collection_schema.py b/backend/schemas/docs_collection_schema.py new file mode 100644 index 0000000..7574316 --- /dev/null +++ b/backend/schemas/docs_collection_schema.py @@ -0,0 +1,145 @@ +""" +Milvus schema definition for the docs_collection. + +Stores chunked and embedded Kubeflow documentation from kubeflow.org. +Uses HNSW index with COSINE metric for fast ANN retrieval. + +Dimension defaults to 384 (all-MiniLM-L6-v2). Override via EMBEDDING_MODEL env var. +""" + +import logging +import os +import sys + +from pymilvus import CollectionSchema, DataType, FieldSchema + +# Allow imports from project root +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..")) + +from pipelines.shared.embedding_utils import get_embedding_dimension + +logger = logging.getLogger(__name__) + +COLLECTION_NAME = "docs_collection" + + +def get_docs_fields(dim: int = None) -> list: + """Define the field schema for docs_collection. + + Args: + dim: Embedding vector dimension. Auto-detected from EMBEDDING_MODEL if None. + + Returns: + List of FieldSchema objects. + """ + if dim is None: + dim = get_embedding_dimension() + + fields = [ + FieldSchema( + name="chunk_id", + dtype=DataType.VARCHAR, + max_length=128, + is_primary=True, + description="Unique chunk identifier (hash of url + chunk_index)", + ), + FieldSchema( + name="source_url", + dtype=DataType.VARCHAR, + max_length=512, + description="Original page URL from kubeflow.org", + ), + FieldSchema( + name="page_title", + dtype=DataType.VARCHAR, + max_length=256, + description="Page title extracted from content", + ), + FieldSchema( + name="heading", + dtype=DataType.VARCHAR, + max_length=256, + description="H2/H3 heading this chunk belongs to", + ), + FieldSchema( + name="section", + dtype=DataType.VARCHAR, + max_length=128, + description="Top-level docs section (e.g., components, started)", + ), + FieldSchema( + name="chunk_text", + dtype=DataType.VARCHAR, + max_length=16384, + description="The actual chunk text content", + ), + FieldSchema( + name="token_count", + dtype=DataType.INT64, + description="Number of tokens in this chunk", + ), + FieldSchema( + name="chunk_index", + dtype=DataType.INT64, + description="Sequential index of this chunk within its page", + ), + FieldSchema( + name="crawled_at", + dtype=DataType.VARCHAR, + max_length=64, + description="ISO timestamp when the page was crawled", + ), + FieldSchema( + name="embedding", + dtype=DataType.FLOAT_VECTOR, + dim=dim, + description=f"Dense embedding vector ({dim} dimensions)", + ), + ] + return fields + + +def get_docs_schema(dim: int = None) -> CollectionSchema: + """Create the full CollectionSchema for docs_collection. + + Args: + dim: Embedding vector dimension. + + Returns: + CollectionSchema object. + """ + fields = get_docs_fields(dim) + schema = CollectionSchema( + fields=fields, + description="Kubeflow documentation chunks with embeddings for RAG retrieval", + ) + return schema + + +def get_docs_index_params() -> dict: + """Get the HNSW index parameters for the embedding field. + + Returns: + Dict of index parameters for Milvus. + """ + return { + "metric_type": "COSINE", + "index_type": "HNSW", + "params": {"M": 16, "efConstruction": 200}, + } + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + schema = get_docs_schema() + logger.info("docs_collection schema:") + for field in schema.fields: + logger.info( + " %s: %s (max_length=%s, dim=%s, primary=%s)", + field.name, + field.dtype.name, + getattr(field, "max_length", "-"), + getattr(field, "dim", "-"), + field.is_primary, + ) + logger.info("Index params: %s", get_docs_index_params()) diff --git a/pipelines/code_ingestion/__init__.py b/pipelines/code_ingestion/__init__.py new file mode 100644 index 0000000..2749f5f --- /dev/null +++ b/pipelines/code_ingestion/__init__.py @@ -0,0 +1 @@ +# Code ingestion pipeline components diff --git a/pipelines/code_ingestion/components/__init__.py b/pipelines/code_ingestion/components/__init__.py new file mode 100644 index 0000000..1362c76 --- /dev/null +++ b/pipelines/code_ingestion/components/__init__.py @@ -0,0 +1 @@ +# Code ingestion components diff --git a/pipelines/code_ingestion/components/ast_parser.py b/pipelines/code_ingestion/components/ast_parser.py new file mode 100644 index 0000000..d70e019 --- /dev/null +++ b/pipelines/code_ingestion/components/ast_parser.py @@ -0,0 +1,739 @@ +""" +Code Ingestion — AST Parser Component + +Multi-language parser that extracts logical code units: + - Python: AST-based extraction of functions and classes with docstrings + - Go: Regex-based splitting on func/struct boundaries + - YAML/YML: Split by top-level Kubernetes resource kind + - Markdown: Split by H2/H3 headings + +Each extracted unit becomes a chunk with rich metadata for retrieval. +""" + +import ast +import hashlib +import logging +import os +import re +from typing import Any, Dict, Iterable, List, Optional + +import yaml + +logger = logging.getLogger(__name__) + +PATH_ALIAS_HINTS = { + "common/istio": [ + "istio", + "service mesh", + "gateway", + "authorization policy", + "peer authentication", + "virtual service", + "sidecar", + "envoy", + "mtls", + "ingress", + ], + "common/knative": [ + "knative", + "serving", + "eventing", + "serverless", + "scale to zero", + "activator", + "revision", + "service", + "net istio", + "webhook", + ], + "common/dex": [ + "dex", + "oidc", + "oauth2", + "authentication", + "identity provider", + "connector", + "login", + ], + "common/cert-manager": [ + "cert manager", + "certificate", + "issuer", + "clusterissuer", + "cainjector", + "tls", + "webhook", + ], + "applications/pipeline": [ + "kubeflow pipelines", + "kfp", + "pipeline api server", + "deployment", + "service", + "configmap", + "role", + "rolebinding", + "serviceaccount", + "crd", + "webhook", + "scheduled workflow", + ], + "applications/profiles": [ + "profiles", + "namespaces", + "rbac", + "rolebinding", + "serviceaccount", + "user profile", + ], + "tests": [ + "tests", + "e2e", + "integration", + "validation", + "presubmit", + ], +} + + +def split_search_terms(value: str) -> List[str]: + """Split identifiers and paths into normalized search terms.""" + expanded = re.sub(r"([a-z0-9])([A-Z])", r"\1 \2", value) + normalized = re.sub(r"[^A-Za-z0-9]+", " ", expanded) + return [token.lower() for token in normalized.split() if token] + + +def unique_terms(values: Iterable[str], limit: int = 24) -> List[str]: + """Return unique normalized search terms while preserving order.""" + seen = set() + ordered: List[str] = [] + for value in values: + for token in split_search_terms(value): + if token not in seen: + seen.add(token) + ordered.append(token) + if len(ordered) >= limit: + return ordered + return ordered + + +def get_path_aliases(file_path: str) -> List[str]: + """Return path-aware semantic aliases for common Kubeflow manifest areas.""" + normalized = file_path.replace("\\", "/").lower() + aliases: List[str] = [] + for prefix, hints in PATH_ALIAS_HINTS.items(): + if normalized.startswith(prefix): + aliases.extend(hints) + return aliases + + +def summarize_list(values: Any, limit: int = 8) -> str: + """Summarize a list-like value for retrieval context lines.""" + if not isinstance(values, list): + return "" + flattened = [str(item) for item in values if item] + return ", ".join(flattened[:limit]) + + +def extract_container_names(parsed: Dict[str, Any]) -> List[str]: + """Extract workload container names when present.""" + spec = parsed.get("spec") + if not isinstance(spec, dict): + return [] + + template = spec.get("template", {}) + if isinstance(template, dict): + template_spec = template.get("spec", {}) + if isinstance(template_spec, dict): + containers = template_spec.get("containers", []) + if isinstance(containers, list): + return [ + str(container.get("name")) + for container in containers + if isinstance(container, dict) and container.get("name") + ] + + job_template = spec.get("jobTemplate", {}) + if isinstance(job_template, dict): + nested_spec = job_template.get("spec", {}) + if isinstance(nested_spec, dict): + nested_template = nested_spec.get("template", {}) + if isinstance(nested_template, dict): + nested_template_spec = nested_template.get("spec", {}) + if isinstance(nested_template_spec, dict): + containers = nested_template_spec.get("containers", []) + if isinstance(containers, list): + return [ + str(container.get("name")) + for container in containers + if isinstance(container, dict) and container.get("name") + ] + return [] + + +def build_manifest_context( + parsed: Dict[str, Any], + file_path: str, + folder_context: str, +) -> str: + """Build retrieval-oriented context text for a Kubernetes manifest.""" + metadata = parsed.get("metadata", {}) + metadata = metadata if isinstance(metadata, dict) else {} + + kind = str(parsed.get("kind", "Unknown")) + api_version = str(parsed.get("apiVersion", "unknown")) + name = str(metadata.get("name", "unknown")) + namespace = str(metadata.get("namespace", "cluster-scoped")) + + path_terms = unique_terms([file_path, os.path.basename(file_path), folder_context], limit=18) + alias_terms = unique_terms(get_path_aliases(file_path), limit=18) + label_keys = summarize_list(list((metadata.get("labels") or {}).keys())) + annotation_keys = summarize_list(list((metadata.get("annotations") or {}).keys())) + top_level_keys = summarize_list(list(parsed.keys())) + + summary_lines = [ + f"Manifest file path: {file_path}", + f"Folder context: {folder_context}", + f"Resource kind: {kind}", + f"API version: {api_version}", + f"Metadata name: {name}", + f"Namespace: {namespace}", + ] + + if path_terms: + summary_lines.append(f"Path hints: {' '.join(path_terms)}") + if alias_terms: + summary_lines.append(f"Domain hints: {' '.join(alias_terms)}") + if top_level_keys: + summary_lines.append(f"Top-level keys: {top_level_keys}") + if label_keys: + summary_lines.append(f"Label keys: {label_keys}") + if annotation_keys: + summary_lines.append(f"Annotation keys: {annotation_keys}") + + spec = parsed.get("spec") + spec = spec if isinstance(spec, dict) else {} + + if kind.lower() == "kustomization" or os.path.basename(file_path).lower() == "kustomization.yaml": + resources = summarize_list(parsed.get("resources")) + components = summarize_list(parsed.get("components")) + bases = summarize_list(parsed.get("bases")) + patches = summarize_list(parsed.get("patchesStrategicMerge")) + if resources: + summary_lines.append(f"Kustomize resources: {resources}") + if components: + summary_lines.append(f"Kustomize components: {components}") + if bases: + summary_lines.append(f"Kustomize bases: {bases}") + if patches: + summary_lines.append(f"Kustomize patches: {patches}") + + if kind in {"Deployment", "StatefulSet", "DaemonSet", "Job", "CronJob"}: + container_names = summarize_list(extract_container_names(parsed)) + service_account = spec.get("serviceAccountName") or ( + spec.get("template", {}).get("spec", {}).get("serviceAccountName") + if isinstance(spec.get("template"), dict) + else None + ) + if container_names: + summary_lines.append(f"Workload containers: {container_names}") + if service_account: + summary_lines.append(f"Service account: {service_account}") + + if kind == "Service": + service_type = spec.get("type") + ports = spec.get("ports") + selector = spec.get("selector") + if service_type: + summary_lines.append(f"Service type: {service_type}") + if isinstance(selector, dict) and selector: + summary_lines.append( + f"Service selector keys: {', '.join(list(selector.keys())[:8])}" + ) + if isinstance(ports, list) and ports: + port_values = [str(port.get("port")) for port in ports if isinstance(port, dict) and port.get("port")] + if port_values: + summary_lines.append(f"Service ports: {', '.join(port_values[:8])}") + + if kind == "CustomResourceDefinition": + crd_spec = spec + names = crd_spec.get("names", {}) if isinstance(crd_spec.get("names"), dict) else {} + versions = crd_spec.get("versions", []) + if crd_spec.get("group"): + summary_lines.append(f"CRD group: {crd_spec.get('group')}") + if names.get("kind"): + summary_lines.append(f"CRD served kind: {names.get('kind')}") + if isinstance(versions, list) and versions: + version_names = [str(version.get("name")) for version in versions if isinstance(version, dict) and version.get("name")] + if version_names: + summary_lines.append(f"CRD versions: {', '.join(version_names[:8])}") + + if kind in {"Role", "ClusterRole"}: + rules = spec.get("rules", parsed.get("rules")) + if isinstance(rules, list) and rules: + resource_names = [] + verbs = [] + for rule in rules[:4]: + if isinstance(rule, dict): + resource_names.extend(str(item) for item in rule.get("resources", [])[:4]) + verbs.extend(str(item) for item in rule.get("verbs", [])[:4]) + if resource_names: + summary_lines.append(f"RBAC resources: {', '.join(resource_names[:10])}") + if verbs: + summary_lines.append(f"RBAC verbs: {', '.join(verbs[:10])}") + + if kind in {"RoleBinding", "ClusterRoleBinding"}: + role_ref = parsed.get("roleRef", {}) + subjects = parsed.get("subjects", []) + if isinstance(role_ref, dict) and role_ref.get("name"): + summary_lines.append(f"Binding roleRef: {role_ref.get('name')}") + if isinstance(subjects, list) and subjects: + subject_names = [ + str(subject.get("name")) + for subject in subjects + if isinstance(subject, dict) and subject.get("name") + ] + if subject_names: + summary_lines.append(f"Binding subjects: {', '.join(subject_names[:10])}") + + if kind in {"AuthorizationPolicy", "PeerAuthentication", "VirtualService", "Gateway", "DestinationRule"}: + selector = spec.get("selector", {}) + if isinstance(selector, dict): + match_labels = selector.get("matchLabels", {}) + if isinstance(match_labels, dict) and match_labels: + summary_lines.append( + f"Istio selector labels: {', '.join(list(match_labels.keys())[:8])}" + ) + gateways = spec.get("gateways") + hosts = spec.get("hosts") + if isinstance(gateways, list) and gateways: + summary_lines.append(f"Istio gateways: {', '.join(str(g) for g in gateways[:8])}") + if isinstance(hosts, list) and hosts: + summary_lines.append(f"Istio hosts: {', '.join(str(h) for h in hosts[:8])}") + + return "\n".join(f"# {line}" for line in summary_lines if line) + + +def generate_chunk_id(file_path: str, symbol_name: str, index: int) -> str: + """Generate a deterministic chunk ID. + + Args: + file_path: Relative file path. + symbol_name: Function/class/resource name. + index: Sequential index. + + Returns: + SHA256 hash string (first 32 chars). + """ + raw = f"{file_path}::{symbol_name}::{index}" + return hashlib.sha256(raw.encode()).hexdigest()[:32] + + +# ─── Python Parser ────────────────────────────────────────────────────────── + +def parse_python(content: str, file_path: str, commit_sha: str, + folder_context: str) -> List[Dict[str, Any]]: + """Parse Python source into function and class chunks via AST. + + Args: + content: Python source code. + file_path: Relative file path. + commit_sha: Git commit SHA. + folder_context: Top-level folder name. + + Returns: + List of chunk dicts. + """ + chunks = [] + lines = content.split("\n") + + try: + tree = ast.parse(content) + except SyntaxError as e: + logger.warning("Syntax error in %s: %s", file_path, e) + # Fall back to whole-file chunk + return [{ + "chunk_id": generate_chunk_id(file_path, "module", 0), + "file_path": file_path, + "extension": ".py", + "language": "python", + "symbol_name": os.path.basename(file_path), + "chunk_text": content, + "start_line": 1, + "end_line": len(lines), + "commit_sha": commit_sha, + "folder_context": folder_context, + }] + + idx = 0 + for node in ast.walk(tree): + if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)): + symbol_name = node.name + start_line = node.lineno + end_line = node.end_lineno or start_line + + # Extract the source lines + chunk_lines = lines[start_line - 1 : end_line] + chunk_text = "\n".join(chunk_lines) + + # Extract docstring if present + docstring = ast.get_docstring(node) or "" + symbol_type = "class" if isinstance(node, ast.ClassDef) else "function" + + chunks.append({ + "chunk_id": generate_chunk_id(file_path, symbol_name, idx), + "file_path": file_path, + "extension": ".py", + "language": "python", + "symbol_name": f"{symbol_type}:{symbol_name}", + "chunk_text": chunk_text, + "start_line": start_line, + "end_line": end_line, + "commit_sha": commit_sha, + "folder_context": folder_context, + }) + idx += 1 + + # If no functions/classes found, treat whole file as one chunk + if not chunks: + chunks.append({ + "chunk_id": generate_chunk_id(file_path, "module", 0), + "file_path": file_path, + "extension": ".py", + "language": "python", + "symbol_name": f"module:{os.path.basename(file_path)}", + "chunk_text": content, + "start_line": 1, + "end_line": len(lines), + "commit_sha": commit_sha, + "folder_context": folder_context, + }) + + return chunks + + +# ─── Go Parser ────────────────────────────────────────────────────────────── + +def parse_go(content: str, file_path: str, commit_sha: str, + folder_context: str) -> List[Dict[str, Any]]: + """Parse Go source by splitting on func and type struct boundaries. + + Args: + content: Go source code. + file_path: Relative file path. + commit_sha: Git commit SHA. + folder_context: Top-level folder name. + + Returns: + List of chunk dicts. + """ + chunks = [] + lines = content.split("\n") + + # Match func declarations and type struct declarations + pattern = re.compile( + r"^(?:func\s+(?:\([^)]+\)\s+)?(\w+)|type\s+(\w+)\s+struct)\b", + re.MULTILINE, + ) + + matches = list(pattern.finditer(content)) + + if not matches: + # Whole file as one chunk + return [{ + "chunk_id": generate_chunk_id(file_path, "file", 0), + "file_path": file_path, + "extension": ".go", + "language": "go", + "symbol_name": f"file:{os.path.basename(file_path)}", + "chunk_text": content, + "start_line": 1, + "end_line": len(lines), + "commit_sha": commit_sha, + "folder_context": folder_context, + }] + + for i, match in enumerate(matches): + symbol = match.group(1) or match.group(2) + start_pos = match.start() + end_pos = matches[i + 1].start() if i + 1 < len(matches) else len(content) + + chunk_text = content[start_pos:end_pos].rstrip() + start_line = content[:start_pos].count("\n") + 1 + end_line = start_line + chunk_text.count("\n") + + is_struct = match.group(2) is not None + symbol_type = "struct" if is_struct else "func" + + chunks.append({ + "chunk_id": generate_chunk_id(file_path, symbol, i), + "file_path": file_path, + "extension": ".go", + "language": "go", + "symbol_name": f"{symbol_type}:{symbol}", + "chunk_text": chunk_text, + "start_line": start_line, + "end_line": end_line, + "commit_sha": commit_sha, + "folder_context": folder_context, + }) + + return chunks + + +# ─── YAML Parser ──────────────────────────────────────────────────────────── + +def parse_yaml(content: str, file_path: str, commit_sha: str, + folder_context: str) -> List[Dict[str, Any]]: + """Parse YAML by splitting on Kubernetes resource kind boundaries. + + Args: + content: YAML content (may contain multiple documents). + file_path: Relative file path. + commit_sha: Git commit SHA. + folder_context: Top-level folder name. + + Returns: + List of chunk dicts. + """ + chunks = [] + + # Split multi-document YAML + documents = content.split("\n---") + + for idx, doc in enumerate(documents): + doc = doc.strip() + if not doc: + continue + + # Try to parse as YAML + try: + parsed = yaml.safe_load(doc) + except yaml.YAMLError: + parsed = None + + if isinstance(parsed, dict): + kind = parsed.get("kind", "Unknown") + name = "unknown" + metadata = parsed.get("metadata", {}) + if isinstance(metadata, dict): + name = metadata.get("name", "unknown") + symbol_name = f"{kind}:{name}" + manifest_context = build_manifest_context(parsed, file_path, folder_context) + chunk_body = f"{manifest_context}\n\n{doc}" if manifest_context else doc + else: + kind = "fragment" + symbol_name = f"fragment:{idx}" + chunk_body = doc + + # Calculate line numbers + preceding = "\n---".join(documents[:idx]) + start_line = preceding.count("\n") + 1 if preceding else 1 + end_line = start_line + doc.count("\n") + + chunks.append({ + "chunk_id": generate_chunk_id(file_path, symbol_name, idx), + "file_path": file_path, + "extension": os.path.splitext(file_path)[1].lower(), + "language": "yaml", + "symbol_name": symbol_name, + "chunk_text": chunk_body, + "start_line": start_line, + "end_line": end_line, + "commit_sha": commit_sha, + "folder_context": folder_context, + }) + + if not chunks: + chunks.append({ + "chunk_id": generate_chunk_id(file_path, "file", 0), + "file_path": file_path, + "extension": os.path.splitext(file_path)[1].lower(), + "language": "yaml", + "symbol_name": f"file:{os.path.basename(file_path)}", + "chunk_text": content, + "start_line": 1, + "end_line": content.count("\n") + 1, + "commit_sha": commit_sha, + "folder_context": folder_context, + }) + + return chunks + + +# ─── Markdown Parser ──────────────────────────────────────────────────────── + +def parse_markdown(content: str, file_path: str, commit_sha: str, + folder_context: str) -> List[Dict[str, Any]]: + """Parse Markdown by H2/H3 headings. + + Args: + content: Markdown content. + file_path: Relative file path. + commit_sha: Git commit SHA. + folder_context: Top-level folder name. + + Returns: + List of chunk dicts. + """ + chunks = [] + heading_pattern = re.compile(r"^(#{2,3})\s+(.+)$", re.MULTILINE) + matches = list(heading_pattern.finditer(content)) + + if not matches: + return [{ + "chunk_id": generate_chunk_id(file_path, "doc", 0), + "file_path": file_path, + "extension": ".md", + "language": "markdown", + "symbol_name": f"doc:{os.path.basename(file_path)}", + "chunk_text": content, + "start_line": 1, + "end_line": content.count("\n") + 1, + "commit_sha": commit_sha, + "folder_context": folder_context, + }] + + for i, match in enumerate(matches): + heading = match.group(2).strip() + start = match.start() + end = matches[i + 1].start() if i + 1 < len(matches) else len(content) + text = content[start:end].strip() + + start_line = content[:start].count("\n") + 1 + end_line = start_line + text.count("\n") + + chunks.append({ + "chunk_id": generate_chunk_id(file_path, heading, i), + "file_path": file_path, + "extension": ".md", + "language": "markdown", + "symbol_name": f"heading:{heading[:100]}", + "chunk_text": text, + "start_line": start_line, + "end_line": end_line, + "commit_sha": commit_sha, + "folder_context": folder_context, + }) + + return chunks + + +# ─── Main Dispatcher ──────────────────────────────────────────────────────── + +PARSERS = { + ".py": parse_python, + ".go": parse_go, + ".yaml": parse_yaml, + ".yml": parse_yaml, + ".md": parse_markdown, +} + + +def parse_file( + content: str, + file_path: str, + extension: str, + commit_sha: str, + folder_context: str, +) -> List[Dict[str, Any]]: + """Parse a file into chunks using the appropriate language parser. + + Args: + content: File content string. + file_path: Relative file path. + extension: File extension (e.g., '.py'). + commit_sha: Git commit SHA. + folder_context: Top-level folder name. + + Returns: + List of chunk dicts. + """ + parser = PARSERS.get(extension.lower()) + if parser is None: + logger.warning("No parser for extension: %s (%s)", extension, file_path) + return [] + + try: + return parser(content, file_path, commit_sha, folder_context) + except Exception as e: + logger.error("Parser error for %s: %s", file_path, e) + return [] + + +def parse_all_files( + repo_dir: str, + file_list: List[Dict[str, Any]], + commit_sha: str, +) -> List[Dict[str, Any]]: + """Parse all files in the file list. + + Args: + repo_dir: Repository root directory. + file_list: List of file info dicts from repo_cloner. + commit_sha: Git commit SHA. + + Returns: + List of all chunk dicts across all files. + """ + all_chunks = [] + + for i, file_info in enumerate(file_list): + file_path = file_info["path"] + extension = file_info["extension"] + folder_context = file_info.get("folder_context", "root") + + full_path = os.path.join(repo_dir, file_path) + try: + with open(full_path, "r", encoding="utf-8", errors="replace") as f: + content = f.read() + except Exception as e: + logger.warning("Cannot read %s: %s", file_path, e) + continue + + chunks = parse_file(content, file_path, extension, commit_sha, folder_context) + all_chunks.extend(chunks) + + if (i + 1) % 50 == 0: + logger.info("Parsed %d/%d files (%d chunks so far)", i + 1, len(file_list), len(all_chunks)) + + logger.info("AST parsing complete: %d chunks from %d files.", len(all_chunks), len(file_list)) + return all_chunks + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") + + # Test Python parser + py_code = ''' +def hello_world(): + """Say hello.""" + print("Hello, World!") + +class MyClass: + """A test class.""" + def method(self): + pass +''' + chunks = parse_python(py_code, "test.py", "abc123", "tests") + logger.info("=== Python Parser Test ===") + for c in chunks: + logger.info(" %s [L%d-%d]", c["symbol_name"], c["start_line"], c["end_line"]) + + # Test YAML parser + yaml_content = '''apiVersion: apps/v1 +kind: Deployment +metadata: + name: my-app +spec: + replicas: 3 +--- +apiVersion: v1 +kind: Service +metadata: + name: my-service +''' + chunks = parse_yaml(yaml_content, "deploy.yaml", "abc123", "apps") + logger.info("=== YAML Parser Test ===") + for c in chunks: + logger.info(" %s [L%d-%d]", c["symbol_name"], c["start_line"], c["end_line"]) diff --git a/pipelines/code_ingestion/components/chunker.py b/pipelines/code_ingestion/components/chunker.py new file mode 100644 index 0000000..872a606 --- /dev/null +++ b/pipelines/code_ingestion/components/chunker.py @@ -0,0 +1,237 @@ +""" +Code Ingestion — Chunker Component + +Post-processes AST parser output to enforce token limits and add +context headers to each chunk. + +Features: + - Enforces 50-512 token limits + - Prepends context header: # File: ... | Symbol: ... | Lang: ... + - Splits oversized chunks at logical boundaries (blank lines) +""" + +import hashlib +import json +import logging +import os +from typing import Any, Dict, List + +logger = logging.getLogger(__name__) + +# Try to use tiktoken +try: + import tiktoken + _ENCODER = tiktoken.get_encoding("cl100k_base") + def count_tokens(text: str) -> int: + return len(_ENCODER.encode(text)) +except ImportError: + def count_tokens(text: str) -> int: + return int(len(text.split()) * 1.3) + +MIN_TOKENS = 50 +MAX_TOKENS = 512 + + +def build_path_hints(chunk: Dict[str, Any]) -> str: + """Build a normalized path-hint string for retrieval context.""" + raw = " ".join( + str(chunk.get(key, "")) + for key in ("file_path", "folder_context", "symbol_name") + ) + expanded = raw.replace("/", " ").replace("_", " ").replace("-", " ") + expanded = "".join( + ( + f" {char}" if index > 0 and char.isupper() and expanded[index - 1].islower() else char + ) + for index, char in enumerate(expanded) + ) + normalized = " ".join(expanded.split()) + return normalized.lower() + + +def make_context_header(chunk: Dict[str, Any]) -> str: + """Create a context header string for a code chunk. + + This header is prepended to the chunk text before embedding to help + the model understand the code's origin and purpose. + + Args: + chunk: Chunk dict with file_path, symbol_name, language. + + Returns: + Context header string. + """ + lines = [ + ( + f"# File: {chunk.get('file_path', 'unknown')} " + f"| Symbol: {chunk.get('symbol_name', 'unknown')} " + f"| Lang: {chunk.get('language', 'unknown')} " + f"| Folder: {chunk.get('folder_context', 'unknown')}" + ) + ] + path_hints = build_path_hints(chunk) + if path_hints: + lines.append(f"# Path Hints: {path_hints}") + return "\n".join(lines) + + +def split_oversized_chunk(text: str, max_tokens: int) -> List[str]: + """Split an oversized chunk at logical boundaries. + + Tries to split at blank lines first, then single newlines, + then falls back to word splitting. + + Args: + text: Text to split. + max_tokens: Maximum tokens per sub-chunk. + + Returns: + List of sub-chunk strings. + """ + if count_tokens(text) <= max_tokens: + return [text] + + # Try blank line split first + for sep in ["\n\n", "\n"]: + parts = text.split(sep) + if len(parts) <= 1: + continue + + chunks = [] + current = "" + + for part in parts: + candidate = current + sep + part if current else part + if count_tokens(candidate) > max_tokens: + if current.strip(): + chunks.append(current.strip()) + current = part + else: + current = candidate + + if current.strip(): + chunks.append(current.strip()) + + if len(chunks) > 1: + return chunks + + # Last resort: word split + words = text.split() + chunks = [] + current_words = [] + + for word in words: + current_words.append(word) + if count_tokens(" ".join(current_words)) > max_tokens: + if len(current_words) > 1: + chunks.append(" ".join(current_words[:-1])) + current_words = [word] + + if current_words: + chunks.append(" ".join(current_words)) + + return chunks + + +def process_chunks( + raw_chunks: List[Dict[str, Any]], +) -> List[Dict[str, Any]]: + """Post-process AST parser chunks with token limits and context headers. + + Args: + raw_chunks: List of raw chunk dicts from AST parser. + + Returns: + List of processed chunk dicts ready for embedding. + """ + processed = [] + skipped_short = 0 + split_count = 0 + # Track index per file_path + file_indices = {} + + for chunk in raw_chunks: + # Add context header + header = make_context_header(chunk) + full_text = f"{header}\n\n{chunk['chunk_text']}" + token_count = count_tokens(full_text) + + if token_count < MIN_TOKENS: + skipped_short += 1 + continue + + if token_count <= MAX_TOKENS: + processed_chunk = chunk.copy() + fp = chunk.get("file_path", "unknown") + ci = file_indices.get(fp, 0) + processed_chunk["chunk_text"] = full_text[:8192] + processed_chunk["token_count"] = token_count + processed_chunk["chunk_index"] = ci + processed.append(processed_chunk) + file_indices[fp] = ci + 1 + else: + # Split oversized chunk + sub_chunks = split_oversized_chunk(full_text, MAX_TOKENS) + split_count += 1 + + for idx, sub_text in enumerate(sub_chunks): + sub_tokens = count_tokens(sub_text) + if sub_tokens < MIN_TOKENS: + continue + + sub_chunk = chunk.copy() + fp = chunk.get("file_path", "unknown") + ci = file_indices.get(fp, 0) + sub_chunk["chunk_id"] = hashlib.sha256( + f"{chunk['chunk_id']}::{idx}".encode() + ).hexdigest()[:32] + sub_chunk["chunk_text"] = sub_text[:8192] + sub_chunk["token_count"] = sub_tokens + sub_chunk["chunk_index"] = ci + processed.append(sub_chunk) + file_indices[fp] = ci + 1 + + logger.info( + "Chunker: %d input -> %d output (%d short skipped, %d split)", + len(raw_chunks), len(processed), skipped_short, split_count, + ) + return processed + + +def save_chunks(chunks: List[Dict[str, Any]], output_path: str) -> None: + """Save chunks to a JSONL file. + + Args: + chunks: List of chunk dicts. + output_path: Path to write the file. + """ + os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True) + with open(output_path, "w", encoding="utf-8") as f: + for chunk in chunks: + f.write(json.dumps(chunk, ensure_ascii=False) + "\n") + logger.info("Saved %d chunks to %s", len(chunks), output_path) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") + + test_chunks = [ + { + "chunk_id": "abc123", + "file_path": "apps/pipeline/upstream/kfp/v2/compiler.py", + "extension": ".py", + "language": "python", + "symbol_name": "function:compile_pipeline", + "chunk_text": "def compile_pipeline(pipeline_func):\n \"\"\"Compile a pipeline function.\"\"\"\n return compiled", + "start_line": 10, + "end_line": 12, + "commit_sha": "deadbeef", + "folder_context": "apps", + }, + ] + + result = process_chunks(test_chunks) + logger.info("=== Code Chunker Test ===") + for c in result: + logger.info(" %s (tokens=%d)", c["symbol_name"], c.get("token_count", 0)) + logger.info(" Text preview: %s...", c["chunk_text"][:100]) diff --git a/pipelines/code_ingestion/components/embedder.py b/pipelines/code_ingestion/components/embedder.py new file mode 100644 index 0000000..de49f62 --- /dev/null +++ b/pipelines/code_ingestion/components/embedder.py @@ -0,0 +1,85 @@ +""" +Code Ingestion — Embedder Component + +Embeds code chunks using configurable embedding model. +Identical to docs embedder but imports from shared utilities. + +The context header prepended by the chunker is included in the +embedding input so vectors capture both code semantics and file location. +""" + +import json +import logging +import os +import sys +from typing import Any, Dict, List + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..")) + +from pipelines.shared.embedding_utils import EmbeddingClient + +logger = logging.getLogger(__name__) + + +def embed_code_chunks( + chunks: List[Dict[str, Any]], + batch_size: int = 32, +) -> List[Dict[str, Any]]: + """Embed all code chunks and add embeddings to each chunk dict. + + Args: + chunks: List of chunk dicts (must have 'chunk_text' key). + batch_size: Batch size for embedding. + + Returns: + Same chunk dicts with added 'embedding' key. + """ + if not chunks: + logger.warning("No chunks to embed.") + return [] + + client = EmbeddingClient(batch_size=batch_size) + texts = [chunk["chunk_text"] for chunk in chunks] + + logger.info("Embedding %d code chunks with model: %s", len(texts), client.model_name) + embeddings = client.embed_texts(texts) + + for chunk, embedding in zip(chunks, embeddings): + chunk["embedding"] = embedding + + logger.info("Embedding complete. %d code chunks embedded.", len(chunks)) + return chunks + + +def load_chunks(input_path: str) -> List[Dict[str, Any]]: + """Load chunks from a JSONL file.""" + chunks = [] + with open(input_path, "r", encoding="utf-8") as f: + for line in f: + if line.strip(): + chunks.append(json.loads(line)) + return chunks + + +def save_embedded_chunks(chunks: List[Dict[str, Any]], output_path: str) -> None: + """Save embedded chunks to a JSONL file.""" + os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True) + with open(output_path, "w", encoding="utf-8") as f: + for chunk in chunks: + f.write(json.dumps(chunk, ensure_ascii=False) + "\n") + logger.info("Saved %d embedded chunks to %s", len(chunks), output_path) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") + test_chunks = [ + { + "chunk_id": "test001", + "chunk_text": "# File: apps/kfp/compiler.py | Symbol: func:compile | Lang: python\n\ndef compile(): pass", + "file_path": "apps/kfp/compiler.py", + "language": "python", + }, + ] + result = embed_code_chunks(test_chunks) + for c in result: + logger.info(" chunk_id=%s dim=%d", c["chunk_id"], len(c.get("embedding", []))) diff --git a/pipelines/code_ingestion/components/loader.py b/pipelines/code_ingestion/components/loader.py new file mode 100644 index 0000000..42961e5 --- /dev/null +++ b/pipelines/code_ingestion/components/loader.py @@ -0,0 +1,107 @@ +""" +Code Ingestion — Loader Component + +Loads embedded code chunks into the Milvus code_collection. +Uses upsert pattern with chunk_id as primary key. +""" + +import json +import logging +import os +import sys +from typing import Any, Dict, List + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "..")) + +from pymilvus import (Collection, CollectionSchema, DataType, FieldSchema, + connections, utility) + +from pipelines.shared.milvus_utils import connect, create_collection_if_not_exists, upsert_batch +from backend.schemas.code_collection_schema import ( + COLLECTION_NAME, + get_code_fields, + get_code_index_params, +) + +logger = logging.getLogger(__name__) + + +def should_recreate_collection() -> bool: + """Return whether the loader should drop and recreate the collection. + + This is disabled by default so local re-runs preserve previously indexed + data and rely on primary-key upserts instead of destructive reloads. + """ + return os.environ.get("MILVUS_DROP_EXISTING", "false").lower() == "true" + + +def load_to_milvus( + chunks: List[Dict[str, Any]], + collection_name: str = None, +) -> Dict[str, int]: + """Load embedded code chunks into Milvus code_collection. + + Args: + chunks: List of chunk dicts with embeddings. + collection_name: Override collection name. + + Returns: + Ingestion summary with inserted, failed, total counts. + """ + col_name = collection_name or COLLECTION_NAME + + connect() + + # Recreate only when explicitly requested. + if should_recreate_collection() and utility.has_collection(col_name): + utility.drop_collection(col_name) + logger.info("Dropped existing collection %s for schema refresh", col_name) + + fields = get_code_fields() + index_params = get_code_index_params() + collection = create_collection_if_not_exists( + collection_name=col_name, + fields=fields, + description="Kubeflow manifests code chunks for RAG retrieval", + index_field="embedding", + index_params=index_params, + ) + + rows = [] + for chunk in chunks: + if "embedding" not in chunk: + continue + + row = { + "chunk_id": str(chunk["chunk_id"])[:128], + "file_path": str(chunk.get("file_path", ""))[:512], + "extension": str(chunk.get("extension", ""))[:16], + "language": str(chunk.get("language", ""))[:32], + "symbol_name": str(chunk.get("symbol_name", ""))[:256], + "folder_context": str(chunk.get("folder_context", ""))[:128], + "chunk_text": str(chunk.get("chunk_text", ""))[:8192], + "start_line": int(chunk.get("start_line", 0)), + "end_line": int(chunk.get("end_line", 0)), + "commit_sha": str(chunk.get("commit_sha", ""))[:64], + "chunk_index": int(chunk.get("chunk_index", 0)), + "embedding": chunk["embedding"], + } + rows.append(row) + + if not rows: + return {"inserted": 0, "failed": 0, "total": 0, "skipped": len(chunks)} + + summary = upsert_batch(collection, rows, batch_size=100) + summary["skipped"] = len(chunks) - len(rows) + + logger.info( + "Code ingestion: %d inserted, %d failed, %d skipped", + summary["inserted"], summary["failed"], summary["skipped"], + ) + return summary + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") + logger.info("=== Code Loader Smoke Test ===") + logger.info("Requires Milvus at localhost:19530") diff --git a/pipelines/code_ingestion/components/repo_cloner.py b/pipelines/code_ingestion/components/repo_cloner.py new file mode 100644 index 0000000..922e364 --- /dev/null +++ b/pipelines/code_ingestion/components/repo_cloner.py @@ -0,0 +1,221 @@ +""" +Code Ingestion — Repo Cloner Component + +Clones the kubeflow/manifests repository and collects file metadata. +Records commit SHA for provenance tracking. + +Features: + - Clones via subprocess (git) or GitPython + - Skips hidden dirs, __pycache__, node_modules + - Size filter: skip files < 200 bytes or > 100KB + - Groups files by extension + +Environment variables: + KUBEFLOW_MANIFESTS_REPO: Repo URL (default: https://github.com/kubeflow/manifests) +""" + +import json +import logging +import os +import shutil +import subprocess +import tempfile +from typing import Any, Dict, List, Optional + +logger = logging.getLogger(__name__) + +SKIP_DIRS = { + ".git", "__pycache__", "node_modules", ".tox", ".mypy_cache", + ".pytest_cache", ".venv", "venv", ".eggs", "*.egg-info", +} + +SUPPORTED_EXTENSIONS = {".py", ".go", ".yaml", ".yml", ".md"} + +MIN_FILE_SIZE = 200 # bytes +MAX_FILE_SIZE = 100_000 # 100KB + + +def get_repo_url() -> str: + """Get the repository URL from environment. + + Returns: + Repository URL string. + """ + return os.environ.get( + "KUBEFLOW_MANIFESTS_REPO", + "https://github.com/kubeflow/manifests", + ) + + +def clone_repo( + repo_url: Optional[str] = None, + target_dir: Optional[str] = None, + branch: str = "master", +) -> Dict[str, Any]: + """Clone a git repository and collect file metadata. + + Args: + repo_url: Repository URL to clone. + target_dir: Directory to clone into (temp dir if None). + branch: Git branch to clone. + + Returns: + Dict with commit_sha, repo_dir, and file_list. + """ + url = repo_url or get_repo_url() + clone_dir = target_dir or tempfile.mkdtemp(prefix="docs-agent-code-") + + logger.info("Cloning %s (branch: %s) to %s", url, branch, clone_dir) + + try: + subprocess.run( + ["git", "clone", "--depth", "1", "--branch", branch, url, clone_dir], + check=True, + capture_output=True, + text=True, + ) + except subprocess.CalledProcessError as e: + logger.error("Git clone failed: %s", e.stderr) + raise RuntimeError(f"Failed to clone {url}: {e.stderr}") from e + + # Get commit SHA + try: + result = subprocess.run( + ["git", "rev-parse", "HEAD"], + capture_output=True, text=True, cwd=clone_dir, check=True, + ) + commit_sha = result.stdout.strip() + except subprocess.CalledProcessError: + commit_sha = "unknown" + + logger.info("Cloned at commit: %s", commit_sha[:12]) + + # Walk and collect files + file_list = collect_files(clone_dir) + + return { + "commit_sha": commit_sha, + "repo_dir": clone_dir, + "file_list": file_list, + } + + +def should_skip_dir(dir_name: str) -> bool: + """Check if a directory should be skipped. + + Args: + dir_name: Directory basename. + + Returns: + True if the directory should be skipped. + """ + if dir_name.startswith("."): + return True + return dir_name in SKIP_DIRS + + +def collect_files(repo_dir: str) -> List[Dict[str, Any]]: + """Walk a directory and collect file metadata. + + Filters by extension, size, and skips hidden/utility directories. + + Args: + repo_dir: Root directory to walk. + + Returns: + List of file info dicts: {path, extension, size_bytes, folder_context}. + """ + files = [] + + for root, dirs, filenames in os.walk(repo_dir): + # Filter out directories to skip (modifies in-place) + dirs[:] = [d for d in dirs if not should_skip_dir(d)] + + for filename in filenames: + filepath = os.path.join(root, filename) + rel_path = os.path.relpath(filepath, repo_dir) + + # Check extension + _, ext = os.path.splitext(filename) + if ext.lower() not in SUPPORTED_EXTENSIONS: + continue + + # Check size + try: + size = os.path.getsize(filepath) + except OSError: + continue + + if size < MIN_FILE_SIZE or size > MAX_FILE_SIZE: + continue + + # Determine folder context (top-level directory) + parts = rel_path.split(os.sep) + folder_context = parts[0] if len(parts) > 1 else "root" + + files.append({ + "path": rel_path, + "extension": ext.lower(), + "size_bytes": size, + "folder_context": folder_context, + }) + + # Log summary by extension + ext_counts: Dict[str, int] = {} + for f in files: + ext_counts[f["extension"]] = ext_counts.get(f["extension"], 0) + 1 + + logger.info("Collected %d files: %s", len(files), ext_counts) + return files + + +def read_file_content(repo_dir: str, file_path: str) -> Optional[str]: + """Read file content safely. + + Args: + repo_dir: Repository root directory. + file_path: Relative file path. + + Returns: + File content string, or None if unreadable. + """ + full_path = os.path.join(repo_dir, file_path) + try: + with open(full_path, "r", encoding="utf-8", errors="replace") as f: + return f.read() + except Exception as e: + logger.warning("Cannot read %s: %s", file_path, e) + return None + + +def save_clone_results( + result: Dict[str, Any], output_path: str +) -> None: + """Save clone results to a JSON file. + + Args: + result: Clone result dict. + output_path: Path to write the file. + """ + os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True) + # Don't include repo_dir in the saved output (it's a temp path) + save_data = { + "commit_sha": result["commit_sha"], + "file_count": len(result["file_list"]), + "file_list": result["file_list"], + } + with open(output_path, "w") as f: + json.dump(save_data, f, indent=2) + logger.info("Saved clone results to %s", output_path) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") + logger.info("=== Repo Cloner Smoke Test ===") + result = clone_repo() + logger.info("Commit: %s", result["commit_sha"][:12]) + logger.info("Files: %d", len(result["file_list"])) + for f in result["file_list"][:10]: + logger.info(" %s (%s, %d bytes)", f["path"], f["extension"], f["size_bytes"]) + # Cleanup + shutil.rmtree(result["repo_dir"], ignore_errors=True) diff --git a/pipelines/code_ingestion/full_pipeline.yaml b/pipelines/code_ingestion/full_pipeline.yaml new file mode 100644 index 0000000..5977c73 --- /dev/null +++ b/pipelines/code_ingestion/full_pipeline.yaml @@ -0,0 +1,1434 @@ +# PIPELINE DEFINITION +# Name: full-ingestion-pipeline +# Description: Run both docs and code ingestion pipelines in parallel +# Inputs: +# chunk_overlap: int [Default: 50.0] +# chunk_size: int [Default: 500.0] +# code_branch: str [Default: 'master'] +# code_repo_url: str [Default: 'https://github.com/kubeflow/manifests'] +# docs_base_url: str [Default: 'https://www.kubeflow.org'] +# docs_crawl_delay: float [Default: 1.0] +# docs_max_pages: int [Default: 0.0] +# embedding_dim: int [Default: 384.0] +# embedding_model: str [Default: 'sentence-transformers/all-MiniLM-L6-v2'] +# milvus_host: str [Default: 'localhost'] +# milvus_port: str [Default: '19530'] +components: + comp-chunk-code: + executorLabel: exec-chunk-code + inputDefinitions: + artifacts: + parsed_data: + artifactType: + schemaTitle: system.Dataset + schemaVersion: 0.0.1 + description: Input dataset of parsed chunks. + outputDefinitions: + artifacts: + chunked_data: + artifactType: + schemaTitle: system.Dataset + schemaVersion: 0.0.1 + comp-chunk-docs: + executorLabel: exec-chunk-docs + inputDefinitions: + artifacts: + crawled_data: + artifactType: + schemaTitle: system.Dataset + schemaVersion: 0.0.1 + description: Input dataset of crawled pages. + parameters: + chunk_overlap: + description: Token overlap between chunks. + parameterType: NUMBER_INTEGER + chunk_size: + description: Maximum tokens per chunk. + parameterType: NUMBER_INTEGER + outputDefinitions: + artifacts: + chunked_data: + artifactType: + schemaTitle: system.Dataset + schemaVersion: 0.0.1 + comp-clone-repo: + executorLabel: exec-clone-repo + inputDefinitions: + parameters: + branch: + description: Branch name to clone. + parameterType: STRING + repo_url: + description: Repository URL to clone. + parameterType: STRING + outputDefinitions: + artifacts: + clone_data: + artifactType: + schemaTitle: system.Dataset + schemaVersion: 0.0.1 + comp-code-ingestion-pipeline: + dag: + tasks: + chunk-code: + cachingOptions: + enableCache: true + componentRef: + name: comp-chunk-code + dependentTasks: + - parse-code + inputs: + artifacts: + parsed_data: + taskOutputArtifact: + outputArtifactKey: parsed_data + producerTask: parse-code + retryPolicy: + backoffDuration: 30s + backoffFactor: 2.0 + backoffMaxDuration: 3600s + maxRetryCount: 3 + taskInfo: + name: chunk-code + clone-repo: + cachingOptions: + enableCache: true + componentRef: + name: comp-clone-repo + inputs: + parameters: + branch: + componentInputParameter: branch + repo_url: + componentInputParameter: repo_url + retryPolicy: + backoffDuration: 30s + backoffFactor: 2.0 + backoffMaxDuration: 3600s + maxRetryCount: 3 + taskInfo: + name: clone-repo + embed-code: + cachingOptions: + enableCache: true + componentRef: + name: comp-embed-code + dependentTasks: + - chunk-code + inputs: + artifacts: + chunked_data: + taskOutputArtifact: + outputArtifactKey: chunked_data + producerTask: chunk-code + parameters: + embedding_model: + componentInputParameter: embedding_model + retryPolicy: + backoffDuration: 30s + backoffFactor: 2.0 + backoffMaxDuration: 3600s + maxRetryCount: 3 + taskInfo: + name: embed-code + load-code: + cachingOptions: + enableCache: true + componentRef: + name: comp-load-code + dependentTasks: + - embed-code + inputs: + artifacts: + embedded_data: + taskOutputArtifact: + outputArtifactKey: embedded_data + producerTask: embed-code + parameters: + collection_name: + componentInputParameter: collection_name + embedding_dim: + componentInputParameter: embedding_dim + milvus_host: + componentInputParameter: milvus_host + milvus_port: + componentInputParameter: milvus_port + retryPolicy: + backoffDuration: 30s + backoffFactor: 2.0 + backoffMaxDuration: 3600s + maxRetryCount: 3 + taskInfo: + name: load-code + parse-code: + cachingOptions: + enableCache: true + componentRef: + name: comp-parse-code + dependentTasks: + - clone-repo + inputs: + artifacts: + clone_data: + taskOutputArtifact: + outputArtifactKey: clone_data + producerTask: clone-repo + retryPolicy: + backoffDuration: 30s + backoffFactor: 2.0 + backoffMaxDuration: 3600s + maxRetryCount: 3 + taskInfo: + name: parse-code + inputDefinitions: + parameters: + branch: + defaultValue: master + isOptional: true + parameterType: STRING + collection_name: + defaultValue: code_collection + isOptional: true + parameterType: STRING + embedding_dim: + defaultValue: 384.0 + isOptional: true + parameterType: NUMBER_INTEGER + embedding_model: + defaultValue: sentence-transformers/all-MiniLM-L6-v2 + isOptional: true + parameterType: STRING + milvus_host: + defaultValue: localhost + isOptional: true + parameterType: STRING + milvus_port: + defaultValue: '19530' + isOptional: true + parameterType: STRING + repo_url: + defaultValue: https://github.com/kubeflow/manifests + isOptional: true + parameterType: STRING + comp-crawl-docs: + executorLabel: exec-crawl-docs + inputDefinitions: + parameters: + base_url: + description: Base URL for kubeflow docs (e.g., https://www.kubeflow.org). + parameterType: STRING + crawl_delay: + description: Delay in seconds between requests. + parameterType: NUMBER_DOUBLE + max_pages: + description: Max pages to crawl (0 = unlimited). + parameterType: NUMBER_INTEGER + outputDefinitions: + artifacts: + crawled_data: + artifactType: + schemaTitle: system.Dataset + schemaVersion: 0.0.1 + comp-docs-ingestion-pipeline: + dag: + tasks: + chunk-docs: + cachingOptions: + enableCache: true + componentRef: + name: comp-chunk-docs + dependentTasks: + - crawl-docs + inputs: + artifacts: + crawled_data: + taskOutputArtifact: + outputArtifactKey: crawled_data + producerTask: crawl-docs + parameters: + chunk_overlap: + componentInputParameter: chunk_overlap + chunk_size: + componentInputParameter: chunk_size + retryPolicy: + backoffDuration: 30s + backoffFactor: 2.0 + backoffMaxDuration: 3600s + maxRetryCount: 3 + taskInfo: + name: chunk-docs + crawl-docs: + cachingOptions: + enableCache: true + componentRef: + name: comp-crawl-docs + inputs: + parameters: + base_url: + componentInputParameter: base_url + crawl_delay: + componentInputParameter: crawl_delay + max_pages: + componentInputParameter: max_pages + retryPolicy: + backoffDuration: 30s + backoffFactor: 2.0 + backoffMaxDuration: 3600s + maxRetryCount: 3 + taskInfo: + name: crawl-docs + embed-docs: + cachingOptions: + enableCache: true + componentRef: + name: comp-embed-docs + dependentTasks: + - chunk-docs + inputs: + artifacts: + chunked_data: + taskOutputArtifact: + outputArtifactKey: chunked_data + producerTask: chunk-docs + parameters: + embedding_model: + componentInputParameter: embedding_model + retryPolicy: + backoffDuration: 30s + backoffFactor: 2.0 + backoffMaxDuration: 3600s + maxRetryCount: 3 + taskInfo: + name: embed-docs + load-docs: + cachingOptions: + enableCache: true + componentRef: + name: comp-load-docs + dependentTasks: + - embed-docs + inputs: + artifacts: + embedded_data: + taskOutputArtifact: + outputArtifactKey: embedded_data + producerTask: embed-docs + parameters: + collection_name: + componentInputParameter: collection_name + embedding_dim: + componentInputParameter: embedding_dim + milvus_host: + componentInputParameter: milvus_host + milvus_port: + componentInputParameter: milvus_port + retryPolicy: + backoffDuration: 30s + backoffFactor: 2.0 + backoffMaxDuration: 3600s + maxRetryCount: 3 + taskInfo: + name: load-docs + inputDefinitions: + parameters: + base_url: + defaultValue: https://www.kubeflow.org + isOptional: true + parameterType: STRING + chunk_overlap: + defaultValue: 50.0 + isOptional: true + parameterType: NUMBER_INTEGER + chunk_size: + defaultValue: 500.0 + isOptional: true + parameterType: NUMBER_INTEGER + collection_name: + defaultValue: docs_collection + isOptional: true + parameterType: STRING + crawl_delay: + defaultValue: 1.0 + isOptional: true + parameterType: NUMBER_DOUBLE + embedding_dim: + defaultValue: 384.0 + isOptional: true + parameterType: NUMBER_INTEGER + embedding_model: + defaultValue: sentence-transformers/all-MiniLM-L6-v2 + isOptional: true + parameterType: STRING + max_pages: + defaultValue: 0.0 + isOptional: true + parameterType: NUMBER_INTEGER + milvus_host: + defaultValue: localhost + isOptional: true + parameterType: STRING + milvus_port: + defaultValue: '19530' + isOptional: true + parameterType: STRING + comp-embed-code: + executorLabel: exec-embed-code + inputDefinitions: + artifacts: + chunked_data: + artifactType: + schemaTitle: system.Dataset + schemaVersion: 0.0.1 + description: Input dataset of chunked code. + parameters: + embedding_model: + description: Model name for embeddings. + parameterType: STRING + outputDefinitions: + artifacts: + embedded_data: + artifactType: + schemaTitle: system.Dataset + schemaVersion: 0.0.1 + comp-embed-docs: + executorLabel: exec-embed-docs + inputDefinitions: + artifacts: + chunked_data: + artifactType: + schemaTitle: system.Dataset + schemaVersion: 0.0.1 + description: Input dataset of chunks. + parameters: + embedding_model: + description: Model name for embeddings. + parameterType: STRING + outputDefinitions: + artifacts: + embedded_data: + artifactType: + schemaTitle: system.Dataset + schemaVersion: 0.0.1 + comp-load-code: + executorLabel: exec-load-code + inputDefinitions: + artifacts: + embedded_data: + artifactType: + schemaTitle: system.Dataset + schemaVersion: 0.0.1 + description: Input dataset with embedded chunks. + parameters: + collection_name: + description: Target collection name. + parameterType: STRING + embedding_dim: + description: Vector dimension. + parameterType: NUMBER_INTEGER + milvus_host: + description: Milvus server host. + parameterType: STRING + milvus_port: + description: Milvus server port. + parameterType: STRING + comp-load-docs: + executorLabel: exec-load-docs + inputDefinitions: + artifacts: + embedded_data: + artifactType: + schemaTitle: system.Dataset + schemaVersion: 0.0.1 + description: Input dataset with embedded chunks. + parameters: + collection_name: + description: Target collection name. + parameterType: STRING + embedding_dim: + description: Vector dimension. + parameterType: NUMBER_INTEGER + milvus_host: + description: Milvus server host. + parameterType: STRING + milvus_port: + description: Milvus server port. + parameterType: STRING + comp-parse-code: + executorLabel: exec-parse-code + inputDefinitions: + artifacts: + clone_data: + artifactType: + schemaTitle: system.Dataset + schemaVersion: 0.0.1 + description: Input dataset from repo cloner. + outputDefinitions: + artifacts: + parsed_data: + artifactType: + schemaTitle: system.Dataset + schemaVersion: 0.0.1 +deploymentSpec: + executors: + exec-chunk-code: + container: + args: + - --executor_input + - '{{$}}' + - --function_to_execute + - chunk_code + command: + - sh + - -c + - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ + \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ + \ python3 -m pip install --quiet --no-warn-script-location 'tiktoken==0.7.0'\ + \ && python3 -m pip install --quiet --no-warn-script-location 'kfp==2.16.0'\ + \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\ + $0\" \"$@\"\n" + - sh + - -ec + - 'program_path=$(mktemp -d) + + + printf "%s" "$0" > "$program_path/ephemeral_component.py" + + _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" + + ' + - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ + \ *\n\ndef chunk_code(\n parsed_data: Input[Dataset],\n chunked_data:\ + \ Output[Dataset],\n):\n \"\"\"Post-process parsed chunks with token\ + \ limits and context headers.\n\n Args:\n parsed_data: Input dataset\ + \ of parsed chunks.\n chunked_data: Output dataset of token-bounded\ + \ chunks.\n \"\"\"\n import hashlib\n import json\n import logging\n\ + \n import tiktoken\n\n logging.basicConfig(level=logging.INFO)\n \ + \ logger = logging.getLogger(\"chunker\")\n\n enc = tiktoken.get_encoding(\"\ + cl100k_base\")\n count = lambda t: len(enc.encode(t))\n\n MIN_T, MAX_T\ + \ = 50, 512\n\n def build_path_hints(chunk):\n raw = \" \".join(str(chunk.get(key,\ + \ \"\")) for key in (\"file_path\", \"folder_context\", \"symbol_name\"\ + ))\n expanded = raw.replace(\"/\", \" \").replace(\"_\", \" \").replace(\"\ + -\", \" \")\n expanded = \"\".join(\n (\n \ + \ f\" {char}\" if index > 0 and char.isupper() and expanded[index - 1].islower()\ + \ else char\n )\n for index, char in enumerate(expanded)\n\ + \ )\n return \" \".join(expanded.split()).lower()\n\n raw\ + \ = []\n with open(parsed_data.path) as f:\n for line in f:\n\ + \ if line.strip():\n raw.append(json.loads(line))\n\ + \n processed = []\n for chunk in raw:\n header = (\n \ + \ f\"# File: {chunk.get('file_path', '?')} | Symbol: {chunk.get('symbol_name',\ + \ '?')} \"\n f\"| Lang: {chunk.get('language', '?')} | Folder:\ + \ {chunk.get('folder_context', '?')}\"\n )\n path_hints =\ + \ build_path_hints(chunk)\n if path_hints:\n header =\ + \ f\"{header}\\n# Path Hints: {path_hints}\"\n full = f\"{header}\\\ + n\\n{chunk['chunk_text']}\"\n tc = count(full)\n\n if tc <\ + \ MIN_T:\n continue\n\n if tc <= MAX_T:\n chunk[\"\ + chunk_text\"] = full[:8192]\n chunk[\"token_count\"] = tc\n \ + \ processed.append(chunk)\n else:\n # Split\ + \ oversized\n parts = full.split(\"\\n\\n\")\n cur,\ + \ subs = \"\", []\n for p in parts:\n cand = cur\ + \ + \"\\n\\n\" + p if cur else p\n if count(cand) > MAX_T:\n\ + \ if cur.strip():\n subs.append(cur.strip())\n\ + \ cur = p\n else:\n \ + \ cur = cand\n if cur.strip():\n subs.append(cur.strip())\n\ + \n for si, sub in enumerate(subs):\n st = count(sub)\n\ + \ if st < MIN_T:\n continue\n \ + \ sc = chunk.copy()\n sc[\"chunk_id\"] = hashlib.sha256(f\"\ + {chunk['chunk_id']}::{si}\".encode()).hexdigest()[:32]\n \ + \ sc[\"chunk_text\"] = sub[:8192]\n sc[\"token_count\"] =\ + \ st\n processed.append(sc)\n\n logger.info(\"Chunked\ + \ %d -> %d chunks\", len(raw), len(processed))\n\n with open(chunked_data.path,\ + \ \"w\") as f:\n for c in processed:\n f.write(json.dumps(c,\ + \ ensure_ascii=False) + \"\\n\")\n\n" + image: python:3.11-slim + exec-chunk-docs: + container: + args: + - --executor_input + - '{{$}}' + - --function_to_execute + - chunk_docs + command: + - sh + - -c + - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ + \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ + \ python3 -m pip install --quiet --no-warn-script-location 'tiktoken==0.7.0'\ + \ && python3 -m pip install --quiet --no-warn-script-location 'kfp==2.16.0'\ + \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\ + $0\" \"$@\"\n" + - sh + - -ec + - 'program_path=$(mktemp -d) + + + printf "%s" "$0" > "$program_path/ephemeral_component.py" + + _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" + + ' + - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ + \ *\n\ndef chunk_docs(\n crawled_data: Input[Dataset],\n chunk_size:\ + \ int,\n chunk_overlap: int,\n chunked_data: Output[Dataset],\n):\n\ + \ \"\"\"Chunk crawled documentation by headings with token-aware limits.\n\ + \n Args:\n crawled_data: Input dataset of crawled pages.\n \ + \ chunk_size: Maximum tokens per chunk.\n chunk_overlap: Token\ + \ overlap between chunks.\n chunked_data: Output dataset of chunks.\n\ + \ \"\"\"\n import hashlib\n import json\n import logging\n \ + \ import re\n\n import tiktoken\n\n logging.basicConfig(level=logging.INFO)\n\ + \ logger = logging.getLogger(\"chunker\")\n\n enc = tiktoken.get_encoding(\"\ + cl100k_base\")\n count_tokens = lambda t: len(enc.encode(t))\n\n def\ + \ gen_id(url, idx):\n return hashlib.sha256(f\"{url}::{idx}\".encode()).hexdigest()[:32]\n\ + \n def split_headings(content):\n pat = re.compile(r\"^(#{2,4})\\\ + s+(.+)$\", re.MULTILINE)\n matches = list(pat.finditer(content))\n\ + \ if not matches:\n return [{\"heading\": \"Overview\"\ + , \"text\": content.strip()}]\n sections = []\n pre = content[:matches[0].start()].strip()\n\ + \ if pre and len(pre) > 50:\n sections.append({\"heading\"\ + : \"Introduction\", \"text\": pre})\n for i, m in enumerate(matches):\n\ + \ end = matches[i + 1].start() if i + 1 < len(matches) else len(content)\n\ + \ text = content[m.end():end].strip()\n if text:\n\ + \ sections.append({\"heading\": m.group(2).strip(), \"text\"\ + : text})\n return sections\n\n def recursive_split(text, max_t,\ + \ overlap):\n if count_tokens(text) <= max_t:\n return\ + \ [text]\n for sep in [\"\\n\\n\", \"\\n\", \". \", \" \"]:\n \ + \ parts = text.split(sep)\n if len(parts) <= 1:\n \ + \ continue\n chunks, cur = [], \"\"\n \ + \ for p in parts:\n cand = cur + sep + p if cur else p\n\ + \ if count_tokens(cand) > max_t:\n if\ + \ cur:\n chunks.append(cur.strip())\n \ + \ words = cur.split()\n ow = max(1, int(overlap\ + \ / 1.3))\n cur = \" \".join(words[-ow:]) + sep +\ + \ p\n else:\n cur = p\n \ + \ else:\n cur = cand\n if cur.strip():\n\ + \ chunks.append(cur.strip())\n if chunks:\n \ + \ return chunks\n words = text.split()\n chunks,\ + \ cw = [], []\n for w in words:\n cw.append(w)\n \ + \ if count_tokens(\" \".join(cw)) > max_t:\n chunks.append(\"\ + \ \".join(cw[:-1]))\n ow = max(1, int(overlap / 1.3))\n \ + \ cw = cw[-ow:]\n if cw:\n chunks.append(\"\ + \ \".join(cw))\n return chunks\n\n pages = []\n with open(crawled_data.path)\ + \ as f:\n for line in f:\n if line.strip():\n \ + \ pages.append(json.loads(line))\n\n all_chunks = []\n for\ + \ page in pages:\n sections = split_headings(page[\"content\"])\n\ + \ ci = 0\n context_prefix = f\"Page: {page['title']} | Source:\ + \ {page['url']}\"\n for sec in sections:\n prefixed =\ + \ f\"{context_prefix}\\n{sec['heading']}\\n\\n{sec['text']}\"\n \ + \ tc = count_tokens(prefixed)\n if tc <= chunk_size and tc\ + \ >= 30:\n all_chunks.append({\n \"chunk_id\"\ + : gen_id(page[\"url\"], ci),\n \"source_url\": page[\"\ + url\"], \"page_title\": page[\"title\"],\n \"heading\"\ + : sec[\"heading\"][:256], \"section\": page.get(\"section\", \"\")[:128],\n\ + \ \"chunk_text\": prefixed[:16384], \"token_count\":\ + \ tc,\n \"chunk_index\": ci, \"crawled_at\": page.get(\"\ + crawled_at\", \"\"),\n })\n ci += 1\n \ + \ elif tc > chunk_size:\n for sub in recursive_split(prefixed,\ + \ chunk_size, chunk_overlap):\n st = count_tokens(sub)\n\ + \ if st >= 30:\n all_chunks.append({\n\ + \ \"chunk_id\": gen_id(page[\"url\"], ci),\n\ + \ \"source_url\": page[\"url\"], \"page_title\"\ + : page[\"title\"],\n \"heading\": sec[\"heading\"\ + ][:256], \"section\": page.get(\"section\", \"\")[:128],\n \ + \ \"chunk_text\": sub[:16384], \"token_count\": st,\n \ + \ \"chunk_index\": ci, \"crawled_at\": page.get(\"\ + crawled_at\", \"\"),\n })\n \ + \ ci += 1\n\n logger.info(\"Created %d chunks from %d pages.\", len(all_chunks),\ + \ len(pages))\n\n with open(chunked_data.path, \"w\") as f:\n \ + \ for c in all_chunks:\n f.write(json.dumps(c, ensure_ascii=False)\ + \ + \"\\n\")\n\n" + image: python:3.11-slim + exec-clone-repo: + container: + args: + - --executor_input + - '{{$}}' + - --function_to_execute + - clone_repo + command: + - sh + - -c + - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ + \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ + \ python3 -m pip install --quiet --no-warn-script-location 'gitpython==3.1.43'\ + \ && python3 -m pip install --quiet --no-warn-script-location 'kfp==2.16.0'\ + \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\ + $0\" \"$@\"\n" + - sh + - -ec + - 'program_path=$(mktemp -d) + + + printf "%s" "$0" > "$program_path/ephemeral_component.py" + + _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" + + ' + - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ + \ *\n\ndef clone_repo(\n repo_url: str,\n branch: str,\n clone_data:\ + \ Output[Dataset],\n):\n \"\"\"Clone a git repository and collect file\ + \ metadata.\n\n Args:\n repo_url: Repository URL to clone.\n \ + \ branch: Branch name to clone.\n clone_data: Output dataset\ + \ artifact.\n \"\"\"\n import json\n import logging\n import\ + \ os\n import subprocess\n import tempfile\n\n logging.basicConfig(level=logging.INFO)\n\ + \ logger = logging.getLogger(\"repo_cloner\")\n\n SKIP_DIRS = {\"\ + .git\", \"__pycache__\", \"node_modules\", \".tox\", \".mypy_cache\"}\n\ + \ EXTENSIONS = {\".py\", \".go\", \".yaml\", \".yml\", \".md\"}\n \ + \ MIN_SIZE, MAX_SIZE = 200, 100_000\n\n clone_dir = tempfile.mkdtemp(prefix=\"\ + code-ingest-\")\n logger.info(\"Cloning %s -> %s\", repo_url, clone_dir)\n\ + \n subprocess.run(\n [\"git\", \"clone\", \"--depth\", \"1\",\ + \ \"--branch\", branch, repo_url, clone_dir],\n check=True, capture_output=True,\ + \ text=True,\n )\n\n result = subprocess.run(\n [\"git\", \"\ + rev-parse\", \"HEAD\"],\n capture_output=True, text=True, cwd=clone_dir,\ + \ check=True,\n )\n commit_sha = result.stdout.strip()\n logger.info(\"\ + Commit: %s\", commit_sha[:12])\n\n files = []\n for root, dirs, fnames\ + \ in os.walk(clone_dir):\n dirs[:] = [d for d in dirs if d not in\ + \ SKIP_DIRS and not d.startswith(\".\")]\n for fn in fnames:\n \ + \ fp = os.path.join(root, fn)\n rel = os.path.relpath(fp,\ + \ clone_dir)\n _, ext = os.path.splitext(fn)\n if\ + \ ext.lower() not in EXTENSIONS:\n continue\n \ + \ try:\n sz = os.path.getsize(fp)\n except OSError:\n\ + \ continue\n if sz < MIN_SIZE or sz > MAX_SIZE:\n\ + \ continue\n parts = rel.split(os.sep)\n \ + \ folder = parts[0] if len(parts) > 1 else \"root\"\n files.append({\"\ + path\": rel, \"extension\": ext.lower(),\n \"size_bytes\"\ + : sz, \"folder_context\": folder})\n\n logger.info(\"Collected %d files\"\ + , len(files))\n\n # Save file list + contents\n output = []\n for\ + \ f in files:\n full = os.path.join(clone_dir, f[\"path\"])\n \ + \ try:\n with open(full, \"r\", encoding=\"utf-8\", errors=\"\ + replace\") as fh:\n content = fh.read()\n except Exception:\n\ + \ continue\n output.append({**f, \"content\": content,\ + \ \"commit_sha\": commit_sha})\n\n with open(clone_data.path, \"w\")\ + \ as fh:\n for item in output:\n fh.write(json.dumps(item,\ + \ ensure_ascii=False) + \"\\n\")\n\n # Cleanup\n import shutil\n \ + \ shutil.rmtree(clone_dir, ignore_errors=True)\n\n" + image: python:3.11-slim + exec-crawl-docs: + container: + args: + - --executor_input + - '{{$}}' + - --function_to_execute + - crawl_docs + command: + - sh + - -c + - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ + \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ + \ python3 -m pip install --quiet --no-warn-script-location 'requests==2.31.0'\ + \ 'beautifulsoup4==4.12.3' && python3 -m pip install --quiet --no-warn-script-location\ + \ 'kfp==2.16.0' '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"\ + 3.9\"' && \"$0\" \"$@\"\n" + - sh + - -ec + - 'program_path=$(mktemp -d) + + + printf "%s" "$0" > "$program_path/ephemeral_component.py" + + _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" + + ' + - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ + \ *\n\ndef crawl_docs(\n base_url: str,\n crawl_delay: float,\n \ + \ max_pages: int,\n crawled_data: Output[Dataset],\n):\n \"\"\"Crawl\ + \ kubeflow.org documentation pages via sitemap.xml.\n\n Args:\n \ + \ base_url: Base URL for kubeflow docs (e.g., https://www.kubeflow.org).\n\ + \ crawl_delay: Delay in seconds between requests.\n max_pages:\ + \ Max pages to crawl (0 = unlimited).\n crawled_data: Output dataset\ + \ artifact for crawled pages.\n \"\"\"\n import json\n import logging\n\ + \ import re\n import time\n from datetime import datetime, timezone\n\ + \ from xml.etree import ElementTree\n\n import requests\n from\ + \ bs4 import BeautifulSoup\n\n logging.basicConfig(level=logging.INFO)\n\ + \ logger = logging.getLogger(\"crawler\")\n\n SITEMAP_NS = {\"ns\"\ + : \"http://www.sitemaps.org/schemas/sitemap/0.9\"}\n\n def fetch(url,\ + \ retries=3):\n for attempt in range(retries):\n try:\n\ + \ resp = requests.get(url, timeout=30, headers={\n \ + \ \"User-Agent\": \"Kubeflow-DocsAgent-Crawler/1.0\"\n \ + \ })\n resp.raise_for_status()\n \ + \ return resp\n except Exception as e:\n if\ + \ attempt < retries - 1:\n time.sleep(2 ** attempt)\n\ + \ else:\n logger.error(\"Failed: %s \u2014\ + \ %s\", url, e)\n return None\n\n def fix_url(raw):\n prefix\ + \ = \"https://www.kubeflow.org\"\n if raw.startswith(prefix + prefix):\n\ + \ return raw[len(prefix):]\n if raw.startswith(prefix\ + \ + \"https://\"):\n return raw[len(prefix):]\n return\ + \ raw\n\n # Parse sitemap\n resp = fetch(f\"{base_url}/sitemap.xml\"\ + )\n if not resp:\n raise RuntimeError(\"Cannot fetch sitemap\"\ + )\n\n root = ElementTree.fromstring(resp.content)\n urls = []\n \ + \ for elem in root.findall(\"ns:url\", SITEMAP_NS):\n loc = elem.find(\"\ + ns:loc\", SITEMAP_NS)\n if loc is not None and loc.text:\n \ + \ url = fix_url(loc.text.strip())\n if \"/docs/\" in url:\n\ + \ urls.append(url)\n\n if max_pages > 0:\n urls\ + \ = urls[:max_pages]\n\n logger.info(\"Found %d docs URLs from sitemap.\"\ + , len(urls))\n\n results = []\n crawled_at = datetime.now(timezone.utc).isoformat()\n\ + \n for i, url in enumerate(urls):\n logger.info(\"[%d/%d] %s\"\ + , i + 1, len(urls), url)\n resp = fetch(url)\n if not resp:\n\ + \ continue\n\n soup = BeautifulSoup(resp.text, \"html.parser\"\ + )\n title_tag = soup.find(\"title\")\n title = title_tag.get_text(strip=True)\ + \ if title_tag else \"Untitled\"\n title = re.sub(r\"\\s*[|\u2013\ + -]\\s*Kubeflow\\s*$\", \"\", title)\n\n for sel in [\"nav\", \"footer\"\ + , \"header\", \"aside\", \"script\", \"style\",\n \"\ + .navbar\", \".sidebar\", \".toc\", \".breadcrumb\",\n \ + \ \".td-sidebar\", \".td-toc\", \".td-navbar\",\n \ + \ \".edit-page\", \".page-meta\"]:\n for el in soup.select(sel):\n\ + \ el.decompose()\n\n main = (\n soup.find(\"\ + main\")\n or soup.find(\"article\")\n or soup.find(\"\ + div\", class_=re.compile(r\"content|td-content|docs-content\"))\n \ + \ or soup.body\n or soup\n )\n\n # Preserve\ + \ heading structure so downstream chunking can split correctly.\n \ + \ for tag_name, md_prefix in [(\"h1\", \"#\"), (\"h2\", \"##\"), (\"h3\"\ + , \"###\"), (\"h4\", \"####\")]:\n for heading_tag in main.find_all(tag_name):\n\ + \ heading_text = heading_tag.get_text(strip=True)\n \ + \ if heading_text:\n heading_tag.replace_with(f\"\ + \\n\\n{md_prefix} {heading_text}\\n\\n\")\n\n # Preserve code blocks\ + \ as inline fenced text markers instead of flattening them away.\n \ + \ for code_tag in main.find_all([\"code\", \"pre\"]):\n code_text\ + \ = code_tag.get_text()\n code_tag.replace_with(f\"`{code_text}`\"\ + )\n\n content = main.get_text(separator=\"\\n\", strip=False)\n \ + \ content = re.sub(r\"\\n{3,}\", \"\\n\\n\", content)\n content\ + \ = re.sub(r\"[ \\t]+\", \" \", content).strip()\n\n if len(content)\ + \ < 200:\n continue\n\n section_match = re.search(r\"\ + /docs/([^/]+)\", url)\n section = section_match.group(1) if section_match\ + \ else \"root\"\n\n results.append({\n \"url\": url, \"\ + title\": title.strip(), \"content\": content.strip(),\n \"section\"\ + : section, \"crawled_at\": crawled_at,\n })\n\n if crawl_delay\ + \ > 0 and i < len(urls) - 1:\n time.sleep(crawl_delay)\n\n \ + \ logger.info(\"Crawled %d pages.\", len(results))\n\n with open(crawled_data.path,\ + \ \"w\") as f:\n for r in results:\n f.write(json.dumps(r,\ + \ ensure_ascii=False) + \"\\n\")\n\n" + image: python:3.11-slim + exec-embed-code: + container: + args: + - --executor_input + - '{{$}}' + - --function_to_execute + - embed_code + command: + - sh + - -c + - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ + \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ + \ python3 -m pip install --quiet --no-warn-script-location 'sentence-transformers==2.7.0'\ + \ 'torch==2.3.0' && python3 -m pip install --quiet --no-warn-script-location\ + \ 'kfp==2.16.0' '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"\ + 3.9\"' && \"$0\" \"$@\"\n" + - sh + - -ec + - 'program_path=$(mktemp -d) + + + printf "%s" "$0" > "$program_path/ephemeral_component.py" + + _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" + + ' + - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ + \ *\n\ndef embed_code(\n chunked_data: Input[Dataset],\n embedding_model:\ + \ str,\n embedded_data: Output[Dataset],\n):\n \"\"\"Embed code chunks\ + \ using configurable model.\n\n Args:\n chunked_data: Input dataset\ + \ of chunked code.\n embedding_model: Model name for embeddings.\n\ + \ embedded_data: Output dataset with embeddings.\n \"\"\"\n \ + \ import json\n import logging\n\n from sentence_transformers import\ + \ SentenceTransformer\n\n logging.basicConfig(level=logging.INFO)\n \ + \ logger = logging.getLogger(\"embedder\")\n\n chunks = []\n with\ + \ open(chunked_data.path) as f:\n for line in f:\n if\ + \ line.strip():\n chunks.append(json.loads(line))\n\n \ + \ logger.info(\"Embedding %d code chunks with %s\", len(chunks), embedding_model)\n\ + \ model = SentenceTransformer(embedding_model)\n\n texts = [c[\"chunk_text\"\ + ] for c in chunks]\n bs = 32\n all_embs = []\n for i in range(0,\ + \ len(texts), bs):\n batch = texts[i:i + bs]\n embs = model.encode(batch,\ + \ show_progress_bar=False)\n all_embs.extend([e.tolist() for e in\ + \ embs])\n logger.info(\"Batch %d/%d\", i // bs + 1, (len(texts)\ + \ + bs - 1) // bs)\n\n for c, e in zip(chunks, all_embs):\n c[\"\ + embedding\"] = e\n\n with open(embedded_data.path, \"w\") as f:\n \ + \ for c in chunks:\n f.write(json.dumps(c, ensure_ascii=False)\ + \ + \"\\n\")\n\n" + image: python:3.11-slim + exec-embed-docs: + container: + args: + - --executor_input + - '{{$}}' + - --function_to_execute + - embed_docs + command: + - sh + - -c + - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ + \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ + \ python3 -m pip install --quiet --no-warn-script-location 'sentence-transformers==2.7.0'\ + \ 'torch==2.3.0' && python3 -m pip install --quiet --no-warn-script-location\ + \ 'kfp==2.16.0' '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"\ + 3.9\"' && \"$0\" \"$@\"\n" + - sh + - -ec + - 'program_path=$(mktemp -d) + + + printf "%s" "$0" > "$program_path/ephemeral_component.py" + + _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" + + ' + - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ + \ *\n\ndef embed_docs(\n chunked_data: Input[Dataset],\n embedding_model:\ + \ str,\n embedded_data: Output[Dataset],\n):\n \"\"\"Embed documentation\ + \ chunks using configurable model.\n\n Args:\n chunked_data: Input\ + \ dataset of chunks.\n embedding_model: Model name for embeddings.\n\ + \ embedded_data: Output dataset with embeddings.\n \"\"\"\n \ + \ import json\n import logging\n\n from sentence_transformers import\ + \ SentenceTransformer\n\n logging.basicConfig(level=logging.INFO)\n \ + \ logger = logging.getLogger(\"embedder\")\n\n chunks = []\n with\ + \ open(chunked_data.path) as f:\n for line in f:\n if\ + \ line.strip():\n chunks.append(json.loads(line))\n\n \ + \ logger.info(\"Embedding %d chunks with %s\", len(chunks), embedding_model)\n\ + \ model = SentenceTransformer(embedding_model)\n\n texts = [c[\"chunk_text\"\ + ] for c in chunks]\n batch_size = 32\n all_embeddings = []\n\n \ + \ for i in range(0, len(texts), batch_size):\n batch = texts[i:i\ + \ + batch_size]\n embs = model.encode(batch, show_progress_bar=False)\n\ + \ all_embeddings.extend([e.tolist() for e in embs])\n logger.info(\"\ + Batch %d/%d done.\", i // batch_size + 1,\n (len(texts)\ + \ + batch_size - 1) // batch_size)\n\n for chunk, emb in zip(chunks,\ + \ all_embeddings):\n chunk[\"embedding\"] = emb\n\n logger.info(\"\ + Embedding complete.\")\n\n with open(embedded_data.path, \"w\") as f:\n\ + \ for c in chunks:\n f.write(json.dumps(c, ensure_ascii=False)\ + \ + \"\\n\")\n\n" + image: python:3.11-slim + exec-load-code: + container: + args: + - --executor_input + - '{{$}}' + - --function_to_execute + - load_code + command: + - sh + - -c + - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ + \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ + \ python3 -m pip install --quiet --no-warn-script-location 'pymilvus==2.4.0'\ + \ && python3 -m pip install --quiet --no-warn-script-location 'kfp==2.16.0'\ + \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\ + $0\" \"$@\"\n" + - sh + - -ec + - 'program_path=$(mktemp -d) + + + printf "%s" "$0" > "$program_path/ephemeral_component.py" + + _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" + + ' + - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ + \ *\n\ndef load_code(\n embedded_data: Input[Dataset],\n milvus_host:\ + \ str,\n milvus_port: str,\n collection_name: str,\n embedding_dim:\ + \ int,\n):\n \"\"\"Load embedded code chunks into Milvus code_collection.\n\ + \n Args:\n embedded_data: Input dataset with embedded chunks.\n\ + \ milvus_host: Milvus server host.\n milvus_port: Milvus server\ + \ port.\n collection_name: Target collection name.\n embedding_dim:\ + \ Vector dimension.\n \"\"\"\n import json\n import logging\n\n\ + \ from pymilvus import (Collection, CollectionSchema, DataType, FieldSchema,\n\ + \ connections, utility)\n\n logging.basicConfig(level=logging.INFO)\n\ + \ logger = logging.getLogger(\"loader\")\n\n connections.connect(\"\ + default\", host=milvus_host, port=milvus_port)\n\n if not utility.has_collection(collection_name):\n\ + \ fields = [\n FieldSchema(\"chunk_id\", DataType.VARCHAR,\ + \ max_length=128, is_primary=True),\n FieldSchema(\"file_path\"\ + , DataType.VARCHAR, max_length=512),\n FieldSchema(\"extension\"\ + , DataType.VARCHAR, max_length=16),\n FieldSchema(\"language\"\ + , DataType.VARCHAR, max_length=32),\n FieldSchema(\"symbol_name\"\ + , DataType.VARCHAR, max_length=256),\n FieldSchema(\"folder_context\"\ + , DataType.VARCHAR, max_length=128),\n FieldSchema(\"chunk_text\"\ + , DataType.VARCHAR, max_length=8192),\n FieldSchema(\"start_line\"\ + , DataType.INT64),\n FieldSchema(\"end_line\", DataType.INT64),\n\ + \ FieldSchema(\"commit_sha\", DataType.VARCHAR, max_length=64),\n\ + \ FieldSchema(\"embedding\", DataType.FLOAT_VECTOR, dim=embedding_dim),\n\ + \ ]\n schema = CollectionSchema(fields, \"Kubeflow manifests\ + \ code chunks\")\n collection = Collection(collection_name, schema)\n\ + \ collection.create_index(\"embedding\", {\n \"metric_type\"\ + : \"COSINE\", \"index_type\": \"HNSW\",\n \"params\": {\"M\"\ + : 16, \"efConstruction\": 200},\n })\n else:\n collection\ + \ = Collection(collection_name)\n\n collection.load()\n\n chunks =\ + \ []\n with open(embedded_data.path) as f:\n for line in f:\n\ + \ if line.strip():\n chunks.append(json.loads(line))\n\ + \n rows = []\n for c in chunks:\n rows.append({\n \ + \ \"chunk_id\": str(c[\"chunk_id\"])[:128],\n \"file_path\"\ + : str(c.get(\"file_path\", \"\"))[:512],\n \"extension\": str(c.get(\"\ + extension\", \"\"))[:16],\n \"language\": str(c.get(\"language\"\ + , \"\"))[:32],\n \"symbol_name\": str(c.get(\"symbol_name\",\ + \ \"\"))[:256],\n \"folder_context\": str(c.get(\"folder_context\"\ + , \"\"))[:128],\n \"chunk_text\": str(c.get(\"chunk_text\", \"\ + \"))[:8192],\n \"start_line\": int(c.get(\"start_line\", 0)),\n\ + \ \"end_line\": int(c.get(\"end_line\", 0)),\n \"\ + commit_sha\": str(c.get(\"commit_sha\", \"\"))[:64],\n \"embedding\"\ + : c[\"embedding\"],\n })\n\n bs = 100\n inserted = 0\n for\ + \ i in range(0, len(rows), bs):\n batch = rows[i:i + bs]\n \ + \ collection.upsert(batch)\n inserted += len(batch)\n\n collection.flush()\n\ + \ logger.info(\"Loaded %d chunks into %s. Total: %d\",\n \ + \ inserted, collection_name, collection.num_entities)\n\n" + image: python:3.11-slim + exec-load-docs: + container: + args: + - --executor_input + - '{{$}}' + - --function_to_execute + - load_docs + command: + - sh + - -c + - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ + \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ + \ python3 -m pip install --quiet --no-warn-script-location 'pymilvus==2.4.0'\ + \ && python3 -m pip install --quiet --no-warn-script-location 'kfp==2.16.0'\ + \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\ + $0\" \"$@\"\n" + - sh + - -ec + - 'program_path=$(mktemp -d) + + + printf "%s" "$0" > "$program_path/ephemeral_component.py" + + _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" + + ' + - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ + \ *\n\ndef load_docs(\n embedded_data: Input[Dataset],\n milvus_host:\ + \ str,\n milvus_port: str,\n collection_name: str,\n embedding_dim:\ + \ int,\n):\n \"\"\"Load embedded chunks into Milvus docs_collection.\n\ + \n Args:\n embedded_data: Input dataset with embedded chunks.\n\ + \ milvus_host: Milvus server host.\n milvus_port: Milvus server\ + \ port.\n collection_name: Target collection name.\n embedding_dim:\ + \ Vector dimension.\n \"\"\"\n import json\n import logging\n\n\ + \ from pymilvus import (Collection, CollectionSchema, DataType, FieldSchema,\n\ + \ connections, utility)\n\n logging.basicConfig(level=logging.INFO)\n\ + \ logger = logging.getLogger(\"loader\")\n\n connections.connect(\"\ + default\", host=milvus_host, port=milvus_port)\n logger.info(\"Connected\ + \ to Milvus at %s:%s\", milvus_host, milvus_port)\n\n if not utility.has_collection(collection_name):\n\ + \ fields = [\n FieldSchema(\"chunk_id\", DataType.VARCHAR,\ + \ max_length=128, is_primary=True),\n FieldSchema(\"source_url\"\ + , DataType.VARCHAR, max_length=512),\n FieldSchema(\"page_title\"\ + , DataType.VARCHAR, max_length=256),\n FieldSchema(\"heading\"\ + , DataType.VARCHAR, max_length=256),\n FieldSchema(\"section\"\ + , DataType.VARCHAR, max_length=128),\n FieldSchema(\"chunk_text\"\ + , DataType.VARCHAR, max_length=16384),\n FieldSchema(\"token_count\"\ + , DataType.INT64),\n FieldSchema(\"chunk_index\", DataType.INT64),\n\ + \ FieldSchema(\"crawled_at\", DataType.VARCHAR, max_length=64),\n\ + \ FieldSchema(\"embedding\", DataType.FLOAT_VECTOR, dim=embedding_dim),\n\ + \ ]\n schema = CollectionSchema(fields, \"Kubeflow docs chunks\"\ + )\n collection = Collection(collection_name, schema)\n collection.create_index(\"\ + embedding\", {\n \"metric_type\": \"COSINE\", \"index_type\"\ + : \"HNSW\",\n \"params\": {\"M\": 16, \"efConstruction\": 200},\n\ + \ })\n logger.info(\"Created collection: %s\", collection_name)\n\ + \ else:\n collection = Collection(collection_name)\n\n collection.load()\n\ + \n chunks = []\n with open(embedded_data.path) as f:\n for\ + \ line in f:\n if line.strip():\n chunks.append(json.loads(line))\n\ + \n rows = []\n for c in chunks:\n rows.append({\n \ + \ \"chunk_id\": str(c[\"chunk_id\"])[:128],\n \"source_url\"\ + : str(c.get(\"source_url\", \"\"))[:512],\n \"page_title\": str(c.get(\"\ + page_title\", \"\"))[:256],\n \"heading\": str(c.get(\"heading\"\ + , \"\"))[:256],\n \"section\": str(c.get(\"section\", \"\"))[:128],\n\ + \ \"chunk_text\": str(c.get(\"chunk_text\", \"\"))[:16384],\n\ + \ \"token_count\": int(c.get(\"token_count\", 0)),\n \ + \ \"chunk_index\": int(c.get(\"chunk_index\", 0)),\n \"crawled_at\"\ + : str(c.get(\"crawled_at\", \"\"))[:64],\n \"embedding\": c[\"\ + embedding\"],\n })\n\n batch_size = 100\n inserted = 0\n \ + \ for i in range(0, len(rows), batch_size):\n batch = rows[i:i +\ + \ batch_size]\n collection.upsert(batch)\n inserted += len(batch)\n\ + \ logger.info(\"Upserted batch %d/%d\", i // batch_size + 1,\n \ + \ (len(rows) + batch_size - 1) // batch_size)\n\n collection.flush()\n\ + \ logger.info(\"Loaded %d chunks into %s. Total: %d\",\n \ + \ inserted, collection_name, collection.num_entities)\n\n" + image: python:3.11-slim + exec-parse-code: + container: + args: + - --executor_input + - '{{$}}' + - --function_to_execute + - parse_code + command: + - sh + - -c + - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ + \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ + \ python3 -m pip install --quiet --no-warn-script-location 'PyYAML==6.0.1'\ + \ && python3 -m pip install --quiet --no-warn-script-location 'kfp==2.16.0'\ + \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\ + $0\" \"$@\"\n" + - sh + - -ec + - 'program_path=$(mktemp -d) + + + printf "%s" "$0" > "$program_path/ephemeral_component.py" + + _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" + + ' + - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ + \ *\n\ndef parse_code(\n clone_data: Input[Dataset],\n parsed_data:\ + \ Output[Dataset],\n):\n \"\"\"Parse files into logical code chunks using\ + \ language-specific parsers.\n\n Args:\n clone_data: Input dataset\ + \ from repo cloner.\n parsed_data: Output dataset of parsed chunks.\n\ + \ \"\"\"\n import ast as pyast\n import hashlib\n import json\n\ + \ import logging\n import os\n import re\n\n import yaml\n\n\ + \ logging.basicConfig(level=logging.INFO)\n logger = logging.getLogger(\"\ + ast_parser\")\n\n PATH_ALIAS_HINTS = {\n \"common/istio\": [\n\ + \ \"istio\", \"service mesh\", \"gateway\", \"authorization policy\"\ + ,\n \"peer authentication\", \"virtual service\", \"sidecar\"\ + , \"envoy\", \"mtls\", \"ingress\",\n ],\n \"common/knative\"\ + : [\n \"knative\", \"serving\", \"eventing\", \"serverless\"\ + , \"scale to zero\",\n \"activator\", \"revision\", \"service\"\ + , \"net istio\", \"webhook\",\n ],\n \"common/dex\": [\n \ + \ \"dex\", \"oidc\", \"oauth2\", \"authentication\", \"identity\ + \ provider\",\n \"connector\", \"login\",\n ],\n \ + \ \"common/cert-manager\": [\n \"cert manager\", \"certificate\"\ + , \"issuer\", \"clusterissuer\",\n \"cainjector\", \"tls\", \"\ + webhook\",\n ],\n \"applications/pipeline\": [\n \ + \ \"kubeflow pipelines\", \"kfp\", \"pipeline api server\", \"deployment\"\ + ,\n \"service\", \"configmap\", \"role\", \"rolebinding\", \"\ + serviceaccount\",\n \"crd\", \"webhook\", \"scheduled workflow\"\ + ,\n ],\n \"applications/profiles\": [\n \"profiles\"\ + , \"namespaces\", \"rbac\", \"rolebinding\", \"serviceaccount\", \"user\ + \ profile\",\n ],\n \"tests\": [\"tests\", \"e2e\", \"integration\"\ + , \"validation\", \"presubmit\"],\n }\n\n def gen_id(fp, sym, idx):\n\ + \ return hashlib.sha256(f\"{fp}::{sym}::{idx}\".encode()).hexdigest()[:32]\n\ + \n def split_terms(value):\n expanded = re.sub(r\"([a-z0-9])([A-Z])\"\ + , r\"\\1 \\2\", value)\n normalized = re.sub(r\"[^A-Za-z0-9]+\",\ + \ \" \", expanded)\n return [token.lower() for token in normalized.split()\ + \ if token]\n\n def unique_terms(values, limit=24):\n seen = set()\n\ + \ ordered = []\n for value in values:\n for token\ + \ in split_terms(str(value)):\n if token not in seen:\n \ + \ seen.add(token)\n ordered.append(token)\n\ + \ if len(ordered) >= limit:\n \ + \ return ordered\n return ordered\n\n def summarize_list(values,\ + \ limit=8):\n if not isinstance(values, list):\n return\ + \ \"\"\n flattened = [str(item) for item in values if item]\n \ + \ return \", \".join(flattened[:limit])\n\n def get_path_aliases(fp):\n\ + \ normalized = fp.replace(\"\\\\\", \"/\").lower()\n aliases\ + \ = []\n for prefix, hints in PATH_ALIAS_HINTS.items():\n \ + \ if normalized.startswith(prefix):\n aliases.extend(hints)\n\ + \ return aliases\n\n def extract_container_names(parsed):\n \ + \ spec = parsed.get(\"spec\")\n if not isinstance(spec, dict):\n\ + \ return []\n template = spec.get(\"template\", {})\n\ + \ if isinstance(template, dict):\n template_spec = template.get(\"\ + spec\", {})\n if isinstance(template_spec, dict):\n \ + \ containers = template_spec.get(\"containers\", [])\n \ + \ if isinstance(containers, list):\n return [\n \ + \ str(container.get(\"name\"))\n \ + \ for container in containers\n if isinstance(container,\ + \ dict) and container.get(\"name\")\n ]\n job_template\ + \ = spec.get(\"jobTemplate\", {})\n if isinstance(job_template, dict):\n\ + \ nested_spec = job_template.get(\"spec\", {})\n if\ + \ isinstance(nested_spec, dict):\n nested_template = nested_spec.get(\"\ + template\", {})\n if isinstance(nested_template, dict):\n\ + \ nested_template_spec = nested_template.get(\"spec\"\ + , {})\n if isinstance(nested_template_spec, dict):\n\ + \ containers = nested_template_spec.get(\"containers\"\ + , [])\n if isinstance(containers, list):\n \ + \ return [\n str(container.get(\"\ + name\"))\n for container in containers\n\ + \ if isinstance(container, dict) and container.get(\"\ + name\")\n ]\n return []\n\n def build_manifest_context(parsed,\ + \ fp, ctx):\n metadata = parsed.get(\"metadata\", {})\n metadata\ + \ = metadata if isinstance(metadata, dict) else {}\n kind = str(parsed.get(\"\ + kind\", \"Unknown\"))\n api_version = str(parsed.get(\"apiVersion\"\ + , \"unknown\"))\n name = str(metadata.get(\"name\", \"unknown\"))\n\ + \ namespace = str(metadata.get(\"namespace\", \"cluster-scoped\"\ + ))\n path_terms = unique_terms([fp, os.path.basename(fp), ctx], limit=18)\n\ + \ alias_terms = unique_terms(get_path_aliases(fp), limit=18)\n \ + \ top_level_keys = summarize_list(list(parsed.keys()))\n label_keys\ + \ = summarize_list(list((metadata.get(\"labels\") or {}).keys()))\n \ + \ annotation_keys = summarize_list(list((metadata.get(\"annotations\"\ + ) or {}).keys()))\n\n lines = [\n f\"Manifest file path:\ + \ {fp}\",\n f\"Folder context: {ctx}\",\n f\"Resource\ + \ kind: {kind}\",\n f\"API version: {api_version}\",\n \ + \ f\"Metadata name: {name}\",\n f\"Namespace: {namespace}\"\ + ,\n ]\n if path_terms:\n lines.append(f\"Path hints:\ + \ {' '.join(path_terms)}\")\n if alias_terms:\n lines.append(f\"\ + Domain hints: {' '.join(alias_terms)}\")\n if top_level_keys:\n \ + \ lines.append(f\"Top-level keys: {top_level_keys}\")\n \ + \ if label_keys:\n lines.append(f\"Label keys: {label_keys}\"\ + )\n if annotation_keys:\n lines.append(f\"Annotation keys:\ + \ {annotation_keys}\")\n\n spec = parsed.get(\"spec\")\n spec\ + \ = spec if isinstance(spec, dict) else {}\n\n if kind.lower() ==\ + \ \"kustomization\" or os.path.basename(fp).lower() == \"kustomization.yaml\"\ + :\n resources = summarize_list(parsed.get(\"resources\"))\n \ + \ components = summarize_list(parsed.get(\"components\"))\n \ + \ bases = summarize_list(parsed.get(\"bases\"))\n patches\ + \ = summarize_list(parsed.get(\"patchesStrategicMerge\"))\n if\ + \ resources:\n lines.append(f\"Kustomize resources: {resources}\"\ + )\n if components:\n lines.append(f\"Kustomize\ + \ components: {components}\")\n if bases:\n lines.append(f\"\ + Kustomize bases: {bases}\")\n if patches:\n lines.append(f\"\ + Kustomize patches: {patches}\")\n\n if kind in {\"Deployment\", \"\ + StatefulSet\", \"DaemonSet\", \"Job\", \"CronJob\"}:\n container_names\ + \ = summarize_list(extract_container_names(parsed))\n service_account\ + \ = spec.get(\"serviceAccountName\")\n if not service_account\ + \ and isinstance(spec.get(\"template\"), dict):\n template_spec\ + \ = spec.get(\"template\", {}).get(\"spec\", {})\n if isinstance(template_spec,\ + \ dict):\n service_account = template_spec.get(\"serviceAccountName\"\ + )\n if container_names:\n lines.append(f\"Workload\ + \ containers: {container_names}\")\n if service_account:\n \ + \ lines.append(f\"Service account: {service_account}\")\n\n\ + \ if kind == \"Service\":\n service_type = spec.get(\"\ + type\")\n selector = spec.get(\"selector\")\n ports\ + \ = spec.get(\"ports\")\n if service_type:\n lines.append(f\"\ + Service type: {service_type}\")\n if isinstance(selector, dict)\ + \ and selector:\n lines.append(f\"Service selector keys:\ + \ {', '.join(list(selector.keys())[:8])}\")\n if isinstance(ports,\ + \ list) and ports:\n port_values = [str(port.get('port'))\ + \ for port in ports if isinstance(port, dict) and port.get('port')]\n \ + \ if port_values:\n lines.append(f\"Service\ + \ ports: {', '.join(port_values[:8])}\")\n\n if kind == \"CustomResourceDefinition\"\ + :\n names = spec.get(\"names\", {}) if isinstance(spec.get(\"\ + names\"), dict) else {}\n versions = spec.get(\"versions\", [])\n\ + \ if spec.get(\"group\"):\n lines.append(f\"CRD\ + \ group: {spec.get('group')}\")\n if names.get(\"kind\"):\n \ + \ lines.append(f\"CRD served kind: {names.get('kind')}\")\n\ + \ if isinstance(versions, list) and versions:\n \ + \ version_names = [str(version.get(\"name\")) for version in versions\ + \ if isinstance(version, dict) and version.get(\"name\")]\n \ + \ if version_names:\n lines.append(f\"CRD versions:\ + \ {', '.join(version_names[:8])}\")\n\n if kind in {\"Role\", \"\ + ClusterRole\"}:\n rules = spec.get(\"rules\", parsed.get(\"rules\"\ + ))\n if isinstance(rules, list) and rules:\n resource_names\ + \ = []\n verbs = []\n for rule in rules[:4]:\n\ + \ if isinstance(rule, dict):\n \ + \ resource_names.extend(str(item) for item in rule.get(\"resources\", [])[:4])\n\ + \ verbs.extend(str(item) for item in rule.get(\"\ + verbs\", [])[:4])\n if resource_names:\n \ + \ lines.append(f\"RBAC resources: {', '.join(resource_names[:10])}\"\ + )\n if verbs:\n lines.append(f\"RBAC verbs:\ + \ {', '.join(verbs[:10])}\")\n\n if kind in {\"RoleBinding\", \"\ + ClusterRoleBinding\"}:\n role_ref = parsed.get(\"roleRef\", {})\n\ + \ subjects = parsed.get(\"subjects\", [])\n if isinstance(role_ref,\ + \ dict) and role_ref.get(\"name\"):\n lines.append(f\"Binding\ + \ roleRef: {role_ref.get('name')}\")\n if isinstance(subjects,\ + \ list) and subjects:\n subject_names = [str(subject.get(\"\ + name\")) for subject in subjects if isinstance(subject, dict) and subject.get(\"\ + name\")]\n if subject_names:\n lines.append(f\"\ + Binding subjects: {', '.join(subject_names[:10])}\")\n\n if kind\ + \ in {\"AuthorizationPolicy\", \"PeerAuthentication\", \"VirtualService\"\ + , \"Gateway\", \"DestinationRule\"}:\n selector = spec.get(\"\ + selector\", {})\n if isinstance(selector, dict):\n \ + \ match_labels = selector.get(\"matchLabels\", {})\n \ + \ if isinstance(match_labels, dict) and match_labels:\n \ + \ lines.append(f\"Istio selector labels: {', '.join(list(match_labels.keys())[:8])}\"\ + )\n gateways = spec.get(\"gateways\")\n hosts = spec.get(\"\ + hosts\")\n if isinstance(gateways, list) and gateways:\n \ + \ lines.append(f\"Istio gateways: {', '.join(str(g) for g in\ + \ gateways[:8])}\")\n if isinstance(hosts, list) and hosts:\n\ + \ lines.append(f\"Istio hosts: {', '.join(str(h) for h in\ + \ hosts[:8])}\")\n\n return \"\\n\".join(f\"# {line}\" for line in\ + \ lines if line)\n\n def parse_python(content, fp, sha, ctx):\n \ + \ chunks, lines = [], content.split(\"\\n\")\n try:\n \ + \ tree = pyast.parse(content)\n except SyntaxError:\n \ + \ return [{\"chunk_id\": gen_id(fp, \"module\", 0), \"file_path\": fp,\n\ + \ \"extension\": \".py\", \"language\": \"python\",\n\ + \ \"symbol_name\": os.path.basename(fp), \"chunk_text\"\ + : content,\n \"start_line\": 1, \"end_line\": len(lines),\n\ + \ \"commit_sha\": sha, \"folder_context\": ctx}]\n \ + \ idx = 0\n for node in pyast.walk(tree):\n if isinstance(node,\ + \ (pyast.FunctionDef, pyast.AsyncFunctionDef, pyast.ClassDef)):\n \ + \ sl, el = node.lineno, node.end_lineno or node.lineno\n \ + \ ct = \"\\n\".join(lines[sl - 1:el])\n tp = \"\ + class\" if isinstance(node, pyast.ClassDef) else \"function\"\n \ + \ chunks.append({\"chunk_id\": gen_id(fp, node.name, idx), \"file_path\"\ + : fp,\n \"extension\": \".py\", \"language\"\ + : \"python\",\n \"symbol_name\": f\"{tp}:{node.name}\"\ + , \"chunk_text\": ct,\n \"start_line\": sl,\ + \ \"end_line\": el,\n \"commit_sha\": sha,\ + \ \"folder_context\": ctx})\n idx += 1\n if not chunks:\n\ + \ chunks.append({\"chunk_id\": gen_id(fp, \"module\", 0), \"\ + file_path\": fp,\n \"extension\": \".py\", \"\ + language\": \"python\",\n \"symbol_name\": f\"\ + module:{os.path.basename(fp)}\", \"chunk_text\": content,\n \ + \ \"start_line\": 1, \"end_line\": len(lines),\n \ + \ \"commit_sha\": sha, \"folder_context\": ctx})\n \ + \ return chunks\n\n def parse_go(content, fp, sha, ctx):\n \ + \ pat = re.compile(r\"^(?:func\\s+(?:\\([^)]+\\)\\s+)?(\\w+)|type\\s+(\\\ + w+)\\s+struct)\\b\", re.MULTILINE)\n matches = list(pat.finditer(content))\n\ + \ if not matches:\n return [{\"chunk_id\": gen_id(fp,\ + \ \"file\", 0), \"file_path\": fp,\n \"extension\":\ + \ \".go\", \"language\": \"go\",\n \"symbol_name\":\ + \ f\"file:{os.path.basename(fp)}\", \"chunk_text\": content,\n \ + \ \"start_line\": 1, \"end_line\": content.count(\"\\n\") + 1,\n\ + \ \"commit_sha\": sha, \"folder_context\": ctx}]\n \ + \ chunks = []\n for i, m in enumerate(matches):\n \ + \ sym = m.group(1) or m.group(2)\n s, e = m.start(), matches[i\ + \ + 1].start() if i + 1 < len(matches) else len(content)\n ct\ + \ = content[s:e].rstrip()\n sl = content[:s].count(\"\\n\") +\ + \ 1\n tp = \"struct\" if m.group(2) else \"func\"\n \ + \ chunks.append({\"chunk_id\": gen_id(fp, sym, i), \"file_path\": fp,\n\ + \ \"extension\": \".go\", \"language\": \"go\"\ + ,\n \"symbol_name\": f\"{tp}:{sym}\", \"chunk_text\"\ + : ct,\n \"start_line\": sl, \"end_line\": sl +\ + \ ct.count(\"\\n\"),\n \"commit_sha\": sha, \"\ + folder_context\": ctx})\n return chunks\n\n def parse_yaml_file(content,\ + \ fp, sha, ctx):\n ext = os.path.splitext(fp)[1].lower()\n \ + \ docs = content.split(\"\\n---\")\n chunks = []\n for idx,\ + \ doc in enumerate(docs):\n doc = doc.strip()\n if\ + \ not doc:\n continue\n try:\n \ + \ parsed = yaml.safe_load(doc)\n except yaml.YAMLError:\n \ + \ parsed = None\n if isinstance(parsed, dict):\n\ + \ kind = parsed.get(\"kind\", \"Unknown\")\n \ + \ md = parsed.get(\"metadata\", {})\n name = md.get(\"\ + name\", \"unknown\") if isinstance(md, dict) else \"unknown\"\n \ + \ sym = f\"{kind}:{name}\"\n manifest_context = build_manifest_context(parsed,\ + \ fp, ctx)\n chunk_body = f\"{manifest_context}\\n\\n{doc}\"\ + \ if manifest_context else doc\n else:\n sym =\ + \ f\"fragment:{idx}\"\n chunk_body = doc\n pre\ + \ = \"\\n---\".join(docs[:idx])\n sl = pre.count(\"\\n\") + 1\ + \ if pre else 1\n chunks.append({\"chunk_id\": gen_id(fp, sym,\ + \ idx), \"file_path\": fp,\n \"extension\": ext,\ + \ \"language\": \"yaml\",\n \"symbol_name\": sym,\ + \ \"chunk_text\": chunk_body,\n \"start_line\"\ + : sl, \"end_line\": sl + doc.count(\"\\n\"),\n \ + \ \"commit_sha\": sha, \"folder_context\": ctx})\n return chunks\ + \ or [{\"chunk_id\": gen_id(fp, \"file\", 0), \"file_path\": fp,\n \ + \ \"extension\": ext, \"language\": \"yaml\",\n \ + \ \"symbol_name\": f\"file:{os.path.basename(fp)}\"\ + , \"chunk_text\": content,\n \"start_line\": 1,\ + \ \"end_line\": content.count(\"\\n\") + 1,\n \ + \ \"commit_sha\": sha, \"folder_context\": ctx}]\n\n def parse_md(content,\ + \ fp, sha, ctx):\n pat = re.compile(r\"^(#{2,3})\\s+(.+)$\", re.MULTILINE)\n\ + \ matches = list(pat.finditer(content))\n if not matches:\n\ + \ return [{\"chunk_id\": gen_id(fp, \"doc\", 0), \"file_path\"\ + : fp,\n \"extension\": \".md\", \"language\": \"markdown\"\ + ,\n \"symbol_name\": f\"doc:{os.path.basename(fp)}\"\ + , \"chunk_text\": content,\n \"start_line\": 1, \"end_line\"\ + : content.count(\"\\n\") + 1,\n \"commit_sha\": sha,\ + \ \"folder_context\": ctx}]\n chunks = []\n for i, m in enumerate(matches):\n\ + \ h = m.group(2).strip()\n s = m.start()\n \ + \ e = matches[i + 1].start() if i + 1 < len(matches) else len(content)\n\ + \ text = content[s:e].strip()\n sl = content[:s].count(\"\ + \\n\") + 1\n chunks.append({\"chunk_id\": gen_id(fp, h, i), \"\ + file_path\": fp,\n \"extension\": \".md\", \"\ + language\": \"markdown\",\n \"symbol_name\": f\"\ + heading:{h[:100]}\", \"chunk_text\": text,\n \"\ + start_line\": sl, \"end_line\": sl + text.count(\"\\n\"),\n \ + \ \"commit_sha\": sha, \"folder_context\": ctx})\n \ + \ return chunks\n\n PARSERS = {\".py\": parse_python, \".go\": parse_go,\n\ + \ \".yaml\": parse_yaml_file, \".yml\": parse_yaml_file, \"\ + .md\": parse_md}\n\n files = []\n with open(clone_data.path) as f:\n\ + \ for line in f:\n if line.strip():\n files.append(json.loads(line))\n\ + \n all_chunks = []\n for fi in files:\n parser = PARSERS.get(fi[\"\ + extension\"])\n if not parser:\n continue\n try:\n\ + \ chunks = parser(fi[\"content\"], fi[\"path\"], fi[\"commit_sha\"\ + ], fi[\"folder_context\"])\n all_chunks.extend(chunks)\n \ + \ except Exception as ex:\n logger.warning(\"Error parsing\ + \ %s: %s\", fi[\"path\"], ex)\n\n logger.info(\"Parsed %d chunks from\ + \ %d files\", len(all_chunks), len(files))\n\n with open(parsed_data.path,\ + \ \"w\") as f:\n for c in all_chunks:\n f.write(json.dumps(c,\ + \ ensure_ascii=False) + \"\\n\")\n\n" + image: python:3.11-slim +pipelineInfo: + description: Run both docs and code ingestion pipelines in parallel + name: full-ingestion-pipeline +root: + dag: + tasks: + code-ingestion-pipeline: + cachingOptions: + enableCache: true + componentRef: + name: comp-code-ingestion-pipeline + inputs: + parameters: + branch: + componentInputParameter: code_branch + collection_name: + runtimeValue: + constant: code_collection + embedding_dim: + componentInputParameter: embedding_dim + embedding_model: + componentInputParameter: embedding_model + milvus_host: + componentInputParameter: milvus_host + milvus_port: + componentInputParameter: milvus_port + repo_url: + componentInputParameter: code_repo_url + taskInfo: + name: code-ingestion-pipeline + docs-ingestion-pipeline: + cachingOptions: + enableCache: true + componentRef: + name: comp-docs-ingestion-pipeline + inputs: + parameters: + base_url: + componentInputParameter: docs_base_url + chunk_overlap: + componentInputParameter: chunk_overlap + chunk_size: + componentInputParameter: chunk_size + collection_name: + runtimeValue: + constant: docs_collection + crawl_delay: + componentInputParameter: docs_crawl_delay + embedding_dim: + componentInputParameter: embedding_dim + embedding_model: + componentInputParameter: embedding_model + max_pages: + componentInputParameter: docs_max_pages + milvus_host: + componentInputParameter: milvus_host + milvus_port: + componentInputParameter: milvus_port + taskInfo: + name: docs-ingestion-pipeline + inputDefinitions: + parameters: + chunk_overlap: + defaultValue: 50.0 + isOptional: true + parameterType: NUMBER_INTEGER + chunk_size: + defaultValue: 500.0 + isOptional: true + parameterType: NUMBER_INTEGER + code_branch: + defaultValue: master + isOptional: true + parameterType: STRING + code_repo_url: + defaultValue: https://github.com/kubeflow/manifests + isOptional: true + parameterType: STRING + docs_base_url: + defaultValue: https://www.kubeflow.org + isOptional: true + parameterType: STRING + docs_crawl_delay: + defaultValue: 1.0 + isOptional: true + parameterType: NUMBER_DOUBLE + docs_max_pages: + defaultValue: 0.0 + isOptional: true + parameterType: NUMBER_INTEGER + embedding_dim: + defaultValue: 384.0 + isOptional: true + parameterType: NUMBER_INTEGER + embedding_model: + defaultValue: sentence-transformers/all-MiniLM-L6-v2 + isOptional: true + parameterType: STRING + milvus_host: + defaultValue: localhost + isOptional: true + parameterType: STRING + milvus_port: + defaultValue: '19530' + isOptional: true + parameterType: STRING +schemaVersion: 2.1.0 +sdkVersion: kfp-2.16.0 diff --git a/pipelines/code_ingestion/pipeline.py b/pipelines/code_ingestion/pipeline.py new file mode 100644 index 0000000..a8493d3 --- /dev/null +++ b/pipelines/code_ingestion/pipeline.py @@ -0,0 +1,880 @@ +""" +Code Ingestion — KFP v2 Pipeline + +Orchestrates the complete code ingestion flow: + repo_cloner -> ast_parser -> chunker -> embedder -> loader + +Usage: + # Compile to YAML + python pipelines/code_ingestion/pipeline.py + + # Run locally (without KFP) + python -m pipelines.code_ingestion.pipeline --local +""" + +import os +import sys + +import kfp +from kfp import dsl +from kfp.dsl import Dataset, Input, Output + +# Ensure package imports work both when executed as a script and as a module. +PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) +if PROJECT_ROOT not in sys.path: + sys.path.insert(0, PROJECT_ROOT) + +# Import docs pipeline for composition +try: + from pipelines.docs_ingestion.pipeline import docs_ingestion_pipeline +except ImportError: + # This might happen if PYTHONPATH is not set during some CI steps + docs_ingestion_pipeline = None + + +# ─── KFP Components ───────────────────────────────────────────────────────── + +@dsl.component( + base_image="python:3.11-slim", + packages_to_install=["gitpython==3.1.43"], +) +def clone_repo( + repo_url: str, + branch: str, + clone_data: Output[Dataset], +): + """Clone a git repository and collect file metadata. + + Args: + repo_url: Repository URL to clone. + branch: Branch name to clone. + clone_data: Output dataset artifact. + """ + import json + import logging + import os + import subprocess + import tempfile + + logging.basicConfig(level=logging.INFO) + logger = logging.getLogger("repo_cloner") + + SKIP_DIRS = {".git", "__pycache__", "node_modules", ".tox", ".mypy_cache"} + EXTENSIONS = {".py", ".go", ".yaml", ".yml", ".md"} + MIN_SIZE, MAX_SIZE = 200, 100_000 + + clone_dir = tempfile.mkdtemp(prefix="code-ingest-") + logger.info("Cloning %s -> %s", repo_url, clone_dir) + + subprocess.run( + ["git", "clone", "--depth", "1", "--branch", branch, repo_url, clone_dir], + check=True, capture_output=True, text=True, + ) + + result = subprocess.run( + ["git", "rev-parse", "HEAD"], + capture_output=True, text=True, cwd=clone_dir, check=True, + ) + commit_sha = result.stdout.strip() + logger.info("Commit: %s", commit_sha[:12]) + + files = [] + for root, dirs, fnames in os.walk(clone_dir): + dirs[:] = [d for d in dirs if d not in SKIP_DIRS and not d.startswith(".")] + for fn in fnames: + fp = os.path.join(root, fn) + rel = os.path.relpath(fp, clone_dir) + _, ext = os.path.splitext(fn) + if ext.lower() not in EXTENSIONS: + continue + try: + sz = os.path.getsize(fp) + except OSError: + continue + if sz < MIN_SIZE or sz > MAX_SIZE: + continue + parts = rel.split(os.sep) + folder = parts[0] if len(parts) > 1 else "root" + files.append({"path": rel, "extension": ext.lower(), + "size_bytes": sz, "folder_context": folder}) + + logger.info("Collected %d files", len(files)) + + # Save file list + contents + output = [] + for f in files: + full = os.path.join(clone_dir, f["path"]) + try: + with open(full, "r", encoding="utf-8", errors="replace") as fh: + content = fh.read() + except Exception: + continue + output.append({**f, "content": content, "commit_sha": commit_sha}) + + with open(clone_data.path, "w") as fh: + for item in output: + fh.write(json.dumps(item, ensure_ascii=False) + "\n") + + # Cleanup + import shutil + shutil.rmtree(clone_dir, ignore_errors=True) + + +@dsl.component( + base_image="python:3.11-slim", + packages_to_install=["PyYAML==6.0.1"], +) +def parse_code( + clone_data: Input[Dataset], + parsed_data: Output[Dataset], +): + """Parse files into logical code chunks using language-specific parsers. + + Args: + clone_data: Input dataset from repo cloner. + parsed_data: Output dataset of parsed chunks. + """ + import ast as pyast + import hashlib + import json + import logging + import os + import re + + import yaml + + logging.basicConfig(level=logging.INFO) + logger = logging.getLogger("ast_parser") + + PATH_ALIAS_HINTS = { + "common/istio": [ + "istio", "service mesh", "gateway", "authorization policy", + "peer authentication", "virtual service", "sidecar", "envoy", "mtls", "ingress", + ], + "common/knative": [ + "knative", "serving", "eventing", "serverless", "scale to zero", + "activator", "revision", "service", "net istio", "webhook", + ], + "common/dex": [ + "dex", "oidc", "oauth2", "authentication", "identity provider", + "connector", "login", + ], + "common/cert-manager": [ + "cert manager", "certificate", "issuer", "clusterissuer", + "cainjector", "tls", "webhook", + ], + "applications/pipeline": [ + "kubeflow pipelines", "kfp", "pipeline api server", "deployment", + "service", "configmap", "role", "rolebinding", "serviceaccount", + "crd", "webhook", "scheduled workflow", + ], + "applications/profiles": [ + "profiles", "namespaces", "rbac", "rolebinding", "serviceaccount", "user profile", + ], + "tests": ["tests", "e2e", "integration", "validation", "presubmit"], + } + + def gen_id(fp, sym, idx): + return hashlib.sha256(f"{fp}::{sym}::{idx}".encode()).hexdigest()[:32] + + def split_terms(value): + expanded = re.sub(r"([a-z0-9])([A-Z])", r"\1 \2", value) + normalized = re.sub(r"[^A-Za-z0-9]+", " ", expanded) + return [token.lower() for token in normalized.split() if token] + + def unique_terms(values, limit=24): + seen = set() + ordered = [] + for value in values: + for token in split_terms(str(value)): + if token not in seen: + seen.add(token) + ordered.append(token) + if len(ordered) >= limit: + return ordered + return ordered + + def summarize_list(values, limit=8): + if not isinstance(values, list): + return "" + flattened = [str(item) for item in values if item] + return ", ".join(flattened[:limit]) + + def get_path_aliases(fp): + normalized = fp.replace("\\", "/").lower() + aliases = [] + for prefix, hints in PATH_ALIAS_HINTS.items(): + if normalized.startswith(prefix): + aliases.extend(hints) + return aliases + + def extract_container_names(parsed): + spec = parsed.get("spec") + if not isinstance(spec, dict): + return [] + template = spec.get("template", {}) + if isinstance(template, dict): + template_spec = template.get("spec", {}) + if isinstance(template_spec, dict): + containers = template_spec.get("containers", []) + if isinstance(containers, list): + return [ + str(container.get("name")) + for container in containers + if isinstance(container, dict) and container.get("name") + ] + job_template = spec.get("jobTemplate", {}) + if isinstance(job_template, dict): + nested_spec = job_template.get("spec", {}) + if isinstance(nested_spec, dict): + nested_template = nested_spec.get("template", {}) + if isinstance(nested_template, dict): + nested_template_spec = nested_template.get("spec", {}) + if isinstance(nested_template_spec, dict): + containers = nested_template_spec.get("containers", []) + if isinstance(containers, list): + return [ + str(container.get("name")) + for container in containers + if isinstance(container, dict) and container.get("name") + ] + return [] + + def build_manifest_context(parsed, fp, ctx): + metadata = parsed.get("metadata", {}) + metadata = metadata if isinstance(metadata, dict) else {} + kind = str(parsed.get("kind", "Unknown")) + api_version = str(parsed.get("apiVersion", "unknown")) + name = str(metadata.get("name", "unknown")) + namespace = str(metadata.get("namespace", "cluster-scoped")) + path_terms = unique_terms([fp, os.path.basename(fp), ctx], limit=18) + alias_terms = unique_terms(get_path_aliases(fp), limit=18) + top_level_keys = summarize_list(list(parsed.keys())) + label_keys = summarize_list(list((metadata.get("labels") or {}).keys())) + annotation_keys = summarize_list(list((metadata.get("annotations") or {}).keys())) + + lines = [ + f"Manifest file path: {fp}", + f"Folder context: {ctx}", + f"Resource kind: {kind}", + f"API version: {api_version}", + f"Metadata name: {name}", + f"Namespace: {namespace}", + ] + if path_terms: + lines.append(f"Path hints: {' '.join(path_terms)}") + if alias_terms: + lines.append(f"Domain hints: {' '.join(alias_terms)}") + if top_level_keys: + lines.append(f"Top-level keys: {top_level_keys}") + if label_keys: + lines.append(f"Label keys: {label_keys}") + if annotation_keys: + lines.append(f"Annotation keys: {annotation_keys}") + + spec = parsed.get("spec") + spec = spec if isinstance(spec, dict) else {} + + if kind.lower() == "kustomization" or os.path.basename(fp).lower() == "kustomization.yaml": + resources = summarize_list(parsed.get("resources")) + components = summarize_list(parsed.get("components")) + bases = summarize_list(parsed.get("bases")) + patches = summarize_list(parsed.get("patchesStrategicMerge")) + if resources: + lines.append(f"Kustomize resources: {resources}") + if components: + lines.append(f"Kustomize components: {components}") + if bases: + lines.append(f"Kustomize bases: {bases}") + if patches: + lines.append(f"Kustomize patches: {patches}") + + if kind in {"Deployment", "StatefulSet", "DaemonSet", "Job", "CronJob"}: + container_names = summarize_list(extract_container_names(parsed)) + service_account = spec.get("serviceAccountName") + if not service_account and isinstance(spec.get("template"), dict): + template_spec = spec.get("template", {}).get("spec", {}) + if isinstance(template_spec, dict): + service_account = template_spec.get("serviceAccountName") + if container_names: + lines.append(f"Workload containers: {container_names}") + if service_account: + lines.append(f"Service account: {service_account}") + + if kind == "Service": + service_type = spec.get("type") + selector = spec.get("selector") + ports = spec.get("ports") + if service_type: + lines.append(f"Service type: {service_type}") + if isinstance(selector, dict) and selector: + lines.append(f"Service selector keys: {', '.join(list(selector.keys())[:8])}") + if isinstance(ports, list) and ports: + port_values = [str(port.get('port')) for port in ports if isinstance(port, dict) and port.get('port')] + if port_values: + lines.append(f"Service ports: {', '.join(port_values[:8])}") + + if kind == "CustomResourceDefinition": + names = spec.get("names", {}) if isinstance(spec.get("names"), dict) else {} + versions = spec.get("versions", []) + if spec.get("group"): + lines.append(f"CRD group: {spec.get('group')}") + if names.get("kind"): + lines.append(f"CRD served kind: {names.get('kind')}") + if isinstance(versions, list) and versions: + version_names = [str(version.get("name")) for version in versions if isinstance(version, dict) and version.get("name")] + if version_names: + lines.append(f"CRD versions: {', '.join(version_names[:8])}") + + if kind in {"Role", "ClusterRole"}: + rules = spec.get("rules", parsed.get("rules")) + if isinstance(rules, list) and rules: + resource_names = [] + verbs = [] + for rule in rules[:4]: + if isinstance(rule, dict): + resource_names.extend(str(item) for item in rule.get("resources", [])[:4]) + verbs.extend(str(item) for item in rule.get("verbs", [])[:4]) + if resource_names: + lines.append(f"RBAC resources: {', '.join(resource_names[:10])}") + if verbs: + lines.append(f"RBAC verbs: {', '.join(verbs[:10])}") + + if kind in {"RoleBinding", "ClusterRoleBinding"}: + role_ref = parsed.get("roleRef", {}) + subjects = parsed.get("subjects", []) + if isinstance(role_ref, dict) and role_ref.get("name"): + lines.append(f"Binding roleRef: {role_ref.get('name')}") + if isinstance(subjects, list) and subjects: + subject_names = [str(subject.get("name")) for subject in subjects if isinstance(subject, dict) and subject.get("name")] + if subject_names: + lines.append(f"Binding subjects: {', '.join(subject_names[:10])}") + + if kind in {"AuthorizationPolicy", "PeerAuthentication", "VirtualService", "Gateway", "DestinationRule"}: + selector = spec.get("selector", {}) + if isinstance(selector, dict): + match_labels = selector.get("matchLabels", {}) + if isinstance(match_labels, dict) and match_labels: + lines.append(f"Istio selector labels: {', '.join(list(match_labels.keys())[:8])}") + gateways = spec.get("gateways") + hosts = spec.get("hosts") + if isinstance(gateways, list) and gateways: + lines.append(f"Istio gateways: {', '.join(str(g) for g in gateways[:8])}") + if isinstance(hosts, list) and hosts: + lines.append(f"Istio hosts: {', '.join(str(h) for h in hosts[:8])}") + + return "\n".join(f"# {line}" for line in lines if line) + + def parse_python(content, fp, sha, ctx): + chunks, lines = [], content.split("\n") + try: + tree = pyast.parse(content) + except SyntaxError: + return [{"chunk_id": gen_id(fp, "module", 0), "file_path": fp, + "extension": ".py", "language": "python", + "symbol_name": os.path.basename(fp), "chunk_text": content, + "start_line": 1, "end_line": len(lines), + "commit_sha": sha, "folder_context": ctx}] + idx = 0 + for node in pyast.walk(tree): + if isinstance(node, (pyast.FunctionDef, pyast.AsyncFunctionDef, pyast.ClassDef)): + sl, el = node.lineno, node.end_lineno or node.lineno + ct = "\n".join(lines[sl - 1:el]) + tp = "class" if isinstance(node, pyast.ClassDef) else "function" + chunks.append({"chunk_id": gen_id(fp, node.name, idx), "file_path": fp, + "extension": ".py", "language": "python", + "symbol_name": f"{tp}:{node.name}", "chunk_text": ct, + "start_line": sl, "end_line": el, + "commit_sha": sha, "folder_context": ctx}) + idx += 1 + if not chunks: + chunks.append({"chunk_id": gen_id(fp, "module", 0), "file_path": fp, + "extension": ".py", "language": "python", + "symbol_name": f"module:{os.path.basename(fp)}", "chunk_text": content, + "start_line": 1, "end_line": len(lines), + "commit_sha": sha, "folder_context": ctx}) + return chunks + + def parse_go(content, fp, sha, ctx): + pat = re.compile(r"^(?:func\s+(?:\([^)]+\)\s+)?(\w+)|type\s+(\w+)\s+struct)\b", re.MULTILINE) + matches = list(pat.finditer(content)) + if not matches: + return [{"chunk_id": gen_id(fp, "file", 0), "file_path": fp, + "extension": ".go", "language": "go", + "symbol_name": f"file:{os.path.basename(fp)}", "chunk_text": content, + "start_line": 1, "end_line": content.count("\n") + 1, + "commit_sha": sha, "folder_context": ctx}] + chunks = [] + for i, m in enumerate(matches): + sym = m.group(1) or m.group(2) + s, e = m.start(), matches[i + 1].start() if i + 1 < len(matches) else len(content) + ct = content[s:e].rstrip() + sl = content[:s].count("\n") + 1 + tp = "struct" if m.group(2) else "func" + chunks.append({"chunk_id": gen_id(fp, sym, i), "file_path": fp, + "extension": ".go", "language": "go", + "symbol_name": f"{tp}:{sym}", "chunk_text": ct, + "start_line": sl, "end_line": sl + ct.count("\n"), + "commit_sha": sha, "folder_context": ctx}) + return chunks + + def parse_yaml_file(content, fp, sha, ctx): + ext = os.path.splitext(fp)[1].lower() + docs = content.split("\n---") + chunks = [] + for idx, doc in enumerate(docs): + doc = doc.strip() + if not doc: + continue + try: + parsed = yaml.safe_load(doc) + except yaml.YAMLError: + parsed = None + if isinstance(parsed, dict): + kind = parsed.get("kind", "Unknown") + md = parsed.get("metadata", {}) + name = md.get("name", "unknown") if isinstance(md, dict) else "unknown" + sym = f"{kind}:{name}" + manifest_context = build_manifest_context(parsed, fp, ctx) + chunk_body = f"{manifest_context}\n\n{doc}" if manifest_context else doc + else: + sym = f"fragment:{idx}" + chunk_body = doc + pre = "\n---".join(docs[:idx]) + sl = pre.count("\n") + 1 if pre else 1 + chunks.append({"chunk_id": gen_id(fp, sym, idx), "file_path": fp, + "extension": ext, "language": "yaml", + "symbol_name": sym, "chunk_text": chunk_body, + "start_line": sl, "end_line": sl + doc.count("\n"), + "commit_sha": sha, "folder_context": ctx}) + return chunks or [{"chunk_id": gen_id(fp, "file", 0), "file_path": fp, + "extension": ext, "language": "yaml", + "symbol_name": f"file:{os.path.basename(fp)}", "chunk_text": content, + "start_line": 1, "end_line": content.count("\n") + 1, + "commit_sha": sha, "folder_context": ctx}] + + def parse_md(content, fp, sha, ctx): + pat = re.compile(r"^(#{2,3})\s+(.+)$", re.MULTILINE) + matches = list(pat.finditer(content)) + if not matches: + return [{"chunk_id": gen_id(fp, "doc", 0), "file_path": fp, + "extension": ".md", "language": "markdown", + "symbol_name": f"doc:{os.path.basename(fp)}", "chunk_text": content, + "start_line": 1, "end_line": content.count("\n") + 1, + "commit_sha": sha, "folder_context": ctx}] + chunks = [] + for i, m in enumerate(matches): + h = m.group(2).strip() + s = m.start() + e = matches[i + 1].start() if i + 1 < len(matches) else len(content) + text = content[s:e].strip() + sl = content[:s].count("\n") + 1 + chunks.append({"chunk_id": gen_id(fp, h, i), "file_path": fp, + "extension": ".md", "language": "markdown", + "symbol_name": f"heading:{h[:100]}", "chunk_text": text, + "start_line": sl, "end_line": sl + text.count("\n"), + "commit_sha": sha, "folder_context": ctx}) + return chunks + + PARSERS = {".py": parse_python, ".go": parse_go, + ".yaml": parse_yaml_file, ".yml": parse_yaml_file, ".md": parse_md} + + files = [] + with open(clone_data.path) as f: + for line in f: + if line.strip(): + files.append(json.loads(line)) + + all_chunks = [] + for fi in files: + parser = PARSERS.get(fi["extension"]) + if not parser: + continue + try: + chunks = parser(fi["content"], fi["path"], fi["commit_sha"], fi["folder_context"]) + all_chunks.extend(chunks) + except Exception as ex: + logger.warning("Error parsing %s: %s", fi["path"], ex) + + logger.info("Parsed %d chunks from %d files", len(all_chunks), len(files)) + + with open(parsed_data.path, "w") as f: + for c in all_chunks: + f.write(json.dumps(c, ensure_ascii=False) + "\n") + + +@dsl.component( + base_image="python:3.11-slim", + packages_to_install=["tiktoken==0.7.0"], +) +def chunk_code( + parsed_data: Input[Dataset], + chunked_data: Output[Dataset], +): + """Post-process parsed chunks with token limits and context headers. + + Args: + parsed_data: Input dataset of parsed chunks. + chunked_data: Output dataset of token-bounded chunks. + """ + import hashlib + import json + import logging + + import tiktoken + + logging.basicConfig(level=logging.INFO) + logger = logging.getLogger("chunker") + + enc = tiktoken.get_encoding("cl100k_base") + count = lambda t: len(enc.encode(t)) + + MIN_T, MAX_T = 50, 512 + + def build_path_hints(chunk): + raw = " ".join(str(chunk.get(key, "")) for key in ("file_path", "folder_context", "symbol_name")) + expanded = raw.replace("/", " ").replace("_", " ").replace("-", " ") + expanded = "".join( + ( + f" {char}" if index > 0 and char.isupper() and expanded[index - 1].islower() else char + ) + for index, char in enumerate(expanded) + ) + return " ".join(expanded.split()).lower() + + raw = [] + with open(parsed_data.path) as f: + for line in f: + if line.strip(): + raw.append(json.loads(line)) + + processed = [] + for chunk in raw: + header = ( + f"# File: {chunk.get('file_path', '?')} | Symbol: {chunk.get('symbol_name', '?')} " + f"| Lang: {chunk.get('language', '?')} | Folder: {chunk.get('folder_context', '?')}" + ) + path_hints = build_path_hints(chunk) + if path_hints: + header = f"{header}\n# Path Hints: {path_hints}" + full = f"{header}\n\n{chunk['chunk_text']}" + tc = count(full) + + if tc < MIN_T: + continue + + if tc <= MAX_T: + chunk["chunk_text"] = full[:8192] + chunk["token_count"] = tc + processed.append(chunk) + else: + # Split oversized + parts = full.split("\n\n") + cur, subs = "", [] + for p in parts: + cand = cur + "\n\n" + p if cur else p + if count(cand) > MAX_T: + if cur.strip(): + subs.append(cur.strip()) + cur = p + else: + cur = cand + if cur.strip(): + subs.append(cur.strip()) + + for si, sub in enumerate(subs): + st = count(sub) + if st < MIN_T: + continue + sc = chunk.copy() + sc["chunk_id"] = hashlib.sha256(f"{chunk['chunk_id']}::{si}".encode()).hexdigest()[:32] + sc["chunk_text"] = sub[:8192] + sc["token_count"] = st + processed.append(sc) + + logger.info("Chunked %d -> %d chunks", len(raw), len(processed)) + + with open(chunked_data.path, "w") as f: + for c in processed: + f.write(json.dumps(c, ensure_ascii=False) + "\n") + + +@dsl.component( + base_image="python:3.11-slim", + packages_to_install=["sentence-transformers==2.7.0", "torch==2.3.0"], +) +def embed_code( + chunked_data: Input[Dataset], + embedding_model: str, + embedded_data: Output[Dataset], +): + """Embed code chunks using configurable model. + + Args: + chunked_data: Input dataset of chunked code. + embedding_model: Model name for embeddings. + embedded_data: Output dataset with embeddings. + """ + import json + import logging + + from sentence_transformers import SentenceTransformer + + logging.basicConfig(level=logging.INFO) + logger = logging.getLogger("embedder") + + chunks = [] + with open(chunked_data.path) as f: + for line in f: + if line.strip(): + chunks.append(json.loads(line)) + + logger.info("Embedding %d code chunks with %s", len(chunks), embedding_model) + model = SentenceTransformer(embedding_model) + + texts = [c["chunk_text"] for c in chunks] + bs = 32 + all_embs = [] + for i in range(0, len(texts), bs): + batch = texts[i:i + bs] + embs = model.encode(batch, show_progress_bar=False) + all_embs.extend([e.tolist() for e in embs]) + logger.info("Batch %d/%d", i // bs + 1, (len(texts) + bs - 1) // bs) + + for c, e in zip(chunks, all_embs): + c["embedding"] = e + + with open(embedded_data.path, "w") as f: + for c in chunks: + f.write(json.dumps(c, ensure_ascii=False) + "\n") + + +@dsl.component( + base_image="python:3.11-slim", + packages_to_install=["pymilvus==2.4.0"], +) +def load_code( + embedded_data: Input[Dataset], + milvus_host: str, + milvus_port: str, + collection_name: str, + embedding_dim: int, +): + """Load embedded code chunks into Milvus code_collection. + + Args: + embedded_data: Input dataset with embedded chunks. + milvus_host: Milvus server host. + milvus_port: Milvus server port. + collection_name: Target collection name. + embedding_dim: Vector dimension. + """ + import json + import logging + + from pymilvus import (Collection, CollectionSchema, DataType, FieldSchema, + connections, utility) + + logging.basicConfig(level=logging.INFO) + logger = logging.getLogger("loader") + + connections.connect("default", host=milvus_host, port=milvus_port) + + if not utility.has_collection(collection_name): + fields = [ + FieldSchema("chunk_id", DataType.VARCHAR, max_length=128, is_primary=True), + FieldSchema("file_path", DataType.VARCHAR, max_length=512), + FieldSchema("extension", DataType.VARCHAR, max_length=16), + FieldSchema("language", DataType.VARCHAR, max_length=32), + FieldSchema("symbol_name", DataType.VARCHAR, max_length=256), + FieldSchema("folder_context", DataType.VARCHAR, max_length=128), + FieldSchema("chunk_text", DataType.VARCHAR, max_length=8192), + FieldSchema("start_line", DataType.INT64), + FieldSchema("end_line", DataType.INT64), + FieldSchema("commit_sha", DataType.VARCHAR, max_length=64), + FieldSchema("chunk_index", DataType.INT64), + FieldSchema("embedding", DataType.FLOAT_VECTOR, dim=embedding_dim), + ] + schema = CollectionSchema(fields, "Kubeflow manifests code chunks") + collection = Collection(collection_name, schema) + collection.create_index("embedding", { + "metric_type": "COSINE", "index_type": "HNSW", + "params": {"M": 16, "efConstruction": 200}, + }) + else: + collection = Collection(collection_name) + + collection.load() + + chunks = [] + with open(embedded_data.path) as f: + for line in f: + if line.strip(): + chunks.append(json.loads(line)) + + rows = [] + for c in chunks: + rows.append({ + "chunk_id": str(c["chunk_id"])[:128], + "file_path": str(c.get("file_path", ""))[:512], + "extension": str(c.get("extension", ""))[:16], + "language": str(c.get("language", ""))[:32], + "symbol_name": str(c.get("symbol_name", ""))[:256], + "folder_context": str(c.get("folder_context", ""))[:128], + "chunk_text": str(c.get("chunk_text", ""))[:8192], + "start_line": int(c.get("start_line", 0)), + "end_line": int(c.get("end_line", 0)), + "commit_sha": str(c.get("commit_sha", ""))[:64], + "chunk_index": int(c.get("chunk_index", 0)), + "embedding": c["embedding"], + }) + + bs = 100 + inserted = 0 + for i in range(0, len(rows), bs): + batch = rows[i:i + bs] + collection.upsert(batch) + inserted += len(batch) + + collection.flush() + logger.info("Loaded %d chunks into %s. Total: %d", + inserted, collection_name, collection.num_entities) + + +# ─── Pipeline Definition ──────────────────────────────────────────────────── + +@dsl.pipeline( + name="code-ingestion-pipeline", + description="Clone kubeflow/manifests, parse code by language, embed, and load into Milvus", +) +def code_ingestion_pipeline( + repo_url: str = "https://github.com/kubeflow/manifests", + branch: str = "master", + embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2", + milvus_host: str = "localhost", + milvus_port: str = "19530", + collection_name: str = "code_collection", + embedding_dim: int = 384, +): + """Full code ingestion pipeline: clone -> parse -> chunk -> embed -> load.""" + + clone_task = clone_repo(repo_url=repo_url, branch=branch) + clone_task.set_retry(num_retries=3, backoff_duration="30s", backoff_factor=2.0) + + parse_task = parse_code(clone_data=clone_task.outputs["clone_data"]) + parse_task.set_retry(num_retries=3, backoff_duration="30s", backoff_factor=2.0) + + chunk_task = chunk_code(parsed_data=parse_task.outputs["parsed_data"]) + chunk_task.set_retry(num_retries=3, backoff_duration="30s", backoff_factor=2.0) + + embed_task = embed_code( + chunked_data=chunk_task.outputs["chunked_data"], + embedding_model=embedding_model, + ) + embed_task.set_retry(num_retries=3, backoff_duration="30s", backoff_factor=2.0) + + load_task = load_code( + embedded_data=embed_task.outputs["embedded_data"], + milvus_host=milvus_host, + milvus_port=milvus_port, + collection_name=collection_name, + embedding_dim=embedding_dim, + ) + load_task.set_retry(num_retries=3, backoff_duration="30s", backoff_factor=2.0) + + +# ─── Parent Pipeline (Composes Both) ──────────────────────────────────────── + +if docs_ingestion_pipeline is not None: + @dsl.pipeline( + name="full-ingestion-pipeline", + description="Run both docs and code ingestion pipelines in parallel", + ) + def full_ingestion_pipeline( + # Docs params + docs_base_url: str = "https://www.kubeflow.org", + docs_crawl_delay: float = 1.0, + docs_max_pages: int = 0, + chunk_size: int = 500, + chunk_overlap: int = 50, + # Code params + code_repo_url: str = "https://github.com/kubeflow/manifests", + code_branch: str = "master", + # Shared params + embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2", + milvus_host: str = "localhost", + milvus_port: str = "19530", + embedding_dim: int = 384, + ): + """Parent pipeline that runs docs + code ingestion in parallel.""" + docs_ingestion_pipeline( + base_url=docs_base_url, + crawl_delay=docs_crawl_delay, + max_pages=docs_max_pages, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + embedding_model=embedding_model, + milvus_host=milvus_host, + milvus_port=milvus_port, + collection_name="docs_collection", + embedding_dim=embedding_dim, + ) + code_ingestion_pipeline( + repo_url=code_repo_url, + branch=code_branch, + embedding_model=embedding_model, + milvus_host=milvus_host, + milvus_port=milvus_port, + collection_name="code_collection", + embedding_dim=embedding_dim, + ) +else: + full_ingestion_pipeline = None + + + + +# ─── Main ──────────────────────────────────────────────────────────────────── + +if __name__ == "__main__": + if "--local" in sys.argv: + print("Running code ingestion pipeline locally...") + sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..")) + from pipelines.code_ingestion.components.repo_cloner import clone_repo as do_clone + from pipelines.code_ingestion.components.ast_parser import parse_all_files + from pipelines.code_ingestion.components.chunker import process_chunks + from pipelines.code_ingestion.components.embedder import embed_code_chunks + from pipelines.code_ingestion.components.loader import load_to_milvus + import logging, shutil + logging.basicConfig(level=logging.INFO) + + result = do_clone() + chunks = parse_all_files(result["repo_dir"], result["file_list"], result["commit_sha"]) + processed = process_chunks(chunks) + embedded = embed_code_chunks(processed) + summary = load_to_milvus(embedded) + print(f"Pipeline complete: {summary}") + shutil.rmtree(result["repo_dir"], ignore_errors=True) + else: + output_path = os.path.join(os.path.dirname(__file__), "pipeline.yaml") + kfp.compiler.Compiler().compile( + pipeline_func=code_ingestion_pipeline, + package_path=output_path, + ) + print(f"Compiled code ingestion pipeline to: {output_path}") + + if full_ingestion_pipeline is not None: + full_output_path = os.path.join( + os.path.dirname(__file__), + "full_pipeline.yaml", + ) + kfp.compiler.Compiler().compile( + pipeline_func=full_ingestion_pipeline, + package_path=full_output_path, + ) + print(f"Compiled full ingestion pipeline to: {full_output_path}") + else: + print( + "Skipped full ingestion pipeline compilation because the docs " + "pipeline import was unavailable." + ) diff --git a/pipelines/code_ingestion/pipeline.yaml b/pipelines/code_ingestion/pipeline.yaml new file mode 100644 index 0000000..97e4a7e --- /dev/null +++ b/pipelines/code_ingestion/pipeline.yaml @@ -0,0 +1,790 @@ +# PIPELINE DEFINITION +# Name: code-ingestion-pipeline +# Description: Clone kubeflow/manifests, parse code by language, embed, and load into Milvus +# Inputs: +# branch: str [Default: 'master'] +# collection_name: str [Default: 'code_collection'] +# embedding_dim: int [Default: 384.0] +# embedding_model: str [Default: 'sentence-transformers/all-MiniLM-L6-v2'] +# milvus_host: str [Default: 'localhost'] +# milvus_port: str [Default: '19530'] +# repo_url: str [Default: 'https://github.com/kubeflow/manifests'] +components: + comp-chunk-code: + executorLabel: exec-chunk-code + inputDefinitions: + artifacts: + parsed_data: + artifactType: + schemaTitle: system.Dataset + schemaVersion: 0.0.1 + description: Input dataset of parsed chunks. + outputDefinitions: + artifacts: + chunked_data: + artifactType: + schemaTitle: system.Dataset + schemaVersion: 0.0.1 + comp-clone-repo: + executorLabel: exec-clone-repo + inputDefinitions: + parameters: + branch: + description: Branch name to clone. + parameterType: STRING + repo_url: + description: Repository URL to clone. + parameterType: STRING + outputDefinitions: + artifacts: + clone_data: + artifactType: + schemaTitle: system.Dataset + schemaVersion: 0.0.1 + comp-embed-code: + executorLabel: exec-embed-code + inputDefinitions: + artifacts: + chunked_data: + artifactType: + schemaTitle: system.Dataset + schemaVersion: 0.0.1 + description: Input dataset of chunked code. + parameters: + embedding_model: + description: Model name for embeddings. + parameterType: STRING + outputDefinitions: + artifacts: + embedded_data: + artifactType: + schemaTitle: system.Dataset + schemaVersion: 0.0.1 + comp-load-code: + executorLabel: exec-load-code + inputDefinitions: + artifacts: + embedded_data: + artifactType: + schemaTitle: system.Dataset + schemaVersion: 0.0.1 + description: Input dataset with embedded chunks. + parameters: + collection_name: + description: Target collection name. + parameterType: STRING + embedding_dim: + description: Vector dimension. + parameterType: NUMBER_INTEGER + milvus_host: + description: Milvus server host. + parameterType: STRING + milvus_port: + description: Milvus server port. + parameterType: STRING + comp-parse-code: + executorLabel: exec-parse-code + inputDefinitions: + artifacts: + clone_data: + artifactType: + schemaTitle: system.Dataset + schemaVersion: 0.0.1 + description: Input dataset from repo cloner. + outputDefinitions: + artifacts: + parsed_data: + artifactType: + schemaTitle: system.Dataset + schemaVersion: 0.0.1 +deploymentSpec: + executors: + exec-chunk-code: + container: + args: + - --executor_input + - '{{$}}' + - --function_to_execute + - chunk_code + command: + - sh + - -c + - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ + \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ + \ python3 -m pip install --quiet --no-warn-script-location 'tiktoken==0.7.0'\ + \ && python3 -m pip install --quiet --no-warn-script-location 'kfp==2.16.0'\ + \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\ + $0\" \"$@\"\n" + - sh + - -ec + - 'program_path=$(mktemp -d) + + + printf "%s" "$0" > "$program_path/ephemeral_component.py" + + _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" + + ' + - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ + \ *\n\ndef chunk_code(\n parsed_data: Input[Dataset],\n chunked_data:\ + \ Output[Dataset],\n):\n \"\"\"Post-process parsed chunks with token\ + \ limits and context headers.\n\n Args:\n parsed_data: Input dataset\ + \ of parsed chunks.\n chunked_data: Output dataset of token-bounded\ + \ chunks.\n \"\"\"\n import hashlib\n import json\n import logging\n\ + \n import tiktoken\n\n logging.basicConfig(level=logging.INFO)\n \ + \ logger = logging.getLogger(\"chunker\")\n\n enc = tiktoken.get_encoding(\"\ + cl100k_base\")\n count = lambda t: len(enc.encode(t))\n\n MIN_T, MAX_T\ + \ = 50, 512\n\n def build_path_hints(chunk):\n raw = \" \".join(str(chunk.get(key,\ + \ \"\")) for key in (\"file_path\", \"folder_context\", \"symbol_name\"\ + ))\n expanded = raw.replace(\"/\", \" \").replace(\"_\", \" \").replace(\"\ + -\", \" \")\n expanded = \"\".join(\n (\n \ + \ f\" {char}\" if index > 0 and char.isupper() and expanded[index - 1].islower()\ + \ else char\n )\n for index, char in enumerate(expanded)\n\ + \ )\n return \" \".join(expanded.split()).lower()\n\n raw\ + \ = []\n with open(parsed_data.path) as f:\n for line in f:\n\ + \ if line.strip():\n raw.append(json.loads(line))\n\ + \n processed = []\n for chunk in raw:\n header = (\n \ + \ f\"# File: {chunk.get('file_path', '?')} | Symbol: {chunk.get('symbol_name',\ + \ '?')} \"\n f\"| Lang: {chunk.get('language', '?')} | Folder:\ + \ {chunk.get('folder_context', '?')}\"\n )\n path_hints =\ + \ build_path_hints(chunk)\n if path_hints:\n header =\ + \ f\"{header}\\n# Path Hints: {path_hints}\"\n full = f\"{header}\\\ + n\\n{chunk['chunk_text']}\"\n tc = count(full)\n\n if tc <\ + \ MIN_T:\n continue\n\n if tc <= MAX_T:\n chunk[\"\ + chunk_text\"] = full[:8192]\n chunk[\"token_count\"] = tc\n \ + \ processed.append(chunk)\n else:\n # Split\ + \ oversized\n parts = full.split(\"\\n\\n\")\n cur,\ + \ subs = \"\", []\n for p in parts:\n cand = cur\ + \ + \"\\n\\n\" + p if cur else p\n if count(cand) > MAX_T:\n\ + \ if cur.strip():\n subs.append(cur.strip())\n\ + \ cur = p\n else:\n \ + \ cur = cand\n if cur.strip():\n subs.append(cur.strip())\n\ + \n for si, sub in enumerate(subs):\n st = count(sub)\n\ + \ if st < MIN_T:\n continue\n \ + \ sc = chunk.copy()\n sc[\"chunk_id\"] = hashlib.sha256(f\"\ + {chunk['chunk_id']}::{si}\".encode()).hexdigest()[:32]\n \ + \ sc[\"chunk_text\"] = sub[:8192]\n sc[\"token_count\"] =\ + \ st\n processed.append(sc)\n\n logger.info(\"Chunked\ + \ %d -> %d chunks\", len(raw), len(processed))\n\n with open(chunked_data.path,\ + \ \"w\") as f:\n for c in processed:\n f.write(json.dumps(c,\ + \ ensure_ascii=False) + \"\\n\")\n\n" + image: python:3.11-slim + exec-clone-repo: + container: + args: + - --executor_input + - '{{$}}' + - --function_to_execute + - clone_repo + command: + - sh + - -c + - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ + \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ + \ python3 -m pip install --quiet --no-warn-script-location 'gitpython==3.1.43'\ + \ && python3 -m pip install --quiet --no-warn-script-location 'kfp==2.16.0'\ + \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\ + $0\" \"$@\"\n" + - sh + - -ec + - 'program_path=$(mktemp -d) + + + printf "%s" "$0" > "$program_path/ephemeral_component.py" + + _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" + + ' + - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ + \ *\n\ndef clone_repo(\n repo_url: str,\n branch: str,\n clone_data:\ + \ Output[Dataset],\n):\n \"\"\"Clone a git repository and collect file\ + \ metadata.\n\n Args:\n repo_url: Repository URL to clone.\n \ + \ branch: Branch name to clone.\n clone_data: Output dataset\ + \ artifact.\n \"\"\"\n import json\n import logging\n import\ + \ os\n import subprocess\n import tempfile\n\n logging.basicConfig(level=logging.INFO)\n\ + \ logger = logging.getLogger(\"repo_cloner\")\n\n SKIP_DIRS = {\"\ + .git\", \"__pycache__\", \"node_modules\", \".tox\", \".mypy_cache\"}\n\ + \ EXTENSIONS = {\".py\", \".go\", \".yaml\", \".yml\", \".md\"}\n \ + \ MIN_SIZE, MAX_SIZE = 200, 100_000\n\n clone_dir = tempfile.mkdtemp(prefix=\"\ + code-ingest-\")\n logger.info(\"Cloning %s -> %s\", repo_url, clone_dir)\n\ + \n subprocess.run(\n [\"git\", \"clone\", \"--depth\", \"1\",\ + \ \"--branch\", branch, repo_url, clone_dir],\n check=True, capture_output=True,\ + \ text=True,\n )\n\n result = subprocess.run(\n [\"git\", \"\ + rev-parse\", \"HEAD\"],\n capture_output=True, text=True, cwd=clone_dir,\ + \ check=True,\n )\n commit_sha = result.stdout.strip()\n logger.info(\"\ + Commit: %s\", commit_sha[:12])\n\n files = []\n for root, dirs, fnames\ + \ in os.walk(clone_dir):\n dirs[:] = [d for d in dirs if d not in\ + \ SKIP_DIRS and not d.startswith(\".\")]\n for fn in fnames:\n \ + \ fp = os.path.join(root, fn)\n rel = os.path.relpath(fp,\ + \ clone_dir)\n _, ext = os.path.splitext(fn)\n if\ + \ ext.lower() not in EXTENSIONS:\n continue\n \ + \ try:\n sz = os.path.getsize(fp)\n except OSError:\n\ + \ continue\n if sz < MIN_SIZE or sz > MAX_SIZE:\n\ + \ continue\n parts = rel.split(os.sep)\n \ + \ folder = parts[0] if len(parts) > 1 else \"root\"\n files.append({\"\ + path\": rel, \"extension\": ext.lower(),\n \"size_bytes\"\ + : sz, \"folder_context\": folder})\n\n logger.info(\"Collected %d files\"\ + , len(files))\n\n # Save file list + contents\n output = []\n for\ + \ f in files:\n full = os.path.join(clone_dir, f[\"path\"])\n \ + \ try:\n with open(full, \"r\", encoding=\"utf-8\", errors=\"\ + replace\") as fh:\n content = fh.read()\n except Exception:\n\ + \ continue\n output.append({**f, \"content\": content,\ + \ \"commit_sha\": commit_sha})\n\n with open(clone_data.path, \"w\")\ + \ as fh:\n for item in output:\n fh.write(json.dumps(item,\ + \ ensure_ascii=False) + \"\\n\")\n\n # Cleanup\n import shutil\n \ + \ shutil.rmtree(clone_dir, ignore_errors=True)\n\n" + image: python:3.11-slim + exec-embed-code: + container: + args: + - --executor_input + - '{{$}}' + - --function_to_execute + - embed_code + command: + - sh + - -c + - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ + \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ + \ python3 -m pip install --quiet --no-warn-script-location 'sentence-transformers==2.7.0'\ + \ 'torch==2.3.0' && python3 -m pip install --quiet --no-warn-script-location\ + \ 'kfp==2.16.0' '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"\ + 3.9\"' && \"$0\" \"$@\"\n" + - sh + - -ec + - 'program_path=$(mktemp -d) + + + printf "%s" "$0" > "$program_path/ephemeral_component.py" + + _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" + + ' + - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ + \ *\n\ndef embed_code(\n chunked_data: Input[Dataset],\n embedding_model:\ + \ str,\n embedded_data: Output[Dataset],\n):\n \"\"\"Embed code chunks\ + \ using configurable model.\n\n Args:\n chunked_data: Input dataset\ + \ of chunked code.\n embedding_model: Model name for embeddings.\n\ + \ embedded_data: Output dataset with embeddings.\n \"\"\"\n \ + \ import json\n import logging\n\n from sentence_transformers import\ + \ SentenceTransformer\n\n logging.basicConfig(level=logging.INFO)\n \ + \ logger = logging.getLogger(\"embedder\")\n\n chunks = []\n with\ + \ open(chunked_data.path) as f:\n for line in f:\n if\ + \ line.strip():\n chunks.append(json.loads(line))\n\n \ + \ logger.info(\"Embedding %d code chunks with %s\", len(chunks), embedding_model)\n\ + \ model = SentenceTransformer(embedding_model)\n\n texts = [c[\"chunk_text\"\ + ] for c in chunks]\n bs = 32\n all_embs = []\n for i in range(0,\ + \ len(texts), bs):\n batch = texts[i:i + bs]\n embs = model.encode(batch,\ + \ show_progress_bar=False)\n all_embs.extend([e.tolist() for e in\ + \ embs])\n logger.info(\"Batch %d/%d\", i // bs + 1, (len(texts)\ + \ + bs - 1) // bs)\n\n for c, e in zip(chunks, all_embs):\n c[\"\ + embedding\"] = e\n\n with open(embedded_data.path, \"w\") as f:\n \ + \ for c in chunks:\n f.write(json.dumps(c, ensure_ascii=False)\ + \ + \"\\n\")\n\n" + image: python:3.11-slim + exec-load-code: + container: + args: + - --executor_input + - '{{$}}' + - --function_to_execute + - load_code + command: + - sh + - -c + - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ + \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ + \ python3 -m pip install --quiet --no-warn-script-location 'pymilvus==2.4.0'\ + \ && python3 -m pip install --quiet --no-warn-script-location 'kfp==2.16.0'\ + \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\ + $0\" \"$@\"\n" + - sh + - -ec + - 'program_path=$(mktemp -d) + + + printf "%s" "$0" > "$program_path/ephemeral_component.py" + + _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" + + ' + - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ + \ *\n\ndef load_code(\n embedded_data: Input[Dataset],\n milvus_host:\ + \ str,\n milvus_port: str,\n collection_name: str,\n embedding_dim:\ + \ int,\n):\n \"\"\"Load embedded code chunks into Milvus code_collection.\n\ + \n Args:\n embedded_data: Input dataset with embedded chunks.\n\ + \ milvus_host: Milvus server host.\n milvus_port: Milvus server\ + \ port.\n collection_name: Target collection name.\n embedding_dim:\ + \ Vector dimension.\n \"\"\"\n import json\n import logging\n\n\ + \ from pymilvus import (Collection, CollectionSchema, DataType, FieldSchema,\n\ + \ connections, utility)\n\n logging.basicConfig(level=logging.INFO)\n\ + \ logger = logging.getLogger(\"loader\")\n\n connections.connect(\"\ + default\", host=milvus_host, port=milvus_port)\n\n if not utility.has_collection(collection_name):\n\ + \ fields = [\n FieldSchema(\"chunk_id\", DataType.VARCHAR,\ + \ max_length=128, is_primary=True),\n FieldSchema(\"file_path\"\ + , DataType.VARCHAR, max_length=512),\n FieldSchema(\"extension\"\ + , DataType.VARCHAR, max_length=16),\n FieldSchema(\"language\"\ + , DataType.VARCHAR, max_length=32),\n FieldSchema(\"symbol_name\"\ + , DataType.VARCHAR, max_length=256),\n FieldSchema(\"folder_context\"\ + , DataType.VARCHAR, max_length=128),\n FieldSchema(\"chunk_text\"\ + , DataType.VARCHAR, max_length=8192),\n FieldSchema(\"start_line\"\ + , DataType.INT64),\n FieldSchema(\"end_line\", DataType.INT64),\n\ + \ FieldSchema(\"commit_sha\", DataType.VARCHAR, max_length=64),\n\ + \ FieldSchema(\"embedding\", DataType.FLOAT_VECTOR, dim=embedding_dim),\n\ + \ ]\n schema = CollectionSchema(fields, \"Kubeflow manifests\ + \ code chunks\")\n collection = Collection(collection_name, schema)\n\ + \ collection.create_index(\"embedding\", {\n \"metric_type\"\ + : \"COSINE\", \"index_type\": \"HNSW\",\n \"params\": {\"M\"\ + : 16, \"efConstruction\": 200},\n })\n else:\n collection\ + \ = Collection(collection_name)\n\n collection.load()\n\n chunks =\ + \ []\n with open(embedded_data.path) as f:\n for line in f:\n\ + \ if line.strip():\n chunks.append(json.loads(line))\n\ + \n rows = []\n for c in chunks:\n rows.append({\n \ + \ \"chunk_id\": str(c[\"chunk_id\"])[:128],\n \"file_path\"\ + : str(c.get(\"file_path\", \"\"))[:512],\n \"extension\": str(c.get(\"\ + extension\", \"\"))[:16],\n \"language\": str(c.get(\"language\"\ + , \"\"))[:32],\n \"symbol_name\": str(c.get(\"symbol_name\",\ + \ \"\"))[:256],\n \"folder_context\": str(c.get(\"folder_context\"\ + , \"\"))[:128],\n \"chunk_text\": str(c.get(\"chunk_text\", \"\ + \"))[:8192],\n \"start_line\": int(c.get(\"start_line\", 0)),\n\ + \ \"end_line\": int(c.get(\"end_line\", 0)),\n \"\ + commit_sha\": str(c.get(\"commit_sha\", \"\"))[:64],\n \"embedding\"\ + : c[\"embedding\"],\n })\n\n bs = 100\n inserted = 0\n for\ + \ i in range(0, len(rows), bs):\n batch = rows[i:i + bs]\n \ + \ collection.upsert(batch)\n inserted += len(batch)\n\n collection.flush()\n\ + \ logger.info(\"Loaded %d chunks into %s. Total: %d\",\n \ + \ inserted, collection_name, collection.num_entities)\n\n" + image: python:3.11-slim + exec-parse-code: + container: + args: + - --executor_input + - '{{$}}' + - --function_to_execute + - parse_code + command: + - sh + - -c + - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ + \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ + \ python3 -m pip install --quiet --no-warn-script-location 'PyYAML==6.0.1'\ + \ && python3 -m pip install --quiet --no-warn-script-location 'kfp==2.16.0'\ + \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\ + $0\" \"$@\"\n" + - sh + - -ec + - 'program_path=$(mktemp -d) + + + printf "%s" "$0" > "$program_path/ephemeral_component.py" + + _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" + + ' + - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ + \ *\n\ndef parse_code(\n clone_data: Input[Dataset],\n parsed_data:\ + \ Output[Dataset],\n):\n \"\"\"Parse files into logical code chunks using\ + \ language-specific parsers.\n\n Args:\n clone_data: Input dataset\ + \ from repo cloner.\n parsed_data: Output dataset of parsed chunks.\n\ + \ \"\"\"\n import ast as pyast\n import hashlib\n import json\n\ + \ import logging\n import os\n import re\n\n import yaml\n\n\ + \ logging.basicConfig(level=logging.INFO)\n logger = logging.getLogger(\"\ + ast_parser\")\n\n PATH_ALIAS_HINTS = {\n \"common/istio\": [\n\ + \ \"istio\", \"service mesh\", \"gateway\", \"authorization policy\"\ + ,\n \"peer authentication\", \"virtual service\", \"sidecar\"\ + , \"envoy\", \"mtls\", \"ingress\",\n ],\n \"common/knative\"\ + : [\n \"knative\", \"serving\", \"eventing\", \"serverless\"\ + , \"scale to zero\",\n \"activator\", \"revision\", \"service\"\ + , \"net istio\", \"webhook\",\n ],\n \"common/dex\": [\n \ + \ \"dex\", \"oidc\", \"oauth2\", \"authentication\", \"identity\ + \ provider\",\n \"connector\", \"login\",\n ],\n \ + \ \"common/cert-manager\": [\n \"cert manager\", \"certificate\"\ + , \"issuer\", \"clusterissuer\",\n \"cainjector\", \"tls\", \"\ + webhook\",\n ],\n \"applications/pipeline\": [\n \ + \ \"kubeflow pipelines\", \"kfp\", \"pipeline api server\", \"deployment\"\ + ,\n \"service\", \"configmap\", \"role\", \"rolebinding\", \"\ + serviceaccount\",\n \"crd\", \"webhook\", \"scheduled workflow\"\ + ,\n ],\n \"applications/profiles\": [\n \"profiles\"\ + , \"namespaces\", \"rbac\", \"rolebinding\", \"serviceaccount\", \"user\ + \ profile\",\n ],\n \"tests\": [\"tests\", \"e2e\", \"integration\"\ + , \"validation\", \"presubmit\"],\n }\n\n def gen_id(fp, sym, idx):\n\ + \ return hashlib.sha256(f\"{fp}::{sym}::{idx}\".encode()).hexdigest()[:32]\n\ + \n def split_terms(value):\n expanded = re.sub(r\"([a-z0-9])([A-Z])\"\ + , r\"\\1 \\2\", value)\n normalized = re.sub(r\"[^A-Za-z0-9]+\",\ + \ \" \", expanded)\n return [token.lower() for token in normalized.split()\ + \ if token]\n\n def unique_terms(values, limit=24):\n seen = set()\n\ + \ ordered = []\n for value in values:\n for token\ + \ in split_terms(str(value)):\n if token not in seen:\n \ + \ seen.add(token)\n ordered.append(token)\n\ + \ if len(ordered) >= limit:\n \ + \ return ordered\n return ordered\n\n def summarize_list(values,\ + \ limit=8):\n if not isinstance(values, list):\n return\ + \ \"\"\n flattened = [str(item) for item in values if item]\n \ + \ return \", \".join(flattened[:limit])\n\n def get_path_aliases(fp):\n\ + \ normalized = fp.replace(\"\\\\\", \"/\").lower()\n aliases\ + \ = []\n for prefix, hints in PATH_ALIAS_HINTS.items():\n \ + \ if normalized.startswith(prefix):\n aliases.extend(hints)\n\ + \ return aliases\n\n def extract_container_names(parsed):\n \ + \ spec = parsed.get(\"spec\")\n if not isinstance(spec, dict):\n\ + \ return []\n template = spec.get(\"template\", {})\n\ + \ if isinstance(template, dict):\n template_spec = template.get(\"\ + spec\", {})\n if isinstance(template_spec, dict):\n \ + \ containers = template_spec.get(\"containers\", [])\n \ + \ if isinstance(containers, list):\n return [\n \ + \ str(container.get(\"name\"))\n \ + \ for container in containers\n if isinstance(container,\ + \ dict) and container.get(\"name\")\n ]\n job_template\ + \ = spec.get(\"jobTemplate\", {})\n if isinstance(job_template, dict):\n\ + \ nested_spec = job_template.get(\"spec\", {})\n if\ + \ isinstance(nested_spec, dict):\n nested_template = nested_spec.get(\"\ + template\", {})\n if isinstance(nested_template, dict):\n\ + \ nested_template_spec = nested_template.get(\"spec\"\ + , {})\n if isinstance(nested_template_spec, dict):\n\ + \ containers = nested_template_spec.get(\"containers\"\ + , [])\n if isinstance(containers, list):\n \ + \ return [\n str(container.get(\"\ + name\"))\n for container in containers\n\ + \ if isinstance(container, dict) and container.get(\"\ + name\")\n ]\n return []\n\n def build_manifest_context(parsed,\ + \ fp, ctx):\n metadata = parsed.get(\"metadata\", {})\n metadata\ + \ = metadata if isinstance(metadata, dict) else {}\n kind = str(parsed.get(\"\ + kind\", \"Unknown\"))\n api_version = str(parsed.get(\"apiVersion\"\ + , \"unknown\"))\n name = str(metadata.get(\"name\", \"unknown\"))\n\ + \ namespace = str(metadata.get(\"namespace\", \"cluster-scoped\"\ + ))\n path_terms = unique_terms([fp, os.path.basename(fp), ctx], limit=18)\n\ + \ alias_terms = unique_terms(get_path_aliases(fp), limit=18)\n \ + \ top_level_keys = summarize_list(list(parsed.keys()))\n label_keys\ + \ = summarize_list(list((metadata.get(\"labels\") or {}).keys()))\n \ + \ annotation_keys = summarize_list(list((metadata.get(\"annotations\"\ + ) or {}).keys()))\n\n lines = [\n f\"Manifest file path:\ + \ {fp}\",\n f\"Folder context: {ctx}\",\n f\"Resource\ + \ kind: {kind}\",\n f\"API version: {api_version}\",\n \ + \ f\"Metadata name: {name}\",\n f\"Namespace: {namespace}\"\ + ,\n ]\n if path_terms:\n lines.append(f\"Path hints:\ + \ {' '.join(path_terms)}\")\n if alias_terms:\n lines.append(f\"\ + Domain hints: {' '.join(alias_terms)}\")\n if top_level_keys:\n \ + \ lines.append(f\"Top-level keys: {top_level_keys}\")\n \ + \ if label_keys:\n lines.append(f\"Label keys: {label_keys}\"\ + )\n if annotation_keys:\n lines.append(f\"Annotation keys:\ + \ {annotation_keys}\")\n\n spec = parsed.get(\"spec\")\n spec\ + \ = spec if isinstance(spec, dict) else {}\n\n if kind.lower() ==\ + \ \"kustomization\" or os.path.basename(fp).lower() == \"kustomization.yaml\"\ + :\n resources = summarize_list(parsed.get(\"resources\"))\n \ + \ components = summarize_list(parsed.get(\"components\"))\n \ + \ bases = summarize_list(parsed.get(\"bases\"))\n patches\ + \ = summarize_list(parsed.get(\"patchesStrategicMerge\"))\n if\ + \ resources:\n lines.append(f\"Kustomize resources: {resources}\"\ + )\n if components:\n lines.append(f\"Kustomize\ + \ components: {components}\")\n if bases:\n lines.append(f\"\ + Kustomize bases: {bases}\")\n if patches:\n lines.append(f\"\ + Kustomize patches: {patches}\")\n\n if kind in {\"Deployment\", \"\ + StatefulSet\", \"DaemonSet\", \"Job\", \"CronJob\"}:\n container_names\ + \ = summarize_list(extract_container_names(parsed))\n service_account\ + \ = spec.get(\"serviceAccountName\")\n if not service_account\ + \ and isinstance(spec.get(\"template\"), dict):\n template_spec\ + \ = spec.get(\"template\", {}).get(\"spec\", {})\n if isinstance(template_spec,\ + \ dict):\n service_account = template_spec.get(\"serviceAccountName\"\ + )\n if container_names:\n lines.append(f\"Workload\ + \ containers: {container_names}\")\n if service_account:\n \ + \ lines.append(f\"Service account: {service_account}\")\n\n\ + \ if kind == \"Service\":\n service_type = spec.get(\"\ + type\")\n selector = spec.get(\"selector\")\n ports\ + \ = spec.get(\"ports\")\n if service_type:\n lines.append(f\"\ + Service type: {service_type}\")\n if isinstance(selector, dict)\ + \ and selector:\n lines.append(f\"Service selector keys:\ + \ {', '.join(list(selector.keys())[:8])}\")\n if isinstance(ports,\ + \ list) and ports:\n port_values = [str(port.get('port'))\ + \ for port in ports if isinstance(port, dict) and port.get('port')]\n \ + \ if port_values:\n lines.append(f\"Service\ + \ ports: {', '.join(port_values[:8])}\")\n\n if kind == \"CustomResourceDefinition\"\ + :\n names = spec.get(\"names\", {}) if isinstance(spec.get(\"\ + names\"), dict) else {}\n versions = spec.get(\"versions\", [])\n\ + \ if spec.get(\"group\"):\n lines.append(f\"CRD\ + \ group: {spec.get('group')}\")\n if names.get(\"kind\"):\n \ + \ lines.append(f\"CRD served kind: {names.get('kind')}\")\n\ + \ if isinstance(versions, list) and versions:\n \ + \ version_names = [str(version.get(\"name\")) for version in versions\ + \ if isinstance(version, dict) and version.get(\"name\")]\n \ + \ if version_names:\n lines.append(f\"CRD versions:\ + \ {', '.join(version_names[:8])}\")\n\n if kind in {\"Role\", \"\ + ClusterRole\"}:\n rules = spec.get(\"rules\", parsed.get(\"rules\"\ + ))\n if isinstance(rules, list) and rules:\n resource_names\ + \ = []\n verbs = []\n for rule in rules[:4]:\n\ + \ if isinstance(rule, dict):\n \ + \ resource_names.extend(str(item) for item in rule.get(\"resources\", [])[:4])\n\ + \ verbs.extend(str(item) for item in rule.get(\"\ + verbs\", [])[:4])\n if resource_names:\n \ + \ lines.append(f\"RBAC resources: {', '.join(resource_names[:10])}\"\ + )\n if verbs:\n lines.append(f\"RBAC verbs:\ + \ {', '.join(verbs[:10])}\")\n\n if kind in {\"RoleBinding\", \"\ + ClusterRoleBinding\"}:\n role_ref = parsed.get(\"roleRef\", {})\n\ + \ subjects = parsed.get(\"subjects\", [])\n if isinstance(role_ref,\ + \ dict) and role_ref.get(\"name\"):\n lines.append(f\"Binding\ + \ roleRef: {role_ref.get('name')}\")\n if isinstance(subjects,\ + \ list) and subjects:\n subject_names = [str(subject.get(\"\ + name\")) for subject in subjects if isinstance(subject, dict) and subject.get(\"\ + name\")]\n if subject_names:\n lines.append(f\"\ + Binding subjects: {', '.join(subject_names[:10])}\")\n\n if kind\ + \ in {\"AuthorizationPolicy\", \"PeerAuthentication\", \"VirtualService\"\ + , \"Gateway\", \"DestinationRule\"}:\n selector = spec.get(\"\ + selector\", {})\n if isinstance(selector, dict):\n \ + \ match_labels = selector.get(\"matchLabels\", {})\n \ + \ if isinstance(match_labels, dict) and match_labels:\n \ + \ lines.append(f\"Istio selector labels: {', '.join(list(match_labels.keys())[:8])}\"\ + )\n gateways = spec.get(\"gateways\")\n hosts = spec.get(\"\ + hosts\")\n if isinstance(gateways, list) and gateways:\n \ + \ lines.append(f\"Istio gateways: {', '.join(str(g) for g in\ + \ gateways[:8])}\")\n if isinstance(hosts, list) and hosts:\n\ + \ lines.append(f\"Istio hosts: {', '.join(str(h) for h in\ + \ hosts[:8])}\")\n\n return \"\\n\".join(f\"# {line}\" for line in\ + \ lines if line)\n\n def parse_python(content, fp, sha, ctx):\n \ + \ chunks, lines = [], content.split(\"\\n\")\n try:\n \ + \ tree = pyast.parse(content)\n except SyntaxError:\n \ + \ return [{\"chunk_id\": gen_id(fp, \"module\", 0), \"file_path\": fp,\n\ + \ \"extension\": \".py\", \"language\": \"python\",\n\ + \ \"symbol_name\": os.path.basename(fp), \"chunk_text\"\ + : content,\n \"start_line\": 1, \"end_line\": len(lines),\n\ + \ \"commit_sha\": sha, \"folder_context\": ctx}]\n \ + \ idx = 0\n for node in pyast.walk(tree):\n if isinstance(node,\ + \ (pyast.FunctionDef, pyast.AsyncFunctionDef, pyast.ClassDef)):\n \ + \ sl, el = node.lineno, node.end_lineno or node.lineno\n \ + \ ct = \"\\n\".join(lines[sl - 1:el])\n tp = \"\ + class\" if isinstance(node, pyast.ClassDef) else \"function\"\n \ + \ chunks.append({\"chunk_id\": gen_id(fp, node.name, idx), \"file_path\"\ + : fp,\n \"extension\": \".py\", \"language\"\ + : \"python\",\n \"symbol_name\": f\"{tp}:{node.name}\"\ + , \"chunk_text\": ct,\n \"start_line\": sl,\ + \ \"end_line\": el,\n \"commit_sha\": sha,\ + \ \"folder_context\": ctx})\n idx += 1\n if not chunks:\n\ + \ chunks.append({\"chunk_id\": gen_id(fp, \"module\", 0), \"\ + file_path\": fp,\n \"extension\": \".py\", \"\ + language\": \"python\",\n \"symbol_name\": f\"\ + module:{os.path.basename(fp)}\", \"chunk_text\": content,\n \ + \ \"start_line\": 1, \"end_line\": len(lines),\n \ + \ \"commit_sha\": sha, \"folder_context\": ctx})\n \ + \ return chunks\n\n def parse_go(content, fp, sha, ctx):\n \ + \ pat = re.compile(r\"^(?:func\\s+(?:\\([^)]+\\)\\s+)?(\\w+)|type\\s+(\\\ + w+)\\s+struct)\\b\", re.MULTILINE)\n matches = list(pat.finditer(content))\n\ + \ if not matches:\n return [{\"chunk_id\": gen_id(fp,\ + \ \"file\", 0), \"file_path\": fp,\n \"extension\":\ + \ \".go\", \"language\": \"go\",\n \"symbol_name\":\ + \ f\"file:{os.path.basename(fp)}\", \"chunk_text\": content,\n \ + \ \"start_line\": 1, \"end_line\": content.count(\"\\n\") + 1,\n\ + \ \"commit_sha\": sha, \"folder_context\": ctx}]\n \ + \ chunks = []\n for i, m in enumerate(matches):\n \ + \ sym = m.group(1) or m.group(2)\n s, e = m.start(), matches[i\ + \ + 1].start() if i + 1 < len(matches) else len(content)\n ct\ + \ = content[s:e].rstrip()\n sl = content[:s].count(\"\\n\") +\ + \ 1\n tp = \"struct\" if m.group(2) else \"func\"\n \ + \ chunks.append({\"chunk_id\": gen_id(fp, sym, i), \"file_path\": fp,\n\ + \ \"extension\": \".go\", \"language\": \"go\"\ + ,\n \"symbol_name\": f\"{tp}:{sym}\", \"chunk_text\"\ + : ct,\n \"start_line\": sl, \"end_line\": sl +\ + \ ct.count(\"\\n\"),\n \"commit_sha\": sha, \"\ + folder_context\": ctx})\n return chunks\n\n def parse_yaml_file(content,\ + \ fp, sha, ctx):\n ext = os.path.splitext(fp)[1].lower()\n \ + \ docs = content.split(\"\\n---\")\n chunks = []\n for idx,\ + \ doc in enumerate(docs):\n doc = doc.strip()\n if\ + \ not doc:\n continue\n try:\n \ + \ parsed = yaml.safe_load(doc)\n except yaml.YAMLError:\n \ + \ parsed = None\n if isinstance(parsed, dict):\n\ + \ kind = parsed.get(\"kind\", \"Unknown\")\n \ + \ md = parsed.get(\"metadata\", {})\n name = md.get(\"\ + name\", \"unknown\") if isinstance(md, dict) else \"unknown\"\n \ + \ sym = f\"{kind}:{name}\"\n manifest_context = build_manifest_context(parsed,\ + \ fp, ctx)\n chunk_body = f\"{manifest_context}\\n\\n{doc}\"\ + \ if manifest_context else doc\n else:\n sym =\ + \ f\"fragment:{idx}\"\n chunk_body = doc\n pre\ + \ = \"\\n---\".join(docs[:idx])\n sl = pre.count(\"\\n\") + 1\ + \ if pre else 1\n chunks.append({\"chunk_id\": gen_id(fp, sym,\ + \ idx), \"file_path\": fp,\n \"extension\": ext,\ + \ \"language\": \"yaml\",\n \"symbol_name\": sym,\ + \ \"chunk_text\": chunk_body,\n \"start_line\"\ + : sl, \"end_line\": sl + doc.count(\"\\n\"),\n \ + \ \"commit_sha\": sha, \"folder_context\": ctx})\n return chunks\ + \ or [{\"chunk_id\": gen_id(fp, \"file\", 0), \"file_path\": fp,\n \ + \ \"extension\": ext, \"language\": \"yaml\",\n \ + \ \"symbol_name\": f\"file:{os.path.basename(fp)}\"\ + , \"chunk_text\": content,\n \"start_line\": 1,\ + \ \"end_line\": content.count(\"\\n\") + 1,\n \ + \ \"commit_sha\": sha, \"folder_context\": ctx}]\n\n def parse_md(content,\ + \ fp, sha, ctx):\n pat = re.compile(r\"^(#{2,3})\\s+(.+)$\", re.MULTILINE)\n\ + \ matches = list(pat.finditer(content))\n if not matches:\n\ + \ return [{\"chunk_id\": gen_id(fp, \"doc\", 0), \"file_path\"\ + : fp,\n \"extension\": \".md\", \"language\": \"markdown\"\ + ,\n \"symbol_name\": f\"doc:{os.path.basename(fp)}\"\ + , \"chunk_text\": content,\n \"start_line\": 1, \"end_line\"\ + : content.count(\"\\n\") + 1,\n \"commit_sha\": sha,\ + \ \"folder_context\": ctx}]\n chunks = []\n for i, m in enumerate(matches):\n\ + \ h = m.group(2).strip()\n s = m.start()\n \ + \ e = matches[i + 1].start() if i + 1 < len(matches) else len(content)\n\ + \ text = content[s:e].strip()\n sl = content[:s].count(\"\ + \\n\") + 1\n chunks.append({\"chunk_id\": gen_id(fp, h, i), \"\ + file_path\": fp,\n \"extension\": \".md\", \"\ + language\": \"markdown\",\n \"symbol_name\": f\"\ + heading:{h[:100]}\", \"chunk_text\": text,\n \"\ + start_line\": sl, \"end_line\": sl + text.count(\"\\n\"),\n \ + \ \"commit_sha\": sha, \"folder_context\": ctx})\n \ + \ return chunks\n\n PARSERS = {\".py\": parse_python, \".go\": parse_go,\n\ + \ \".yaml\": parse_yaml_file, \".yml\": parse_yaml_file, \"\ + .md\": parse_md}\n\n files = []\n with open(clone_data.path) as f:\n\ + \ for line in f:\n if line.strip():\n files.append(json.loads(line))\n\ + \n all_chunks = []\n for fi in files:\n parser = PARSERS.get(fi[\"\ + extension\"])\n if not parser:\n continue\n try:\n\ + \ chunks = parser(fi[\"content\"], fi[\"path\"], fi[\"commit_sha\"\ + ], fi[\"folder_context\"])\n all_chunks.extend(chunks)\n \ + \ except Exception as ex:\n logger.warning(\"Error parsing\ + \ %s: %s\", fi[\"path\"], ex)\n\n logger.info(\"Parsed %d chunks from\ + \ %d files\", len(all_chunks), len(files))\n\n with open(parsed_data.path,\ + \ \"w\") as f:\n for c in all_chunks:\n f.write(json.dumps(c,\ + \ ensure_ascii=False) + \"\\n\")\n\n" + image: python:3.11-slim +pipelineInfo: + description: Clone kubeflow/manifests, parse code by language, embed, and load into + Milvus + name: code-ingestion-pipeline +root: + dag: + tasks: + chunk-code: + cachingOptions: + enableCache: true + componentRef: + name: comp-chunk-code + dependentTasks: + - parse-code + inputs: + artifacts: + parsed_data: + taskOutputArtifact: + outputArtifactKey: parsed_data + producerTask: parse-code + retryPolicy: + backoffDuration: 30s + backoffFactor: 2.0 + backoffMaxDuration: 3600s + maxRetryCount: 3 + taskInfo: + name: chunk-code + clone-repo: + cachingOptions: + enableCache: true + componentRef: + name: comp-clone-repo + inputs: + parameters: + branch: + componentInputParameter: branch + repo_url: + componentInputParameter: repo_url + retryPolicy: + backoffDuration: 30s + backoffFactor: 2.0 + backoffMaxDuration: 3600s + maxRetryCount: 3 + taskInfo: + name: clone-repo + embed-code: + cachingOptions: + enableCache: true + componentRef: + name: comp-embed-code + dependentTasks: + - chunk-code + inputs: + artifacts: + chunked_data: + taskOutputArtifact: + outputArtifactKey: chunked_data + producerTask: chunk-code + parameters: + embedding_model: + componentInputParameter: embedding_model + retryPolicy: + backoffDuration: 30s + backoffFactor: 2.0 + backoffMaxDuration: 3600s + maxRetryCount: 3 + taskInfo: + name: embed-code + load-code: + cachingOptions: + enableCache: true + componentRef: + name: comp-load-code + dependentTasks: + - embed-code + inputs: + artifacts: + embedded_data: + taskOutputArtifact: + outputArtifactKey: embedded_data + producerTask: embed-code + parameters: + collection_name: + componentInputParameter: collection_name + embedding_dim: + componentInputParameter: embedding_dim + milvus_host: + componentInputParameter: milvus_host + milvus_port: + componentInputParameter: milvus_port + retryPolicy: + backoffDuration: 30s + backoffFactor: 2.0 + backoffMaxDuration: 3600s + maxRetryCount: 3 + taskInfo: + name: load-code + parse-code: + cachingOptions: + enableCache: true + componentRef: + name: comp-parse-code + dependentTasks: + - clone-repo + inputs: + artifacts: + clone_data: + taskOutputArtifact: + outputArtifactKey: clone_data + producerTask: clone-repo + retryPolicy: + backoffDuration: 30s + backoffFactor: 2.0 + backoffMaxDuration: 3600s + maxRetryCount: 3 + taskInfo: + name: parse-code + inputDefinitions: + parameters: + branch: + defaultValue: master + isOptional: true + parameterType: STRING + collection_name: + defaultValue: code_collection + isOptional: true + parameterType: STRING + embedding_dim: + defaultValue: 384.0 + isOptional: true + parameterType: NUMBER_INTEGER + embedding_model: + defaultValue: sentence-transformers/all-MiniLM-L6-v2 + isOptional: true + parameterType: STRING + milvus_host: + defaultValue: localhost + isOptional: true + parameterType: STRING + milvus_port: + defaultValue: '19530' + isOptional: true + parameterType: STRING + repo_url: + defaultValue: https://github.com/kubeflow/manifests + isOptional: true + parameterType: STRING +schemaVersion: 2.1.0 +sdkVersion: kfp-2.16.0 diff --git a/pipelines/shared/__init__.py b/pipelines/shared/__init__.py new file mode 100644 index 0000000..002a1f6 --- /dev/null +++ b/pipelines/shared/__init__.py @@ -0,0 +1 @@ +# Shared utilities for docs-agent ingestion pipelines diff --git a/pipelines/shared/embedding_utils.py b/pipelines/shared/embedding_utils.py new file mode 100644 index 0000000..25771a7 --- /dev/null +++ b/pipelines/shared/embedding_utils.py @@ -0,0 +1,236 @@ +""" +Shared embedding utilities for docs-agent ingestion pipelines. + +Supports multiple embedding backends: + - sentence-transformers (local, default for development) + - openai (API-based, for production) + +Configure via environment variables: + EMBEDDING_MODEL: Model name/path (default: sentence-transformers/all-MiniLM-L6-v2) + OPENAI_API_KEY: Required only when EMBEDDING_MODEL=openai +""" + +import logging +import os +import time +from typing import List, Optional + +logger = logging.getLogger(__name__) + + +def get_embedding_model_name() -> str: + """Get the configured embedding model name from environment.""" + return os.environ.get( + "EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2" + ) + + +def get_embedding_dimension() -> int: + """Return the embedding dimension for the configured model. + + Returns: + int: Vector dimension size. + """ + model_name = get_embedding_model_name() + dimension_map = { + "sentence-transformers/all-MiniLM-L6-v2": 384, + "sentence-transformers/all-mpnet-base-v2": 768, + "nomic-embed-text": 768, + "openai": 1536, + "text-embedding-3-small": 1536, + } + for key, dim in dimension_map.items(): + if key in model_name: + return dim + # Default fallback + logger.warning( + "Unknown model '%s', defaulting to 384 dimensions.", model_name + ) + return 384 + + +class EmbeddingClient: + """Unified embedding client supporting local and API-based models. + + Usage: + client = EmbeddingClient() + vectors = client.embed_batch(["hello world", "kubeflow pipelines"]) + """ + + def __init__(self, model_name: Optional[str] = None, batch_size: int = 32): + """Initialize the embedding client. + + Args: + model_name: Override for EMBEDDING_MODEL env var. + batch_size: Number of texts to embed per batch. + """ + self.model_name = model_name or get_embedding_model_name() + self.batch_size = batch_size + self._model = None + self._client = None + + logger.info("Embedding client initialized with model: %s", self.model_name) + + def _is_openai(self) -> bool: + """Check if using OpenAI API backend.""" + return "openai" in self.model_name or "text-embedding" in self.model_name + + def _load_local_model(self): + """Lazy-load the sentence-transformers model.""" + if self._model is None: + from sentence_transformers import SentenceTransformer + + model_path = self.model_name + # Strip the prefix if it's a sentence-transformers model + if "/" in model_path and not model_path.startswith("/"): + pass # Use full HuggingFace path + logger.info("Loading local model: %s", model_path) + self._model = SentenceTransformer(model_path) + logger.info("Model loaded successfully.") + return self._model + + def _get_openai_client(self): + """Lazy-initialize the OpenAI client.""" + if self._client is None: + import openai + + api_key = os.environ.get("OPENAI_API_KEY") + if not api_key: + raise ValueError( + "OPENAI_API_KEY environment variable is required " + "when using OpenAI embeddings." + ) + self._client = openai.OpenAI(api_key=api_key) + logger.info("OpenAI client initialized.") + return self._client + + def embed_texts(self, texts: List[str]) -> List[List[float]]: + """Embed a list of texts with automatic batching and retry. + + Args: + texts: List of text strings to embed. + + Returns: + List of embedding vectors (list of floats). + """ + if not texts: + return [] + + all_embeddings: List[List[float]] = [] + + for i in range(0, len(texts), self.batch_size): + batch = texts[i : i + self.batch_size] + batch_num = i // self.batch_size + 1 + total_batches = (len(texts) + self.batch_size - 1) // self.batch_size + + embeddings = self._embed_batch_with_retry(batch) + all_embeddings.extend(embeddings) + + logger.info( + "Embedded batch %d/%d (%d texts)", + batch_num, + total_batches, + len(batch), + ) + + return all_embeddings + + def _embed_batch_with_retry( + self, texts: List[str], max_retries: int = 3 + ) -> List[List[float]]: + """Embed a single batch with exponential backoff retry. + + Args: + texts: Batch of texts to embed. + max_retries: Maximum number of retry attempts. + + Returns: + List of embedding vectors. + + Raises: + RuntimeError: If all retries are exhausted. + """ + for attempt in range(max_retries): + try: + if self._is_openai(): + return self._embed_openai(texts) + else: + return self._embed_local(texts) + except Exception as e: + wait_time = (2 ** attempt) + (0.1 * attempt) + logger.warning( + "Embedding failed (attempt %d/%d): %s. Retrying in %.1fs...", + attempt + 1, + max_retries, + str(e), + wait_time, + ) + if attempt < max_retries - 1: + time.sleep(wait_time) + else: + raise RuntimeError( + f"Embedding failed after {max_retries} attempts: {e}" + ) from e + return [] # unreachable, but satisfies type checker + + def _embed_local(self, texts: List[str]) -> List[List[float]]: + """Embed using local sentence-transformers model. + + Args: + texts: Batch of texts to embed. + + Returns: + List of embedding vectors. + """ + model = self._load_local_model() + embeddings = model.encode(texts, show_progress_bar=False) + return [emb.tolist() for emb in embeddings] + + def _embed_openai(self, texts: List[str]) -> List[List[float]]: + """Embed using OpenAI API. + + Args: + texts: Batch of texts to embed. + + Returns: + List of embedding vectors. + """ + client = self._get_openai_client() + model_name = self.model_name + if "openai" in model_name and "text-embedding" not in model_name: + model_name = "text-embedding-3-small" + + response = client.embeddings.create(input=texts, model=model_name) + return [item.embedding for item in response.data] + + +# Convenience function +def embed_texts(texts: List[str], model_name: Optional[str] = None) -> List[List[float]]: + """Convenience function to embed texts with default settings. + + Args: + texts: List of texts to embed. + model_name: Optional model override. + + Returns: + List of embedding vectors. + """ + client = EmbeddingClient(model_name=model_name) + return client.embed_texts(texts) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + # Quick smoke test + test_texts = [ + "How to install Kubeflow on Kubernetes", + "KFP pipeline component decorator", + "Milvus vector database schema design", + ] + logger.info("Testing embedding with model: %s", get_embedding_model_name()) + logger.info("Expected dimensions: %d", get_embedding_dimension()) + + client = EmbeddingClient() + vectors = client.embed_texts(test_texts) + for i, (text, vec) in enumerate(zip(test_texts, vectors)): + logger.info("Text %d: '%s...' -> dim=%d", i, text[:40], len(vec)) diff --git a/pipelines/shared/milvus_utils.py b/pipelines/shared/milvus_utils.py new file mode 100644 index 0000000..7570ef3 --- /dev/null +++ b/pipelines/shared/milvus_utils.py @@ -0,0 +1,297 @@ +""" +Shared Milvus utilities for docs-agent ingestion pipelines. + +Provides connection management, collection creation, upsert, and search +operations with retry logic and exponential backoff. + +Configure via environment variables: + MILVUS_HOST: Milvus server host (default: localhost) + MILVUS_PORT: Milvus server port (default: 19530) + MILVUS_TOKEN: Authentication token (default: empty, no auth) +""" + +import logging +import os +import time +from typing import Any, Dict, List, Optional + +from pymilvus import ( + Collection, + CollectionSchema, + DataType, + FieldSchema, + connections, + utility, +) + +logger = logging.getLogger(__name__) + + +def get_milvus_config() -> Dict[str, str]: + """Get Milvus connection configuration from environment. + + Returns: + Dict with host, port, and token. + """ + return { + "host": os.environ.get("MILVUS_HOST", "localhost"), + "port": os.environ.get("MILVUS_PORT", "19530"), + "token": os.environ.get("MILVUS_TOKEN", ""), + } + + +def connect( + alias: str = "default", + host: Optional[str] = None, + port: Optional[str] = None, + token: Optional[str] = None, + max_retries: int = 3, +) -> None: + """Connect to Milvus with retry logic. + + Args: + alias: Connection alias. + host: Override for MILVUS_HOST env var. + port: Override for MILVUS_PORT env var. + token: Override for MILVUS_TOKEN env var. + max_retries: Maximum retry attempts. + + Raises: + ConnectionError: If all retries are exhausted. + """ + config = get_milvus_config() + host = host or config["host"] + port = port or config["port"] + token = token or config["token"] + + for attempt in range(max_retries): + try: + connect_params = {"alias": alias, "host": host, "port": port} + if token: + connect_params["token"] = token + + connections.connect(**connect_params) + logger.info("Connected to Milvus at %s:%s", host, port) + return + except Exception as e: + wait_time = (2 ** attempt) + 1 + logger.warning( + "Milvus connection failed (attempt %d/%d): %s. Retrying in %ds...", + attempt + 1, + max_retries, + str(e), + wait_time, + ) + if attempt < max_retries - 1: + time.sleep(wait_time) + else: + raise ConnectionError( + f"Failed to connect to Milvus after {max_retries} attempts: {e}" + ) from e + + +def create_collection_if_not_exists( + collection_name: str, + fields: List[FieldSchema], + description: str = "", + index_field: str = "embedding", + index_params: Optional[Dict[str, Any]] = None, +) -> Collection: + """Create a Milvus collection if it doesn't already exist. + + Args: + collection_name: Name of the collection. + fields: List of FieldSchema objects defining the schema. + description: Collection description. + index_field: Name of the vector field to index. + index_params: Custom index parameters. Defaults to HNSW + COSINE. + + Returns: + The Milvus Collection object. + """ + if utility.has_collection(collection_name): + logger.info("Collection '%s' already exists. Loading.", collection_name) + collection = Collection(collection_name) + collection.load() + return collection + + schema = CollectionSchema(fields, description=description) + collection = Collection(name=collection_name, schema=schema) + logger.info("Created collection: %s", collection_name) + + # Default HNSW index params + if index_params is None: + index_params = { + "metric_type": "COSINE", + "index_type": "HNSW", + "params": {"M": 16, "efConstruction": 200}, + } + + collection.create_index(field_name=index_field, index_params=index_params) + logger.info( + "Created HNSW index on '%s' for collection '%s'", + index_field, + collection_name, + ) + + collection.load() + return collection + + +def upsert_batch( + collection: Collection, + rows: List[Dict[str, Any]], + batch_size: int = 100, + max_retries: int = 3, +) -> Dict[str, int]: + """Upsert rows into a Milvus collection in batches. + + Uses the primary key to handle duplicates (Milvus upsert semantics). + + Args: + collection: The target Milvus collection. + rows: List of row dicts matching the collection schema. + batch_size: Number of rows per insert batch. + max_retries: Retry attempts per batch. + + Returns: + Dict with counts: inserted, failed, total. + """ + total = len(rows) + inserted = 0 + failed = 0 + + for i in range(0, total, batch_size): + batch = rows[i : i + batch_size] + batch_num = i // batch_size + 1 + total_batches = (total + batch_size - 1) // batch_size + + success = False + for attempt in range(max_retries): + try: + collection.upsert(batch) + inserted += len(batch) + success = True + logger.info( + "Upserted batch %d/%d (%d rows)", + batch_num, + total_batches, + len(batch), + ) + break + except Exception as e: + wait_time = (2 ** attempt) + 1 + logger.warning( + "Upsert failed (batch %d, attempt %d/%d): %s. " + "Retrying in %ds...", + batch_num, + attempt + 1, + max_retries, + str(e), + wait_time, + ) + if attempt < max_retries - 1: + time.sleep(wait_time) + + if not success: + failed += len(batch) + logger.error("Batch %d permanently failed after %d retries.", batch_num, max_retries) + + collection.flush() + summary = {"inserted": inserted, "failed": failed, "total": total} + logger.info("Upsert complete: %s", summary) + return summary + + +def search( + collection: Collection, + query_vector: List[float], + top_k: int = 3, + output_fields: Optional[List[str]] = None, + search_params: Optional[Dict[str, Any]] = None, + max_retries: int = 3, +) -> List[Dict[str, Any]]: + """Search a Milvus collection by vector similarity. + + Args: + collection: The Milvus collection to search. + query_vector: The query embedding vector. + top_k: Number of results to return. + output_fields: Fields to include in results. + search_params: Custom search parameters. + max_retries: Retry attempts. + + Returns: + List of result dicts with fields and distance score. + """ + if search_params is None: + search_params = {"metric_type": "COSINE", "params": {"ef": 64}} + + if output_fields is None: + output_fields = ["chunk_text"] + + for attempt in range(max_retries): + try: + results = collection.search( + data=[query_vector], + anns_field="embedding", + param=search_params, + limit=top_k, + output_fields=output_fields, + ) + + hits = [] + for hit in results[0]: + hit_dict = {"id": hit.id, "distance": hit.distance} + for field in output_fields: + hit_dict[field] = hit.entity.get(field) + hits.append(hit_dict) + + logger.info("Search returned %d results.", len(hits)) + return hits + + except Exception as e: + wait_time = (2 ** attempt) + 1 + logger.warning( + "Search failed (attempt %d/%d): %s. Retrying in %ds...", + attempt + 1, + max_retries, + str(e), + wait_time, + ) + if attempt < max_retries - 1: + time.sleep(wait_time) + else: + logger.error("Search failed after %d retries: %s", max_retries, e) + return [] + + return [] + + +def drop_collection(collection_name: str) -> bool: + """Drop a collection if it exists. + + Args: + collection_name: Name of the collection to drop. + + Returns: + True if dropped, False if it didn't exist. + """ + if utility.has_collection(collection_name): + utility.drop_collection(collection_name) + logger.info("Dropped collection: %s", collection_name) + return True + logger.info("Collection '%s' does not exist. Nothing to drop.", collection_name) + return False + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + # Quick connection test + try: + connect() + logger.info("Milvus connection test: SUCCESS") + collections = utility.list_collections() + logger.info("Existing collections: %s", collections) + except ConnectionError as e: + logger.error("Milvus connection test: FAILED — %s", e) diff --git a/pipelines/shared/retrieval_strategy.py b/pipelines/shared/retrieval_strategy.py new file mode 100644 index 0000000..898fe2b --- /dev/null +++ b/pipelines/shared/retrieval_strategy.py @@ -0,0 +1,359 @@ +""" +Shared retrieval strategy helpers for docs-agent search and validation. + +This module adds lightweight hybrid-retrieval behavior on top of vector search: + - query expansion for manifest-heavy questions + - collection preference inference (docs vs code) + - path/domain-aware reranking of candidate hits +""" + +from __future__ import annotations + +import re +from typing import Dict, Iterable, List + +PATH_ALIAS_HINTS = { + "common/istio": [ + "istio", + "service mesh", + "gateway", + "authorization policy", + "peer authentication", + "virtual service", + "sidecar", + "envoy", + "mtls", + "ingress", + ], + "common/knative": [ + "knative", + "serving", + "eventing", + "serverless", + "scale to zero", + "activator", + "revision", + "service", + "net istio", + "webhook", + ], + "common/dex": [ + "dex", + "oidc", + "oauth2", + "authentication", + "identity provider", + "connector", + "login", + ], + "common/cert-manager": [ + "cert manager", + "certificate", + "issuer", + "clusterissuer", + "cainjector", + "tls", + "webhook", + ], + "applications/pipeline": [ + "kubeflow pipelines", + "kfp", + "pipeline api server", + "deployment", + "service", + "configmap", + "role", + "rolebinding", + "serviceaccount", + "crd", + "webhook", + "scheduled workflow", + ], + "applications/profiles": [ + "profiles", + "namespaces", + "rbac", + "rolebinding", + "serviceaccount", + "user profile", + ], + "tests": [ + "tests", + "e2e", + "integration", + "validation", + "presubmit", + ], +} + +QUERY_EXPANSIONS = { + "istio": [ + "istio", + "service mesh", + "gateway", + "authorization policy", + "peer authentication", + "virtual service", + "mtls", + ], + "knative": [ + "knative", + "serving", + "eventing", + "serverless", + "scale to zero", + "activator", + "revision", + ], + "dex": [ + "dex", + "oidc", + "oauth2", + "authentication", + "identity provider", + "connector", + ], + "cert-manager": [ + "cert manager", + "certificate", + "issuer", + "clusterissuer", + "cainjector", + "tls", + ], + "component": [ + "dsl component", + "lightweight python component", + "lightweight python components", + "containerized python component", + "base image", + "@dsl.component", + ], + "compile": [ + "compile pipeline", + "pipeline compiler", + "kfp compiler", + "pipeline yaml", + "compiler compile", + ], + "resources": [ + "deployment", + "service", + "configmap", + "role", + "rolebinding", + "serviceaccount", + "custom resource definition", + ], + "testing": [ + "tests", + "e2e", + "integration", + "validation", + "presubmit", + ], +} + +CODE_INTENT_TERMS = { + "yaml", + "manifest", + "manifests", + "deployment", + "deployments", + "service", + "services", + "configmap", + "configmaps", + "rolebinding", + "clusterrolebinding", + "clusterrole", + "serviceaccount", + "crd", + "resources", + "rbac", + "istio", + "knative", + "dex", + "cert", + "cert-manager", + "namespace", + "namespaces", + "authorizationpolicy", + "authorizationpolicies", + "clustertrainingruntime", + "clusterservingruntimes", + "pvcviewer", + "networkpolicy", + "horizontalpodautoscaler", + "webhook", + "kustomization", + "dockerfile", + "helm", +} + +# Stronger signal terms that definitively mean the user wants code/manifest +# results rather than documentation pages. +STRONG_CODE_TERMS = { + "authorizationpolicy", "authorizationpolicies", + "clusterrolebinding", "clusterrole", + "clustertrainingruntime", "clusterservingruntimes", + "clusterservingruntime", + "pvcviewer", "networkpolicy", + "kustomization", "dockerfile", + "helm", "cache server", + "metadata service", "metadata-grpc", +} + +DOCS_INTENT_TERMS = { + "how", + "what", + "overview", + "introduction", + "guide", + "concept", + "architecture", + "tutorial", +} + + +def split_terms(value: str) -> List[str]: + """Split free text, paths, and identifiers into normalized terms.""" + expanded = re.sub(r"([a-z0-9])([A-Z])", r"\1 \2", value) + normalized = re.sub(r"[^A-Za-z0-9]+", " ", expanded) + return [token.lower() for token in normalized.split() if token] + + +def unique_terms(values: Iterable[str], limit: int = 32) -> List[str]: + """Return unique normalized terms while preserving order.""" + seen = set() + ordered: List[str] = [] + for value in values: + for token in split_terms(str(value)): + if token not in seen: + seen.add(token) + ordered.append(token) + if len(ordered) >= limit: + return ordered + return ordered + + +def source_alias_terms(source: str) -> List[str]: + """Return semantic alias terms for a source path or URL.""" + normalized = source.replace("\\", "/").lower() + aliases: List[str] = [] + for prefix, hints in PATH_ALIAS_HINTS.items(): + if prefix in normalized: + aliases.extend(hints) + return unique_terms(aliases, limit=20) + + +def analyze_query(question: str) -> Dict[str, object]: + """Analyze a user question and produce retrieval hints.""" + lowered = question.lower() + expanded_terms = [question] + + for trigger, additions in QUERY_EXPANSIONS.items(): + if trigger in lowered: + expanded_terms.extend(additions) + + question_terms = set(split_terms(question)) + prefer_code = bool(question_terms & CODE_INTENT_TERMS) + # If any strong code term is present, strongly prefer code. + strongly_prefer_code = bool( + question_terms & STRONG_CODE_TERMS + ) or any(term in lowered for term in STRONG_CODE_TERMS) + prefer_docs = not prefer_code and bool(question_terms & DOCS_INTENT_TERMS) + + priority_terms = unique_terms(expanded_terms, limit=28) + enhanced_query = question + if len(priority_terms) > len(split_terms(question)): + enhanced_query = ( + f"{question}\n" + f"Relevant retrieval hints: {' '.join(priority_terms)}" + ) + + return { + "question": question, + "enhanced_query": enhanced_query, + "priority_terms": priority_terms, + "prefer_code": prefer_code, + "strongly_prefer_code": strongly_prefer_code, + "prefer_docs": prefer_docs, + } + + +def rerank_hits( + hits: List[Dict[str, object]], + query_analysis: Dict[str, object], + top_k: int, +) -> List[Dict[str, object]]: + """Rerank candidate hits with lightweight hybrid-retrieval heuristics.""" + priority_terms = set(query_analysis.get("priority_terms", [])) + prefer_code = bool(query_analysis.get("prefer_code")) + strongly_prefer_code = bool(query_analysis.get("strongly_prefer_code")) + prefer_docs = bool(query_analysis.get("prefer_docs")) + question_lower = str(query_analysis.get("question", "")).lower() + + reranked: List[Dict[str, object]] = [] + + for hit in hits: + score = float(hit.get("distance", 0.0)) + collection = str(hit.get("collection", "")) + source = str(hit.get("source_url") or hit.get("file_path") or "") + symbol_name = str(hit.get("symbol_name", "")) + heading = str(hit.get("heading", "")) + text = str(hit.get("chunk_text", "")) + + haystack = " ".join([source, symbol_name, heading, text]).lower() + haystack_terms = set(split_terms(haystack)) + path_aliases = set(source_alias_terms(source)) + + # --- Collection preference --- + if strongly_prefer_code: + # Strongly boost code results when query mentions specific K8s resources + if collection == "code_collection": + score += 0.15 + elif collection == "docs_collection": + score -= 0.06 + elif prefer_code: + if collection == "code_collection": + score += 0.08 + elif collection == "docs_collection": + score -= 0.03 + + if prefer_docs: + if collection == "docs_collection": + score += 0.09 + elif collection == "code_collection": + score -= 0.04 + + # --- Term-overlap scoring --- + term_overlap = len(priority_terms & haystack_terms) + alias_overlap = len(priority_terms & path_aliases) + score += min(0.16, 0.014 * term_overlap) + score += min(0.10, 0.025 * alias_overlap) + + # --- Path-keyword boosting --- + # Extract meaningful keywords from the query and boost hits whose + # file_path or source_url directly contain those keywords. + source_lower = source.lower() + path_keywords = [ + "cache", "metadata", "rbac", "authorization", "runtimes", + "catalog", "pvcviewer", "release", "webhook", "training-operator", + "trainer", "kserve", "pipeline", "model-registry", + ] + for kw in path_keywords: + if kw in question_lower and kw in source_lower: + score += 0.06 + + if prefer_code and source.endswith((".yaml", ".yml")): + score += 0.02 + if "kustomization.yaml" in source: + score += 0.02 + + reranked_hit = dict(hit) + reranked_hit["rerank_score"] = score + reranked.append(reranked_hit) + + reranked.sort(key=lambda item: item.get("rerank_score", item.get("distance", 0.0)), reverse=True) + return reranked[:top_k]