From dcaf03da45efce13d726f5e8ce9c318f0d3c03bb Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Thu, 26 Feb 2026 13:00:21 +1300 Subject: [PATCH 1/8] [ML] Add per-PR changelog YAML entries with schema validation Replace the monolithic CHANGELOG.md with per-PR YAML changelog files in docs/changelog/. Each PR that changes user-visible behaviour adds a small YAML file (.yaml) with structured metadata (area, type, summary). This eliminates merge conflicts in CHANGELOG.md and simplifies backports. Includes: - JSON schema for validating changelog entries - Python validation script (validate_changelogs.py) - Python bundler script (bundle_changelogs.py) for release notes - Gradle tasks: validateChangelogs, bundleChangelogs - Buildkite CI step (soft-fail during rollout) - Skip validation via >test, >refactoring, >docs, >build labels Made-with: Cursor --- .../pipelines/format_and_validation.yml.sh | 9 + .../scripts/steps/validate-changelogs.sh | 58 +++++++ build.gradle | 14 ++ dev-tools/bundle_changelogs.py | 129 +++++++++++++++ dev-tools/validate_changelogs.py | 154 ++++++++++++++++++ docs/changelog/.gitkeep | 0 docs/changelog/0000.yaml.sample | 6 + docs/changelog/README.md | 65 ++++++++ docs/changelog/changelog-schema.json | 53 ++++++ 9 files changed, 488 insertions(+) create mode 100755 .buildkite/scripts/steps/validate-changelogs.sh create mode 100755 dev-tools/bundle_changelogs.py create mode 100755 dev-tools/validate_changelogs.py create mode 100644 docs/changelog/.gitkeep create mode 100644 docs/changelog/0000.yaml.sample create mode 100644 docs/changelog/README.md create mode 100644 docs/changelog/changelog-schema.json diff --git a/.buildkite/pipelines/format_and_validation.yml.sh b/.buildkite/pipelines/format_and_validation.yml.sh index ffff9e54d5..15bec940d2 100755 --- a/.buildkite/pipelines/format_and_validation.yml.sh +++ b/.buildkite/pipelines/format_and_validation.yml.sh @@ -18,4 +18,13 @@ steps: notify: - github_commit_status: context: "Validate formatting with clang-format" + - label: "Validate changelog entries" + key: "validate_changelogs" + command: ".buildkite/scripts/steps/validate-changelogs.sh" + agents: + image: "docker.elastic.co/ml-dev/ml-check-style:2" + soft_fail: true + notify: + - github_commit_status: + context: "Validate changelog entries" EOL diff --git a/.buildkite/scripts/steps/validate-changelogs.sh b/.buildkite/scripts/steps/validate-changelogs.sh new file mode 100755 index 0000000000..7a9de78980 --- /dev/null +++ b/.buildkite/scripts/steps/validate-changelogs.sh @@ -0,0 +1,58 @@ +#!/bin/bash +# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +# or more contributor license agreements. Licensed under the Elastic License +# 2.0 and the following additional limitation. Functionality enabled by the +# files subject to the Elastic License 2.0 may only be used in production when +# invoked by an Elasticsearch process with a license key installed that permits +# use of machine learning features. You may not use this file except in +# compliance with the Elastic License 2.0 and the foregoing additional +# limitation. + +set -euo pipefail + +SKIP_LABELS=">test >refactoring >docs >build" + +# On PR builds, check if the PR has a label that skips changelog validation. +# BUILDKITE_PULL_REQUEST_LABELS is a comma-separated list set by Buildkite. +if [[ -n "${BUILDKITE_PULL_REQUEST_LABELS:-}" ]]; then + IFS=',' read -ra LABELS <<< "${BUILDKITE_PULL_REQUEST_LABELS}" + for label in "${LABELS[@]}"; do + label="$(echo "${label}" | xargs)" # trim whitespace + for skip in ${SKIP_LABELS}; do + if [[ "${label}" == "${skip}" ]]; then + echo "Skipping changelog validation: PR has label '${label}'" + exit 0 + fi + done + done +fi + +# Install Python dependencies +pip3 install --quiet pyyaml jsonschema 2>/dev/null || pip install --quiet pyyaml jsonschema + +# Find changelog files changed in this PR (compared to main/target branch) +TARGET_BRANCH="${BUILDKITE_PULL_REQUEST_BASE_BRANCH:-main}" + +# Fetch the target branch so we can diff against it +git fetch origin "${TARGET_BRANCH}" --depth=1 2>/dev/null || true + +CHANGED_CHANGELOGS=$(git diff --name-only --diff-filter=ACM "origin/${TARGET_BRANCH}"...HEAD -- 'docs/changelog/*.yaml' || true) + +if [[ -z "${CHANGED_CHANGELOGS}" ]]; then + echo "No changelog files found in this PR." + echo "If this PR changes user-visible behaviour, please add a changelog entry." + echo "See docs/changelog/README.md for details." + echo "To skip this check, add one of these labels: ${SKIP_LABELS}" + + # Soft warning rather than hard failure during rollout + if [[ "${CHANGELOG_REQUIRED:-false}" == "true" ]]; then + exit 1 + fi + exit 0 +fi + +echo "Validating changelog files:" +echo "${CHANGED_CHANGELOGS}" +echo "" + +python3 dev-tools/validate_changelogs.py ${CHANGED_CHANGELOGS} diff --git a/build.gradle b/build.gradle index 843e8718d7..080714884e 100644 --- a/build.gradle +++ b/build.gradle @@ -169,6 +169,20 @@ task format(type: Exec) { workingDir "${projectDir}" } +task validateChangelogs(type: Exec) { + commandLine 'python3', 'dev-tools/validate_changelogs.py' + workingDir "${projectDir}" + description = 'Validate changelog YAML entries against the schema' + group = 'verification' +} + +task bundleChangelogs(type: Exec) { + commandLine 'python3', 'dev-tools/bundle_changelogs.py', '--version', project.version + workingDir "${projectDir}" + description = 'Generate consolidated changelog from per-PR YAML entries' + group = 'documentation' +} + task precommit(type: Exec) { commandLine shell workingDir "${projectDir}" diff --git a/dev-tools/bundle_changelogs.py b/dev-tools/bundle_changelogs.py new file mode 100755 index 0000000000..a76c1f1a0f --- /dev/null +++ b/dev-tools/bundle_changelogs.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python3 +""" +Bundle per-PR changelog YAML files into a consolidated changelog for release. + +Usage: + python3 bundle_changelogs.py [--dir DIR] [--version VERSION] [--format FORMAT] + +Outputs a formatted changelog grouped by type and area, suitable for inclusion +in release notes. + +Formats: + markdown (default) - Markdown suitable for GitHub releases + asciidoc - AsciiDoc suitable for Elastic docs +""" + +import argparse +import sys +from collections import defaultdict +from pathlib import Path + +try: + import yaml +except ImportError: + print("Missing pyyaml. Install with: pip3 install pyyaml", file=sys.stderr) + sys.exit(2) + + +TYPE_ORDER = [ + ("breaking", "Breaking changes"), + ("deprecation", "Deprecations"), + ("feature", "New features"), + ("enhancement", "Enhancements"), + ("bug", "Bug fixes"), + ("regression", "Regression fixes"), +] + + +def load_entries(changelog_dir): + entries = [] + for path in sorted(changelog_dir.glob("*.yaml")): + with open(path) as f: + data = yaml.safe_load(f) + if data and isinstance(data, dict): + data["_file"] = path.name + entries.append(data) + return entries + + +def format_markdown(entries, version=None): + lines = [] + if version: + lines.append(f"## {version}\n") + + grouped = defaultdict(lambda: defaultdict(list)) + for entry in entries: + grouped[entry["type"]][entry["area"]].append(entry) + + for type_key, type_label in TYPE_ORDER: + if type_key not in grouped: + continue + lines.append(f"### {type_label}\n") + for area in sorted(grouped[type_key].keys()): + lines.append(f"**{area}**") + for entry in sorted(grouped[type_key][area], key=lambda e: e["pr"]): + pr = entry["pr"] + summary = entry["summary"] + issues = entry.get("issues", []) + issue_refs = ", ".join(f"#{i}" for i in issues) + line = f"- {summary} [#{pr}](https://github.com/elastic/ml-cpp/pull/{pr})" + if issue_refs: + line += f" ({issue_refs})" + lines.append(line) + lines.append("") + + return "\n".join(lines) + + +def format_asciidoc(entries, version=None): + lines = [] + if version: + lines.append(f"== {version}\n") + + grouped = defaultdict(lambda: defaultdict(list)) + for entry in entries: + grouped[entry["type"]][entry["area"]].append(entry) + + for type_key, type_label in TYPE_ORDER: + if type_key not in grouped: + continue + lines.append(f"=== {type_label}\n") + for area in sorted(grouped[type_key].keys()): + lines.append(f"*{area}*") + for entry in sorted(grouped[type_key][area], key=lambda e: e["pr"]): + pr = entry["pr"] + summary = entry["summary"] + issues = entry.get("issues", []) + issue_refs = ", ".join(f"https://github.com/elastic/ml-cpp/issues/{i}[#{i}]" for i in issues) + line = f"* {summary} https://github.com/elastic/ml-cpp/pull/{pr}[#{pr}]" + if issue_refs: + line += f" ({issue_refs})" + lines.append(line) + lines.append("") + + return "\n".join(lines) + + +def main(): + parser = argparse.ArgumentParser(description="Bundle changelog YAML files") + parser.add_argument("--dir", default=None, help="Changelog directory") + parser.add_argument("--version", default=None, help="Version string for heading") + parser.add_argument("--format", default="markdown", choices=["markdown", "asciidoc"]) + args = parser.parse_args() + + repo_root = Path(__file__).resolve().parent.parent + changelog_dir = Path(args.dir) if args.dir else repo_root / "docs" / "changelog" + + entries = load_entries(changelog_dir) + if not entries: + print("No changelog entries found.", file=sys.stderr) + sys.exit(0) + + if args.format == "asciidoc": + print(format_asciidoc(entries, args.version)) + else: + print(format_markdown(entries, args.version)) + + +if __name__ == "__main__": + main() diff --git a/dev-tools/validate_changelogs.py b/dev-tools/validate_changelogs.py new file mode 100755 index 0000000000..856975658d --- /dev/null +++ b/dev-tools/validate_changelogs.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 +""" +Validate changelog YAML files against the changelog JSON schema. + +Usage: + python3 validate_changelogs.py [--schema SCHEMA] [--dir DIR] [FILES...] + +If FILES are given, only those files are validated. +Otherwise all *.yaml files in DIR (default: docs/changelog/) are validated. + +Exit codes: + 0 All files valid (or no files to validate) + 1 One or more validation errors + 2 Missing dependencies or bad arguments +""" + +import argparse +import json +import os +import re +import sys +from pathlib import Path + + +def check_dependencies(): + """Check that required Python packages are available.""" + missing = [] + try: + import yaml # noqa: F401 + except ImportError: + missing.append("pyyaml") + try: + import jsonschema # noqa: F401 + except ImportError: + missing.append("jsonschema") + if missing: + print( + f"Missing Python packages: {', '.join(missing)}\n" + f"Install with: pip3 install {' '.join(missing)}", + file=sys.stderr, + ) + sys.exit(2) + + +def load_schema(schema_path): + with open(schema_path) as f: + return json.load(f) + + +def validate_file(filepath, schema): + """Validate a single YAML file. Returns a list of error strings.""" + import jsonschema + import yaml + + errors = [] + filename = os.path.basename(filepath) + + # Check filename convention: .yaml + stem = Path(filepath).stem + if not re.match(r"^\d+$", stem): + errors.append(f"{filename}: filename must be a PR number (e.g. 1234.yaml)") + + try: + with open(filepath) as f: + data = yaml.safe_load(f) + except yaml.YAMLError as e: + errors.append(f"{filename}: invalid YAML: {e}") + return errors + + if data is None: + errors.append(f"{filename}: file is empty") + return errors + + if not isinstance(data, dict): + errors.append(f"{filename}: expected a YAML mapping, got {type(data).__name__}") + return errors + + # Validate against JSON schema + validator = jsonschema.Draft7Validator(schema) + for error in sorted(validator.iter_errors(data), key=lambda e: list(e.path)): + path = ".".join(str(p) for p in error.absolute_path) or "(root)" + errors.append(f"{filename}: {path}: {error.message}") + + # Cross-check: PR number in filename should match pr field + if "pr" in data and stem.isdigit(): + if data["pr"] != int(stem): + errors.append( + f"{filename}: pr field ({data['pr']}) does not match filename ({stem})" + ) + + return errors + + +def main(): + parser = argparse.ArgumentParser(description="Validate changelog YAML files") + parser.add_argument( + "--schema", + default=None, + help="Path to the JSON schema (default: docs/changelog/changelog-schema.json)", + ) + parser.add_argument( + "--dir", + default=None, + help="Directory containing changelog YAML files (default: docs/changelog/)", + ) + parser.add_argument( + "files", + nargs="*", + help="Specific files to validate (overrides --dir)", + ) + args = parser.parse_args() + + check_dependencies() + + # Resolve paths relative to repo root + repo_root = Path(__file__).resolve().parent.parent + schema_path = Path(args.schema) if args.schema else repo_root / "docs" / "changelog" / "changelog-schema.json" + changelog_dir = Path(args.dir) if args.dir else repo_root / "docs" / "changelog" + + if not schema_path.exists(): + print(f"Schema not found: {schema_path}", file=sys.stderr) + sys.exit(2) + + schema = load_schema(schema_path) + + # Collect files to validate + if args.files: + yaml_files = [Path(f) for f in args.files] + else: + yaml_files = sorted(changelog_dir.glob("*.yaml")) + + if not yaml_files: + print("No changelog files to validate.") + return + + all_errors = [] + for filepath in yaml_files: + if not filepath.exists(): + all_errors.append(f"{filepath}: file not found") + continue + errors = validate_file(filepath, schema) + all_errors.extend(errors) + + if all_errors: + print(f"Changelog validation failed ({len(all_errors)} error(s)):\n") + for error in all_errors: + print(f" - {error}") + sys.exit(1) + else: + print(f"Validated {len(yaml_files)} changelog file(s) successfully.") + + +if __name__ == "__main__": + main() diff --git a/docs/changelog/.gitkeep b/docs/changelog/.gitkeep new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/changelog/0000.yaml.sample b/docs/changelog/0000.yaml.sample new file mode 100644 index 0000000000..6cb9df71f8 --- /dev/null +++ b/docs/changelog/0000.yaml.sample @@ -0,0 +1,6 @@ +pr: 1234 +summary: Fix anomaly detection model state persistence for large jobs +area: Anomaly Detection +type: bug +issues: + - 1230 diff --git a/docs/changelog/README.md b/docs/changelog/README.md new file mode 100644 index 0000000000..2b812f4e9f --- /dev/null +++ b/docs/changelog/README.md @@ -0,0 +1,65 @@ +# Changelog entries + +Each pull request that changes user-visible behaviour should include a changelog +entry as a YAML file in this directory, named `.yaml`. + +## Format + +```yaml +pr: 2914 +summary: Split build and test into separate pipeline steps +area: Build +type: enhancement +issues: [] +``` + +### Required fields + +| Field | Description | +|-----------|-------------| +| `pr` | The pull request number (integer). | +| `summary` | A concise, user-facing description of the change. | +| `area` | The area of the codebase affected (see below). | +| `type` | The type of change (see below). | + +### Optional fields + +| Field | Description | +|----------|-------------| +| `issues` | List of related GitHub issue numbers (integers). Default: `[]` | + +### Valid areas + +- **Anomaly Detection** – anomaly detection jobs, modelling, and results +- **Data Frame Analytics** – classification, regression, and outlier detection +- **NLP** – natural language processing and PyTorch inference +- **Core** – core libraries, platform support, and utilities +- **API** – REST API layer and state persistence +- **Build** – build system, CI, packaging, and developer tooling +- **Inference** – inference service integration + +### Valid types + +- **breaking** – a change that breaks backwards compatibility +- **bug** – a fix for an existing defect +- **deprecation** – deprecation of existing functionality +- **enhancement** – an improvement to existing functionality +- **feature** – a wholly new feature +- **regression** – a fix for a recently introduced defect + +## When is a changelog entry required? + +A changelog entry is **required** for any PR that: +- Fixes a bug +- Adds or changes user-visible functionality +- Changes the API or data formats +- Deprecates or removes functionality + +A changelog entry is **not required** for: +- Pure refactoring with no behaviour change +- Test-only changes +- CI/build infrastructure changes (unless they affect the shipped artefact) +- Documentation-only changes + +PRs that do not require a changelog entry should be labelled with +`>test`, `>refactoring`, `>docs`, or `>build` to skip validation. diff --git a/docs/changelog/changelog-schema.json b/docs/changelog/changelog-schema.json new file mode 100644 index 0000000000..1ea3f22054 --- /dev/null +++ b/docs/changelog/changelog-schema.json @@ -0,0 +1,53 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "$id": "https://github.com/elastic/ml-cpp/tree/main/docs/changelog", + "description": "Schema for ml-cpp changelog YAML entries", + "type": "object", + "properties": { + "pr": { + "type": "integer", + "minimum": 1 + }, + "summary": { + "type": "string", + "minLength": 1 + }, + "area": { + "type": "string", + "enum": [ + "Anomaly Detection", + "Data Frame Analytics", + "NLP", + "Core", + "API", + "Build", + "Inference" + ] + }, + "type": { + "type": "string", + "enum": [ + "breaking", + "bug", + "deprecation", + "enhancement", + "feature", + "regression" + ] + }, + "issues": { + "type": "array", + "items": { + "type": "integer", + "minimum": 1 + } + } + }, + "required": [ + "pr", + "summary", + "area", + "type" + ], + "additionalProperties": false +} From 67817657b1cbd4967d86562f875a4772c569fe05 Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Thu, 26 Feb 2026 13:02:27 +1300 Subject: [PATCH 2/8] [ML] Add >non-issue to changelog validation skip labels Made-with: Cursor --- .buildkite/scripts/steps/validate-changelogs.sh | 2 +- docs/changelog/README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.buildkite/scripts/steps/validate-changelogs.sh b/.buildkite/scripts/steps/validate-changelogs.sh index 7a9de78980..af30918fe7 100755 --- a/.buildkite/scripts/steps/validate-changelogs.sh +++ b/.buildkite/scripts/steps/validate-changelogs.sh @@ -10,7 +10,7 @@ set -euo pipefail -SKIP_LABELS=">test >refactoring >docs >build" +SKIP_LABELS=">test >refactoring >docs >build >non-issue" # On PR builds, check if the PR has a label that skips changelog validation. # BUILDKITE_PULL_REQUEST_LABELS is a comma-separated list set by Buildkite. diff --git a/docs/changelog/README.md b/docs/changelog/README.md index 2b812f4e9f..9975fdde30 100644 --- a/docs/changelog/README.md +++ b/docs/changelog/README.md @@ -62,4 +62,4 @@ A changelog entry is **not required** for: - Documentation-only changes PRs that do not require a changelog entry should be labelled with -`>test`, `>refactoring`, `>docs`, or `>build` to skip validation. +`>test`, `>refactoring`, `>docs`, `>build`, or `>non-issue` to skip validation. From 199c6cb18a73464daecf91ef1a8bbb9ae1eaa74b Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Thu, 12 Mar 2026 16:15:56 +1300 Subject: [PATCH 3/8] Fix changelog validation CI: use Python image and portable pip The validate-changelogs step ran on ml-check-style:2 (Alpine with only clang/bash/git, no Python), causing "pip: command not found". Switch the step to python:3.11-slim and install git on demand. Use python3 -m pip with --break-system-packages for PEP 668 compat. Made-with: Cursor --- .buildkite/pipelines/format_and_validation.yml.sh | 2 +- .buildkite/scripts/steps/validate-changelogs.sh | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/.buildkite/pipelines/format_and_validation.yml.sh b/.buildkite/pipelines/format_and_validation.yml.sh index 15bec940d2..b21d1c0fdb 100755 --- a/.buildkite/pipelines/format_and_validation.yml.sh +++ b/.buildkite/pipelines/format_and_validation.yml.sh @@ -22,7 +22,7 @@ steps: key: "validate_changelogs" command: ".buildkite/scripts/steps/validate-changelogs.sh" agents: - image: "docker.elastic.co/ml-dev/ml-check-style:2" + image: "python:3.11-slim" soft_fail: true notify: - github_commit_status: diff --git a/.buildkite/scripts/steps/validate-changelogs.sh b/.buildkite/scripts/steps/validate-changelogs.sh index af30918fe7..797f001381 100755 --- a/.buildkite/scripts/steps/validate-changelogs.sh +++ b/.buildkite/scripts/steps/validate-changelogs.sh @@ -27,8 +27,12 @@ if [[ -n "${BUILDKITE_PULL_REQUEST_LABELS:-}" ]]; then done fi -# Install Python dependencies -pip3 install --quiet pyyaml jsonschema 2>/dev/null || pip install --quiet pyyaml jsonschema +# Install system and Python dependencies +if ! command -v git &>/dev/null; then + apt-get update -qq && apt-get install -y -qq git >/dev/null 2>&1 +fi +python3 -m pip install --quiet --break-system-packages pyyaml jsonschema 2>/dev/null \ + || python3 -m pip install --quiet pyyaml jsonschema # Find changelog files changed in this PR (compared to main/target branch) TARGET_BRANCH="${BUILDKITE_PULL_REQUEST_BASE_BRANCH:-main}" From ac50206a66a600c2d7e6b34377dae779556e4ac4 Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Thu, 9 Apr 2026 13:03:28 +1200 Subject: [PATCH 4/8] [ML] Align changelog schema with Elasticsearch Replaces the ml-cpp-specific changelog schema with the exact Elasticsearch changelog schema so that entries can be consumed directly by the ES release notes pipeline (Phase 2 Option A). Key changes: - area enum: ES-wide values (most entries use "Machine Learning") - type enum: adds breaking-java, known-issue, new-aggregation, security, upgrade - Adds highlight, breaking, and deprecation sub-objects - pr/area not required for known-issue and security types - Validator allows descriptive filenames for entries without a pr - Bundler handles all new types and entries without pr/area - AsciiDoc output uses {ml-pull} macros for consistency Made-with: Cursor --- dev-tools/bundle_changelogs.py | 36 ++- dev-tools/validate_changelogs.py | 16 +- docs/changelog/0000.yaml.sample | 2 +- docs/changelog/README.md | 79 +++++-- docs/changelog/changelog-schema.json | 338 +++++++++++++++++++++++---- 5 files changed, 391 insertions(+), 80 deletions(-) diff --git a/dev-tools/bundle_changelogs.py b/dev-tools/bundle_changelogs.py index a76c1f1a0f..9614b536a8 100755 --- a/dev-tools/bundle_changelogs.py +++ b/dev-tools/bundle_changelogs.py @@ -26,14 +26,22 @@ TYPE_ORDER = [ + ("known-issue", "Known issues"), + ("security", "Security fixes"), ("breaking", "Breaking changes"), + ("breaking-java", "Breaking Java changes"), ("deprecation", "Deprecations"), ("feature", "New features"), + ("new-aggregation", "New aggregations"), ("enhancement", "Enhancements"), ("bug", "Bug fixes"), ("regression", "Regression fixes"), + ("upgrade", "Upgrades"), ] +ML_CPP_PULL_URL = "https://github.com/elastic/ml-cpp/pull" +ML_CPP_ISSUE_URL = "https://github.com/elastic/ml-cpp/issues" + def load_entries(changelog_dir): entries = [] @@ -53,7 +61,8 @@ def format_markdown(entries, version=None): grouped = defaultdict(lambda: defaultdict(list)) for entry in entries: - grouped[entry["type"]][entry["area"]].append(entry) + area = entry.get("area", "General") + grouped[entry["type"]][area].append(entry) for type_key, type_label in TYPE_ORDER: if type_key not in grouped: @@ -61,12 +70,15 @@ def format_markdown(entries, version=None): lines.append(f"### {type_label}\n") for area in sorted(grouped[type_key].keys()): lines.append(f"**{area}**") - for entry in sorted(grouped[type_key][area], key=lambda e: e["pr"]): - pr = entry["pr"] + for entry in sorted(grouped[type_key][area], key=lambda e: e.get("pr", 0)): + pr = entry.get("pr") summary = entry["summary"] issues = entry.get("issues", []) issue_refs = ", ".join(f"#{i}" for i in issues) - line = f"- {summary} [#{pr}](https://github.com/elastic/ml-cpp/pull/{pr})" + if pr: + line = f"- {summary} [#{pr}]({ML_CPP_PULL_URL}/{pr})" + else: + line = f"- {summary}" if issue_refs: line += f" ({issue_refs})" lines.append(line) @@ -82,7 +94,8 @@ def format_asciidoc(entries, version=None): grouped = defaultdict(lambda: defaultdict(list)) for entry in entries: - grouped[entry["type"]][entry["area"]].append(entry) + area = entry.get("area", "General") + grouped[entry["type"]][area].append(entry) for type_key, type_label in TYPE_ORDER: if type_key not in grouped: @@ -90,12 +103,17 @@ def format_asciidoc(entries, version=None): lines.append(f"=== {type_label}\n") for area in sorted(grouped[type_key].keys()): lines.append(f"*{area}*") - for entry in sorted(grouped[type_key][area], key=lambda e: e["pr"]): - pr = entry["pr"] + for entry in sorted(grouped[type_key][area], key=lambda e: e.get("pr", 0)): + pr = entry.get("pr") summary = entry["summary"] issues = entry.get("issues", []) - issue_refs = ", ".join(f"https://github.com/elastic/ml-cpp/issues/{i}[#{i}]" for i in issues) - line = f"* {summary} https://github.com/elastic/ml-cpp/pull/{pr}[#{pr}]" + issue_refs = ", ".join( + f"{ML_CPP_ISSUE_URL}/{i}[#{i}]" for i in issues + ) + if pr: + line = f"* {summary} {{ml-pull}}{pr}[#{pr}]" + else: + line = f"* {summary}" if issue_refs: line += f" ({issue_refs})" lines.append(line) diff --git a/dev-tools/validate_changelogs.py b/dev-tools/validate_changelogs.py index 856975658d..1bc8437c97 100755 --- a/dev-tools/validate_changelogs.py +++ b/dev-tools/validate_changelogs.py @@ -54,11 +54,7 @@ def validate_file(filepath, schema): errors = [] filename = os.path.basename(filepath) - - # Check filename convention: .yaml stem = Path(filepath).stem - if not re.match(r"^\d+$", stem): - errors.append(f"{filename}: filename must be a PR number (e.g. 1234.yaml)") try: with open(filepath) as f: @@ -81,12 +77,18 @@ def validate_file(filepath, schema): path = ".".join(str(p) for p in error.absolute_path) or "(root)" errors.append(f"{filename}: {path}: {error.message}") - # Cross-check: PR number in filename should match pr field - if "pr" in data and stem.isdigit(): - if data["pr"] != int(stem): + # Filename convention: numeric filenames must match the pr field. + # Types without a pr field (known-issue, security) may use descriptive names. + if re.match(r"^\d+$", stem): + if "pr" in data and data["pr"] != int(stem): errors.append( f"{filename}: pr field ({data['pr']}) does not match filename ({stem})" ) + elif "pr" in data: + errors.append( + f"{filename}: file has a pr field ({data['pr']}), " + f"so filename should be {data['pr']}.yaml" + ) return errors diff --git a/docs/changelog/0000.yaml.sample b/docs/changelog/0000.yaml.sample index 6cb9df71f8..bd1d40314a 100644 --- a/docs/changelog/0000.yaml.sample +++ b/docs/changelog/0000.yaml.sample @@ -1,6 +1,6 @@ pr: 1234 summary: Fix anomaly detection model state persistence for large jobs -area: Anomaly Detection +area: Machine Learning type: bug issues: - 1230 diff --git a/docs/changelog/README.md b/docs/changelog/README.md index 9975fdde30..6ab3193508 100644 --- a/docs/changelog/README.md +++ b/docs/changelog/README.md @@ -3,12 +3,16 @@ Each pull request that changes user-visible behaviour should include a changelog entry as a YAML file in this directory, named `.yaml`. +The schema is aligned with the +[Elasticsearch changelog schema](https://github.com/elastic/elasticsearch/blob/main/build-tools-internal/src/main/resources/changelog-schema.json) +so that ml-cpp entries can be consumed directly by the ES release notes pipeline. + ## Format ```yaml pr: 2914 summary: Split build and test into separate pipeline steps -area: Build +area: Machine Learning type: enhancement issues: [] ``` @@ -17,35 +21,68 @@ issues: [] | Field | Description | |-----------|-------------| -| `pr` | The pull request number (integer). | -| `summary` | A concise, user-facing description of the change. | -| `area` | The area of the codebase affected (see below). | -| `type` | The type of change (see below). | +| `type` | The type of change (see below). Always required. | +| `summary` | A concise, user-facing description of the change. Always required. | +| `pr` | The pull request number (integer). Required unless type is `known-issue` or `security`. | +| `area` | The area of the codebase affected (see below). Required unless type is `known-issue` or `security`. | ### Optional fields -| Field | Description | -|----------|-------------| -| `issues` | List of related GitHub issue numbers (integers). Default: `[]` | +| Field | Description | +|---------------|-------------| +| `issues` | List of related GitHub issue numbers (integers). Default: `[]` | +| `highlight` | Release highlight object (see below). | +| `breaking` | Breaking change details. **Required** when type is `breaking` or `breaking-java`. | +| `deprecation` | Deprecation details. **Required** when type is `deprecation`. | ### Valid areas -- **Anomaly Detection** – anomaly detection jobs, modelling, and results -- **Data Frame Analytics** – classification, regression, and outlier detection -- **NLP** – natural language processing and PyTorch inference -- **Core** – core libraries, platform support, and utilities -- **API** – REST API layer and state persistence -- **Build** – build system, CI, packaging, and developer tooling -- **Inference** – inference service integration +Most ml-cpp entries should use **Machine Learning**. Other valid areas from the +ES schema (e.g. **Inference**) may be used when appropriate. The full list of +valid areas is defined in `changelog-schema.json`. ### Valid types -- **breaking** – a change that breaks backwards compatibility -- **bug** – a fix for an existing defect -- **deprecation** – deprecation of existing functionality -- **enhancement** – an improvement to existing functionality -- **feature** – a wholly new feature -- **regression** – a fix for a recently introduced defect +| Type | Description | +|------|-------------| +| `breaking` | A change that breaks backwards compatibility (requires `breaking` object) | +| `breaking-java` | A breaking change to the Java API (requires `breaking` object) | +| `bug` | A fix for an existing defect | +| `deprecation` | Deprecation of existing functionality (requires `deprecation` object) | +| `enhancement` | An improvement to existing functionality | +| `feature` | A wholly new feature | +| `known-issue` | A known issue (`pr` and `area` not required) | +| `new-aggregation` | A new aggregation type | +| `regression` | A fix for a recently introduced defect | +| `security` | A security fix (`pr` and `area` not required) | +| `upgrade` | An upgrade-related change | + +### Highlight object + +For changes worthy of a release highlight: + +```yaml +highlight: + notable: true + title: "Short title for the highlight" + body: "Longer description in AsciiDoc format (no triple-backtick code blocks)." +``` + +### Breaking / Deprecation object + +Required when `type` is `breaking`, `breaking-java`, or `deprecation`: + +```yaml +breaking: + area: Machine Learning + title: "Short title describing the breaking change" + details: "Detailed description of what changed (AsciiDoc, no triple-backticks)." + impact: "What users need to do to adapt." + notable: true +``` + +Valid areas for breaking/deprecation changes are a subset of the main areas, +defined in `changelog-schema.json` under `compatibilityChangeArea`. ## When is a changelog entry required? diff --git a/docs/changelog/changelog-schema.json b/docs/changelog/changelog-schema.json index 1ea3f22054..1995103dd1 100644 --- a/docs/changelog/changelog-schema.json +++ b/docs/changelog/changelog-schema.json @@ -1,53 +1,307 @@ { "$schema": "http://json-schema.org/draft-07/schema#", "$id": "https://github.com/elastic/ml-cpp/tree/main/docs/changelog", - "description": "Schema for ml-cpp changelog YAML entries", - "type": "object", - "properties": { - "pr": { - "type": "integer", - "minimum": 1 + "$ref": "#/definitions/Changelog", + "definitions": { + "Changelog": { + "type": "object", + "properties": { + "pr": { + "type": "integer" + }, + "issues": { + "type": "array", + "items": { + "type": "integer" + } + }, + "area": { + "type": "string", + "enum": [ + "Aggregations", + "Allocation", + "Analysis", + "Application", + "Audit", + "Authentication", + "Authorization", + "Autoscaling", + "CAT APIs", + "CCR", + "CCS", + "CRUD", + "Client", + "Cluster Coordination", + "Codec", + "Data streams", + "DLM", + "Discovery-Plugins", + "Distributed", + "Downsampling", + "EQL", + "ES|QL", + "Engine", + "Experiences", + "Extract&Transform", + "FIPS", + "Features", + "Geo", + "Graph", + "Health", + "Highlighting", + "ILM", + "IdentityProvider", + "Indices APIs", + "Inference", + "Infra/CLI", + "Infra/Circuit Breakers", + "Infra/Core", + "Infra/Logging", + "Infra/Node Lifecycle", + "Infra/Plugins", + "Infra/REST API", + "Infra/Resiliency", + "Infra/Scripting", + "Infra/Settings", + "Infra/Transport API", + "Infra/Metrics", + "Ingest", + "Ingest Node", + "Java High Level REST Client", + "Java Low Level REST Client", + "License", + "Logs", + "Machine Learning", + "Mapping", + "Monitoring", + "Network", + "Packaging", + "Percolator", + "Performance", + "PromQL", + "Query Languages", + "Ranking", + "Recovery", + "Reindex", + "Relevance", + "Rollup", + "SQL", + "Search", + "Searchable Snapshots", + "Security", + "SLM", + "Snapshot/Restore", + "Stats", + "Store", + "Suggesters", + "Task Management", + "TLS", + "Transform", + "TSDB", + "Vector Search", + "Watcher" + ] + }, + "type": { + "type": "string", + "enum": [ + "breaking", + "breaking-java", + "bug", + "deprecation", + "enhancement", + "feature", + "known-issue", + "new-aggregation", + "regression", + "security", + "upgrade" + ] + }, + "summary": { + "type": "string", + "minLength": 1 + }, + "highlight": { + "$ref": "#/definitions/Highlight" + }, + "breaking": { + "$ref": "#/definitions/CompatibilityChange" + }, + "deprecation": { + "$ref": "#/definitions/CompatibilityChange" + } + }, + "required": [ + "type", + "summary" + ], + "anyOf": [ + { + "$comment": "PR number and area fields not required for known-issue type", + "if": { + "not": { + "properties": { + "type": { + "const": "known-issue" + } + } + } + }, + "then": { + "required": [ + "pr", + "area" + ] + } + }, + { + "$comment": "PR number and area fields not required for security type", + "if": { + "not": { + "properties": { + "type": { + "const": "security" + } + } + } + }, + "then": { + "required": [ + "pr", + "area" + ] + } + } + ], + "allOf": [ + { + "if": { + "properties": { + "type": { + "const": "breaking" + } + } + }, + "then": { + "required": [ + "breaking" + ] + } + }, + { + "if": { + "properties": { + "type": { + "const": "breaking-java" + } + } + }, + "then": { + "required": [ + "breaking" + ] + } + } + ], + "if": { + "properties": { + "type": { + "const": "deprecation" + } + } + }, + "then": { + "required": [ + "deprecation" + ] + }, + "additionalProperties": false }, - "summary": { - "type": "string", - "minLength": 1 + "Highlight": { + "properties": { + "notable": { + "type": "boolean" + }, + "title": { + "type": "string", + "minLength": 1 + }, + "body": { + "type": "string", + "pattern": "(?s)^((?!```).)*$", + "minLength": 1 + } + }, + "required": [ + "title", + "body" + ], + "additionalProperties": false }, - "area": { - "type": "string", - "enum": [ - "Anomaly Detection", - "Data Frame Analytics", - "NLP", - "Core", - "API", - "Build", - "Inference" - ] + "CompatibilityChange": { + "properties": { + "area": { + "$ref": "#/definitions/compatibilityChangeArea" + }, + "title": { + "type": "string", + "minLength": 1 + }, + "details": { + "type": "string", + "pattern": "(?s)^((?!```).)*$", + "minLength": 1 + }, + "impact": { + "type": "string", + "pattern": "(?s)^((?!```).)*$", + "minLength": 1 + }, + "notable": { + "type": "boolean" + }, + "ess_setting_change": { + "type": "boolean" + } + }, + "required": [ + "area", + "title", + "details", + "impact" + ], + "additionalProperties": false }, - "type": { + "compatibilityChangeArea": { "type": "string", "enum": [ - "breaking", - "bug", - "deprecation", - "enhancement", - "feature", - "regression" + "Aggregations", + "Analysis", + "Authorization", + "Cluster and node setting", + "Command line tool", + "CRUD", + "ES|QL", + "ILM", + "Index setting", + "Ingest", + "JVM option", + "Java API", + "Logging", + "Logs", + "Machine Learning", + "Mapping", + "Metrics", + "Packaging", + "Painless", + "REST API", + "Rollup", + "Search", + "System requirement", + "Transform" ] }, - "issues": { - "type": "array", - "items": { - "type": "integer", - "minimum": 1 - } - } - }, - "required": [ - "pr", - "summary", - "area", - "type" - ], - "additionalProperties": false + "additionalProperties": false + } } From 3050d2f393052225f596b32cdbd2f0f510c30098 Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Thu, 9 Apr 2026 13:04:55 +1200 Subject: [PATCH 5/8] [ML] Backfill changelog YAML entries for active branches Adds structured changelog entries for all changes in the active release branches: main/9.4.0, 9.3.x, 9.2.x, and 8.19.x. Also includes entries for recent hardening PRs (#3008, #3015) and the flaky test fix (#3017) that were not yet in CHANGELOG.asciidoc. Made-with: Cursor --- docs/changelog/2841.yaml | 5 +++++ docs/changelog/2846.yaml | 5 +++++ docs/changelog/2848.yaml | 5 +++++ docs/changelog/2863.yaml | 5 +++++ docs/changelog/2889.yaml | 5 +++++ docs/changelog/2894.yaml | 5 +++++ docs/changelog/2895.yaml | 5 +++++ docs/changelog/3008.yaml | 5 +++++ docs/changelog/3015.yaml | 5 +++++ docs/changelog/3017.yaml | 5 +++++ 10 files changed, 50 insertions(+) create mode 100644 docs/changelog/2841.yaml create mode 100644 docs/changelog/2846.yaml create mode 100644 docs/changelog/2848.yaml create mode 100644 docs/changelog/2863.yaml create mode 100644 docs/changelog/2889.yaml create mode 100644 docs/changelog/2894.yaml create mode 100644 docs/changelog/2895.yaml create mode 100644 docs/changelog/3008.yaml create mode 100644 docs/changelog/3015.yaml create mode 100644 docs/changelog/3017.yaml diff --git a/docs/changelog/2841.yaml b/docs/changelog/2841.yaml new file mode 100644 index 0000000000..a7d97b7041 --- /dev/null +++ b/docs/changelog/2841.yaml @@ -0,0 +1,5 @@ +pr: 2841 +summary: "Better messaging regarding OOM process termination" +area: Machine Learning +type: enhancement +issues: [] diff --git a/docs/changelog/2846.yaml b/docs/changelog/2846.yaml new file mode 100644 index 0000000000..d225e9a1cb --- /dev/null +++ b/docs/changelog/2846.yaml @@ -0,0 +1,5 @@ +pr: 2846 +summary: "Report the actual memory usage of the autodetect process" +area: Machine Learning +type: enhancement +issues: [] diff --git a/docs/changelog/2848.yaml b/docs/changelog/2848.yaml new file mode 100644 index 0000000000..ccca2a64c6 --- /dev/null +++ b/docs/changelog/2848.yaml @@ -0,0 +1,5 @@ +pr: 2848 +summary: "Improve adherence to memory limits for the bucket gatherer" +area: Machine Learning +type: enhancement +issues: [] diff --git a/docs/changelog/2863.yaml b/docs/changelog/2863.yaml new file mode 100644 index 0000000000..7042ed337a --- /dev/null +++ b/docs/changelog/2863.yaml @@ -0,0 +1,5 @@ +pr: 2863 +summary: "Update the PyTorch library to version 2.7.1" +area: Machine Learning +type: upgrade +issues: [] diff --git a/docs/changelog/2889.yaml b/docs/changelog/2889.yaml new file mode 100644 index 0000000000..7206fb8e5e --- /dev/null +++ b/docs/changelog/2889.yaml @@ -0,0 +1,5 @@ +pr: 2889 +summary: "Downgrade log severity for a batch of recoverable errors" +area: Machine Learning +type: enhancement +issues: [] diff --git a/docs/changelog/2894.yaml b/docs/changelog/2894.yaml new file mode 100644 index 0000000000..528edff646 --- /dev/null +++ b/docs/changelog/2894.yaml @@ -0,0 +1,5 @@ +pr: 2894 +summary: "Better error handling regarding quantiles state documents" +area: Machine Learning +type: enhancement +issues: [] diff --git a/docs/changelog/2895.yaml b/docs/changelog/2895.yaml new file mode 100644 index 0000000000..6d5e810b39 --- /dev/null +++ b/docs/changelog/2895.yaml @@ -0,0 +1,5 @@ +pr: 2895 +summary: "Better handling of invalid JSON state documents" +area: Machine Learning +type: enhancement +issues: [] diff --git a/docs/changelog/3008.yaml b/docs/changelog/3008.yaml new file mode 100644 index 0000000000..e851b35d57 --- /dev/null +++ b/docs/changelog/3008.yaml @@ -0,0 +1,5 @@ +pr: 3008 +summary: "Harden pytorch_inference with TorchScript model graph validation" +area: Machine Learning +type: enhancement +issues: [2890] diff --git a/docs/changelog/3015.yaml b/docs/changelog/3015.yaml new file mode 100644 index 0000000000..9b9dee8485 --- /dev/null +++ b/docs/changelog/3015.yaml @@ -0,0 +1,5 @@ +pr: 3015 +summary: "Add EuroBERT and Jina v5 ops to graph validation allowlist" +area: Machine Learning +type: enhancement +issues: [] diff --git a/docs/changelog/3017.yaml b/docs/changelog/3017.yaml new file mode 100644 index 0000000000..ae0820ff1a --- /dev/null +++ b/docs/changelog/3017.yaml @@ -0,0 +1,5 @@ +pr: 3017 +summary: "Fix flaky CIoManagerTest/testFileIoGood test" +area: Machine Learning +type: bug +issues: [] From a935699bb47929892f3875d5354b3442555a87a6 Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Thu, 9 Apr 2026 14:19:49 +1200 Subject: [PATCH 6/8] [ML] Add release-time changelog export script Interim tool (Option C) to bridge ml-cpp changelogs into the ES release notes pipeline until the full BundleChangelogsTask integration (Option A) is implemented. The script copies changelog YAML entries from docs/changelog/ to the ES repo's docs/changelog/ with a 'ml-cpp-' filename prefix to avoid PR number collisions. Supports: - --dry-run to preview what would be exported - --target to specify the ES docs/changelog/ directory - --create-pr to automatically create a PR in the ES repo - --prune to delete source entries after a successful release - --version to label the export with a version number Made-with: Cursor --- dev-tools/export_changelogs.py | 187 +++++++++++++++++++++++++++++++++ 1 file changed, 187 insertions(+) create mode 100755 dev-tools/export_changelogs.py diff --git a/dev-tools/export_changelogs.py b/dev-tools/export_changelogs.py new file mode 100755 index 0000000000..20e52c37fb --- /dev/null +++ b/dev-tools/export_changelogs.py @@ -0,0 +1,187 @@ +#!/usr/bin/env python3 +""" +Export ml-cpp changelog entries for inclusion in Elasticsearch release notes. + +Copies changelog YAML files from docs/changelog/ to a target directory +(typically elastic/elasticsearch's docs/changelog/) with a 'ml-cpp-' filename +prefix to avoid PR number collisions with ES-native entries. + +Usage: + # Preview what would be exported + python3 dev-tools/export_changelogs.py --dry-run + + # Export to a local ES checkout + python3 dev-tools/export_changelogs.py --target ~/src/elasticsearch/docs/changelog + + # Export and create a PR in the ES repo + python3 dev-tools/export_changelogs.py --target ~/src/elasticsearch/docs/changelog --create-pr + + # Export specific files only + python3 dev-tools/export_changelogs.py --target /tmp/out docs/changelog/3008.yaml +""" + +import argparse +import shutil +import subprocess +import sys +from pathlib import Path + +try: + import yaml +except ImportError: + print("Missing pyyaml. Install with: pip3 install pyyaml", file=sys.stderr) + sys.exit(2) + + +PREFIX = "ml-cpp-" + + +def collect_entries(changelog_dir, specific_files=None): + """Collect changelog YAML files, returning (source_path, target_name, data) tuples.""" + if specific_files: + paths = [Path(f) for f in specific_files] + else: + paths = sorted(changelog_dir.glob("*.yaml")) + + entries = [] + for path in paths: + if not path.exists(): + print(f"Warning: {path} not found, skipping", file=sys.stderr) + continue + with open(path) as f: + data = yaml.safe_load(f) + if not data or not isinstance(data, dict): + continue + + target_name = PREFIX + path.name + entries.append((path, target_name, data)) + + return entries + + +def export_entries(entries, target_dir, dry_run=False): + """Copy entries to the target directory with prefixed filenames.""" + target = Path(target_dir) + if not dry_run and not target.is_dir(): + print(f"Error: target directory {target} does not exist", file=sys.stderr) + sys.exit(1) + + for source_path, target_name, data in entries: + dest = target / target_name + pr = data.get("pr", "n/a") + summary = data.get("summary", "")[:60] + if dry_run: + print(f" {target_name} (PR #{pr}: {summary})") + else: + shutil.copy2(source_path, dest) + print(f" Copied {source_path.name} -> {dest}") + + return [target / name for _, name, _ in entries] + + +def create_pr(es_repo_dir, exported_files, version=None): + """Create a git branch and PR in the ES repo with the exported entries.""" + es_repo = Path(es_repo_dir).resolve() + branch_name = f"ml-cpp-changelog-export" + if version: + branch_name += f"-{version}" + + try: + subprocess.run(["git", "checkout", "-b", branch_name], cwd=es_repo, check=True) + subprocess.run(["git", "add"] + [str(f) for f in exported_files], cwd=es_repo, check=True) + + msg = "[ML] Add ml-cpp changelog entries" + if version: + msg += f" for {version}" + subprocess.run(["git", "commit", "-m", msg], cwd=es_repo, check=True) + subprocess.run(["git", "push", "-u", "origin", branch_name], cwd=es_repo, check=True) + + pr_body = f"Adds ml-cpp changelog entries to the ES release notes.\n\nSource: elastic/ml-cpp docs/changelog/" + if version: + pr_body += f"\nVersion: {version}" + result = subprocess.run( + ["gh", "pr", "create", "--title", msg, "--body", pr_body], + cwd=es_repo, capture_output=True, text=True + ) + if result.returncode == 0: + print(f"\nPR created: {result.stdout.strip()}") + else: + print(f"\nFailed to create PR: {result.stderr}", file=sys.stderr) + sys.exit(1) + except subprocess.CalledProcessError as e: + print(f"Git error: {e}", file=sys.stderr) + sys.exit(1) + + +def main(): + parser = argparse.ArgumentParser( + description="Export ml-cpp changelog entries for ES release notes" + ) + parser.add_argument( + "--target", + help="Target directory (e.g. ~/src/elasticsearch/docs/changelog)", + ) + parser.add_argument( + "--dir", + default=None, + help="Source changelog directory (default: docs/changelog/)", + ) + parser.add_argument( + "--version", + default=None, + help="Version label (used in PR title/branch if --create-pr)", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Show what would be exported without copying files", + ) + parser.add_argument( + "--create-pr", + action="store_true", + help="Create a PR in the ES repo (requires --target to be inside an ES checkout)", + ) + parser.add_argument( + "--prune", + action="store_true", + help="Delete source YAML files after successful export (use after release)", + ) + parser.add_argument( + "files", + nargs="*", + help="Specific changelog files to export (default: all *.yaml in --dir)", + ) + args = parser.parse_args() + + repo_root = Path(__file__).resolve().parent.parent + changelog_dir = Path(args.dir) if args.dir else repo_root / "docs" / "changelog" + + entries = collect_entries(changelog_dir, args.files if args.files else None) + if not entries: + print("No changelog entries found.") + return + + print(f"Found {len(entries)} changelog entry(ies):\n") + + if args.dry_run or not args.target: + export_entries(entries, args.target or "/dev/null", dry_run=True) + if not args.target: + print("\nUse --target to export, or --dry-run to preview.") + return + + exported = export_entries(entries, args.target) + print(f"\nExported {len(exported)} file(s) to {args.target}") + + if args.create_pr: + es_repo_dir = Path(args.target).resolve().parent.parent + create_pr(es_repo_dir, exported, args.version) + + if args.prune: + for source_path, _, _ in entries: + source_path.unlink() + print(f" Pruned {source_path}") + print(f"\nPruned {len(entries)} source file(s)") + + +if __name__ == "__main__": + main() From 0e2cf597a6593b9d8ea4e5241dedab2f04913b2e Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Thu, 9 Apr 2026 14:41:27 +1200 Subject: [PATCH 7/8] [ML] Harden export script with validation and conflict handling - Validate all entries against the JSON schema before exporting - Verify the target directory is inside an ES checkout (checks for build.gradle, settings.gradle, and docs/changelog/) - Detect pre-existing files at the destination: - Identical files are silently skipped - Different files show a unified diff and prompt the user to overwrite, skip, or abort the entire export - Use the verified ES repo root for --create-pr instead of fragile parent-of-parent path assumption Made-with: Cursor --- dev-tools/export_changelogs.py | 149 ++++++++++++++++++++++++++++++--- 1 file changed, 136 insertions(+), 13 deletions(-) diff --git a/dev-tools/export_changelogs.py b/dev-tools/export_changelogs.py index 20e52c37fb..c767c818b7 100755 --- a/dev-tools/export_changelogs.py +++ b/dev-tools/export_changelogs.py @@ -21,6 +21,8 @@ """ import argparse +import difflib +import json import shutil import subprocess import sys @@ -32,10 +34,30 @@ print("Missing pyyaml. Install with: pip3 install pyyaml", file=sys.stderr) sys.exit(2) +try: + import jsonschema +except ImportError: + print("Missing jsonschema. Install with: pip3 install jsonschema", file=sys.stderr) + sys.exit(2) + PREFIX = "ml-cpp-" +def validate_entries(entries, schema_path): + """Validate all entries against the JSON schema. Returns list of errors.""" + with open(schema_path) as f: + schema = json.load(f) + + validator = jsonschema.Draft7Validator(schema) + errors = [] + for source_path, _, data in entries: + for error in validator.iter_errors(data): + path = ".".join(str(p) for p in error.absolute_path) or "(root)" + errors.append(f"{source_path.name}: {path}: {error.message}") + return errors + + def collect_entries(changelog_dir, specific_files=None): """Collect changelog YAML files, returning (source_path, target_name, data) tuples.""" if specific_files: @@ -59,30 +81,104 @@ def collect_entries(changelog_dir, specific_files=None): return entries +def resolve_conflict(source_path, dest, target_name): + """Handle a pre-existing file at the destination. Returns the action taken.""" + source_lines = source_path.read_text().splitlines(keepends=True) + dest_lines = dest.read_text().splitlines(keepends=True) + + if source_lines == dest_lines: + print(f" {target_name}: identical to existing file, skipping") + return "skip" + + print(f"\n {target_name}: file already exists with different content.\n") + diff = difflib.unified_diff( + dest_lines, source_lines, + fromfile=f"existing: {dest.name}", + tofile=f"incoming: {source_path.name}", + ) + sys.stdout.writelines(" " + line for line in diff) + print() + + while True: + choice = input(f" [{target_name}] (o)verwrite / (s)kip / (a)bort export? ").strip().lower() + if choice in ("o", "overwrite"): + shutil.copy2(source_path, dest) + print(f" {target_name}: overwritten") + return "overwrite" + elif choice in ("s", "skip"): + print(f" {target_name}: skipped") + return "skip" + elif choice in ("a", "abort"): + print("\nExport aborted.") + sys.exit(1) + else: + print(" Please enter 'o' (overwrite), 's' (skip), or 'a' (abort).") + + +def verify_es_repo(target_dir): + """Verify that the target looks like an ES docs/changelog directory.""" + target = Path(target_dir).resolve() + + if not target.is_dir(): + print(f"Error: target directory does not exist: {target}", file=sys.stderr) + sys.exit(1) + + es_repo_root = target.parent.parent + markers = [ + es_repo_root / "build.gradle", + es_repo_root / "settings.gradle", + es_repo_root / "docs" / "changelog", + ] + if not all(m.exists() for m in markers): + print( + f"Warning: {es_repo_root} does not look like an Elasticsearch checkout.\n" + f" Expected to find build.gradle, settings.gradle, and docs/changelog/\n" + f" at the repo root (two levels above --target).\n", + file=sys.stderr, + ) + choice = input(" Continue anyway? (y/n) ").strip().lower() + if choice not in ("y", "yes"): + print("Export aborted.") + sys.exit(1) + + return es_repo_root + + def export_entries(entries, target_dir, dry_run=False): """Copy entries to the target directory with prefixed filenames.""" target = Path(target_dir) - if not dry_run and not target.is_dir(): - print(f"Error: target directory {target} does not exist", file=sys.stderr) - sys.exit(1) + exported = [] + skipped = 0 for source_path, target_name, data in entries: dest = target / target_name pr = data.get("pr", "n/a") summary = data.get("summary", "")[:60] if dry_run: - print(f" {target_name} (PR #{pr}: {summary})") + flag = " [EXISTS]" if dest.exists() else "" + print(f" {target_name} (PR #{pr}: {summary}){flag}") + exported.append(dest) + elif dest.exists(): + action = resolve_conflict(source_path, dest, target_name) + if action == "overwrite": + exported.append(dest) + else: + skipped += 1 else: shutil.copy2(source_path, dest) - print(f" Copied {source_path.name} -> {dest}") + print(f" Copied {source_path.name} -> {target_name}") + exported.append(dest) - return [target / name for _, name, _ in entries] + if skipped > 0 and not dry_run: + print(f"\n ({skipped} file(s) skipped due to conflicts)") + + return exported def create_pr(es_repo_dir, exported_files, version=None): """Create a git branch and PR in the ES repo with the exported entries.""" es_repo = Path(es_repo_dir).resolve() - branch_name = f"ml-cpp-changelog-export" + branch_name = "ml-cpp-changelog-export" if version: branch_name += f"-{version}" @@ -96,12 +192,15 @@ def create_pr(es_repo_dir, exported_files, version=None): subprocess.run(["git", "commit", "-m", msg], cwd=es_repo, check=True) subprocess.run(["git", "push", "-u", "origin", branch_name], cwd=es_repo, check=True) - pr_body = f"Adds ml-cpp changelog entries to the ES release notes.\n\nSource: elastic/ml-cpp docs/changelog/" + pr_body = ( + "Adds ml-cpp changelog entries to the ES release notes.\n\n" + "Source: elastic/ml-cpp docs/changelog/" + ) if version: pr_body += f"\nVersion: {version}" result = subprocess.run( ["gh", "pr", "create", "--title", msg, "--body", pr_body], - cwd=es_repo, capture_output=True, text=True + cwd=es_repo, capture_output=True, text=True, ) if result.returncode == 0: print(f"\nPR created: {result.stdout.strip()}") @@ -115,7 +214,7 @@ def create_pr(es_repo_dir, exported_files, version=None): def main(): parser = argparse.ArgumentParser( - description="Export ml-cpp changelog entries for ES release notes" + description="Export ml-cpp changelog entries for ES release notes", ) parser.add_argument( "--target", @@ -155,13 +254,31 @@ def main(): repo_root = Path(__file__).resolve().parent.parent changelog_dir = Path(args.dir) if args.dir else repo_root / "docs" / "changelog" + schema_path = repo_root / "docs" / "changelog" / "changelog-schema.json" entries = collect_entries(changelog_dir, args.files if args.files else None) if not entries: print("No changelog entries found.") return - print(f"Found {len(entries)} changelog entry(ies):\n") + print(f"Found {len(entries)} changelog entry(ies).") + + # Validate all entries before exporting + if schema_path.exists(): + print("Validating entries against schema... ", end="", flush=True) + errors = validate_entries(entries, schema_path) + if errors: + print(f"FAILED ({len(errors)} error(s)):\n") + for error in errors: + print(f" - {error}") + print("\nFix validation errors before exporting.") + sys.exit(1) + print("OK") + else: + print(f"Warning: schema not found at {schema_path}, skipping validation", + file=sys.stderr) + + print() if args.dry_run or not args.target: export_entries(entries, args.target or "/dev/null", dry_run=True) @@ -169,12 +286,18 @@ def main(): print("\nUse --target to export, or --dry-run to preview.") return + # Verify the target is a real ES checkout + es_repo_root = verify_es_repo(args.target) + exported = export_entries(entries, args.target) + if not exported: + print("\nNo files exported.") + return + print(f"\nExported {len(exported)} file(s) to {args.target}") if args.create_pr: - es_repo_dir = Path(args.target).resolve().parent.parent - create_pr(es_repo_dir, exported, args.version) + create_pr(es_repo_root, exported, args.version) if args.prune: for source_path, _, _ in entries: From 84a21dacaf64ca0b88bdfe8cf57d26f19f106084 Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Thu, 9 Apr 2026 15:01:32 +1200 Subject: [PATCH 8/8] [ML] Add source_repo field to schema and export script Adds the optional source_repo field to the changelog schema, matching the corresponding change in the Elasticsearch repo. This field tells the ES release notes generator which GitHub repo to use for PR links. The export script now injects source_repo: elastic/ml-cpp into exported entries automatically, so they link correctly in the ES release notes. Made-with: Cursor --- dev-tools/export_changelogs.py | 18 ++++++++++++++---- docs/changelog/changelog-schema.json | 4 ++++ 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/dev-tools/export_changelogs.py b/dev-tools/export_changelogs.py index c767c818b7..5d264eceeb 100755 --- a/dev-tools/export_changelogs.py +++ b/dev-tools/export_changelogs.py @@ -23,7 +23,6 @@ import argparse import difflib import json -import shutil import subprocess import sys from pathlib import Path @@ -42,6 +41,7 @@ PREFIX = "ml-cpp-" +SOURCE_REPO = "elastic/ml-cpp" def validate_entries(entries, schema_path): @@ -102,7 +102,7 @@ def resolve_conflict(source_path, dest, target_name): while True: choice = input(f" [{target_name}] (o)verwrite / (s)kip / (a)bort export? ").strip().lower() if choice in ("o", "overwrite"): - shutil.copy2(source_path, dest) + write_entry_with_source_repo(source_path, dest) print(f" {target_name}: overwritten") return "overwrite" elif choice in ("s", "skip"): @@ -144,8 +144,18 @@ def verify_es_repo(target_dir): return es_repo_root +def write_entry_with_source_repo(source_path, dest): + """Write a changelog entry to dest, injecting source_repo if not already present.""" + with open(source_path) as f: + data = yaml.safe_load(f) + if "source_repo" not in data: + data["source_repo"] = SOURCE_REPO + with open(dest, "w") as f: + yaml.dump(data, f, default_flow_style=False, sort_keys=False) + + def export_entries(entries, target_dir, dry_run=False): - """Copy entries to the target directory with prefixed filenames.""" + """Export entries to the target directory with prefixed filenames and source_repo.""" target = Path(target_dir) exported = [] @@ -165,7 +175,7 @@ def export_entries(entries, target_dir, dry_run=False): else: skipped += 1 else: - shutil.copy2(source_path, dest) + write_entry_with_source_repo(source_path, dest) print(f" Copied {source_path.name} -> {target_name}") exported.append(dest) diff --git a/docs/changelog/changelog-schema.json b/docs/changelog/changelog-schema.json index 1995103dd1..2777aedb4a 100644 --- a/docs/changelog/changelog-schema.json +++ b/docs/changelog/changelog-schema.json @@ -130,6 +130,10 @@ }, "deprecation": { "$ref": "#/definitions/CompatibilityChange" + }, + "source_repo": { + "type": "string", + "description": "GitHub repository (owner/name) for entries from external repos, e.g. elastic/ml-cpp. Defaults to elastic/elasticsearch when absent." } }, "required": [