From 24528211be7a88c16157ef4d16e2dfc01982ca0d Mon Sep 17 00:00:00 2001
From: James Darby <james@symmetricgroup.co.uk>
Date: Sun, 8 Mar 2026 15:52:01 +0000
Subject: [PATCH 01/12] initial file structure

---
 .../compression/analyse_compression.py        | 444 ++++++++++++++++++
 .../physicality/compression/metrics.yml       |  26 +
 .../compression/app_compression.py            | 260 ++++++++++
 .../compression/calc_compression.py           | 327 +++++++++++++
 .../physicality/compression/data/README.md    |  11 +
 .../physicality/diatomics/calc_diatomics.py   |   2 +-
 6 files changed, 1069 insertions(+), 1 deletion(-)
 create mode 100644 ml_peg/analysis/physicality/compression/analyse_compression.py
 create mode 100644 ml_peg/analysis/physicality/compression/metrics.yml
 create mode 100644 ml_peg/app/physicality/compression/app_compression.py
 create mode 100644 ml_peg/calcs/physicality/compression/calc_compression.py
 create mode 100644 ml_peg/calcs/physicality/compression/data/README.md

diff --git a/ml_peg/analysis/physicality/compression/analyse_compression.py b/ml_peg/analysis/physicality/compression/analyse_compression.py
new file mode 100644
index 000000000..6dbb6e5c6
--- /dev/null
+++ b/ml_peg/analysis/physicality/compression/analyse_compression.py
@@ -0,0 +1,444 @@
+"""Analyse compression benchmark."""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import pytest
+from scipy.signal import find_peaks
+
+from ml_peg.analysis.utils.decorators import build_table
+from ml_peg.analysis.utils.utils import load_metrics_config
+from ml_peg.app import APP_ROOT
+from ml_peg.calcs import CALCS_ROOT
+from ml_peg.models.get_models import get_model_names
+from ml_peg.models.models import current_models
+
+MODELS = get_model_names(current_models)
+CALC_PATH = CALCS_ROOT / "physicality" / "compression" / "outputs"
+OUT_PATH = APP_ROOT / "data" / "physicality" / "compression"
+CURVE_PATH = OUT_PATH / "curves"
+
+METRICS_CONFIG_PATH = Path(__file__).with_name("metrics.yml")
+DEFAULT_THRESHOLDS, DEFAULT_TOOLTIPS, _ = load_metrics_config(METRICS_CONFIG_PATH)
+
+
+def load_model_data(model_name: str) -> pd.DataFrame:
+    """
+    Load compression curve data for a model.
+
+    Parameters
+    ----------
+    model_name
+        Model identifier.
+
+    Returns
+    -------
+    pd.DataFrame
+        Dataframe containing structure, scale, volume, energy, and pressure columns.
+    """
+    csv_path = CALC_PATH / model_name / "compression.csv"
+    if not csv_path.exists():
+        return pd.DataFrame()
+    return pd.read_csv(csv_path)
+
+
+def prepare_structure_series(
+    struct_dataframe: pd.DataFrame,
+) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    """
+    Sort and align energy/pressure series for a crystal structure.
+
+    Parameters
+    ----------
+    struct_dataframe
+        Structure-specific dataframe.
+
+    Returns
+    -------
+    tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]
+        Volume per atom, shifted energy per atom, pressure, and scale factors
+        sorted by increasing volume.
+    """
+    df_sorted = struct_dataframe.sort_values("volume_per_atom").drop_duplicates(
+        "volume_per_atom"
+    )
+    if len(df_sorted) < 3:
+        return np.array([]), np.array([]), np.array([]), np.array([])
+
+    volumes = df_sorted["volume_per_atom"].to_numpy()
+    energies = df_sorted["energy_per_atom"].to_numpy()
+    pressures = df_sorted["pressure"].to_numpy()
+    scales = df_sorted["scale"].to_numpy()
+
+    # Shift energies so the equilibrium value (scale closest to 1.0) is zero
+    eq_idx = int(np.argmin(np.abs(scales - 1.0)))
+    shifted_energies = energies - energies[eq_idx]
+
+    return volumes, shifted_energies, pressures, scales
+
+
+def count_sign_changes(array: np.ndarray, tol: float) -> int:
+    """
+    Count sign changes in a sequence while ignoring small magnitudes.
+
+    Parameters
+    ----------
+    array
+        Input values.
+    tol
+        Absolute tolerance below which values are treated as zero.
+
+    Returns
+    -------
+    int
+        Number of sign changes exceeding the specified tolerance.
+    """
+    if array.size < 3:
+        return 0
+    clipped = array[np.abs(array) > tol]
+    if clipped.size < 2:
+        return 0
+    signs = np.sign(clipped)
+    sign_flips = signs[:-1] != signs[1:]
+    return int(np.sum(sign_flips))
+
+
+def compute_structure_metrics(
+    df_struct: pd.DataFrame,
+) -> dict[str, float] | None:
+    """
+    Compute diagnostics for a single crystal structure compression curve.
+
+    Parameters
+    ----------
+    df_struct
+        Structure-specific dataframe.
+
+    Returns
+    -------
+    dict[str, float] | None
+        Dictionary of metrics, or None if insufficient data.
+    """
+    volumes, shifted_energies, pressures, scales = prepare_structure_series(df_struct)
+    if volumes.size < 3:
+        return None
+
+    energy_gradient = np.gradient(shifted_energies, volumes)
+    energy_curvature = np.gradient(energy_gradient, volumes)
+
+    # Energy minima: find peaks in inverted energy
+    minima = 0
+    if shifted_energies.size >= 3:
+        minima_indices, _ = find_peaks(
+            -shifted_energies,
+            prominence=0.001,
+            width=1,
+        )
+        minima = len(minima_indices)
+
+    inflections = count_sign_changes(energy_curvature, tol=0.01)
+
+    # Pressure sign flips
+    pressure_flips = count_sign_changes(pressures, tol=1e-4)
+
+    # Spearman correlations in compressed and expanded regimes
+    spearman_compression = np.nan
+    spearman_expansion = np.nan
+
+    try:
+        from scipy import stats
+
+        eq_idx = int(np.argmin(shifted_energies))
+
+        # Compressed regime: volumes smaller than equilibrium
+        # Energy should increase as volume decreases (positive correlation
+        # between energy and volume in the compressed region means energy
+        # rises when volume shrinks — but since volumes are sorted ascending,
+        # the compressed side has *lower* volumes with *higher* energies).
+        if volumes[:eq_idx].size > 2:
+            spearman_compression = float(
+                stats.spearmanr(
+                    volumes[: eq_idx + 1], shifted_energies[: eq_idx + 1]
+                ).statistic
+            )
+        # Expanded regime: volumes larger than equilibrium
+        if volumes[eq_idx:].size > 2:
+            spearman_expansion = float(
+                stats.spearmanr(
+                    volumes[eq_idx:], shifted_energies[eq_idx:]
+                ).statistic
+            )
+    except Exception:
+        pass
+
+    return {
+        "Energy minima": float(minima),
+        "Energy inflections": float(inflections),
+        "Pressure sign flips": float(pressure_flips),
+        "ρ(E, compression)": float(spearman_compression),
+        "ρ(E, expansion)": float(spearman_expansion),
+    }
+
+
+def aggregate_model_metrics(
+    model_dataframe: pd.DataFrame,
+) -> dict[str, float]:
+    """
+    Aggregate metrics across all crystal structures for a model.
+
+    Parameters
+    ----------
+    model_dataframe
+        Per-model compression dataset.
+
+    Returns
+    -------
+    dict[str, float]
+        Aggregated model metrics (averaged across all structures).
+    """
+    if model_dataframe.empty:
+        return {}
+
+    structure_metrics: list[dict[str, float]] = []
+
+    for _struct, struct_dataframe in model_dataframe.groupby("structure"):
+        metrics = compute_structure_metrics(struct_dataframe)
+        if metrics is None:
+            continue
+        structure_metrics.append(metrics)
+
+    if not structure_metrics:
+        return {}
+
+    return {
+        key: float(
+            np.nanmean([m.get(key, np.nan) for m in structure_metrics])
+        )
+        for key in DEFAULT_THRESHOLDS.keys()
+    }
+
+
+# Helper to load per-model data -----------------------------------------------
+
+
+def _load_structure_data() -> dict[str, pd.DataFrame]:
+    """
+    Load per-model compression curve dataframes from calculator outputs.
+
+    Returns
+    -------
+    dict[str, pd.DataFrame]
+        Mapping of model name to curve dataframe.
+    """
+    structure_data: dict[str, pd.DataFrame] = {}
+    for model_name in MODELS:
+        model_dataframe = load_model_data(model_name)
+        if not model_dataframe.empty:
+            structure_data[model_name] = model_dataframe
+    return structure_data
+
+
+def _write_curve_payloads(
+    curve_dir: Path, model_name: str, frame: pd.DataFrame
+) -> None:
+    """
+    Serialise per-structure JSON payloads for the Dash app callbacks.
+
+    Parameters
+    ----------
+    curve_dir
+        Base directory for curve JSON files.
+    model_name
+        Name of the model.
+    frame
+        Dataframe with all structure data for this model.
+    """
+    model_curve_dir = curve_dir / model_name
+    model_curve_dir.mkdir(parents=True, exist_ok=True)
+
+    for struct_label, group in frame.groupby("structure"):
+        ordered = group.sort_values("volume_per_atom").drop_duplicates(
+            "volume_per_atom"
+        )
+        volumes = ordered["volume_per_atom"].tolist()
+        energies = ordered["energy_per_atom"].tolist()
+        pressures = ordered["pressure"].tolist()
+        scales = ordered["scale"].tolist()
+
+        # Compute dE/dV (numerical derivative of energy per atom w.r.t. volume)
+        vol_arr = np.array(volumes)
+        eng_arr = np.array(energies)
+        if vol_arr.size >= 2:
+            de_dv = np.gradient(eng_arr, vol_arr).tolist()
+        else:
+            de_dv = [0.0] * len(volumes)
+
+        payload = {
+            "structure": str(struct_label),
+            "volume_per_atom": volumes,
+            "energy_per_atom": energies,
+            "pressure": pressures,
+            "scale": scales,
+            "dEdV": de_dv,
+        }
+        filepath = model_curve_dir / f"{struct_label}.json"
+        with filepath.open("w", encoding="utf8") as fh:
+            json.dump(payload, fh)
+
+
+def persist_compression_data() -> dict[str, pd.DataFrame]:
+    """
+    Persist curve payloads and return per-model dataframes.
+
+    Returns
+    -------
+    dict[str, pd.DataFrame]
+        Mapping of model name to per-structure curve data.
+    """
+    data = _load_structure_data()
+    CURVE_PATH.mkdir(parents=True, exist_ok=True)
+    for model_name, frame in data.items():
+        if frame is not None and not frame.empty:
+            _write_curve_payloads(CURVE_PATH, model_name, frame)
+    return data
+
+
+@pytest.fixture
+def compression_data_fixture() -> dict[str, pd.DataFrame]:
+    """
+    Load curve data and persist gallery assets for pytest use.
+
+    Returns
+    -------
+    dict[str, pd.DataFrame]
+        Mapping of model name to per-structure curve data.
+    """
+    return persist_compression_data()
+
+
+def collect_metrics(
+    structure_data: dict[str, pd.DataFrame] | None = None,
+) -> pd.DataFrame:
+    """
+    Gather metrics for all models.
+
+    Metrics are averaged across all crystal structures.
+
+    Parameters
+    ----------
+    structure_data
+        Optional mapping of model names to curve dataframes. When ``None``,
+        the data is loaded via ``persist_compression_data``.
+
+    Returns
+    -------
+    pd.DataFrame
+        Aggregated metrics table (all structures).
+    """
+    metrics_rows: list[dict[str, float | str]] = []
+
+    OUT_PATH.mkdir(parents=True, exist_ok=True)
+
+    data = (
+        structure_data if structure_data is not None else persist_compression_data()
+    )
+
+    for model_name, model_dataframe in data.items():
+        metrics = aggregate_model_metrics(model_dataframe)
+        row = {"Model": model_name} | metrics
+        metrics_rows.append(row)
+
+    columns = ["Model"] + list(DEFAULT_THRESHOLDS.keys())
+
+    return pd.DataFrame(metrics_rows).reindex(columns=columns)
+
+
+@pytest.fixture
+def compression_collection(
+    compression_data_fixture: dict[str, pd.DataFrame],
+) -> pd.DataFrame:
+    """
+    Collect compression metrics across all models.
+
+    Parameters
+    ----------
+    compression_data_fixture
+        Mapping of model names to curve dataframes generated by the fixture.
+
+    Returns
+    -------
+    pd.DataFrame
+        Aggregated metrics dataframe.
+    """
+    return collect_metrics(compression_data_fixture)
+
+
+@pytest.fixture
+def compression_metrics_dataframe(
+    compression_collection: pd.DataFrame,
+) -> pd.DataFrame:
+    """
+    Provide the aggregated compression metrics dataframe.
+
+    Parameters
+    ----------
+    compression_collection
+        Metrics dataframe produced by ``collect_metrics``.
+
+    Returns
+    -------
+    pd.DataFrame
+        Aggregated compression metrics indexed by model.
+    """
+    return compression_collection
+
+
+@pytest.fixture
+@build_table(
+    filename=OUT_PATH / "compression_metrics_table.json",
+    metric_tooltips=DEFAULT_TOOLTIPS,
+    thresholds=DEFAULT_THRESHOLDS,
+    weights=None,
+)
+def metrics(
+    compression_metrics_dataframe: pd.DataFrame,
+) -> dict[str, dict]:
+    """
+    Compute compression metrics for all models.
+
+    Parameters
+    ----------
+    compression_metrics_dataframe
+        Aggregated per-model metrics produced by ``collect_metrics``.
+
+    Returns
+    -------
+    dict[str, dict]
+        Mapping of metric names to per-model results.
+    """
+    metrics_df = compression_metrics_dataframe
+    metrics_dict: dict[str, dict[str, float | None]] = {}
+    for column in metrics_df.columns:
+        if column == "Model":
+            continue
+        values = [
+            value if pd.notna(value) else None for value in metrics_df[column].tolist()
+        ]
+        metrics_dict[column] = dict(zip(metrics_df["Model"], values, strict=False))
+    return metrics_dict
+
+
+def test_compression(metrics: dict[str, dict]) -> None:
+    """
+    Run compression analysis.
+
+    Parameters
+    ----------
+    metrics
+        Benchmark metrics generated by fixtures.
+    """
+    return
diff --git a/ml_peg/analysis/physicality/compression/metrics.yml b/ml_peg/analysis/physicality/compression/metrics.yml
new file mode 100644
index 000000000..448719524
--- /dev/null
+++ b/ml_peg/analysis/physicality/compression/metrics.yml
@@ -0,0 +1,26 @@
+metrics:
+  Energy minima:
+    good: 1.0
+    bad: 5.0
+    unit: null
+    tooltip: Average number of energy-per-atom minima per structure
+  Energy inflections:
+    good: 1.0
+    bad: 5.0
+    unit: null
+    tooltip: Average number of energy curvature sign changes per structure
+  Pressure sign flips:
+    good: 1.0
+    bad: 5.0
+    unit: null
+    tooltip: Average number of hydrostatic-pressure sign changes per structure
+  ρ(E, compression):
+    good: 1.0
+    bad: -1.0
+    unit: null
+    tooltip: Spearman correlation between energy and volume in the compressed regime
+  ρ(E, expansion):
+    good: -1.0
+    bad: 1.0
+    unit: null
+    tooltip: Spearman correlation between energy and volume in the expanded regime
diff --git a/ml_peg/app/physicality/compression/app_compression.py b/ml_peg/app/physicality/compression/app_compression.py
new file mode 100644
index 000000000..363485add
--- /dev/null
+++ b/ml_peg/app/physicality/compression/app_compression.py
@@ -0,0 +1,260 @@
+"""Run compression benchmark app."""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+from dash import Dash, Input, Output, callback, dcc
+from dash.dcc import Loading
+from dash.exceptions import PreventUpdate
+from dash.html import Div, Label
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+
+from ml_peg.app import APP_ROOT
+from ml_peg.app.base_app import BaseApp
+from ml_peg.models.get_models import get_model_names
+from ml_peg.models.models import current_models
+
+# Get all models
+MODELS = get_model_names(current_models)
+BENCHMARK_NAME = "Compression"
+DATA_PATH = APP_ROOT / "data" / "physicality" / "compression"
+CURVE_PATH = DATA_PATH / "curves"
+DOCS_URL = (
+    "https://ddmms.github.io/ml-peg/user_guide/benchmarks/"
+    "physicality.html#compression"
+)
+
+
+def _available_structures(model_name: str) -> list[str]:
+    """
+    List structure labels available for a given model.
+
+    Parameters
+    ----------
+    model_name
+        Selected model identifier.
+
+    Returns
+    -------
+    list[str]
+        Sorted list of structure labels with stored curve data.
+    """
+    model_dir = CURVE_PATH / model_name
+    if not model_dir.exists():
+        return []
+    return sorted(p.stem for p in model_dir.glob("*.json"))
+
+
+def _load_curve(model_name: str, structure: str) -> dict | None:
+    """
+    Load a single curve payload from disk.
+
+    Parameters
+    ----------
+    model_name
+        Model identifier.
+    structure
+        Structure label (filename stem).
+
+    Returns
+    -------
+    dict | None
+        Curve payload dict, or None if the file is missing / unreadable.
+    """
+    filepath = CURVE_PATH / model_name / f"{structure}.json"
+    if not filepath.exists():
+        return None
+    try:
+        with filepath.open(encoding="utf8") as fh:
+            return json.load(fh)
+    except Exception:
+        return None
+
+
+class CompressionApp(BaseApp):
+    """Compression benchmark app layout and callbacks."""
+
+    def register_callbacks(self) -> None:
+        """Register dropdown-driven compression curve callbacks."""
+        model_dropdown_id = f"{BENCHMARK_NAME}-model-dropdown"
+        structure_dropdown_id = f"{BENCHMARK_NAME}-structure-dropdown"
+        figure_id = f"{BENCHMARK_NAME}-figure"
+
+        @callback(
+            Output(structure_dropdown_id, "options"),
+            Output(structure_dropdown_id, "value"),
+            Input(model_dropdown_id, "value"),
+        )
+        def _update_structure_options(model_name: str):
+            """
+            Populate structure dropdown for the selected model.
+
+            Parameters
+            ----------
+            model_name
+                Selected model value from the model dropdown.
+
+            Returns
+            -------
+            tuple[list[dict], str | None]
+                Structure dropdown options and default selection.
+            """
+            if not model_name:
+                raise PreventUpdate
+            structures = _available_structures(model_name)
+            options = [{"label": s, "value": s} for s in structures]
+            default = structures[0] if structures else None
+            return options, default
+
+        @callback(
+            Output(figure_id, "figure"),
+            Input(model_dropdown_id, "value"),
+            Input(structure_dropdown_id, "value"),
+        )
+        def _update_figure(model_name: str, structure: str | None):
+            """
+            Render energy-vs-volume and dE/dV curves for the selected structure.
+
+            Parameters
+            ----------
+            model_name
+                Selected model identifier.
+            structure
+                Selected structure label.
+
+            Returns
+            -------
+            go.Figure
+                Plotly figure with two subplots.
+            """
+            if not model_name or not structure:
+                raise PreventUpdate
+
+            payload = _load_curve(model_name, structure)
+            if payload is None:
+                raise PreventUpdate
+
+            volumes = payload.get("volume_per_atom", [])
+            energies = payload.get("energy_per_atom", [])
+            de_dv = payload.get("dEdV", [])
+
+            if not volumes or not energies:
+                raise PreventUpdate
+
+            fig = make_subplots(
+                rows=2,
+                cols=1,
+                shared_xaxes=True,
+                vertical_spacing=0.08,
+                subplot_titles=(
+                    "Energy per atom vs Volume per atom",
+                    "dE/dV vs Volume per atom",
+                ),
+            )
+
+            fig.add_trace(
+                go.Scatter(
+                    x=volumes,
+                    y=energies,
+                    mode="lines+markers",
+                    name="E/atom",
+                    line={"color": "royalblue"},
+                    marker={"size": 4},
+                ),
+                row=1,
+                col=1,
+            )
+
+            if de_dv:
+                fig.add_trace(
+                    go.Scatter(
+                        x=volumes,
+                        y=de_dv,
+                        mode="lines+markers",
+                        name="dE/dV",
+                        line={"color": "firebrick"},
+                        marker={"size": 4},
+                    ),
+                    row=2,
+                    col=1,
+                )
+
+            fig.update_xaxes(title_text="Volume per atom (ų)", row=2, col=1)
+            fig.update_yaxes(title_text="Energy per atom (eV)", row=1, col=1)
+            fig.update_yaxes(title_text="dE/dV (eV/ų)", row=2, col=1)
+
+            fig.update_layout(
+                title=f"{model_name} — {structure}",
+                height=700,
+                showlegend=True,
+                template="plotly_white",
+            )
+
+            return fig
+
+
+def get_app() -> CompressionApp:
+    """
+    Get compression benchmark app layout and callback registration.
+
+    Returns
+    -------
+    CompressionApp
+        Benchmark layout and callback registration.
+    """
+    model_options = [{"label": model, "value": model} for model in MODELS]
+    default_model = model_options[0]["value"] if model_options else None
+
+    extra_components = [
+        Div(
+            [
+                Label("Select model:"),
+                dcc.Dropdown(
+                    id=f"{BENCHMARK_NAME}-model-dropdown",
+                    options=model_options,
+                    value=default_model,
+                    clearable=False,
+                    style={"width": "300px", "marginBottom": "20px"},
+                ),
+                Label("Select structure:"),
+                dcc.Dropdown(
+                    id=f"{BENCHMARK_NAME}-structure-dropdown",
+                    options=[],
+                    value=None,
+                    clearable=False,
+                    style={"width": "300px"},
+                ),
+            ],
+            style={"marginBottom": "20px"},
+        ),
+        Loading(
+            dcc.Graph(
+                id=f"{BENCHMARK_NAME}-figure",
+                style={"height": "700px", "width": "100%", "marginTop": "20px"},
+            ),
+            type="circle",
+        ),
+    ]
+
+    return CompressionApp(
+        name=BENCHMARK_NAME,
+        description=(
+            "Uniform crystal compression explorer. Structures are isotropically "
+            "scaled and the energy per atom, its derivative dE/dV, and stress are "
+            "recorded. Metrics are averaged across all structures."
+        ),
+        docs_url=DOCS_URL,
+        table_path=DATA_PATH / "compression_metrics_table.json",
+        extra_components=extra_components,
+    )
+
+
+if __name__ == "__main__":
+    dash_app = Dash(__name__, assets_folder=DATA_PATH.parent.parent)
+    compression_app = get_app()
+    dash_app.layout = compression_app.layout
+    compression_app.register_callbacks()
+    dash_app.run(port=8056, debug=True)
diff --git a/ml_peg/calcs/physicality/compression/calc_compression.py b/ml_peg/calcs/physicality/compression/calc_compression.py
new file mode 100644
index 000000000..9873cc364
--- /dev/null
+++ b/ml_peg/calcs/physicality/compression/calc_compression.py
@@ -0,0 +1,327 @@
+"""Run calculations for uniform crystal compression benchmark."""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+from ase import Atoms
+from ase.build import bulk, make_supercell
+from ase.data import covalent_radii, atomic_numbers
+from ase.io import write
+import numpy as np
+import pandas as pd
+import pytest
+from tqdm import tqdm
+
+from ml_peg.models.get_models import load_models
+from ml_peg.models.models import current_models
+
+MODELS = load_models(current_models)
+
+# Local directory for calculator outputs
+OUT_PATH = Path(__file__).parent / "outputs"
+DATA_PATH = Path(__file__).parent / "data"
+
+# Benchmark configuration
+# Compression/expansion factors applied uniformly to the cell volume.
+# A factor of 1.0 corresponds to the equilibrium structure.
+MIN_SCALE = 0.25
+MAX_SCALE = 3.0
+N_POINTS = 20
+
+#ELEMENTS: list[str] = [symbol for symbol in chemical_symbols if symbol]
+ELEMENTS: list[str] = ["H", "C", "N", "O"]  # limit to common elements for testing
+PROTOTYPES: list[str] = ["sc", "bcc","fcc", "hcp", "diamond"]  # common crystal prototypes
+
+def _scale_grid(
+    min_scale: float,
+    max_scale: float,
+    n_points: int,
+) -> np.ndarray:
+    """
+    Return a uniformly spaced grid of isotropic scaling factors.
+
+    Each factor multiplies the *linear* cell dimensions so that the
+    corresponding volume scales as ``factor**3``.
+
+    Parameters
+    ----------
+    min_scale, max_scale
+        Bounds of the linear scaling grid.
+    n_points
+        Number of points in the grid.
+
+    Returns
+    -------
+    np.ndarray
+        Monotonic array of scaling factors.
+    """
+    return np.linspace(min_scale, max_scale, n_points, dtype=float)
+
+
+def _generate_prototype_structures(
+    elements: list[str],
+    prototypes: list[str],
+) -> dict[str, Atoms]:
+    """
+    Generate crystal structures from element/prototype combinations using ASE.
+
+    For each element and crystal prototype, ``ase.build.bulk`` is called
+    with a lattice constant of 1.0.  The resulting structure is then
+    rescaled so that the shortest interatomic distance equals twice the
+    covalent radius of the element (from ``ase.data.covalent_radii``).
+    Combinations that ASE cannot build are silently skipped.
+
+    Parameters
+    ----------
+    elements
+        Chemical symbols to build structures for.
+    prototypes
+        Crystal-structure prototypes recognised by ``ase.build.bulk``
+        (e.g. ``"fcc"``, ``"bcc"``, ``"hcp"``, ``"diamond"``, ``"sc"``).
+
+    Returns
+    -------
+    dict[str, Atoms]
+        Mapping of ``"{element}_{prototype}"`` labels to ASE ``Atoms``.
+    """
+    structures: dict[str, Atoms] = {}
+    for element in elements:
+        z = atomic_numbers[element]
+        target_bond = 2.0 * covalent_radii[z]
+        for proto in prototypes:
+            label = f"{element}_{proto}"
+            #try:
+            atoms = bulk(element, crystalstructure=proto, a=1.0)
+
+            # Find shortest interatomic distance at a=1.0
+            sc = make_supercell(atoms, 2 * np.eye(3))  # ensure we have a full unit cell for distance calculation
+            distances = sc.get_all_distances(mic=True)
+            np.fill_diagonal(distances, np.inf)
+            min_dist = float(distances.min())
+
+            if min_dist <= 0.0:
+                print(f"Skipping {label}: degenerate geometry at a=1.0")
+                continue
+
+            # Scale so shortest distance equals the covalent bond length
+            scale_factor = target_bond / min_dist
+            atoms.set_cell(atoms.cell * scale_factor, scale_atoms=True)
+            print("Atoms is", label, atoms)
+            assert np.all(atoms.pbc), f"Generated non-periodic structure for {label}"
+            assert atoms.get_volume() > 0.0, f"Generated zero-volume structure for {label}"
+
+            structures[label] = atoms
+            # except (ValueError, KeyError, RuntimeError) as exc:
+            #     # Not every element/prototype combination is supported
+            #     print(f"Skipping {label}: {exc}")
+            #     continue
+    return structures
+
+
+def _load_structures(data_path: Path) -> dict[str, Atoms]:
+    """
+    Load reference crystal structures from the data directory.
+
+    The directory is expected to contain individual ``.xyz`` or
+    ``.extxyz`` files, each holding a single periodic structure with a
+    cell defined.
+
+    Parameters
+    ----------
+    data_path
+        Directory containing reference crystal structure files.
+
+    Returns
+    -------
+    dict[str, Atoms]
+        Mapping of structure label (stem of filename) to ASE ``Atoms``.
+    """
+    from ase.io import read
+
+    structures: dict[str, Atoms] = {}
+    for ext in ("*.xyz", "*.extxyz", "*.cif"):
+        for filepath in sorted(data_path.glob(ext)):
+            atoms = read(filepath)
+            if not np.any(atoms.pbc):
+                atoms.pbc = True
+            structures[filepath.stem] = atoms
+    return structures
+
+
+def _collect_structures(
+    elements: list[str],
+    prototypes: list[str],
+    data_path: Path,
+) -> dict[str, Atoms]:
+    """
+    Build the full set of structures to benchmark.
+
+    Combines prototype-generated structures (one per element/prototype
+    combination produced by ``ase.build.bulk``) with any additional
+    structures loaded from files in ``data_path``.
+
+    Parameters
+    ----------
+    elements
+        Chemical symbols for prototype generation.
+    prototypes
+        Crystal prototypes for ``ase.build.bulk``.
+    data_path
+        Directory containing additional reference structure files.
+
+    Returns
+    -------
+    dict[str, Atoms]
+        Combined mapping of structure labels to ASE ``Atoms``.
+        Prototype labels have the form ``"{element}_{prototype}"``;
+        file-loaded labels use the filename stem.  If a file label
+        collides with a prototype label, the file-loaded structure
+        takes precedence.
+    """
+    structures = _generate_prototype_structures(elements, prototypes)
+    file_structures = _load_structures(data_path)
+    # File-loaded structures override any prototype with the same label
+    structures.update(file_structures)
+    return structures
+
+
+def run_compression(model_name: str, model) -> None:
+    """
+    Evaluate energy-volume compression curves for a single model.
+
+    For every reference crystal structure, the cell is uniformly scaled
+    over a range of linear factors.  At each scale point the energy,
+    volume (per atom), and stress tensor are recorded.
+
+    Parameters
+    ----------
+    model_name
+        Name of the model being evaluated.
+    model
+        Model wrapper providing ``get_calculator``.
+    """
+    calc = model.get_calculator()
+    write_dir = OUT_PATH / model_name
+    write_dir.mkdir(parents=True, exist_ok=True)
+    traj_dir = write_dir / "compression"
+    traj_dir.mkdir(parents=True, exist_ok=True)
+    
+
+    reference_structures = _collect_structures(ELEMENTS, PROTOTYPES, DATA_PATH)
+    if not reference_structures:
+        print(f"[{model_name}] No structures generated or found in {DATA_PATH}")
+        return
+
+    records: list[dict[str, float | str]] = []
+    supported_structures: set[str] = set()
+    failed_structures: dict[str, str] = {}
+
+    scales = _scale_grid(MIN_SCALE, MAX_SCALE, N_POINTS)
+
+    for struct_label, ref_atoms in tqdm(
+        reference_structures.items(),
+        desc=f"{model_name} compression",
+        unit="struct",
+    ):
+        try:
+            ref_cell = ref_atoms.cell.copy()
+            ref_positions = ref_atoms.get_scaled_positions()
+            n_atoms = len(ref_atoms)
+            trajectory: list[Atoms] = []
+
+            for scale in scales:
+                atoms = ref_atoms.copy()
+                # Uniformly scale the cell (isotropic compression/expansion)
+                atoms.set_cell(ref_cell * scale, scale_atoms=True)
+                atoms.set_scaled_positions(ref_positions)
+
+                atoms.info.setdefault("charge", 0)
+                atoms.info.setdefault("spin", 1)
+
+                atoms.calc = calc
+
+                energy = float(atoms.get_potential_energy())
+                volume = float(atoms.get_volume())
+                volume_per_atom = volume / n_atoms
+                energy_per_atom = energy / n_atoms
+
+                # Stress tensor (Voigt notation, eV/Å³)
+                try:
+                    stress_voigt = atoms.get_stress(voigt=False)
+                    pressure = -1/3 * np.trace(stress_voigt)  # hydrostatic pressure
+                except Exception:
+                    stress_voigt = np.zeros(6)
+                    pressure = np.nan
+
+                # Store trajectory snapshot
+                atoms_copy = atoms.copy()
+                atoms_copy.calc = None
+                atoms_copy.info.update(
+                    {
+                        "structure": struct_label,
+                        "scale": float(scale),
+                        "energy": energy,
+                        "energy_per_atom": energy_per_atom,
+                        "volume": volume,
+                        "volume_per_atom": volume_per_atom,
+                        "pressure": pressure,
+                        "model": model_name,
+                    }
+                )
+                trajectory.append(atoms_copy)
+
+                records.append(
+                    {
+                        "structure": struct_label,
+                        "scale": float(scale),
+                        "energy": energy,
+                        "energy_per_atom": energy_per_atom,
+                        "volume": volume,
+                        "volume_per_atom": volume_per_atom,
+                        "pressure": pressure,
+                    }
+                )
+
+            if trajectory:
+                write(
+                    traj_dir / f"{struct_label}.xyz",
+                    trajectory,
+                    format="extxyz",
+                )
+                supported_structures.add(struct_label)
+
+        except Exception as exc:
+            failed_structures[struct_label] = str(exc)
+            print(f"[{model_name}] Skipping {struct_label}: {exc}")
+            continue
+
+    if records:
+        df = pd.DataFrame.from_records(records)
+        df.to_csv(write_dir / "compression.csv", index=False)
+
+    metadata = {
+        "supported_structures": sorted(supported_structures),
+        "failed_structures": failed_structures,
+        "config": {
+            "min_scale": MIN_SCALE,
+            "max_scale": MAX_SCALE,
+            "n_points": N_POINTS,
+        },
+    }
+    (write_dir / "metadata.json").write_text(json.dumps(metadata, indent=2))
+
+
+@pytest.mark.slow
+@pytest.mark.parametrize("model_name", MODELS)
+def test_compression(model_name: str) -> None:
+    """
+    Run compression benchmark for each registered model.
+
+    Parameters
+    ----------
+    model_name
+        Name of the model to evaluate.
+    """
+    run_compression(model_name, MODELS[model_name])
diff --git a/ml_peg/calcs/physicality/compression/data/README.md b/ml_peg/calcs/physicality/compression/data/README.md
new file mode 100644
index 000000000..c2f9155d9
--- /dev/null
+++ b/ml_peg/calcs/physicality/compression/data/README.md
@@ -0,0 +1,11 @@
+# Compression Benchmark Reference Structures
+
+Place reference crystal structure files here (``.xyz``, ``.extxyz``, or ``.cif`` format).
+
+Each file should contain a single periodic structure with a unit cell defined.
+The filename stem is used as the structure label in results and plots.
+
+Example files:
+- ``NaCl.xyz``
+- ``Si_diamond.cif``
+- ``Cu_fcc.extxyz``
diff --git a/ml_peg/calcs/physicality/diatomics/calc_diatomics.py b/ml_peg/calcs/physicality/diatomics/calc_diatomics.py
index 84c06752a..b3182e7d6 100644
--- a/ml_peg/calcs/physicality/diatomics/calc_diatomics.py
+++ b/ml_peg/calcs/physicality/diatomics/calc_diatomics.py
@@ -25,7 +25,7 @@
 
 # Benchmark configuration (matches historical benchmark settings)
 ELEMENTS: list[str] = [symbol for symbol in chemical_symbols if symbol]
-INCLUDE_HETERONUCLEAR = True
+INCLUDE_HETERONUCLEAR = False
 MIN_DISTANCE = 0.18
 MAX_DISTANCE = 6.0
 # for testing, reduce the number of points to e.g. 5

From 4a2776442ee14952506d87b8bf9c447e6bb2f5d4 Mon Sep 17 00:00:00 2001
From: James Darby <james@symmetricgroup.co.uk>
Date: Sun, 8 Mar 2026 17:59:56 +0000
Subject: [PATCH 02/12] added in ability to generate random structures, uses
 pyxtal package

---
 .../compression/calc_compression.py           | 360 ++++++++++++++++--
 1 file changed, 323 insertions(+), 37 deletions(-)

diff --git a/ml_peg/calcs/physicality/compression/calc_compression.py b/ml_peg/calcs/physicality/compression/calc_compression.py
index 9873cc364..025a3b3b4 100644
--- a/ml_peg/calcs/physicality/compression/calc_compression.py
+++ b/ml_peg/calcs/physicality/compression/calc_compression.py
@@ -5,10 +5,12 @@
 import json
 from pathlib import Path
 
+import itertools
+
 from ase import Atoms
 from ase.build import bulk, make_supercell
 from ase.data import covalent_radii, atomic_numbers
-from ase.io import write
+from ase.io import read as ase_read, write
 import numpy as np
 import pandas as pd
 import pytest
@@ -17,6 +19,8 @@
 from ml_peg.models.get_models import load_models
 from ml_peg.models.models import current_models
 
+import pyxtal #used for generating random crystal structures
+
 MODELS = load_models(current_models)
 
 # Local directory for calculator outputs
@@ -33,6 +37,8 @@
 #ELEMENTS: list[str] = [symbol for symbol in chemical_symbols if symbol]
 ELEMENTS: list[str] = ["H", "C", "N", "O"]  # limit to common elements for testing
 PROTOTYPES: list[str] = ["sc", "bcc","fcc", "hcp", "diamond"]  # common crystal prototypes
+MAX_ATOMS_PER_CELL = 6  # limit to small cells for testing
+RANDOM_STRUCTURES: list[dict[str, int]] = [[1, 10, 10], [2, 10, 10]]  # [[Number of elements, number of compositions, repeats per composition], ...]
 
 def _scale_grid(
     min_scale: float,
@@ -60,6 +66,62 @@ def _scale_grid(
     return np.linspace(min_scale, max_scale, n_points, dtype=float)
 
 
+def _scale_using_isotropic_guess(atoms: Atoms) -> Atoms:
+    """
+    Isotropically scale a structure so nearest-neighbour distances match covalent radii.
+
+    For each pair of atoms the target bond length is the sum of their
+    covalent radii (from ``ase.data.covalent_radii``).  The structure is
+    uniformly rescaled so that the shortest interatomic distance equals
+    the smallest such target across all pairs.
+
+    Parameters
+    ----------
+    atoms
+        Input structure (must be periodic with ``pbc=True``).
+
+    Returns
+    -------
+    Atoms
+        A copy of *atoms* with the cell and positions rescaled.
+
+    Raises
+    ------
+    ValueError
+        If the shortest interatomic distance at the current cell is zero
+        or negative (degenerate geometry).
+    """
+    atoms = atoms.copy()
+    sc = make_supercell(atoms, 2 * np.eye(3))
+    distances = sc.get_all_distances(mic=True)
+    np.fill_diagonal(distances, np.inf)
+
+    # Build a matrix of target bond lengths (sum of covalent radii per pair)
+    radii = np.array([covalent_radii[atomic_numbers[s]] for s in sc.get_chemical_symbols()])
+    target_matrix = radii[:, None] + radii[None, :]
+
+    # Mask infinite entries (self-distances)
+    mask = np.isfinite(distances)
+    if not np.any(mask):
+        raise ValueError("No finite interatomic distances found")
+
+    # Find the pair with the smallest distance and its corresponding target
+    ratios = np.full_like(distances, np.inf)
+    ratios[mask] = target_matrix[mask] / distances[mask]
+
+    # The required scale factor is the ratio for the closest pair
+    closest_idx = np.unravel_index(np.argmin(distances), distances.shape)
+    min_dist = distances[closest_idx]
+    target_bond = target_matrix[closest_idx]
+
+    if min_dist <= 0.0:
+        raise ValueError("Degenerate geometry: shortest interatomic distance <= 0")
+
+    scale_factor = target_bond / min_dist
+    atoms.set_cell(atoms.cell * scale_factor, scale_atoms=True)
+    return atoms
+
+
 def _generate_prototype_structures(
     elements: list[str],
     prototypes: list[str],
@@ -69,9 +131,10 @@ def _generate_prototype_structures(
 
     For each element and crystal prototype, ``ase.build.bulk`` is called
     with a lattice constant of 1.0.  The resulting structure is then
-    rescaled so that the shortest interatomic distance equals twice the
-    covalent radius of the element (from ``ase.data.covalent_radii``).
-    Combinations that ASE cannot build are silently skipped.
+    rescaled via ``_scale_using_isotropic_guess`` so that the shortest
+    interatomic distance matches the sum of covalent radii for the
+    closest pair.  Combinations that ASE cannot build are silently
+    skipped.
 
     Parameters
     ----------
@@ -88,35 +151,233 @@ def _generate_prototype_structures(
     """
     structures: dict[str, Atoms] = {}
     for element in elements:
-        z = atomic_numbers[element]
-        target_bond = 2.0 * covalent_radii[z]
         for proto in prototypes:
             label = f"{element}_{proto}"
-            #try:
-            atoms = bulk(element, crystalstructure=proto, a=1.0)
+            try:
+                atoms = bulk(element, crystalstructure=proto, a=1.0)
+                atoms = _scale_using_isotropic_guess(atoms)
+
+                assert np.all(atoms.pbc), f"Generated non-periodic structure for {label}"
+                assert atoms.get_volume() > 0.0, f"Generated zero-volume structure for {label}"
 
-            # Find shortest interatomic distance at a=1.0
-            sc = make_supercell(atoms, 2 * np.eye(3))  # ensure we have a full unit cell for distance calculation
-            distances = sc.get_all_distances(mic=True)
-            np.fill_diagonal(distances, np.inf)
-            min_dist = float(distances.min())
+                structures[label] = atoms
+            except (ValueError, KeyError, RuntimeError) as exc:
+                print(f"Skipping {label}: {exc}")
+                continue
+    return structures
+
+
+def _gen_random_structure(
+    composition: dict[str, int],
+    space_group: int | None = None,
+    dim: int = 3,
+    seed: int | None = None,
+    max_attempts: int = 230,
+) -> Atoms:
+    """
+    Generate a random crystal structure using PyXtal.
+
+    Parameters
+    ----------
+    composition
+        Mapping of chemical symbol to number of atoms in the
+        conventional cell (e.g. ``{"C": 12}`` or ``{"Ba": 1, "Ti": 1, "O": 3}``).
+    space_group
+        International space-group number (1–230).  When ``None`` (default),
+        a space group is chosen at random.  If the chosen space group is
+        incompatible with the composition, additional random space groups
+        are tried until one succeeds.
+    dim
+        Dimensionality of the crystal (3 for bulk).  Default is 3.
+    seed
+        Random seed for reproducibility.  When ``None`` (default), a
+        random seed is generated automatically.
+    max_attempts
+        Maximum number of space-group attempts before giving up.
+
+    Returns
+    -------
+    Atoms
+        ASE ``Atoms`` with periodic boundary conditions and a cell
+        scaled so that nearest-neighbour distances match the sum of
+        covalent radii.
+
+    Raises
+    ------
+    RuntimeError
+        If no valid structure could be generated within *max_attempts*.
+    """
+    from pyxtal import pyxtal as PyXtal
+
+    rng = np.random.default_rng(seed)
+
+    species = list(composition.keys())
+    num_atoms = list(composition.values())
+
+    # If a specific space group was requested, try it first
+    sg_candidates: list[int] = []
+    if space_group is not None:
+        #sg_candidates.append(space_group)
+        sg_candidates = [space_group] * max_attempts  # try the same one repeatedly to allow for randomness in the structure generation
 
-            if min_dist <= 0.0:
-                print(f"Skipping {label}: degenerate geometry at a=1.0")
+    # Fill remaining attempts with random space groups
+    while len(sg_candidates) < max_attempts:
+        sg_candidates.append(int(rng.integers(1, 231)))
+
+    for sg in sg_candidates:
+        try:
+            crystal = PyXtal()
+            crystal.from_random(
+                dim=dim,
+                group=sg,
+                species=species,
+                numIons=num_atoms,
+                random_state=int(rng.integers(0, 2**31)),
+            )
+            if not crystal.valid:
                 continue
 
-            # Scale so shortest distance equals the covalent bond length
-            scale_factor = target_bond / min_dist
-            atoms.set_cell(atoms.cell * scale_factor, scale_atoms=True)
-            print("Atoms is", label, atoms)
-            assert np.all(atoms.pbc), f"Generated non-periodic structure for {label}"
-            assert atoms.get_volume() > 0.0, f"Generated zero-volume structure for {label}"
-
-            structures[label] = atoms
-            # except (ValueError, KeyError, RuntimeError) as exc:
-            #     # Not every element/prototype combination is supported
-            #     print(f"Skipping {label}: {exc}")
-            #     continue
+            atoms = crystal.to_ase()
+            atoms.pbc = True
+            atoms = _scale_using_isotropic_guess(atoms)
+            return atoms
+        except Exception:
+            continue
+
+    raise RuntimeError(
+        f"PyXtal failed to generate a valid structure for "
+        f"composition={composition} after {max_attempts} attempts"
+    )
+
+
+def _composition_label(composition: dict[str, int]) -> str:
+    """
+    Build a canonical string label for a composition.
+
+    Elements are sorted alphabetically and counts are appended only when
+    greater than one (e.g. ``{"C": 1, "O": 2}`` → ``"CO2"``).
+
+    Parameters
+    ----------
+    composition
+        Mapping of element symbol to atom count.
+
+    Returns
+    -------
+    str
+        Compact composition label.
+    """
+    parts: list[str] = []
+    for elem in sorted(composition):
+        count = composition[elem]
+        parts.append(f"{elem}{count if count > 1 else ''}")
+    return "".join(parts)
+
+
+def _generate_all_random(
+    random_specs: list[list[int]],
+    elements: list[str],
+    max_atoms: int,
+    data_path: Path,
+    seed: int | None = None,
+) -> dict[str, Atoms]:
+    """
+    Generate random crystal structures for multiple compositions.
+
+    For each entry ``[n_elements, n_compositions, repeats]`` in
+    *random_specs*, *n_compositions* unique compositions are sampled
+    (drawing *n_elements* species from *elements* and random atom
+    counts respecting *max_atoms*).  For every composition,
+    ``_gen_random_structure`` is called *repeats* times, producing
+    structures labelled ``"{composition}_RSS_{i}"``.
+
+    Single-element structures are additionally saved to *data_path* so
+    they can be reloaded later via ``_load_structures``.
+
+    Parameters
+    ----------
+    random_specs
+        List of ``[n_elements, n_compositions, repeats]`` triples.
+    elements
+        Pool of chemical symbols to draw from.
+    max_atoms
+        Maximum total number of atoms in any generated cell.
+    data_path
+        Directory where the configs ``.xyz`` files are written.
+    seed
+        Master random seed for reproducibility.
+
+    Returns
+    -------
+    dict[str, Atoms]
+        All successfully generated structures keyed by label.
+    """
+    rng = np.random.default_rng(seed)
+    structures: dict[str, Atoms] = {}
+    data_path.mkdir(parents=True, exist_ok=True)
+
+    # Collect frames grouped by number of elements for saving
+    frames_by_n_elements: dict[int, list[Atoms]] = {}
+
+    for n_elements, n_compositions, repeats in random_specs:
+        if n_elements > len(elements):
+            print(
+                f"Requested {n_elements} elements but only {len(elements)} "
+                f"available — skipping this spec."
+            )
+            continue
+
+        # Generate unique compositions
+        generated_compositions: list[dict[str, int]] = []
+        seen_labels: set[str] = set()
+        max_comp_attempts = n_compositions * 50  # safety limit
+        attempts = 0
+
+        while len(generated_compositions) < n_compositions and attempts < max_comp_attempts:
+            attempts += 1
+            chosen_elements = sorted(
+                rng.choice(elements, size=n_elements, replace=False).tolist()
+            )
+            # Random atom counts that sum to at most max_atoms (each >= 1)
+            counts = rng.integers(1, max(2, max_atoms - n_elements + 2), size=n_elements)
+            # Clip total to max_atoms
+            total = int(counts.sum())
+            if total > max_atoms:
+                counts = (counts * max_atoms / total).astype(int)
+                counts = np.maximum(counts, 1)
+
+            composition = dict(zip(chosen_elements, counts.tolist()))
+            label = _composition_label(composition)
+            if label not in seen_labels:
+                seen_labels.add(label)
+                generated_compositions.append(composition)
+
+        # Generate structures for each composition
+        for composition in generated_compositions:
+            comp_label = _composition_label(composition)
+            for i in range(repeats):
+                struct_label = f"{comp_label}_RSS_{i}"
+                try:
+                    atoms = _gen_random_structure(
+                        composition,
+                        seed=int(rng.integers(0, 2**31)),
+                        space_group=1,  # random space group
+                    )
+                    atoms.info["label"] = struct_label
+                    structures[struct_label] = atoms
+
+                    # Collect for saving grouped by number of elements
+                    frames_by_n_elements.setdefault(n_elements, []).append(atoms)
+                except Exception as exc:
+                    print(f"Failed to generate {struct_label}: {exc}")
+                    continue
+
+    # Save structures to data_path as multi-frame xyz, grouped by element count
+    for n_elem, atom_list in frames_by_n_elements.items():
+        filepath = data_path / f"{n_elem}.xyz"
+        write(filepath, atom_list, format="extxyz")
+        print(f"Saved {len(atom_list)} structures to {filepath}")
+
     return structures
 
 
@@ -124,9 +385,9 @@ def _load_structures(data_path: Path) -> dict[str, Atoms]:
     """
     Load reference crystal structures from the data directory.
 
-    The directory is expected to contain individual ``.xyz`` or
-    ``.extxyz`` files, each holding a single periodic structure with a
-    cell defined.
+    Each ``.xyz`` / ``.extxyz`` file may contain multiple frames.  For
+    single-frame files the label is the filename stem; for multi-frame
+    files each frame is labelled ``"{stem}_{index}"``.
 
     Parameters
     ----------
@@ -136,17 +397,31 @@ def _load_structures(data_path: Path) -> dict[str, Atoms]:
     Returns
     -------
     dict[str, Atoms]
-        Mapping of structure label (stem of filename) to ASE ``Atoms``.
+        Mapping of structure label to ASE ``Atoms``.
     """
-    from ase.io import read
-
     structures: dict[str, Atoms] = {}
     for ext in ("*.xyz", "*.extxyz", "*.cif"):
         for filepath in sorted(data_path.glob(ext)):
-            atoms = read(filepath)
-            if not np.any(atoms.pbc):
-                atoms.pbc = True
-            structures[filepath.stem] = atoms
+            try:
+                frames = ase_read(filepath, index=":")
+            except Exception:
+                continue
+            if not isinstance(frames, list):
+                frames = [frames]
+            stem = filepath.stem
+            if len(frames) == 1:
+                atoms = frames[0]
+                # Prefer the stored label if present
+                label = atoms.info.get("label", stem)
+                if not np.any(atoms.pbc):
+                    atoms.pbc = True
+                structures[label] = atoms
+            else:
+                for idx, atoms in enumerate(frames):
+                    label = atoms.info.get("label", f"{stem}_{idx}")
+                    if not np.any(atoms.pbc):
+                        atoms.pbc = True
+                    structures[label] = atoms
     return structures
 
 
@@ -325,3 +600,14 @@ def test_compression(model_name: str) -> None:
         Name of the model to evaluate.
     """
     run_compression(model_name, MODELS[model_name])
+
+
+if __name__ == "__main__":
+    #generate the random structures and save to data directory (only needs to be done once)
+    _generate_all_random(
+        RANDOM_STRUCTURES,
+        ELEMENTS,
+        MAX_ATOMS_PER_CELL,
+        DATA_PATH,
+        seed=42,
+    )
\ No newline at end of file

From 21b691e9334c200d7c237de4838c2bc417ebbda4 Mon Sep 17 00:00:00 2001
From: James Darby <james@symmetricgroup.co.uk>
Date: Sun, 8 Mar 2026 18:35:18 +0000
Subject: [PATCH 03/12] tweaking settings to get sensible random structures

---
 .../compression/calc_compression.py           | 43 +++++++++++--------
 1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/ml_peg/calcs/physicality/compression/calc_compression.py b/ml_peg/calcs/physicality/compression/calc_compression.py
index 025a3b3b4..981b274cb 100644
--- a/ml_peg/calcs/physicality/compression/calc_compression.py
+++ b/ml_peg/calcs/physicality/compression/calc_compression.py
@@ -20,6 +20,7 @@
 from ml_peg.models.models import current_models
 
 import pyxtal #used for generating random crystal structures
+from pyxtal.tolerance import Tol_matrix
 
 MODELS = load_models(current_models)
 
@@ -173,6 +174,7 @@ def _gen_random_structure(
     dim: int = 3,
     seed: int | None = None,
     max_attempts: int = 230,
+    volume_factor: float = 0.35,
 ) -> Atoms:
     """
     Generate a random crystal structure using PyXtal.
@@ -224,26 +226,30 @@ def _gen_random_structure(
     while len(sg_candidates) < max_attempts:
         sg_candidates.append(int(rng.integers(1, 231)))
 
+    custom_tol = Tol_matrix(prototype="atomic", factor=1.0)
     for sg in sg_candidates:
-        try:
-            crystal = PyXtal()
-            crystal.from_random(
-                dim=dim,
-                group=sg,
-                species=species,
-                numIons=num_atoms,
-                random_state=int(rng.integers(0, 2**31)),
-            )
-            if not crystal.valid:
-                continue
-
-            atoms = crystal.to_ase()
-            atoms.pbc = True
-            atoms = _scale_using_isotropic_guess(atoms)
-            return atoms
-        except Exception:
+        crystal = PyXtal()
+        crystal.from_random(
+            dim=dim,
+            group=sg,
+            species=species,
+            numIons=num_atoms,
+            random_state=int(rng.integers(0, 2**31)),
+            tm=custom_tol,
+            max_count=100000,
+            factor=volume_factor
+        )
+        if not crystal.valid:
             continue
 
+        atoms = crystal.to_ase()
+        atoms.pbc = True
+        atoms = _scale_using_isotropic_guess(atoms)
+
+        return atoms
+        # except Exception:
+        #     continue
+
     raise RuntimeError(
         f"PyXtal failed to generate a valid structure for "
         f"composition={composition} after {max_attempts} attempts"
@@ -351,7 +357,8 @@ def _generate_all_random(
             if label not in seen_labels:
                 seen_labels.add(label)
                 generated_compositions.append(composition)
-
+        
+        print(f"Generated {len(generated_compositions)} unique compositions with {n_elements} elements: {generated_compositions}")
         # Generate structures for each composition
         for composition in generated_compositions:
             comp_label = _composition_label(composition)

From 91b7f4f3eaea84b5fb4f39cf1b5589372c273e9c Mon Sep 17 00:00:00 2001
From: James Darby <james@symmetricgroup.co.uk>
Date: Sun, 8 Mar 2026 19:27:58 +0000
Subject: [PATCH 04/12] pre-plotting

---
 .../compression/calc_compression.py           | 27 ++++++++++++++-----
 .../physicality/compression/data/README.md    |  9 +++----
 2 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/ml_peg/calcs/physicality/compression/calc_compression.py b/ml_peg/calcs/physicality/compression/calc_compression.py
index 981b274cb..d0aad956d 100644
--- a/ml_peg/calcs/physicality/compression/calc_compression.py
+++ b/ml_peg/calcs/physicality/compression/calc_compression.py
@@ -39,7 +39,7 @@
 ELEMENTS: list[str] = ["H", "C", "N", "O"]  # limit to common elements for testing
 PROTOTYPES: list[str] = ["sc", "bcc","fcc", "hcp", "diamond"]  # common crystal prototypes
 MAX_ATOMS_PER_CELL = 6  # limit to small cells for testing
-RANDOM_STRUCTURES: list[dict[str, int]] = [[1, 10, 10], [2, 10, 10]]  # [[Number of elements, number of compositions, repeats per composition], ...]
+RANDOM_STRUCTURES: list[dict[str, int]] = [[1, 2*len(ELEMENTS), 5], [2, 5, 10]]  # [[Number of elements, number of compositions, repeats per composition], ...]
 
 def _scale_grid(
     min_scale: float,
@@ -341,16 +341,29 @@ def _generate_all_random(
 
         while len(generated_compositions) < n_compositions and attempts < max_comp_attempts:
             attempts += 1
-            chosen_elements = sorted(
-                rng.choice(elements, size=n_elements, replace=False).tolist()
-            )
+            if n_elements == 1:
+                #pick the element which occurs least frequently in the already generated compositions to increase diversity
+                elem_counts = {elem: 0 for elem in elements}
+                for comp in generated_compositions:
+                    for elem in comp:
+                        elem_counts[elem] += comp[elem]
+                least_common = sorted(elem_counts, key=elem_counts.get)
+                chosen_elements = [least_common[0]]
+            else:
+                chosen_elements = sorted(
+                    rng.choice(elements, size=n_elements, replace=False).tolist()
+                )
             # Random atom counts that sum to at most max_atoms (each >= 1)
             counts = rng.integers(1, max(2, max_atoms - n_elements + 2), size=n_elements)
+            if len(counts) == 1:
+                counts = np.random.choice(range(max_atoms//2, max_atoms + 1), size=1)  # for single element, just pick a random count between max_atoms//2 and max_atoms
+            
             # Clip total to max_atoms
             total = int(counts.sum())
             if total > max_atoms:
                 counts = (counts * max_atoms / total).astype(int)
                 counts = np.maximum(counts, 1)
+            
 
             composition = dict(zip(chosen_elements, counts.tolist()))
             label = _composition_label(composition)
@@ -363,7 +376,7 @@ def _generate_all_random(
         for composition in generated_compositions:
             comp_label = _composition_label(composition)
             for i in range(repeats):
-                struct_label = f"{comp_label}_RSS_{i}"
+                struct_label = f"{comp_label}_pyxtal_{i}"
                 try:
                     atoms = _gen_random_structure(
                         composition,
@@ -610,7 +623,9 @@ def test_compression(model_name: str) -> None:
 
 
 if __name__ == "__main__":
-    #generate the random structures and save to data directory (only needs to be done once)
+    # generate the random structures and save to data directory 
+    # this only needs to be done once
+    # code included here so that the structures can be easily regenerated in the same way if needed, or more can be created etc.
     _generate_all_random(
         RANDOM_STRUCTURES,
         ELEMENTS,
diff --git a/ml_peg/calcs/physicality/compression/data/README.md b/ml_peg/calcs/physicality/compression/data/README.md
index c2f9155d9..9ba7177e1 100644
--- a/ml_peg/calcs/physicality/compression/data/README.md
+++ b/ml_peg/calcs/physicality/compression/data/README.md
@@ -2,10 +2,7 @@
 
 Place reference crystal structure files here (``.xyz``, ``.extxyz``, or ``.cif`` format).
 
-Each file should contain a single periodic structure with a unit cell defined.
-The filename stem is used as the structure label in results and plots.
+1.xyz contains random unary structures
+2.xyz contains random binary structures etc. 
 
-Example files:
-- ``NaCl.xyz``
-- ``Si_diamond.cif``
-- ``Cu_fcc.extxyz``
+Random structures generated using PyXtal

From b2725bc5ba5baf1a17585b4f1b1bf2cb51e3284a Mon Sep 17 00:00:00 2001
From: James Darby <james@symmetricgroup.co.uk>
Date: Sun, 8 Mar 2026 19:59:57 +0000
Subject: [PATCH 05/12] Claude fiddling with the plots

---
 .../compression/app_compression.py            | 341 ++++++++++++++----
 1 file changed, 272 insertions(+), 69 deletions(-)

diff --git a/ml_peg/app/physicality/compression/app_compression.py b/ml_peg/app/physicality/compression/app_compression.py
index 363485add..575f512d3 100644
--- a/ml_peg/app/physicality/compression/app_compression.py
+++ b/ml_peg/app/physicality/compression/app_compression.py
@@ -3,12 +3,14 @@
 from __future__ import annotations
 
 import json
+import re
 from pathlib import Path
 
 from dash import Dash, Input, Output, callback, dcc
 from dash.dcc import Loading
 from dash.exceptions import PreventUpdate
 from dash.html import Div, Label
+import numpy as np
 import plotly.graph_objects as go
 from plotly.subplots import make_subplots
 
@@ -27,10 +29,63 @@
     "physicality.html#compression"
 )
 
+# Regex to strip the _RSS_<n> suffix to get the composition
+_RSS_SUFFIX = re.compile(r"_RSS_\d+$")
+# Regex to extract element symbols from a composition string
+_ELEMENT_RE = re.compile(r"[A-Z][a-z]?")
 
-def _available_structures(model_name: str) -> list[str]:
+
+def _composition_from_label(label: str) -> str:
+    """
+    Extract the composition prefix from a structure label.
+
+    Parameters
+    ----------
+    label
+        Full structure label (e.g. ``"C2H3_RSS_0"``).
+
+    Returns
+    -------
+    str
+        Composition string (e.g. ``"C2H3"``).  If the label has no
+        ``_RSS_<n>`` suffix it is returned unchanged.
+    """
+    return _RSS_SUFFIX.sub("", label)
+
+
+def _element_group(composition: str) -> tuple[str, ...]:
+    """
+    Extract the sorted tuple of unique element symbols from a composition.
+
+    ``"C2H3"`` → ``("C", "H")``, ``"C3"`` → ``("C",)``.
+
+    Parameters
+    ----------
+    composition
+        Composition string (no ``_RSS_<n>`` suffix).
+
+    Returns
+    -------
+    tuple[str, ...]
+        Sorted unique elements.
+    """
+    return tuple(sorted(set(_ELEMENT_RE.findall(composition))))
+
+
+def _element_group_label(group: tuple[str, ...]) -> str:
+    """
+    Human-readable label for an element group, e.g. ``"C"`` or ``"C-H"``.
+    """
+    return "-".join(group)
+
+
+def _available_element_groups(model_name: str) -> list[tuple[str, ...]]:
     """
-    List structure labels available for a given model.
+    List unique element groups available for a given model.
+
+    Structures whose compositions share the same *set* of elements are
+    placed into the same group.  For example ``"C"``, ``"C2"``, and
+    ``"C3"`` all belong to the group ``("C",)``.
 
     Parameters
     ----------
@@ -39,39 +94,147 @@ def _available_structures(model_name: str) -> list[str]:
 
     Returns
     -------
-    list[str]
-        Sorted list of structure labels with stored curve data.
+    list[tuple[str, ...]]
+        Sorted list of unique element groups.
     """
     model_dir = CURVE_PATH / model_name
     if not model_dir.exists():
         return []
-    return sorted(p.stem for p in model_dir.glob("*.json"))
+    groups: set[tuple[str, ...]] = set()
+    for p in model_dir.glob("*.json"):
+        groups.add(_element_group(_composition_from_label(p.stem)))
+    return sorted(groups)
 
 
-def _load_curve(model_name: str, structure: str) -> dict | None:
+def _load_curves_for_element_group(
+    model_name: str, group: tuple[str, ...]
+) -> list[tuple[str, dict]]:
     """
-    Load a single curve payload from disk.
+    Load all curve payloads whose element group matches *group*.
 
     Parameters
     ----------
     model_name
         Model identifier.
-    structure
-        Structure label (filename stem).
+    group
+        Target element group (e.g. ``("C",)`` or ``("C", "H")``).
 
     Returns
     -------
-    dict | None
-        Curve payload dict, or None if the file is missing / unreadable.
+    list[tuple[str, dict]]
+        List of ``(label, payload)`` tuples for every matching structure.
+    """
+    model_dir = CURVE_PATH / model_name
+    if not model_dir.exists():
+        return []
+    results: list[tuple[str, dict]] = []
+    for p in sorted(model_dir.glob("*.json")):
+        if _element_group(_composition_from_label(p.stem)) == group:
+            try:
+                with p.open(encoding="utf8") as fh:
+                    results.append((p.stem, json.load(fh)))
+            except Exception:
+                continue
+    return results
+
+
+# Fraction of the y-axis occupied by the linear region (±linthresh).
+LINEAR_FRAC: float = 0.6
+
+
+def _symlog(
+    values: list[float] | np.ndarray,
+    linthresh: float = 10.0,
+    linear_frac: float = LINEAR_FRAC,
+    decades: int = 4,
+) -> list[float]:
     """
-    filepath = CURVE_PATH / model_name / f"{structure}.json"
-    if not filepath.exists():
-        return None
-    try:
-        with filepath.open(encoding="utf8") as fh:
-            return json.load(fh)
-    except Exception:
-        return None
+    Apply a symmetric-log transform to *values*.
+
+    Linear within ``[-linthresh, linthresh]``, logarithmic outside.
+    The linear region is scaled so that it occupies *linear_frac* of the
+    total axis range (assuming *decades* log decades on each side).
+
+    Parameters
+    ----------
+    values
+        Raw data values.
+    linthresh
+        Linear threshold.
+    linear_frac
+        Fraction of the full axis height reserved for the linear region.
+    decades
+        Number of log decades shown on each side (used to compute the
+        compression factor for the logarithmic tails).
+
+    Returns
+    -------
+    list[float]
+        Transformed values.
+    """
+    lin_half = linear_frac / 2.0
+    log_scale = (1.0 - linear_frac) / (2.0 * max(decades, 1))
+
+    arr = np.asarray(values, dtype=float)
+    out = np.where(
+        np.abs(arr) <= linthresh,
+        lin_half * arr / linthresh,
+        np.sign(arr)
+        * (lin_half + log_scale * np.log10(np.abs(arr) / linthresh)),
+    )
+    return out.tolist()
+
+
+def _symlog_ticks(
+    linthresh: float = 10.0,
+    decades: int = 4,
+    linear_frac: float = LINEAR_FRAC,
+) -> tuple[list[float], list[str]]:
+    """
+    Generate tick positions and labels for a symlog axis.
+
+    Parameters
+    ----------
+    linthresh
+        Linear threshold matching ``_symlog``.
+    decades
+        Number of decades to show on each side of zero.
+    linear_frac
+        Must match the value passed to ``_symlog``.
+
+    Returns
+    -------
+    tuple[list[float], list[str]]
+        Tick values (in transformed space) and their display labels.
+    """
+    ticks_val: list[float] = []
+    ticks_txt: list[str] = []
+
+    kw = {"linthresh": linthresh, "linear_frac": linear_frac, "decades": decades}
+
+    # Negative decades
+    for d in range(decades, 0, -1):
+        raw = -(linthresh * 10 ** d)
+        ticks_val.append(_symlog([raw], **kw)[0])
+        ticks_txt.append(f"{raw:.0e}")
+    # Linear region
+    for v in [-linthresh, 0, linthresh]:
+        ticks_val.append(_symlog([v], **kw)[0])
+        ticks_txt.append(f"{v:g}")
+    # Positive decades
+    for d in range(1, decades + 1):
+        raw = linthresh * 10 ** d
+        ticks_val.append(_symlog([raw], **kw)[0])
+        ticks_txt.append(f"{raw:.0e}")
+
+    return ticks_val, ticks_txt
+
+
+# A palette for overlaid RSS curves
+_PALETTE = [
+    "royalblue", "firebrick", "seagreen", "darkorange", "mediumpurple",
+    "deeppink", "teal", "goldenrod", "slategray", "crimson",
+]
 
 
 class CompressionApp(BaseApp):
@@ -80,17 +243,17 @@ class CompressionApp(BaseApp):
     def register_callbacks(self) -> None:
         """Register dropdown-driven compression curve callbacks."""
         model_dropdown_id = f"{BENCHMARK_NAME}-model-dropdown"
-        structure_dropdown_id = f"{BENCHMARK_NAME}-structure-dropdown"
+        composition_dropdown_id = f"{BENCHMARK_NAME}-composition-dropdown"
         figure_id = f"{BENCHMARK_NAME}-figure"
 
         @callback(
-            Output(structure_dropdown_id, "options"),
-            Output(structure_dropdown_id, "value"),
+            Output(composition_dropdown_id, "options"),
+            Output(composition_dropdown_id, "value"),
             Input(model_dropdown_id, "value"),
         )
-        def _update_structure_options(model_name: str):
+        def _update_composition_options(model_name: str):
             """
-            Populate structure dropdown for the selected model.
+            Populate composition dropdown for the selected model.
 
             Parameters
             ----------
@@ -100,49 +263,54 @@ def _update_structure_options(model_name: str):
             Returns
             -------
             tuple[list[dict], str | None]
-                Structure dropdown options and default selection.
+                Composition dropdown options and default selection.
             """
             if not model_name:
                 raise PreventUpdate
-            structures = _available_structures(model_name)
-            options = [{"label": s, "value": s} for s in structures]
-            default = structures[0] if structures else None
+            groups = _available_element_groups(model_name)
+            options = [
+                {"label": _element_group_label(g), "value": _element_group_label(g)}
+                for g in groups
+            ]
+            default = options[0]["value"] if options else None
             return options, default
 
         @callback(
             Output(figure_id, "figure"),
             Input(model_dropdown_id, "value"),
-            Input(structure_dropdown_id, "value"),
+            Input(composition_dropdown_id, "value"),
         )
-        def _update_figure(model_name: str, structure: str | None):
+        def _update_figure(model_name: str, composition: str | None):
             """
-            Render energy-vs-volume and dE/dV curves for the selected structure.
+            Render energy-per-atom and pressure curves vs linear scale
+            factor for all structures sharing the selected composition,
+            using symlog y-axes.
 
             Parameters
             ----------
             model_name
                 Selected model identifier.
-            structure
-                Selected structure label.
+            composition
+                Selected composition label.
 
             Returns
             -------
             go.Figure
                 Plotly figure with two subplots.
             """
-            if not model_name or not structure:
+            if not model_name or not composition:
                 raise PreventUpdate
 
-            payload = _load_curve(model_name, structure)
-            if payload is None:
+            group = tuple(composition.split("-"))
+            curves = _load_curves_for_element_group(model_name, group)
+            if not curves:
                 raise PreventUpdate
 
-            volumes = payload.get("volume_per_atom", [])
-            energies = payload.get("energy_per_atom", [])
-            de_dv = payload.get("dEdV", [])
-
-            if not volumes or not energies:
-                raise PreventUpdate
+            linthresh = 10.0
+            decades = 4
+            tick_vals, tick_text = _symlog_ticks(
+                linthresh, decades=decades, linear_frac=LINEAR_FRAC,
+            )
 
             fig = make_subplots(
                 rows=2,
@@ -150,44 +318,79 @@ def _update_figure(model_name: str, structure: str | None):
                 shared_xaxes=True,
                 vertical_spacing=0.08,
                 subplot_titles=(
-                    "Energy per atom vs Volume per atom",
-                    "dE/dV vs Volume per atom",
+                    "Energy per atom vs Scale factor",
+                    "Pressure vs Scale factor",
                 ),
             )
 
-            fig.add_trace(
-                go.Scatter(
-                    x=volumes,
-                    y=energies,
-                    mode="lines+markers",
-                    name="E/atom",
-                    line={"color": "royalblue"},
-                    marker={"size": 4},
-                ),
-                row=1,
-                col=1,
-            )
+            for idx, (label, payload) in enumerate(curves):
+                scales = payload.get("scale", [])
+                energies = payload.get("energy_per_atom", [])
+                pressures = payload.get("pressure", [])
+                color = _PALETTE[idx % len(_PALETTE)]
+                # Use the full composition + RSS index as the legend entry
+                short_label = label
+
+                if not scales or not energies:
+                    continue
 
-            if de_dv:
                 fig.add_trace(
                     go.Scatter(
-                        x=volumes,
-                        y=de_dv,
+                        x=scales,
+                        y=_symlog(energies, linthresh, LINEAR_FRAC, decades),
                         mode="lines+markers",
-                        name="dE/dV",
-                        line={"color": "firebrick"},
-                        marker={"size": 4},
+                        name=f"E — {short_label}",
+                        line={"color": color},
+                        marker={"size": 3},
+                        legendgroup=short_label,
+                        customdata=energies,
+                        hovertemplate=(
+                            "scale: %{x:.3f}<br>"
+                            "E/atom: %{customdata:.4f} eV<extra></extra>"
+                        ),
                     ),
-                    row=2,
+                    row=1,
                     col=1,
                 )
 
-            fig.update_xaxes(title_text="Volume per atom (ų)", row=2, col=1)
-            fig.update_yaxes(title_text="Energy per atom (eV)", row=1, col=1)
-            fig.update_yaxes(title_text="dE/dV (eV/ų)", row=2, col=1)
+                if pressures:
+                    fig.add_trace(
+                        go.Scatter(
+                            x=scales,
+                            y=_symlog(pressures, linthresh, LINEAR_FRAC, decades),
+                            mode="lines+markers",
+                            name=f"P — {short_label}",
+                            line={"color": color},
+                            marker={"size": 3},
+                            legendgroup=short_label,
+                            showlegend=False,
+                            customdata=pressures,
+                            hovertemplate=(
+                                "scale: %{x:.3f}<br>"
+                                "P: %{customdata:.4f} eV/ų<extra></extra>"
+                            ),
+                        ),
+                        row=2,
+                        col=1,
+                    )
+
+            # Apply symlog tick formatting to both y-axes
+            fig.update_yaxes(
+                tickvals=tick_vals,
+                ticktext=tick_text,
+                title_text="Energy per atom (eV, symlog)",
+                row=1, col=1,
+            )
+            fig.update_yaxes(
+                tickvals=tick_vals,
+                ticktext=tick_text,
+                title_text="Pressure (eV/ų, symlog)",
+                row=2, col=1,
+            )
+            fig.update_xaxes(title_text="Scale factor", row=2, col=1)
 
             fig.update_layout(
-                title=f"{model_name} — {structure}",
+                title=f"{model_name} — {composition}",
                 height=700,
                 showlegend=True,
                 template="plotly_white",
@@ -219,9 +422,9 @@ def get_app() -> CompressionApp:
                     clearable=False,
                     style={"width": "300px", "marginBottom": "20px"},
                 ),
-                Label("Select structure:"),
+                Label("Select composition:"),
                 dcc.Dropdown(
-                    id=f"{BENCHMARK_NAME}-structure-dropdown",
+                    id=f"{BENCHMARK_NAME}-composition-dropdown",
                     options=[],
                     value=None,
                     clearable=False,

From 0dba915b82629ccacfe12cfb9c493da2fbff7b56 Mon Sep 17 00:00:00 2001
From: James Darby <james@symmetricgroup.co.uk>
Date: Mon, 9 Mar 2026 21:57:22 +0000
Subject: [PATCH 06/12] manual tweaks to chemicals formula grouping

---
 .../compression/app_compression.py            | 100 +++++-------------
 .../compression/calc_compression.py           |  14 ++-
 2 files changed, 34 insertions(+), 80 deletions(-)

diff --git a/ml_peg/app/physicality/compression/app_compression.py b/ml_peg/app/physicality/compression/app_compression.py
index 575f512d3..bf520325b 100644
--- a/ml_peg/app/physicality/compression/app_compression.py
+++ b/ml_peg/app/physicality/compression/app_compression.py
@@ -6,6 +6,8 @@
 import re
 from pathlib import Path
 
+from ase.formula import Formula
+
 from dash import Dash, Input, Output, callback, dcc
 from dash.dcc import Loading
 from dash.exceptions import PreventUpdate
@@ -14,6 +16,7 @@
 import plotly.graph_objects as go
 from plotly.subplots import make_subplots
 
+
 from ml_peg.app import APP_ROOT
 from ml_peg.app.base_app import BaseApp
 from ml_peg.models.get_models import get_model_names
@@ -29,64 +32,16 @@
     "physicality.html#compression"
 )
 
-# Regex to strip the _RSS_<n> suffix to get the composition
-_RSS_SUFFIX = re.compile(r"_RSS_\d+$")
-# Regex to extract element symbols from a composition string
-_ELEMENT_RE = re.compile(r"[A-Z][a-z]?")
-
-
-def _composition_from_label(label: str) -> str:
-    """
-    Extract the composition prefix from a structure label.
-
-    Parameters
-    ----------
-    label
-        Full structure label (e.g. ``"C2H3_RSS_0"``).
-
-    Returns
-    -------
-    str
-        Composition string (e.g. ``"C2H3"``).  If the label has no
-        ``_RSS_<n>`` suffix it is returned unchanged.
-    """
-    return _RSS_SUFFIX.sub("", label)
-
+def _chemical_formula_from_label(label: str) -> str:
+    """convert a label like "C2H4_pyxtal_0" to a reduce chemical formula string like "CH2"."""
+    formula_str = label.split("_")[0]
+    f = Formula(formula_str)
+    return str(f.reduce()[0])
 
-def _element_group(composition: str) -> tuple[str, ...]:
+def _available_formulas(model_name: str) -> list[str]:
     """
-    Extract the sorted tuple of unique element symbols from a composition.
-
-    ``"C2H3"`` → ``("C", "H")``, ``"C3"`` → ``("C",)``.
-
-    Parameters
-    ----------
-    composition
-        Composition string (no ``_RSS_<n>`` suffix).
-
-    Returns
-    -------
-    tuple[str, ...]
-        Sorted unique elements.
-    """
-    return tuple(sorted(set(_ELEMENT_RE.findall(composition))))
-
-
-def _element_group_label(group: tuple[str, ...]) -> str:
-    """
-    Human-readable label for an element group, e.g. ``"C"`` or ``"C-H"``.
-    """
-    return "-".join(group)
-
-
-def _available_element_groups(model_name: str) -> list[tuple[str, ...]]:
-    """
-    List unique element groups available for a given model.
-
-    Structures whose compositions share the same *set* of elements are
-    placed into the same group.  For example ``"C"``, ``"C2"``, and
-    ``"C3"`` all belong to the group ``("C",)``.
-
+    List unique formulas available for a given model.
+    
     Parameters
     ----------
     model_name
@@ -94,30 +49,29 @@ def _available_element_groups(model_name: str) -> list[tuple[str, ...]]:
 
     Returns
     -------
-    list[tuple[str, ...]]
-        Sorted list of unique element groups.
+    list[str]
+        Sorted list of unique formulas.
     """
     model_dir = CURVE_PATH / model_name
     if not model_dir.exists():
         return []
-    groups: set[tuple[str, ...]] = set()
+    groups: set[str] = set()
     for p in model_dir.glob("*.json"):
-        groups.add(_element_group(_composition_from_label(p.stem)))
+        groups.add(_chemical_formula_from_label(p.stem))
     return sorted(groups)
 
-
-def _load_curves_for_element_group(
-    model_name: str, group: tuple[str, ...]
+def _load_curves_for_formula(
+    model_name: str, formula: str
 ) -> list[tuple[str, dict]]:
     """
-    Load all curve payloads whose element group matches *group*.
+    Load all curve payloads whose reduced chemical formula matches *formula*.
 
     Parameters
     ----------
     model_name
         Model identifier.
-    group
-        Target element group (e.g. ``("C",)`` or ``("C", "H")``).
+    formula
+        Reduced chemical formula (e.g. ``"CH2"``).
 
     Returns
     -------
@@ -129,7 +83,7 @@ def _load_curves_for_element_group(
         return []
     results: list[tuple[str, dict]] = []
     for p in sorted(model_dir.glob("*.json")):
-        if _element_group(_composition_from_label(p.stem)) == group:
+        if _chemical_formula_from_label(p.stem) == formula:
             try:
                 with p.open(encoding="utf8") as fh:
                     results.append((p.stem, json.load(fh)))
@@ -267,12 +221,9 @@ def _update_composition_options(model_name: str):
             """
             if not model_name:
                 raise PreventUpdate
-            groups = _available_element_groups(model_name)
-            options = [
-                {"label": _element_group_label(g), "value": _element_group_label(g)}
-                for g in groups
-            ]
-            default = options[0]["value"] if options else None
+            formulas = _available_formulas(model_name)
+            options = [{"label": f, "value": f} for f in formulas]
+            default = formulas[0] if formulas else None
             return options, default
 
         @callback(
@@ -301,8 +252,7 @@ def _update_figure(model_name: str, composition: str | None):
             if not model_name or not composition:
                 raise PreventUpdate
 
-            group = tuple(composition.split("-"))
-            curves = _load_curves_for_element_group(model_name, group)
+            curves = _load_curves_for_formula(model_name, composition)
             if not curves:
                 raise PreventUpdate
 
diff --git a/ml_peg/calcs/physicality/compression/calc_compression.py b/ml_peg/calcs/physicality/compression/calc_compression.py
index d0aad956d..b0b07943e 100644
--- a/ml_peg/calcs/physicality/compression/calc_compression.py
+++ b/ml_peg/calcs/physicality/compression/calc_compression.py
@@ -11,6 +11,7 @@
 from ase.build import bulk, make_supercell
 from ase.data import covalent_radii, atomic_numbers
 from ase.io import read as ase_read, write
+from ase.formula import Formula
 import numpy as np
 import pandas as pd
 import pytest
@@ -39,7 +40,7 @@
 ELEMENTS: list[str] = ["H", "C", "N", "O"]  # limit to common elements for testing
 PROTOTYPES: list[str] = ["sc", "bcc","fcc", "hcp", "diamond"]  # common crystal prototypes
 MAX_ATOMS_PER_CELL = 6  # limit to small cells for testing
-RANDOM_STRUCTURES: list[dict[str, int]] = [[1, 2*len(ELEMENTS), 5], [2, 5, 10]]  # [[Number of elements, number of compositions, repeats per composition], ...]
+RANDOM_STRUCTURES: list[dict[str, int]] = [[1, len(ELEMENTS), 1], [2, 2, 1]]  # [[Number of elements, number of compositions, repeats per composition], ...]
 
 def _scale_grid(
     min_scale: float,
@@ -261,7 +262,8 @@ def _composition_label(composition: dict[str, int]) -> str:
     Build a canonical string label for a composition.
 
     Elements are sorted alphabetically and counts are appended only when
-    greater than one (e.g. ``{"C": 1, "O": 2}`` → ``"CO2"``).
+    greater than one (e.g. ``{"C": 1, "O": 2}`` → ``"C-O2"``).
+    elements split by "-" to avoid confusion with the underscore used for separating the composition from the rest of the label (e.g. "C-O2_pyxtal_0")
 
     Parameters
     ----------
@@ -277,7 +279,7 @@ def _composition_label(composition: dict[str, int]) -> str:
     for elem in sorted(composition):
         count = composition[elem]
         parts.append(f"{elem}{count if count > 1 else ''}")
-    return "".join(parts)
+    return "-".join(parts)
 
 
 def _generate_all_random(
@@ -374,15 +376,17 @@ def _generate_all_random(
         print(f"Generated {len(generated_compositions)} unique compositions with {n_elements} elements: {generated_compositions}")
         # Generate structures for each composition
         for composition in generated_compositions:
-            comp_label = _composition_label(composition)
+            #comp_label = _composition_label(composition)
             for i in range(repeats):
-                struct_label = f"{comp_label}_pyxtal_{i}"
+                #struct_label = f"{comp_label}_pyxtal_{i}"
                 try:
                     atoms = _gen_random_structure(
                         composition,
                         seed=int(rng.integers(0, 2**31)),
                         space_group=1,  # random space group
                     )
+                    
+                    struct_label = f"{atoms.get_chemical_formula()}_pyxtal_{i}"
                     atoms.info["label"] = struct_label
                     structures[struct_label] = atoms
 

From dc8460819b38490516de1ffcce83d2d77ff9ccd9 Mon Sep 17 00:00:00 2001
From: James Darby <james@symmetricgroup.co.uk>
Date: Tue, 10 Mar 2026 12:26:44 +0000
Subject: [PATCH 07/12] added in the metrics

---
 .../compression/analyse_compression.py        | 47 +++++++++++++------
 .../physicality/compression/metrics.yml       | 35 ++++++++++----
 .../compression/app_compression.py            |  2 +-
 .../compression/calc_compression.py           |  5 +-
 ml_peg/models/models.yml                      |  2 +-
 5 files changed, 62 insertions(+), 29 deletions(-)

diff --git a/ml_peg/analysis/physicality/compression/analyse_compression.py b/ml_peg/analysis/physicality/compression/analyse_compression.py
index 6dbb6e5c6..ae4e28c9e 100644
--- a/ml_peg/analysis/physicality/compression/analyse_compression.py
+++ b/ml_peg/analysis/physicality/compression/analyse_compression.py
@@ -75,8 +75,9 @@ def prepare_structure_series(
     scales = df_sorted["scale"].to_numpy()
 
     # Shift energies so the equilibrium value (scale closest to 1.0) is zero
-    eq_idx = int(np.argmin(np.abs(scales - 1.0)))
-    shifted_energies = energies - energies[eq_idx]
+    #eq_idx = int(np.argmin(np.abs(scales - 1.0)))
+    #shifted_energies = energies - energies[eq_idx]
+    shifted_energies = energies - energies[-1]  # Shift by the last energy value (largest volume)
 
     return volumes, shifted_energies, pressures, scales
 
@@ -127,8 +128,8 @@ def compute_structure_metrics(
     if volumes.size < 3:
         return None
 
-    energy_gradient = np.gradient(shifted_energies, volumes)
-    energy_curvature = np.gradient(energy_gradient, volumes)
+    energy_gradient = np.gradient(shifted_energies, scales)
+    energy_curvature = np.gradient(energy_gradient, scales)
 
     # Energy minima: find peaks in inverted energy
     minima = 0
@@ -139,11 +140,25 @@ def compute_structure_metrics(
             width=1,
         )
         minima = len(minima_indices)
+        
+        minima_indices, _ = find_peaks(
+            -shifted_energies,
+            prominence=0.1,
+            width=1,
+        )
+        deep_minima = len(minima_indices)
+        
+    # Is there a hole present or not
+    relative_Es = shifted_energies - shifted_energies[-1]  # Relative to the largest volume energy
+    holes = 0
+    if np.any(relative_Es < -100):  # If any energy is 100 eV/atom lower than the largest volume energy, we safely consider it a hole
+       holes = 1
 
     inflections = count_sign_changes(energy_curvature, tol=0.01)
 
     # Pressure sign flips
     pressure_flips = count_sign_changes(pressures, tol=1e-4)
+    big_pressure_flips = count_sign_changes(pressures, tol=1)
 
     # Spearman correlations in compressed and expanded regimes
     spearman_compression = np.nan
@@ -151,21 +166,19 @@ def compute_structure_metrics(
 
     try:
         from scipy import stats
+        
+        #NOTE: this isn't perfect, minimum will be in the wrong place if holes are present
+        #BUT for models without holes, this is another way of finding spurious minima
+        eq_idx = np.argmin(shifted_energies)
 
-        eq_idx = int(np.argmin(shifted_energies))
-
-        # Compressed regime: volumes smaller than equilibrium
-        # Energy should increase as volume decreases (positive correlation
-        # between energy and volume in the compressed region means energy
-        # rises when volume shrinks — but since volumes are sorted ascending,
-        # the compressed side has *lower* volumes with *higher* energies).
+        # Compressed regime
         if volumes[:eq_idx].size > 2:
             spearman_compression = float(
                 stats.spearmanr(
                     volumes[: eq_idx + 1], shifted_energies[: eq_idx + 1]
                 ).statistic
             )
-        # Expanded regime: volumes larger than equilibrium
+        # Expanded regime
         if volumes[eq_idx:].size > 2:
             spearman_expansion = float(
                 stats.spearmanr(
@@ -176,11 +189,14 @@ def compute_structure_metrics(
         pass
 
     return {
+        "Holes": float(holes),
         "Energy minima": float(minima),
-        "Energy inflections": float(inflections),
+        "Deep Energy minima": float(deep_minima),
+        #"Energy inflections": float(inflections),
+        "Big Pressure sign flips": float(big_pressure_flips),
         "Pressure sign flips": float(pressure_flips),
-        "ρ(E, compression)": float(spearman_compression),
-        "ρ(E, expansion)": float(spearman_expansion),
+        "ρ(-E,Vsmall)": -float(spearman_compression),
+        "ρ(E,Vlarge)": float(spearman_expansion),
     }
 
 
@@ -207,6 +223,7 @@ def aggregate_model_metrics(
 
     for _struct, struct_dataframe in model_dataframe.groupby("structure"):
         metrics = compute_structure_metrics(struct_dataframe)
+        print(f"Computed metrics for structure {_struct}: {metrics}")
         if metrics is None:
             continue
         structure_metrics.append(metrics)
diff --git a/ml_peg/analysis/physicality/compression/metrics.yml b/ml_peg/analysis/physicality/compression/metrics.yml
index 448719524..a7e3fa6d3 100644
--- a/ml_peg/analysis/physicality/compression/metrics.yml
+++ b/ml_peg/analysis/physicality/compression/metrics.yml
@@ -1,26 +1,41 @@
 metrics:
-  Energy minima:
+  Holes:
+    good: 0.0
+    bad: 1.0
+    unit: null
+    tooltip: Presence of holes in the energy-per-atom curve, any curve with an energy value that is 100 eV/atom lower than the largest volume energy contains a hole
+  Deep Energy minima:
     good: 1.0
     bad: 5.0
     unit: null
-    tooltip: Average number of energy-per-atom minima per structure
-  Energy inflections:
+    tooltip: Average number of deep, > 0.1 eV/atom, energy-per-atom minima per structure. Designed to ignore very shallow minma, particularly in the expanded regime.
+  Energy minima:
     good: 1.0
     bad: 5.0
     unit: null
-    tooltip: Average number of energy curvature sign changes per structure
+    tooltip: Average number of energy-per-atom minima per structure. Tolerance of 1 meV/atom is used.
+  # Energy inflections:
+  #   good: 1.0
+  #   bad: 5.0
+  #   unit: null
+  #   tooltip: Average number of energy curvature sign changes per structure
   Pressure sign flips:
     good: 1.0
     bad: 5.0
     unit: null
     tooltip: Average number of hydrostatic-pressure sign changes per structure
-  ρ(E, compression):
+  Big Pressure sign flips:
+    good: 1.0
+    bad: 5.0
+    unit: null
+    tooltip: Average number of large, > 1GPa, hydrostatic-pressure sign changes per structure
+  ρ(-E,Vsmall):
     good: 1.0
     bad: -1.0
     unit: null
-    tooltip: Spearman correlation between energy and volume in the compressed regime
-  ρ(E, expansion):
-    good: -1.0
-    bad: 1.0
+    tooltip: Spearman correlation between energy and volume in the compressed region, defined as V < V(Emin) so this metric is polluted when Holes are present.
+  ρ(E,Vlarge):
+    good: 1.0
+    bad: -1.0
     unit: null
-    tooltip: Spearman correlation between energy and volume in the expanded regime
+    tooltip: Spearman correlation between energy and volume in the expanded region, defined as V > V(Emin) so this metric is polluted when Holes are present.
diff --git a/ml_peg/app/physicality/compression/app_compression.py b/ml_peg/app/physicality/compression/app_compression.py
index bf520325b..e8155661b 100644
--- a/ml_peg/app/physicality/compression/app_compression.py
+++ b/ml_peg/app/physicality/compression/app_compression.py
@@ -93,7 +93,7 @@ def _load_curves_for_formula(
 
 
 # Fraction of the y-axis occupied by the linear region (±linthresh).
-LINEAR_FRAC: float = 0.6
+LINEAR_FRAC: float = 0.8
 
 
 def _symlog(
diff --git a/ml_peg/calcs/physicality/compression/calc_compression.py b/ml_peg/calcs/physicality/compression/calc_compression.py
index b0b07943e..644e75a92 100644
--- a/ml_peg/calcs/physicality/compression/calc_compression.py
+++ b/ml_peg/calcs/physicality/compression/calc_compression.py
@@ -12,6 +12,7 @@
 from ase.data import covalent_radii, atomic_numbers
 from ase.io import read as ase_read, write
 from ase.formula import Formula
+from ase import units
 import numpy as np
 import pandas as pd
 import pytest
@@ -40,7 +41,7 @@
 ELEMENTS: list[str] = ["H", "C", "N", "O"]  # limit to common elements for testing
 PROTOTYPES: list[str] = ["sc", "bcc","fcc", "hcp", "diamond"]  # common crystal prototypes
 MAX_ATOMS_PER_CELL = 6  # limit to small cells for testing
-RANDOM_STRUCTURES: list[dict[str, int]] = [[1, len(ELEMENTS), 1], [2, 2, 1]]  # [[Number of elements, number of compositions, repeats per composition], ...]
+RANDOM_STRUCTURES: list[dict[str, int]] = [[1, len(ELEMENTS), 5], [2, 5, 10]]  # [[Number of elements, number of compositions, repeats per composition], ...]
 
 def _scale_grid(
     min_scale: float,
@@ -549,7 +550,7 @@ def run_compression(model_name: str, model) -> None:
                 # Stress tensor (Voigt notation, eV/Å³)
                 try:
                     stress_voigt = atoms.get_stress(voigt=False)
-                    pressure = -1/3 * np.trace(stress_voigt)  # hydrostatic pressure
+                    pressure = -1/3 * np.trace(stress_voigt)/units.GPa  # hydrostatic pressure
                 except Exception:
                     stress_voigt = np.zeros(6)
                     pressure = np.nan
diff --git a/ml_peg/models/models.yml b/ml_peg/models/models.yml
index ce83dd31f..ac86d0165 100644
--- a/ml_peg/models/models.yml
+++ b/ml_peg/models/models.yml
@@ -172,7 +172,7 @@ orb-v3-consv-inf-omat:
 pet-mad:
   module: pet_mad.calculator
   class_name: PETMADCalculator
-  device: "cpu"
+  device: "auto"
   default_dtype: float32
   trained_on_d3: false
   level_of_theory: PBEsol

From 915b389f5a7c9845f0059e5be25f2fbc6198ce9e Mon Sep 17 00:00:00 2001
From: James Darby <james@symmetricgroup.co.uk>
Date: Tue, 10 Mar 2026 12:36:09 +0000
Subject: [PATCH 08/12] added in extra yticks in the linear region

---
 .../compression/app_compression.py            | 34 ++++++++++++-------
 1 file changed, 21 insertions(+), 13 deletions(-)

diff --git a/ml_peg/app/physicality/compression/app_compression.py b/ml_peg/app/physicality/compression/app_compression.py
index e8155661b..729378733 100644
--- a/ml_peg/app/physicality/compression/app_compression.py
+++ b/ml_peg/app/physicality/compression/app_compression.py
@@ -93,7 +93,13 @@ def _load_curves_for_formula(
 
 
 # Fraction of the y-axis occupied by the linear region (±linthresh).
-LINEAR_FRAC: float = 0.8
+LINEAR_FRAC: float = 0.6
+
+# Separate linear thresholds for energy and pressure symlog axes
+LINTHRESH_ENERGY: float = 10.0   # eV/atom
+LINTHRESH_PRESSURE: float = 100.0  # GPa
+
+# Conversion factor: 1 eV/ų = 160.21766 GPa
 
 
 def _symlog(
@@ -172,7 +178,7 @@ def _symlog_ticks(
         ticks_val.append(_symlog([raw], **kw)[0])
         ticks_txt.append(f"{raw:.0e}")
     # Linear region
-    for v in [-linthresh, 0, linthresh]:
+    for v in [-linthresh, -linthresh / 2, 0, linthresh / 2, linthresh]:
         ticks_val.append(_symlog([v], **kw)[0])
         ticks_txt.append(f"{v:g}")
     # Positive decades
@@ -256,10 +262,12 @@ def _update_figure(model_name: str, composition: str | None):
             if not curves:
                 raise PreventUpdate
 
-            linthresh = 10.0
             decades = 4
-            tick_vals, tick_text = _symlog_ticks(
-                linthresh, decades=decades, linear_frac=LINEAR_FRAC,
+            e_tick_vals, e_tick_text = _symlog_ticks(
+                LINTHRESH_ENERGY, decades=decades, linear_frac=LINEAR_FRAC,
+            )
+            p_tick_vals, p_tick_text = _symlog_ticks(
+                LINTHRESH_PRESSURE, decades=decades, linear_frac=LINEAR_FRAC,
             )
 
             fig = make_subplots(
@@ -287,7 +295,7 @@ def _update_figure(model_name: str, composition: str | None):
                 fig.add_trace(
                     go.Scatter(
                         x=scales,
-                        y=_symlog(energies, linthresh, LINEAR_FRAC, decades),
+                        y=_symlog(energies, LINTHRESH_ENERGY, LINEAR_FRAC, decades),
                         mode="lines+markers",
                         name=f"E — {short_label}",
                         line={"color": color},
@@ -307,7 +315,7 @@ def _update_figure(model_name: str, composition: str | None):
                     fig.add_trace(
                         go.Scatter(
                             x=scales,
-                            y=_symlog(pressures, linthresh, LINEAR_FRAC, decades),
+                            y=_symlog(pressures, LINTHRESH_PRESSURE, LINEAR_FRAC, decades),
                             mode="lines+markers",
                             name=f"P — {short_label}",
                             line={"color": color},
@@ -317,7 +325,7 @@ def _update_figure(model_name: str, composition: str | None):
                             customdata=pressures,
                             hovertemplate=(
                                 "scale: %{x:.3f}<br>"
-                                "P: %{customdata:.4f} eV/ų<extra></extra>"
+                                "P: %{customdata:.2f} GPa<extra></extra>"
                             ),
                         ),
                         row=2,
@@ -326,15 +334,15 @@ def _update_figure(model_name: str, composition: str | None):
 
             # Apply symlog tick formatting to both y-axes
             fig.update_yaxes(
-                tickvals=tick_vals,
-                ticktext=tick_text,
+                tickvals=e_tick_vals,
+                ticktext=e_tick_text,
                 title_text="Energy per atom (eV, symlog)",
                 row=1, col=1,
             )
             fig.update_yaxes(
-                tickvals=tick_vals,
-                ticktext=tick_text,
-                title_text="Pressure (eV/ų, symlog)",
+                tickvals=p_tick_vals,
+                ticktext=p_tick_text,
+                title_text="Pressure (GPa, symlog)",
                 row=2, col=1,
             )
             fig.update_xaxes(title_text="Scale factor", row=2, col=1)

From 5f613da67694cf96eea6d9f6b08abe9369a76e80 Mon Sep 17 00:00:00 2001
From: James Darby <james@symmetricgroup.co.uk>
Date: Tue, 10 Mar 2026 14:31:19 +0000
Subject: [PATCH 09/12] moved plotting to analysis and upgraded to symlog
 scaling

---
 .../compression/analyse_compression.py        | 209 +++++++++++-
 .../compression/app_compression.py            | 298 +-----------------
 2 files changed, 220 insertions(+), 287 deletions(-)

diff --git a/ml_peg/analysis/physicality/compression/analyse_compression.py b/ml_peg/analysis/physicality/compression/analyse_compression.py
index ae4e28c9e..46b844ab5 100644
--- a/ml_peg/analysis/physicality/compression/analyse_compression.py
+++ b/ml_peg/analysis/physicality/compression/analyse_compression.py
@@ -3,10 +3,13 @@
 from __future__ import annotations
 
 import json
+from collections import defaultdict
 from pathlib import Path
 
+from ase.formula import Formula
 import numpy as np
 import pandas as pd
+import plotly.graph_objects as go
 import pytest
 from scipy.signal import find_peaks
 
@@ -21,6 +24,106 @@
 CALC_PATH = CALCS_ROOT / "physicality" / "compression" / "outputs"
 OUT_PATH = APP_ROOT / "data" / "physicality" / "compression"
 CURVE_PATH = OUT_PATH / "curves"
+FIGURE_PATH = OUT_PATH / "figures"
+
+# Palette for overlaid structure curves
+_PALETTE = [
+    "royalblue", "firebrick", "seagreen", "darkorange", "mediumpurple",
+    "deeppink", "teal", "goldenrod", "slategray", "crimson",
+]
+
+# Symlog scaling parameters
+LINEAR_FRAC: float = 0.6
+LINTHRESH_ENERGY: float = 10.0  # eV/atom
+SYMLOG_DECADES: int = 5
+
+DEFAULT_WEIGHTS = {"Holes": 1.0, "Energy minima": 0.1, "Deep Energy minima": 1.0, "Energy inflections": 0.1, "Big Pressure sign flips": 1.0, "Pressure sign flips": 0.1, "ρ(-E,Vsmall)": 1.0, "ρ(E,Vlarge)": 0.1}
+
+
+def _symlog(
+    values: list[float] | np.ndarray,
+    linthresh: float = LINTHRESH_ENERGY,
+    linear_frac: float = LINEAR_FRAC,
+    decades: int = SYMLOG_DECADES,
+) -> list[float]:
+    """Apply a symmetric-log transform to *values*.
+
+    Linear within ``[-linthresh, linthresh]``, logarithmic outside.
+    The linear region is scaled so that it occupies *linear_frac* of the
+    total axis range (assuming *decades* log decades on each side).
+
+    Parameters
+    ----------
+    values
+        Raw data values.
+    linthresh
+        Linear threshold.
+    linear_frac
+        Fraction of the full axis height reserved for the linear region.
+    decades
+        Number of log decades shown on each side.
+
+    Returns
+    -------
+    list[float]
+        Transformed values.
+    """
+    lin_half = linear_frac / 2.0
+    log_scale = (1.0 - linear_frac) / (2.0 * max(decades, 1))
+
+    arr = np.asarray(values, dtype=float)
+    out = np.where(
+        np.abs(arr) <= linthresh,
+        lin_half * arr / linthresh,
+        np.sign(arr)
+        * (lin_half + log_scale * np.log10(np.abs(arr) / linthresh)),
+    )
+    return out.tolist()
+
+
+def _symlog_ticks(
+    linthresh: float = LINTHRESH_ENERGY,
+    decades: int = SYMLOG_DECADES,
+    linear_frac: float = LINEAR_FRAC,
+) -> tuple[list[float], list[str]]:
+    """Generate tick positions and labels for a symlog axis.
+
+    Parameters
+    ----------
+    linthresh
+        Linear threshold matching ``_symlog``.
+    decades
+        Number of decades to show on each side of zero.
+    linear_frac
+        Must match the value passed to ``_symlog``.
+
+    Returns
+    -------
+    tuple[list[float], list[str]]
+        Tick values (in transformed space) and their display labels.
+    """
+    ticks_val: list[float] = []
+    ticks_txt: list[str] = []
+
+    kw = {"linthresh": linthresh, "linear_frac": linear_frac, "decades": decades}
+
+    # Negative decades
+    for d in range(decades, 0, -1):
+        raw = -(linthresh * 10**d)
+        ticks_val.append(_symlog([raw], **kw)[0])
+        ticks_txt.append(f"{raw:.0e}")
+    # Linear region
+    for v in [-linthresh, -linthresh / 2, 0, linthresh / 2, linthresh]:
+        ticks_val.append(_symlog([v], **kw)[0])
+        ticks_txt.append(f"{v:g}")
+    # Positive decades
+    for d in range(1, decades + 1):
+        raw = linthresh * 10**d
+        ticks_val.append(_symlog([raw], **kw)[0])
+        ticks_txt.append(f"{raw:.0e}")
+
+    return ticks_val, ticks_txt
+
 
 METRICS_CONFIG_PATH = Path(__file__).with_name("metrics.yml")
 DEFAULT_THRESHOLDS, DEFAULT_TOOLTIPS, _ = load_metrics_config(METRICS_CONFIG_PATH)
@@ -307,9 +410,109 @@ def _write_curve_payloads(
             json.dump(payload, fh)
 
 
+def _chemical_formula_from_label(label: str) -> str:
+    """Convert a structure label to its reduced chemical formula.
+
+    Parameters
+    ----------
+    label
+        Structure label such as ``"C2H4_pyxtal_0"`` or ``"C_bcc"``.
+
+    Returns
+    -------
+    str
+        Reduced formula, e.g. ``"CH2"`` or ``"C"``.
+    """
+    formula_str = label.split("_")[0]
+    f = Formula(formula_str)
+    return str(f.reduce()[0])
+
+
+def _build_formula_figures(
+    model_name: str, frame: pd.DataFrame
+) -> None:
+    """Build and save an energy-vs-scale scatter figure for each formula.
+
+    All structures that share the same reduced chemical formula are overlaid
+    on a single figure.  Figures are written to
+    ``FIGURE_PATH / model_name / {formula}.json``.
+
+    Parameters
+    ----------
+    model_name
+        Model identifier.
+    frame
+        Dataframe with columns *structure*, *scale*, *energy_per_atom*
+        (at minimum) for this model.
+    """
+    model_fig_dir = FIGURE_PATH / model_name
+    model_fig_dir.mkdir(parents=True, exist_ok=True)
+
+    # Group structure labels by reduced formula
+    formula_groups: dict[str, list[str]] = defaultdict(list)
+    for struct_label in frame["structure"].unique():
+        formula = _chemical_formula_from_label(str(struct_label))
+        formula_groups[formula].append(str(struct_label))
+
+    # Pre-compute symlog ticks
+    tick_vals, tick_text = _symlog_ticks(
+        LINTHRESH_ENERGY, decades=SYMLOG_DECADES, linear_frac=LINEAR_FRAC,
+    )
+
+    for formula, struct_labels in formula_groups.items():
+        fig = go.Figure()
+        y_plot_range = [0,0]
+        for idx, struct_label in enumerate(sorted(struct_labels)):
+            group = (
+                frame[frame["structure"] == struct_label]
+                .sort_values("volume_per_atom")
+                .drop_duplicates("volume_per_atom")
+            )
+            scales = group["scale"].tolist()
+            energies = group["energy_per_atom"].tolist()
+            if not scales or not energies:
+                continue
+
+            color = _PALETTE[idx % len(_PALETTE)]
+            fig.add_trace(
+                go.Scatter(
+                    x=scales,
+                    y=_symlog(energies),
+                    mode="lines",
+                    name=struct_label,
+                    line={"color": color},
+                    marker={"size": 5, "color": color},
+                    customdata=energies,
+                    hovertemplate=(
+                        "scale: %{x:.3f}<br>"
+                        "E/atom: %{customdata:.4f} eV<extra></extra>"
+                    ),
+                )
+            )
+            y_plot_range[0] = min(y_plot_range[0], min(_symlog(energies)))
+            y_plot_range[1] = max(y_plot_range[1], max(_symlog(energies)))
+        
+        default_range = [max(tick_vals[0], y_plot_range[0]), min(tick_vals[-1], y_plot_range[1])]
+        fig.update_layout(
+            title=f"{model_name} — {formula}",
+            xaxis_title="Scale factor",
+            yaxis_title="Energy per atom (eV, symlog)",
+            yaxis={
+                "tickvals": tick_vals,
+                "ticktext": tick_text,
+                "range": default_range,
+            },
+            height=500,
+            showlegend=True,
+            template="plotly_white",
+        )
+
+        fig.write_json(model_fig_dir / f"{formula}.json")
+
+
 def persist_compression_data() -> dict[str, pd.DataFrame]:
     """
-    Persist curve payloads and return per-model dataframes.
+    Persist curve payloads, build figures, and return per-model dataframes.
 
     Returns
     -------
@@ -318,9 +521,11 @@ def persist_compression_data() -> dict[str, pd.DataFrame]:
     """
     data = _load_structure_data()
     CURVE_PATH.mkdir(parents=True, exist_ok=True)
+    FIGURE_PATH.mkdir(parents=True, exist_ok=True)
     for model_name, frame in data.items():
         if frame is not None and not frame.empty:
             _write_curve_payloads(CURVE_PATH, model_name, frame)
+            _build_formula_figures(model_name, frame)
     return data
 
 
@@ -419,7 +624,7 @@ def compression_metrics_dataframe(
     filename=OUT_PATH / "compression_metrics_table.json",
     metric_tooltips=DEFAULT_TOOLTIPS,
     thresholds=DEFAULT_THRESHOLDS,
-    weights=None,
+    weights=DEFAULT_WEIGHTS,
 )
 def metrics(
     compression_metrics_dataframe: pd.DataFrame,
diff --git a/ml_peg/app/physicality/compression/app_compression.py b/ml_peg/app/physicality/compression/app_compression.py
index 729378733..35a6c26eb 100644
--- a/ml_peg/app/physicality/compression/app_compression.py
+++ b/ml_peg/app/physicality/compression/app_compression.py
@@ -2,20 +2,13 @@
 
 from __future__ import annotations
 
-import json
-import re
 from pathlib import Path
 
-from ase.formula import Formula
-
 from dash import Dash, Input, Output, callback, dcc
 from dash.dcc import Loading
 from dash.exceptions import PreventUpdate
 from dash.html import Div, Label
-import numpy as np
-import plotly.graph_objects as go
-from plotly.subplots import make_subplots
-
+from plotly.io import read_json
 
 from ml_peg.app import APP_ROOT
 from ml_peg.app.base_app import BaseApp
@@ -26,22 +19,17 @@
 MODELS = get_model_names(current_models)
 BENCHMARK_NAME = "Compression"
 DATA_PATH = APP_ROOT / "data" / "physicality" / "compression"
-CURVE_PATH = DATA_PATH / "curves"
+FIGURE_PATH = DATA_PATH / "figures"
 DOCS_URL = (
     "https://ddmms.github.io/ml-peg/user_guide/benchmarks/"
     "physicality.html#compression"
 )
 
-def _chemical_formula_from_label(label: str) -> str:
-    """convert a label like "C2H4_pyxtal_0" to a reduce chemical formula string like "CH2"."""
-    formula_str = label.split("_")[0]
-    f = Formula(formula_str)
-    return str(f.reduce()[0])
 
 def _available_formulas(model_name: str) -> list[str]:
     """
     List unique formulas available for a given model.
-    
+
     Parameters
     ----------
     model_name
@@ -52,149 +40,10 @@ def _available_formulas(model_name: str) -> list[str]:
     list[str]
         Sorted list of unique formulas.
     """
-    model_dir = CURVE_PATH / model_name
-    if not model_dir.exists():
-        return []
-    groups: set[str] = set()
-    for p in model_dir.glob("*.json"):
-        groups.add(_chemical_formula_from_label(p.stem))
-    return sorted(groups)
-
-def _load_curves_for_formula(
-    model_name: str, formula: str
-) -> list[tuple[str, dict]]:
-    """
-    Load all curve payloads whose reduced chemical formula matches *formula*.
-
-    Parameters
-    ----------
-    model_name
-        Model identifier.
-    formula
-        Reduced chemical formula (e.g. ``"CH2"``).
-
-    Returns
-    -------
-    list[tuple[str, dict]]
-        List of ``(label, payload)`` tuples for every matching structure.
-    """
-    model_dir = CURVE_PATH / model_name
+    model_dir = FIGURE_PATH / model_name
     if not model_dir.exists():
         return []
-    results: list[tuple[str, dict]] = []
-    for p in sorted(model_dir.glob("*.json")):
-        if _chemical_formula_from_label(p.stem) == formula:
-            try:
-                with p.open(encoding="utf8") as fh:
-                    results.append((p.stem, json.load(fh)))
-            except Exception:
-                continue
-    return results
-
-
-# Fraction of the y-axis occupied by the linear region (±linthresh).
-LINEAR_FRAC: float = 0.6
-
-# Separate linear thresholds for energy and pressure symlog axes
-LINTHRESH_ENERGY: float = 10.0   # eV/atom
-LINTHRESH_PRESSURE: float = 100.0  # GPa
-
-# Conversion factor: 1 eV/ų = 160.21766 GPa
-
-
-def _symlog(
-    values: list[float] | np.ndarray,
-    linthresh: float = 10.0,
-    linear_frac: float = LINEAR_FRAC,
-    decades: int = 4,
-) -> list[float]:
-    """
-    Apply a symmetric-log transform to *values*.
-
-    Linear within ``[-linthresh, linthresh]``, logarithmic outside.
-    The linear region is scaled so that it occupies *linear_frac* of the
-    total axis range (assuming *decades* log decades on each side).
-
-    Parameters
-    ----------
-    values
-        Raw data values.
-    linthresh
-        Linear threshold.
-    linear_frac
-        Fraction of the full axis height reserved for the linear region.
-    decades
-        Number of log decades shown on each side (used to compute the
-        compression factor for the logarithmic tails).
-
-    Returns
-    -------
-    list[float]
-        Transformed values.
-    """
-    lin_half = linear_frac / 2.0
-    log_scale = (1.0 - linear_frac) / (2.0 * max(decades, 1))
-
-    arr = np.asarray(values, dtype=float)
-    out = np.where(
-        np.abs(arr) <= linthresh,
-        lin_half * arr / linthresh,
-        np.sign(arr)
-        * (lin_half + log_scale * np.log10(np.abs(arr) / linthresh)),
-    )
-    return out.tolist()
-
-
-def _symlog_ticks(
-    linthresh: float = 10.0,
-    decades: int = 4,
-    linear_frac: float = LINEAR_FRAC,
-) -> tuple[list[float], list[str]]:
-    """
-    Generate tick positions and labels for a symlog axis.
-
-    Parameters
-    ----------
-    linthresh
-        Linear threshold matching ``_symlog``.
-    decades
-        Number of decades to show on each side of zero.
-    linear_frac
-        Must match the value passed to ``_symlog``.
-
-    Returns
-    -------
-    tuple[list[float], list[str]]
-        Tick values (in transformed space) and their display labels.
-    """
-    ticks_val: list[float] = []
-    ticks_txt: list[str] = []
-
-    kw = {"linthresh": linthresh, "linear_frac": linear_frac, "decades": decades}
-
-    # Negative decades
-    for d in range(decades, 0, -1):
-        raw = -(linthresh * 10 ** d)
-        ticks_val.append(_symlog([raw], **kw)[0])
-        ticks_txt.append(f"{raw:.0e}")
-    # Linear region
-    for v in [-linthresh, -linthresh / 2, 0, linthresh / 2, linthresh]:
-        ticks_val.append(_symlog([v], **kw)[0])
-        ticks_txt.append(f"{v:g}")
-    # Positive decades
-    for d in range(1, decades + 1):
-        raw = linthresh * 10 ** d
-        ticks_val.append(_symlog([raw], **kw)[0])
-        ticks_txt.append(f"{raw:.0e}")
-
-    return ticks_val, ticks_txt
-
-
-# A palette for overlaid RSS curves
-_PALETTE = [
-    "royalblue", "firebrick", "seagreen", "darkorange", "mediumpurple",
-    "deeppink", "teal", "goldenrod", "slategray", "crimson",
-]
+    return sorted(p.stem for p in model_dir.glob("*.json"))
 
 
 class CompressionApp(BaseApp):
@@ -212,19 +61,6 @@ def register_callbacks(self) -> None:
             Input(model_dropdown_id, "value"),
         )
         def _update_composition_options(model_name: str):
-            """
-            Populate composition dropdown for the selected model.
-
-            Parameters
-            ----------
-            model_name
-                Selected model value from the model dropdown.
-
-            Returns
-            -------
-            tuple[list[dict], str | None]
-                Composition dropdown options and default selection.
-            """
             if not model_name:
                 raise PreventUpdate
             formulas = _available_formulas(model_name)
@@ -238,123 +74,15 @@ def _update_composition_options(model_name: str):
             Input(composition_dropdown_id, "value"),
         )
         def _update_figure(model_name: str, composition: str | None):
-            """
-            Render energy-per-atom and pressure curves vs linear scale
-            factor for all structures sharing the selected composition,
-            using symlog y-axes.
-
-            Parameters
-            ----------
-            model_name
-                Selected model identifier.
-            composition
-                Selected composition label.
-
-            Returns
-            -------
-            go.Figure
-                Plotly figure with two subplots.
-            """
+            """Load pre-built energy-per-atom vs scale factor figure."""
             if not model_name or not composition:
                 raise PreventUpdate
 
-            curves = _load_curves_for_formula(model_name, composition)
-            if not curves:
+            figure_file = FIGURE_PATH / model_name / f"{composition}.json"
+            if not figure_file.exists():
                 raise PreventUpdate
 
-            decades = 4
-            e_tick_vals, e_tick_text = _symlog_ticks(
-                LINTHRESH_ENERGY, decades=decades, linear_frac=LINEAR_FRAC,
-            )
-            p_tick_vals, p_tick_text = _symlog_ticks(
-                LINTHRESH_PRESSURE, decades=decades, linear_frac=LINEAR_FRAC,
-            )
-
-            fig = make_subplots(
-                rows=2,
-                cols=1,
-                shared_xaxes=True,
-                vertical_spacing=0.08,
-                subplot_titles=(
-                    "Energy per atom vs Scale factor",
-                    "Pressure vs Scale factor",
-                ),
-            )
-
-            for idx, (label, payload) in enumerate(curves):
-                scales = payload.get("scale", [])
-                energies = payload.get("energy_per_atom", [])
-                pressures = payload.get("pressure", [])
-                color = _PALETTE[idx % len(_PALETTE)]
-                # Use the full composition + RSS index as the legend entry
-                short_label = label
-
-                if not scales or not energies:
-                    continue
-
-                fig.add_trace(
-                    go.Scatter(
-                        x=scales,
-                        y=_symlog(energies, LINTHRESH_ENERGY, LINEAR_FRAC, decades),
-                        mode="lines+markers",
-                        name=f"E — {short_label}",
-                        line={"color": color},
-                        marker={"size": 3},
-                        legendgroup=short_label,
-                        customdata=energies,
-                        hovertemplate=(
-                            "scale: %{x:.3f}<br>"
-                            "E/atom: %{customdata:.4f} eV<extra></extra>"
-                        ),
-                    ),
-                    row=1,
-                    col=1,
-                )
-
-                if pressures:
-                    fig.add_trace(
-                        go.Scatter(
-                            x=scales,
-                            y=_symlog(pressures, LINTHRESH_PRESSURE, LINEAR_FRAC, decades),
-                            mode="lines+markers",
-                            name=f"P — {short_label}",
-                            line={"color": color},
-                            marker={"size": 3},
-                            legendgroup=short_label,
-                            showlegend=False,
-                            customdata=pressures,
-                            hovertemplate=(
-                                "scale: %{x:.3f}<br>"
-                                "P: %{customdata:.2f} GPa<extra></extra>"
-                            ),
-                        ),
-                        row=2,
-                        col=1,
-                    )
-
-            # Apply symlog tick formatting to both y-axes
-            fig.update_yaxes(
-                tickvals=e_tick_vals,
-                ticktext=e_tick_text,
-                title_text="Energy per atom (eV, symlog)",
-                row=1, col=1,
-            )
-            fig.update_yaxes(
-                tickvals=p_tick_vals,
-                ticktext=p_tick_text,
-                title_text="Pressure (GPa, symlog)",
-                row=2, col=1,
-            )
-            fig.update_xaxes(title_text="Scale factor", row=2, col=1)
-
-            fig.update_layout(
-                title=f"{model_name} — {composition}",
-                height=700,
-                showlegend=True,
-                template="plotly_white",
-            )
-
-            return fig
+            return read_json(figure_file)
 
 
 def get_app() -> CompressionApp:
@@ -394,7 +122,7 @@ def get_app() -> CompressionApp:
         Loading(
             dcc.Graph(
                 id=f"{BENCHMARK_NAME}-figure",
-                style={"height": "700px", "width": "100%", "marginTop": "20px"},
+                style={"height": "500px", "width": "100%", "marginTop": "20px"},
             ),
             type="circle",
         ),
@@ -403,9 +131,9 @@ def get_app() -> CompressionApp:
     return CompressionApp(
         name=BENCHMARK_NAME,
         description=(
-            "Uniform crystal compression explorer. Structures are isotropically "
-            "scaled and the energy per atom, its derivative dE/dV, and stress are "
-            "recorded. Metrics are averaged across all structures."
+            "A handful of common prototype (and randomly generated via pyxtal) structures are isotropically scaled across a wide range. "
+            "A scale factor of 1.0 means that a pair of atoms in the structure is separated by the sum of their covalent radii. "
+            "e.g. min(d_ij/(r_cov_i + r_cov_j)) = 1.0."
         ),
         docs_url=DOCS_URL,
         table_path=DATA_PATH / "compression_metrics_table.json",

From 1b8347655013c356b03afb0baf6486e00cfea7ee Mon Sep 17 00:00:00 2001
From: James Darby <james@symmetricgroup.co.uk>
Date: Tue, 10 Mar 2026 21:26:04 +0000
Subject: [PATCH 10/12] ruff formating

---
 .../compression/analyse_compression.py        | 114 ++++++++------
 .../compression/app_compression.py            |  40 ++++-
 .../compression/calc_compression.py           | 144 ++++++++++++------
 .../physicality/compression/data/README.md    |   2 +-
 ml_peg/models/models.yml                      |   2 +-
 5 files changed, 198 insertions(+), 104 deletions(-)

diff --git a/ml_peg/analysis/physicality/compression/analyse_compression.py b/ml_peg/analysis/physicality/compression/analyse_compression.py
index 46b844ab5..9297cfd18 100644
--- a/ml_peg/analysis/physicality/compression/analyse_compression.py
+++ b/ml_peg/analysis/physicality/compression/analyse_compression.py
@@ -2,8 +2,8 @@
 
 from __future__ import annotations
 
-import json
 from collections import defaultdict
+import json
 from pathlib import Path
 
 from ase.formula import Formula
@@ -28,16 +28,32 @@
 
 # Palette for overlaid structure curves
 _PALETTE = [
-    "royalblue", "firebrick", "seagreen", "darkorange", "mediumpurple",
-    "deeppink", "teal", "goldenrod", "slategray", "crimson",
+    "royalblue",
+    "firebrick",
+    "seagreen",
+    "darkorange",
+    "mediumpurple",
+    "deeppink",
+    "teal",
+    "goldenrod",
+    "slategray",
+    "crimson",
 ]
 
 # Symlog scaling parameters
 LINEAR_FRAC: float = 0.6
 LINTHRESH_ENERGY: float = 10.0  # eV/atom
 SYMLOG_DECADES: int = 5
-
-DEFAULT_WEIGHTS = {"Holes": 1.0, "Energy minima": 0.1, "Deep Energy minima": 1.0, "Energy inflections": 0.1, "Big Pressure sign flips": 1.0, "Pressure sign flips": 0.1, "ρ(-E,Vsmall)": 1.0, "ρ(E,Vlarge)": 0.1}
+DEFAULT_WEIGHTS = {
+    "Holes": 1.0,
+    "Energy minima": 0.1,
+    "Deep Energy minima": 1.0,
+    "Energy inflections": 0.1,
+    "Big Pressure sign flips": 1.0,
+    "Pressure sign flips": 0.1,
+    "ρ(-E,Vsmall)": 1.0,
+    "ρ(E,Vlarge)": 0.1,
+}
 
 
 def _symlog(
@@ -46,7 +62,8 @@ def _symlog(
     linear_frac: float = LINEAR_FRAC,
     decades: int = SYMLOG_DECADES,
 ) -> list[float]:
-    """Apply a symmetric-log transform to *values*.
+    """
+    Apply a symmetric-log transform to *values*.
 
     Linear within ``[-linthresh, linthresh]``, logarithmic outside.
     The linear region is scaled so that it occupies *linear_frac* of the
@@ -75,8 +92,7 @@ def _symlog(
     out = np.where(
         np.abs(arr) <= linthresh,
         lin_half * arr / linthresh,
-        np.sign(arr)
-        * (lin_half + log_scale * np.log10(np.abs(arr) / linthresh)),
+        np.sign(arr) * (lin_half + log_scale * np.log10(np.abs(arr) / linthresh)),
     )
     return out.tolist()
 
@@ -86,7 +102,8 @@ def _symlog_ticks(
     decades: int = SYMLOG_DECADES,
     linear_frac: float = LINEAR_FRAC,
 ) -> tuple[list[float], list[str]]:
-    """Generate tick positions and labels for a symlog axis.
+    """
+    Generate tick positions and labels for a symlog axis.
 
     Parameters
     ----------
@@ -178,9 +195,11 @@ def prepare_structure_series(
     scales = df_sorted["scale"].to_numpy()
 
     # Shift energies so the equilibrium value (scale closest to 1.0) is zero
-    #eq_idx = int(np.argmin(np.abs(scales - 1.0)))
-    #shifted_energies = energies - energies[eq_idx]
-    shifted_energies = energies - energies[-1]  # Shift by the last energy value (largest volume)
+    # eq_idx = int(np.argmin(np.abs(scales - 1.0)))
+    # shifted_energies = energies - energies[eq_idx]
+    shifted_energies = (
+        energies - energies[-1]
+    )  # Shift by the last energy value (largest volume)
 
     return volumes, shifted_energies, pressures, scales
 
@@ -231,9 +250,6 @@ def compute_structure_metrics(
     if volumes.size < 3:
         return None
 
-    energy_gradient = np.gradient(shifted_energies, scales)
-    energy_curvature = np.gradient(energy_gradient, scales)
-
     # Energy minima: find peaks in inverted energy
     minima = 0
     if shifted_energies.size >= 3:
@@ -243,21 +259,22 @@ def compute_structure_metrics(
             width=1,
         )
         minima = len(minima_indices)
-        
+
         minima_indices, _ = find_peaks(
             -shifted_energies,
             prominence=0.1,
             width=1,
         )
         deep_minima = len(minima_indices)
-        
+
     # Is there a hole present or not
-    relative_Es = shifted_energies - shifted_energies[-1]  # Relative to the largest volume energy
+    # Relative to the largest volume energy
+    relative_energies = shifted_energies - shifted_energies[-1]
     holes = 0
-    if np.any(relative_Es < -100):  # If any energy is 100 eV/atom lower than the largest volume energy, we safely consider it a hole
-       holes = 1
-
-    inflections = count_sign_changes(energy_curvature, tol=0.01)
+    # If any energy is 100 eV/atom lower than the largest volume
+    # energy, we safely consider it a hole
+    if np.any(relative_energies < -100):
+        holes = 1
 
     # Pressure sign flips
     pressure_flips = count_sign_changes(pressures, tol=1e-4)
@@ -269,9 +286,11 @@ def compute_structure_metrics(
 
     try:
         from scipy import stats
-        
-        #NOTE: this isn't perfect, minimum will be in the wrong place if holes are present
-        #BUT for models without holes, this is another way of finding spurious minima
+
+        # NOTE: this isn't perfect, minimum will be in the
+        # wrong place if holes are present.
+        # BUT for models without holes, this is another way
+        # of finding spurious minima.
         eq_idx = np.argmin(shifted_energies)
 
         # Compressed regime
@@ -284,9 +303,7 @@ def compute_structure_metrics(
         # Expanded regime
         if volumes[eq_idx:].size > 2:
             spearman_expansion = float(
-                stats.spearmanr(
-                    volumes[eq_idx:], shifted_energies[eq_idx:]
-                ).statistic
+                stats.spearmanr(volumes[eq_idx:], shifted_energies[eq_idx:]).statistic
             )
     except Exception:
         pass
@@ -295,7 +312,7 @@ def compute_structure_metrics(
         "Holes": float(holes),
         "Energy minima": float(minima),
         "Deep Energy minima": float(deep_minima),
-        #"Energy inflections": float(inflections),
+        # "Energy inflections": float(inflections),
         "Big Pressure sign flips": float(big_pressure_flips),
         "Pressure sign flips": float(pressure_flips),
         "ρ(-E,Vsmall)": -float(spearman_compression),
@@ -326,7 +343,6 @@ def aggregate_model_metrics(
 
     for _struct, struct_dataframe in model_dataframe.groupby("structure"):
         metrics = compute_structure_metrics(struct_dataframe)
-        print(f"Computed metrics for structure {_struct}: {metrics}")
         if metrics is None:
             continue
         structure_metrics.append(metrics)
@@ -335,9 +351,7 @@ def aggregate_model_metrics(
         return {}
 
     return {
-        key: float(
-            np.nanmean([m.get(key, np.nan) for m in structure_metrics])
-        )
+        key: float(np.nanmean([m.get(key, np.nan) for m in structure_metrics]))
         for key in DEFAULT_THRESHOLDS.keys()
     }
 
@@ -411,7 +425,8 @@ def _write_curve_payloads(
 
 
 def _chemical_formula_from_label(label: str) -> str:
-    """Convert a structure label to its reduced chemical formula.
+    """
+    Convert a structure label to its reduced chemical formula.
 
     Parameters
     ----------
@@ -428,10 +443,9 @@ def _chemical_formula_from_label(label: str) -> str:
     return str(f.reduce()[0])
 
 
-def _build_formula_figures(
-    model_name: str, frame: pd.DataFrame
-) -> None:
-    """Build and save an energy-vs-scale scatter figure for each formula.
+def _build_formula_figures(model_name: str, frame: pd.DataFrame) -> None:
+    """
+    Build and save an energy-vs-scale scatter figure for each formula.
 
     All structures that share the same reduced chemical formula are overlaid
     on a single figure.  Figures are written to
@@ -456,12 +470,14 @@ def _build_formula_figures(
 
     # Pre-compute symlog ticks
     tick_vals, tick_text = _symlog_ticks(
-        LINTHRESH_ENERGY, decades=SYMLOG_DECADES, linear_frac=LINEAR_FRAC,
+        LINTHRESH_ENERGY,
+        decades=SYMLOG_DECADES,
+        linear_frac=LINEAR_FRAC,
     )
 
     for formula, struct_labels in formula_groups.items():
         fig = go.Figure()
-        y_plot_range = [0,0]
+        y_plot_range = [0, 0]
         for idx, struct_label in enumerate(sorted(struct_labels)):
             group = (
                 frame[frame["structure"] == struct_label]
@@ -484,15 +500,21 @@ def _build_formula_figures(
                     marker={"size": 5, "color": color},
                     customdata=energies,
                     hovertemplate=(
-                        "scale: %{x:.3f}<br>"
-                        "E/atom: %{customdata:.4f} eV<extra></extra>"
+                        "scale: %{x:.3f}<br>E/atom: %{customdata:.4f} eV<extra></extra>"
                     ),
                 )
             )
             y_plot_range[0] = min(y_plot_range[0], min(_symlog(energies)))
             y_plot_range[1] = max(y_plot_range[1], max(_symlog(energies)))
-        
-        default_range = [max(tick_vals[0], y_plot_range[0]), min(tick_vals[-1], y_plot_range[1])]
+
+        default_range = [
+            max(tick_vals[0], y_plot_range[0]),
+            min(tick_vals[-1], y_plot_range[1]),
+        ]
+
+        default_range[0] -= abs(default_range[0]) * 0.1
+        default_range[1] += abs(default_range[1]) * 0.1
+
         fig.update_layout(
             title=f"{model_name} — {formula}",
             xaxis_title="Scale factor",
@@ -565,9 +587,7 @@ def collect_metrics(
 
     OUT_PATH.mkdir(parents=True, exist_ok=True)
 
-    data = (
-        structure_data if structure_data is not None else persist_compression_data()
-    )
+    data = structure_data if structure_data is not None else persist_compression_data()
 
     for model_name, model_dataframe in data.items():
         metrics = aggregate_model_metrics(model_dataframe)
diff --git a/ml_peg/app/physicality/compression/app_compression.py b/ml_peg/app/physicality/compression/app_compression.py
index 35a6c26eb..8cbbb960d 100644
--- a/ml_peg/app/physicality/compression/app_compression.py
+++ b/ml_peg/app/physicality/compression/app_compression.py
@@ -2,8 +2,6 @@
 
 from __future__ import annotations
 
-from pathlib import Path
-
 from dash import Dash, Input, Output, callback, dcc
 from dash.dcc import Loading
 from dash.exceptions import PreventUpdate
@@ -21,8 +19,7 @@
 DATA_PATH = APP_ROOT / "data" / "physicality" / "compression"
 FIGURE_PATH = DATA_PATH / "figures"
 DOCS_URL = (
-    "https://ddmms.github.io/ml-peg/user_guide/benchmarks/"
-    "physicality.html#compression"
+    "https://ddmms.github.io/ml-peg/user_guide/benchmarks/physicality.html#compression"
 )
 
 
@@ -61,6 +58,19 @@ def register_callbacks(self) -> None:
             Input(model_dropdown_id, "value"),
         )
         def _update_composition_options(model_name: str):
+            """
+            Update composition dropdown options based on selected model.
+
+            Parameters
+            ----------
+            model_name
+                Currently selected model identifier.
+
+            Returns
+            -------
+            tuple
+                Dropdown options list and default value.
+            """
             if not model_name:
                 raise PreventUpdate
             formulas = _available_formulas(model_name)
@@ -74,7 +84,21 @@ def _update_composition_options(model_name: str):
             Input(composition_dropdown_id, "value"),
         )
         def _update_figure(model_name: str, composition: str | None):
-            """Load pre-built energy-per-atom vs scale factor figure."""
+            """
+            Load pre-built energy-per-atom vs scale factor figure.
+
+            Parameters
+            ----------
+            model_name
+                Currently selected model identifier.
+            composition
+                Reduced chemical formula for the composition group.
+
+            Returns
+            -------
+            Figure
+                Plotly figure loaded from the pre-built JSON file.
+            """
             if not model_name or not composition:
                 raise PreventUpdate
 
@@ -131,8 +155,10 @@ def get_app() -> CompressionApp:
     return CompressionApp(
         name=BENCHMARK_NAME,
         description=(
-            "A handful of common prototype (and randomly generated via pyxtal) structures are isotropically scaled across a wide range. "
-            "A scale factor of 1.0 means that a pair of atoms in the structure is separated by the sum of their covalent radii. "
+            "A handful of common prototype (and randomly generated via pyxtal)"
+            "structures are isotropically scaled across a wide range."
+            "A scale factor of 1.0 means that a pair of atoms in the"
+            "structure is separated by the sum of their covalent radii. "
             "e.g. min(d_ij/(r_cov_i + r_cov_j)) = 1.0."
         ),
         docs_url=DOCS_URL,
diff --git a/ml_peg/calcs/physicality/compression/calc_compression.py b/ml_peg/calcs/physicality/compression/calc_compression.py
index 644e75a92..45bf8b8b9 100644
--- a/ml_peg/calcs/physicality/compression/calc_compression.py
+++ b/ml_peg/calcs/physicality/compression/calc_compression.py
@@ -5,25 +5,20 @@
 import json
 from pathlib import Path
 
-import itertools
-
-from ase import Atoms
+from ase import Atoms, units
 from ase.build import bulk, make_supercell
-from ase.data import covalent_radii, atomic_numbers
-from ase.io import read as ase_read, write
-from ase.formula import Formula
-from ase import units
+from ase.data import atomic_numbers, covalent_radii
+from ase.io import read as ase_read
+from ase.io import write
 import numpy as np
 import pandas as pd
 import pytest
-from tqdm import tqdm
+from pyxtal.tolerance import Tol_matrix
+import tqdm
 
 from ml_peg.models.get_models import load_models
 from ml_peg.models.models import current_models
 
-import pyxtal #used for generating random crystal structures
-from pyxtal.tolerance import Tol_matrix
-
 MODELS = load_models(current_models)
 
 # Local directory for calculator outputs
@@ -35,13 +30,36 @@
 # A factor of 1.0 corresponds to the equilibrium structure.
 MIN_SCALE = 0.25
 MAX_SCALE = 3.0
-N_POINTS = 20
-
-#ELEMENTS: list[str] = [symbol for symbol in chemical_symbols if symbol]
-ELEMENTS: list[str] = ["H", "C", "N", "O"]  # limit to common elements for testing
-PROTOTYPES: list[str] = ["sc", "bcc","fcc", "hcp", "diamond"]  # common crystal prototypes
+N_POINTS = 100
+
+# ELEMENTS: list[str] = [symbol for symbol in chemical_symbols if symbol]
+ELEMENTS: list[str] = [
+    "H",
+    "C",
+    "N",
+    "O",
+    "Ti",
+    "Cu",
+    "Cl",
+    "P",
+    "W",
+    "Rb",
+    "Xe",
+]  # limit to common elements for testing
+PROTOTYPES: list[str] = [
+    "sc",
+    "bcc",
+    "fcc",
+    "hcp",
+    "diamond",
+]  # common crystal prototypes
 MAX_ATOMS_PER_CELL = 6  # limit to small cells for testing
-RANDOM_STRUCTURES: list[dict[str, int]] = [[1, len(ELEMENTS), 5], [2, 5, 10]]  # [[Number of elements, number of compositions, repeats per composition], ...]
+RANDOM_STRUCTURES: list[dict[str, int]] = [
+    [1, len(ELEMENTS), 5],
+    [2, 10, 10],
+    [3, 10, 10],
+]  # [[Number of elements, number of compositions, repeats per composition], ...]
+
 
 def _scale_grid(
     min_scale: float,
@@ -100,7 +118,9 @@ def _scale_using_isotropic_guess(atoms: Atoms) -> Atoms:
     np.fill_diagonal(distances, np.inf)
 
     # Build a matrix of target bond lengths (sum of covalent radii per pair)
-    radii = np.array([covalent_radii[atomic_numbers[s]] for s in sc.get_chemical_symbols()])
+    radii = np.array(
+        [covalent_radii[atomic_numbers[s]] for s in sc.get_chemical_symbols()]
+    )
     target_matrix = radii[:, None] + radii[None, :]
 
     # Mask infinite entries (self-distances)
@@ -160,8 +180,12 @@ def _generate_prototype_structures(
                 atoms = bulk(element, crystalstructure=proto, a=1.0)
                 atoms = _scale_using_isotropic_guess(atoms)
 
-                assert np.all(atoms.pbc), f"Generated non-periodic structure for {label}"
-                assert atoms.get_volume() > 0.0, f"Generated zero-volume structure for {label}"
+                assert np.all(atoms.pbc), (
+                    f"Generated non-periodic structure for {label}"
+                )
+                assert atoms.get_volume() > 0.0, (
+                    f"Generated zero-volume structure for {label}"
+                )
 
                 structures[label] = atoms
             except (ValueError, KeyError, RuntimeError) as exc:
@@ -198,6 +222,9 @@ def _gen_random_structure(
         random seed is generated automatically.
     max_attempts
         Maximum number of space-group attempts before giving up.
+    volume_factor
+        Volume factor used when generating random structures.
+        Smaller factors will results in denser structures.
 
     Returns
     -------
@@ -211,7 +238,7 @@ def _gen_random_structure(
     RuntimeError
         If no valid structure could be generated within *max_attempts*.
     """
-    from pyxtal import pyxtal as PyXtal
+    from pyxtal import pyxtal as pyxtal_cls
 
     rng = np.random.default_rng(seed)
 
@@ -221,8 +248,11 @@ def _gen_random_structure(
     # If a specific space group was requested, try it first
     sg_candidates: list[int] = []
     if space_group is not None:
-        #sg_candidates.append(space_group)
-        sg_candidates = [space_group] * max_attempts  # try the same one repeatedly to allow for randomness in the structure generation
+        # sg_candidates.append(space_group)
+        sg_candidates = [
+            space_group
+        ] * max_attempts  # try the same one repeatedly to allow for
+        # randomness in the structure generation
 
     # Fill remaining attempts with random space groups
     while len(sg_candidates) < max_attempts:
@@ -230,7 +260,7 @@ def _gen_random_structure(
 
     custom_tol = Tol_matrix(prototype="atomic", factor=1.0)
     for sg in sg_candidates:
-        crystal = PyXtal()
+        crystal = pyxtal_cls()
         crystal.from_random(
             dim=dim,
             group=sg,
@@ -238,8 +268,8 @@ def _gen_random_structure(
             numIons=num_atoms,
             random_state=int(rng.integers(0, 2**31)),
             tm=custom_tol,
-            max_count=100000,
-            factor=volume_factor
+            max_count=1000,
+            factor=volume_factor,
         )
         if not crystal.valid:
             continue
@@ -248,7 +278,7 @@ def _gen_random_structure(
         atoms.pbc = True
         atoms = _scale_using_isotropic_guess(atoms)
 
-        return atoms
+        return atoms  # noqa: RET504
         # except Exception:
         #     continue
 
@@ -264,7 +294,9 @@ def _composition_label(composition: dict[str, int]) -> str:
 
     Elements are sorted alphabetically and counts are appended only when
     greater than one (e.g. ``{"C": 1, "O": 2}`` → ``"C-O2"``).
-    elements split by "-" to avoid confusion with the underscore used for separating the composition from the rest of the label (e.g. "C-O2_pyxtal_0")
+    Elements are split by ``"-"`` to avoid confusion with the
+    underscore used for separating the composition from the rest
+    of the label (e.g. ``"C-O2_pyxtal_0"``).
 
     Parameters
     ----------
@@ -342,11 +374,16 @@ def _generate_all_random(
         max_comp_attempts = n_compositions * 50  # safety limit
         attempts = 0
 
-        while len(generated_compositions) < n_compositions and attempts < max_comp_attempts:
+        while (
+            len(generated_compositions) < n_compositions
+            and attempts < max_comp_attempts
+        ):
             attempts += 1
             if n_elements == 1:
-                #pick the element which occurs least frequently in the already generated compositions to increase diversity
-                elem_counts = {elem: 0 for elem in elements}
+                # Pick the element which occurs least frequently
+                # in the already generated compositions to
+                # increase diversity.
+                elem_counts = dict.fromkeys(elements, 0)
                 for comp in generated_compositions:
                     for elem in comp:
                         elem_counts[elem] += comp[elem]
@@ -357,36 +394,44 @@ def _generate_all_random(
                     rng.choice(elements, size=n_elements, replace=False).tolist()
                 )
             # Random atom counts that sum to at most max_atoms (each >= 1)
-            counts = rng.integers(1, max(2, max_atoms - n_elements + 2), size=n_elements)
+            counts = rng.integers(
+                1, max(2, max_atoms - n_elements + 2), size=n_elements
+            )
             if len(counts) == 1:
-                counts = np.random.choice(range(max_atoms//2, max_atoms + 1), size=1)  # for single element, just pick a random count between max_atoms//2 and max_atoms
-            
+                counts = np.random.choice(
+                    range(max_atoms // 2, max_atoms + 1), size=1
+                )  # for single element, pick a random count
+                # between max_atoms//2 and max_atoms
+
             # Clip total to max_atoms
             total = int(counts.sum())
             if total > max_atoms:
                 counts = (counts * max_atoms / total).astype(int)
                 counts = np.maximum(counts, 1)
-            
 
-            composition = dict(zip(chosen_elements, counts.tolist()))
+            composition = dict(zip(chosen_elements, counts.tolist(), strict=True))
             label = _composition_label(composition)
             if label not in seen_labels:
                 seen_labels.add(label)
                 generated_compositions.append(composition)
-        
-        print(f"Generated {len(generated_compositions)} unique compositions with {n_elements} elements: {generated_compositions}")
+
+        print(
+            f"Generated {len(generated_compositions)} unique "
+            f"compositions with {n_elements} elements: "
+            f"{generated_compositions}"
+        )
         # Generate structures for each composition
-        for composition in generated_compositions:
-            #comp_label = _composition_label(composition)
+        for composition in tqdm.tqdm(generated_compositions):
+            # comp_label = _composition_label(composition)
             for i in range(repeats):
-                #struct_label = f"{comp_label}_pyxtal_{i}"
+                # struct_label = f"{comp_label}_pyxtal_{i}"
                 try:
                     atoms = _gen_random_structure(
                         composition,
                         seed=int(rng.integers(0, 2**31)),
                         space_group=1,  # random space group
                     )
-                    
+
                     struct_label = f"{atoms.get_chemical_formula()}_pyxtal_{i}"
                     atoms.info["label"] = struct_label
                     structures[struct_label] = atoms
@@ -507,7 +552,6 @@ def run_compression(model_name: str, model) -> None:
     write_dir.mkdir(parents=True, exist_ok=True)
     traj_dir = write_dir / "compression"
     traj_dir.mkdir(parents=True, exist_ok=True)
-    
 
     reference_structures = _collect_structures(ELEMENTS, PROTOTYPES, DATA_PATH)
     if not reference_structures:
@@ -520,7 +564,7 @@ def run_compression(model_name: str, model) -> None:
 
     scales = _scale_grid(MIN_SCALE, MAX_SCALE, N_POINTS)
 
-    for struct_label, ref_atoms in tqdm(
+    for struct_label, ref_atoms in tqdm.tqdm(
         reference_structures.items(),
         desc=f"{model_name} compression",
         unit="struct",
@@ -550,7 +594,9 @@ def run_compression(model_name: str, model) -> None:
                 # Stress tensor (Voigt notation, eV/Å³)
                 try:
                     stress_voigt = atoms.get_stress(voigt=False)
-                    pressure = -1/3 * np.trace(stress_voigt)/units.GPa  # hydrostatic pressure
+                    pressure = (
+                        -1 / 3 * np.trace(stress_voigt) / units.GPa
+                    )  # hydrostatic pressure
                 except Exception:
                     stress_voigt = np.zeros(6)
                     pressure = np.nan
@@ -628,13 +674,15 @@ def test_compression(model_name: str) -> None:
 
 
 if __name__ == "__main__":
-    # generate the random structures and save to data directory 
+    # generate the random structures and save to data directory
     # this only needs to be done once
-    # code included here so that the structures can be easily regenerated in the same way if needed, or more can be created etc.
+    # code included here so that the structures can be easily
+    # regenerated in the same way if needed, or more can be
+    # created etc.
     _generate_all_random(
         RANDOM_STRUCTURES,
         ELEMENTS,
         MAX_ATOMS_PER_CELL,
         DATA_PATH,
         seed=42,
-    )
\ No newline at end of file
+    )
diff --git a/ml_peg/calcs/physicality/compression/data/README.md b/ml_peg/calcs/physicality/compression/data/README.md
index 9ba7177e1..f128488c9 100644
--- a/ml_peg/calcs/physicality/compression/data/README.md
+++ b/ml_peg/calcs/physicality/compression/data/README.md
@@ -3,6 +3,6 @@
 Place reference crystal structure files here (``.xyz``, ``.extxyz``, or ``.cif`` format).
 
 1.xyz contains random unary structures
-2.xyz contains random binary structures etc. 
+2.xyz contains random binary structures etc.
 
 Random structures generated using PyXtal
diff --git a/ml_peg/models/models.yml b/ml_peg/models/models.yml
index 0a46eff25..cb28285e2 100644
--- a/ml_peg/models/models.yml
+++ b/ml_peg/models/models.yml
@@ -66,7 +66,7 @@ mace-matpes-r2scan:
 orb-v3-consv-inf-omat:
   module: orb_models.forcefield
   class_name: OrbCalc
-  device: "cpu"
+  device: "auto"
   default_dtype: float32-high
   trained_on_dispersion: false
   level_of_theory: PBE

From 1f44934970899559533df1a156ede778b8220c96 Mon Sep 17 00:00:00 2001
From: James Darby <james@symmetricgroup.co.uk>
Date: Tue, 10 Mar 2026 21:55:05 +0000
Subject: [PATCH 11/12] user guide and undoing default setting changes
 elsewhere

---
 .../user_guide/benchmarks/physicality.rst     | 60 +++++++++++++++++++
 .../compression/analyse_compression.py        |  2 +-
 .../physicality/diatomics/calc_diatomics.py   |  2 +-
 ml_peg/models/models.yml                      |  4 +-
 4 files changed, 64 insertions(+), 4 deletions(-)

diff --git a/docs/source/user_guide/benchmarks/physicality.rst b/docs/source/user_guide/benchmarks/physicality.rst
index 0d5b55d25..e5979e687 100644
--- a/docs/source/user_guide/benchmarks/physicality.rst
+++ b/docs/source/user_guide/benchmarks/physicality.rst
@@ -135,3 +135,63 @@ Data availability
 -----------------
 
 None required; diatomics are generated in ASE.
+
+
+Compression
+=========
+
+Summary
+-------
+This benchmark is a many-body analogue to the diatomics benchmark.
+Single element crystal structures are generated using common prototypes, namely (sc, bcc, fcc, hcp and diamond).
+Further structures are randomly generated, subject to minimum distance constraints, using pxtal.
+The final structures are then isotropically scaled such that min(d_ij/(r_i + r_j)) = 1.0,
+where d_ij is the distance between atoms i and j, and r_i and r_j are the covalent radii of the respective atoms.
+A linear grid of scale factors containing 100 points is generated from 0.25 to 3.0.
+The structures are then scaled (scale factor applied to all cell vectors) and the energy and pressure are calculated.
+The resulting energies and projected forces are analysed for unphysical oscillations and "holes".
+
+Metrics
+-------
+
+1. Pressure sign flips and Big Pressure sign flips
+
+   Analogous to force flips in the diatomics benchmark, this metric counts the number of times the isotropic pressure changes sign.
+   Pressure sign flips uses a tolerance of 0.001 GPa whilst Big Pressure sign flips uses a tolerance of 1 GPa, to avoid counting noise-induced flips.
+   Whilst it is plausible that some, very contrived, configurations should show multiple sign flips in the pressure curve almost all materials should show 1.
+
+
+2. Energy minima and Deep Energy minima
+
+   Mean count of distinct minima in the energy-scale factor profile. Local minima are
+   found from the second derivative using tolerance of 1meV/atom and 100 meV/atom for Energy minima and Deep Energy minima respectively.
+
+
+3. Holes
+
+   A "hole" is defined as a point on the energy-scale factor curve where the energy is less than the energy at the equilibrium scale factor by more than 100 eV/atom.
+   This is a crude and "cautious" definition, with the idea of flagging catestrophic model failures only.
+
+4. :math:`\rho(-E, V_{\text{small}})`
+
+   Spearman correlation between scale factor and minus energy for :math:`scale < scale(E_{\min})` (i.e. the compressed side of the curve).
+   A perfect curve should show a strong positive correlation, so a value of +1, indicating that as the structure is compressed, the energy increases.
+   As above, this assumes a single minimum in the energy-scale factor curve, which is the case for almost all materials.
+   This metric will be polluted in models containing holes.
+
+5. :math:`\rho(E, V_{\text{large}})`
+
+   Spearman correlation between scale factor and energy for :math:`scale > scale(E_{\min})` (i.e. the expanded side of the curve).
+   A perfect curve should show a strong positive correlation, so a value of +1, indicating that as the structure is expanded, the energy increases.
+   As above, this assumes a single minimum in the energy-scale factor curve, which is the case for almost all materials.
+   This metric will be polluted in models containing holes.
+
+Computational cost
+------------------
+
+High: Expected to take hours to run on GPU, or around one day for slower MLIPs.
+
+Data availability
+-----------------
+
+None required; prototype structures are generated in ASE and random structures are generated using pxtal.
diff --git a/ml_peg/analysis/physicality/compression/analyse_compression.py b/ml_peg/analysis/physicality/compression/analyse_compression.py
index 9297cfd18..0f8d26efa 100644
--- a/ml_peg/analysis/physicality/compression/analyse_compression.py
+++ b/ml_peg/analysis/physicality/compression/analyse_compression.py
@@ -277,7 +277,7 @@ def compute_structure_metrics(
         holes = 1
 
     # Pressure sign flips
-    pressure_flips = count_sign_changes(pressures, tol=1e-4)
+    pressure_flips = count_sign_changes(pressures, tol=1e-3)
     big_pressure_flips = count_sign_changes(pressures, tol=1)
 
     # Spearman correlations in compressed and expanded regimes
diff --git a/ml_peg/calcs/physicality/diatomics/calc_diatomics.py b/ml_peg/calcs/physicality/diatomics/calc_diatomics.py
index b3182e7d6..84c06752a 100644
--- a/ml_peg/calcs/physicality/diatomics/calc_diatomics.py
+++ b/ml_peg/calcs/physicality/diatomics/calc_diatomics.py
@@ -25,7 +25,7 @@
 
 # Benchmark configuration (matches historical benchmark settings)
 ELEMENTS: list[str] = [symbol for symbol in chemical_symbols if symbol]
-INCLUDE_HETERONUCLEAR = False
+INCLUDE_HETERONUCLEAR = True
 MIN_DISTANCE = 0.18
 MAX_DISTANCE = 6.0
 # for testing, reduce the number of points to e.g. 5
diff --git a/ml_peg/models/models.yml b/ml_peg/models/models.yml
index cb28285e2..1d69c64be 100644
--- a/ml_peg/models/models.yml
+++ b/ml_peg/models/models.yml
@@ -66,7 +66,7 @@ mace-matpes-r2scan:
 orb-v3-consv-inf-omat:
   module: orb_models.forcefield
   class_name: OrbCalc
-  device: "auto"
+  device: "cpu"
   default_dtype: float32-high
   trained_on_dispersion: false
   level_of_theory: PBE
@@ -173,7 +173,7 @@ orb-v3-consv-inf-omat:
 pet-mad:
   module: pet_mad.calculator
   class_name: PETMADCalculator
-  device: "auto"
+  device: "cpu"
   default_dtype: float32
   trained_on_dispersion: false
   level_of_theory: PBEsol

From 03dff7bf0485455eac295b4d839437ab28483d62 Mon Sep 17 00:00:00 2001
From: James Darby <james@symmetricgroup.co.uk>
Date: Tue, 10 Mar 2026 22:00:12 +0000
Subject: [PATCH 12/12] removed old readme

---
 docs/source/user_guide/benchmarks/physicality.rst   | 1 +
 ml_peg/calcs/physicality/compression/data/README.md | 8 --------
 2 files changed, 1 insertion(+), 8 deletions(-)
 delete mode 100644 ml_peg/calcs/physicality/compression/data/README.md

diff --git a/docs/source/user_guide/benchmarks/physicality.rst b/docs/source/user_guide/benchmarks/physicality.rst
index e5979e687..d28e04acf 100644
--- a/docs/source/user_guide/benchmarks/physicality.rst
+++ b/docs/source/user_guide/benchmarks/physicality.rst
@@ -150,6 +150,7 @@ where d_ij is the distance between atoms i and j, and r_i and r_j are the covale
 A linear grid of scale factors containing 100 points is generated from 0.25 to 3.0.
 The structures are then scaled (scale factor applied to all cell vectors) and the energy and pressure are calculated.
 The resulting energies and projected forces are analysed for unphysical oscillations and "holes".
+All structures were generated by running ml_peg/calcs/physicality/compression/calc_compression.py directly.
 
 Metrics
 -------
diff --git a/ml_peg/calcs/physicality/compression/data/README.md b/ml_peg/calcs/physicality/compression/data/README.md
deleted file mode 100644
index f128488c9..000000000
--- a/ml_peg/calcs/physicality/compression/data/README.md
+++ /dev/null
@@ -1,8 +0,0 @@
-# Compression Benchmark Reference Structures
-
-Place reference crystal structure files here (``.xyz``, ``.extxyz``, or ``.cif`` format).
-
-1.xyz contains random unary structures
-2.xyz contains random binary structures etc.
-
-Random structures generated using PyXtal