ai2cm · Arcomano1234 · Mar 16, 2026 · Mar 17, 2026 · Mar 17, 2026 · Mar 17, 2026
diff --git a/fme/ace/aggregator/test_train.py b/fme/ace/aggregator/test_train.py
@@ -78,3 +78,93 @@ def test_aggregator_gets_logs_with_no_batches(config: TrainAggregatorConfig):
     logs = agg.get_logs(label="test")
     assert np.isnan(logs.pop("test/mean/loss"))
     assert logs == {}
+
+
+def test_aggregator_logs_per_channel_loss():
+    """
+    Per-channel (per-variable) loss is accumulated from batch.metrics and reported.
+    """
+    batch_size = 4
+    n_ensemble = 1
+    n_time = 2
+    nx, ny = 2, 2
+    device = get_device()
+    gridded_operations = LatLonOperations(
+        area_weights=torch.ones(nx, ny, device=device)
+    )
+    config = TrainAggregatorConfig(
+        spherical_power_spectrum=False, weighted_rmse=False, per_channel_loss=True
+    )
+    agg = TrainAggregator(config=config, operations=gridded_operations)
+    target_data = EnsembleTensorDict(
+        {"a": torch.randn(batch_size, 1, n_time, nx, ny, device=device)},
+    )
+    gen_data = EnsembleTensorDict(
+        {"a": torch.randn(batch_size, n_ensemble, n_time, nx, ny, device=device)},
+    )
+    agg.record_batch(
+        batch=TrainOutput(
+            metrics={
+                "loss": torch.tensor(1.0, device=device),
+                "loss/a": torch.tensor(0.5, device=device),
+            },
+            target_data=target_data,
+            gen_data=gen_data,
+            time=xr.DataArray(np.zeros((batch_size, n_time)), dims=["sample", "time"]),
+            normalize=lambda x: x,
+        ),
+    )
+    agg.record_batch(
+        batch=TrainOutput(
+            metrics={
+                "loss": torch.tensor(2.0, device=device),
+                "loss/a": torch.tensor(1.0, device=device),
+            },
+            target_data=target_data,
+            gen_data=gen_data,
+            time=xr.DataArray(np.zeros((batch_size, n_time)), dims=["sample", "time"]),
+            normalize=lambda x: x,
+        ),
+    )
+    logs = agg.get_logs(label="train")
+    assert logs["train/mean/loss"] == 1.5
+    assert logs["train/mean/loss/a"] == 0.75
+
+
+def test_aggregator_per_channel_loss_disabled():
+    """When per_channel_loss=False, get_logs does not include per-variable loss."""
+    batch_size = 4
+    n_ensemble = 1
+    n_time = 2
+    nx, ny = 2, 2
+    device = get_device()
+    gridded_operations = LatLonOperations(
+        area_weights=torch.ones(nx, ny, device=device)
+    )
+    config = TrainAggregatorConfig(
+        spherical_power_spectrum=False,
+        weighted_rmse=False,
+        per_channel_loss=False,
+    )
+    agg = TrainAggregator(config=config, operations=gridded_operations)
+    target_data = EnsembleTensorDict(
+        {"a": torch.randn(batch_size, 1, n_time, nx, ny, device=device)},
+    )
+    gen_data = EnsembleTensorDict(
+        {"a": torch.randn(batch_size, n_ensemble, n_time, nx, ny, device=device)},
+    )
+    agg.record_batch(
+        batch=TrainOutput(
+            metrics={
+                "loss": torch.tensor(1.0, device=device),
+                "loss/a": torch.tensor(0.5, device=device),
+            },
+            target_data=target_data,
+            gen_data=gen_data,
+            time=xr.DataArray(np.zeros((batch_size, n_time)), dims=["sample", "time"]),
+            normalize=lambda x: x,
+        ),
+    )
+    logs = agg.get_logs(label="train")
+    assert logs["train/mean/loss"] == 1.0
+    assert "train/mean/loss/a" not in logs
diff --git a/fme/ace/aggregator/train.py b/fme/ace/aggregator/train.py
@@ -14,6 +14,9 @@
 from fme.core.tensors import fold_ensemble_dim, fold_sized_ensemble_dim
 from fme.core.typing_ import TensorMapping
 
+# Metric key prefix for per-variable loss (must match stepper's metrics["loss/<var>"]).
+PER_CHANNEL_LOSS_PREFIX = "loss/"
+
 
 @dataclasses.dataclass
 class TrainAggregatorConfig:
@@ -23,10 +26,13 @@ class TrainAggregatorConfig:
     Attributes:
         spherical_power_spectrum: Whether to compute the spherical power spectrum.
         weighted_rmse: Whether to compute the weighted RMSE.
+        per_channel_loss: Whether to accumulate and report per-variable (per-channel)
+            loss in get_logs (e.g. train/mean/loss/<var_name>).
     """
 
     spherical_power_spectrum: bool = True
     weighted_rmse: bool = True
+    per_channel_loss: bool = True
 
 
 class Aggregator(Protocol):
@@ -48,6 +54,8 @@ class TrainAggregator(AggregatorABC[TrainOutput]):
     def __init__(self, config: TrainAggregatorConfig, operations: GriddedOperations):
         self._n_loss_batches = 0
         self._loss = torch.tensor(0.0, device=get_device())
+        self._per_channel_loss: dict[str, torch.Tensor] = {}
+        self._per_channel_loss_enabled = config.per_channel_loss
         self._paired_aggregators: dict[str, Aggregator] = {}
         if config.spherical_power_spectrum:
             try:
@@ -73,6 +81,16 @@ def __init__(self, config: TrainAggregatorConfig, operations: GriddedOperations)
     def record_batch(self, batch: TrainOutput):
         self._loss += batch.metrics["loss"]
         self._n_loss_batches += 1
+        if self._per_channel_loss_enabled:
+            for key, value in batch.metrics.items():
+                if not key.startswith(PER_CHANNEL_LOSS_PREFIX):
+                    continue
+                var_name = key.removeprefix(PER_CHANNEL_LOSS_PREFIX)
+                acc = self._per_channel_loss.get(
+                    var_name,
+                    torch.tensor(0.0, device=get_device(), dtype=value.dtype),
+                )
+                self._per_channel_loss[var_name] = acc + value
 
         folded_gen_data, n_ensemble = fold_ensemble_dim(batch.gen_data)
         folded_target_data = fold_sized_ensemble_dim(batch.target_data, n_ensemble)
@@ -100,6 +118,11 @@ def get_logs(self, label: str) -> dict[str, torch.Tensor]:
         logs[f"{label}/mean/loss"] = float(
             dist.reduce_mean(self._loss / self._n_loss_batches).cpu().numpy()
         )
+        if self._n_loss_batches > 0 and self._per_channel_loss_enabled:
+            for var_name, acc in self._per_channel_loss.items():
+                logs[f"{label}/mean/loss/{var_name}"] = float(
+                    dist.reduce_mean(acc / self._n_loss_batches).cpu().numpy()
+                )
         return logs
 
     @torch.no_grad()

diff --git a/fme/ace/stepper/single_module.py b/fme/ace/stepper/single_module.py
@@ -1606,6 +1606,7 @@ def _accumulate_loss(
                     "data requirements are retrieved, so this is a bug."
                 )
             n_forward_steps = stochastic_n_forward_steps
+        per_channel_sum: dict[str, torch.Tensor] | None = None
         for step in range(n_forward_steps):
             optimize_step = (
                 step == n_forward_steps - 1 or not self._config.optimize_last_step_only
@@ -1626,10 +1627,22 @@ def _accumulate_loss(
                         for k, v in target_data.data.items()
                     }
                 )
-                step_loss = self._loss_obj(gen_step, target_step, step=step)
-                metrics[f"loss_step_{step}"] = step_loss.detach()
+                step_loss = self._loss_obj(
+                    gen_step, target_step, step=step, reduce=False
+                )
+                step_total_loss = step_loss.sum()
+                metrics[f"loss_step_{step}"] = step_total_loss.detach()
+                per_ch = self._loss_obj.loss.packer.unpack(step_loss.detach(), axis=0)
+                if per_channel_sum is None:
+                    per_channel_sum = {k: v.clone() for k, v in per_ch.items()}
+                else:
+                    for k in per_channel_sum:
+                        per_channel_sum[k] = per_channel_sum[k] + per_ch[k]
             if optimize_step:
-                optimization.accumulate_loss(step_loss)
+                optimization.accumulate_loss(step_total_loss)
+        if per_channel_sum is not None:
+            for k, v in per_channel_sum.items():
+                metrics[f"loss/{k}"] = v.detach()
         return output_list
 
     def update_training_history(self, training_job: TrainingJob) -> None:

diff --git a/fme/ace/stepper/testdata/stepper_train_on_batch_regression-False-crps.pt b/fme/ace/stepper/testdata/stepper_train_on_batch_regression-False-crps.pt
diff --git a/fme/ace/stepper/testdata/stepper_train_on_batch_regression-False.pt b/fme/ace/stepper/testdata/stepper_train_on_batch_regression-False.pt
diff --git a/fme/ace/stepper/testdata/stepper_train_on_batch_regression-True-crps.pt b/fme/ace/stepper/testdata/stepper_train_on_batch_regression-True-crps.pt
diff --git a/fme/ace/stepper/testdata/stepper_train_on_batch_regression-True.pt b/fme/ace/stepper/testdata/stepper_train_on_batch_regression-True.pt
diff --git a/fme/core/generics/trainer.py b/fme/core/generics/trainer.py
@@ -496,7 +496,10 @@ def train_one_epoch(self):
                 stop_batch=self.config.train_evaluation_batches
             ):
                 with GlobalTimer():
-                    stepped = self.stepper.train_on_batch(batch, self._no_optimization)
+                    stepped = self.stepper.train_on_batch(
+                        batch,
+                        self._no_optimization,
+                    )
                 aggregator.record_batch(stepped)
         if (
             self._should_save_checkpoints()

diff --git a/fme/core/loss.py b/fme/core/loss.py
@@ -13,6 +13,42 @@
 from fme.core.typing_ import TensorMapping
 
 
+def _channel_dim_positive(ndim: int, channel_dim: int) -> int:
+    return channel_dim if channel_dim >= 0 else ndim + channel_dim
+
+
+def _uniform_broadcast_per_channel(
+    total: torch.Tensor, n_channels: int
+) -> torch.Tensor:
+    """Evenly split a scalar across ``n_channels`` so ``.sum() == total``."""
+    return (total / n_channels).expand(n_channels)
+
+
+def _reduce_to_per_channel(
+    loss_value: torch.Tensor,
+    channel_dim: int,
+    n_channels: int,
+) -> torch.Tensor:
+    """Reduce any loss tensor to shape ``(n_channels,)``.
+
+    Handles element-wise tensors, partially-reduced tensors
+    (e.g. after area-weighted mean removes spatial dims), and
+    scalars (uniformly broadcast).
+
+    ``channel_dim`` must be a **non-negative** index, computed from
+    the *input* tensor (before the loss potentially removes dims).
+
+    The result satisfies ``tensor.sum() == loss_value.mean()``
+    for element-wise losses.
+    """
+    if loss_value.ndim == 0:
+        return _uniform_broadcast_per_channel(loss_value, n_channels)
+    dims = tuple(i for i in range(loss_value.ndim) if i != channel_dim)
+    if not dims:
+        return loss_value / n_channels
+    return loss_value.mean(dim=dims) / n_channels
+
+
 class NaNLoss(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -59,7 +95,16 @@ def __call__(
         self,
         predict_dict: TensorMapping,
         target_dict: TensorMapping,
-    ):
+        reduce: bool = True,
+    ) -> torch.Tensor:
+        """
+        Args:
+            predict_dict: The predicted data.
+            target_dict: The target data.
+            reduce: If True (default), return a scalar loss.
+                If False, return a per-channel loss vector of
+                shape ``(n_channels,)``.
+        """
         predict_tensors = self.packer.pack(
             self.normalizer.normalize(predict_dict), axis=self.channel_dim
         )
@@ -71,7 +116,14 @@ def __call__(
             predict_tensors = torch.where(nan_mask, 0.0, predict_tensors)
             target_tensors = torch.where(nan_mask, 0.0, target_tensors)
 
-        return self.loss(predict_tensors, target_tensors)
+        result = self.loss(predict_tensors, target_tensors)
+        if not reduce:
+            cdim = _channel_dim_positive(predict_tensors.ndim, self.channel_dim)
+            n_c = int(predict_tensors.shape[cdim])
+            return _reduce_to_per_channel(result, cdim, n_c)
+        if result.ndim > 0:
+            return result.mean()
+        return result
 
     def get_normalizer_state(self) -> dict[str, float]:
         return self.normalizer.get_state()
@@ -457,18 +509,23 @@ def forward(
         predict_dict: TensorMapping,
         target_dict: TensorMapping,
         step: int,
+        reduce: bool = True,
     ) -> torch.Tensor:
         """
         Args:
             predict_dict: The predicted data.
             target_dict: The target data.
             step: The step number, indexed from 0 for the first step.
+            reduce: If True (default), return a scalar loss.
+                If False, return a per-channel loss vector of
+                shape ``(n_channels,)``.
 
         Returns:
-            The loss.
+            The loss, scalar or ``(n_channels,)`` depending on
+            ``reduce``.
         """
         step_weight = (1.0 + self.sqrt_loss_decay_constant * step) ** (-0.5)
-        return self.loss(predict_dict, target_dict) * step_weight
+        return self.loss(predict_dict, target_dict, reduce=reduce) * step_weight
 
 
 @dataclasses.dataclass
@@ -525,7 +582,7 @@ def build(
         channel_dim: int = -3,
     ) -> StepLoss:
         loss = self.loss_config.build(
-            reduction="mean",
+            reduction="none",
             gridded_operations=gridded_ops,
         )
         return StepLoss(