Laurits7 · Laurits7 · Jul 2, 2025 · Jun 22, 2025 · Jun 23, 2025 · Jun 24, 2025
diff --git a/ml4cc/config/datasets/CEPC.yaml b/ml4cc/config/datasets/CEPC.yaml
@@ -1,6 +1,6 @@
 name: CEPC
 input_dim: 3000
-max_peak_cands: 1580
+max_peak_cands: 1600
 branches: null
 tree_path: sim
 particle_types:

diff --git a/ml4cc/config/datasets/FCC.yaml b/ml4cc/config/datasets/FCC.yaml
@@ -1,6 +1,6 @@
 name: FCC
 input_dim: 1200
-max_peak_cands: 645
+max_peak_cands: 700
 branches: null
 tree_path: sim
 particle_types:

diff --git a/ml4cc/config/evaluation/evaluation.yaml b/ml4cc/config/evaluation/evaluation.yaml
@@ -2,4 +2,4 @@ dataset:
   num_evaluation_waveforms: 1000
   results_output_dir: ${training.results_dir}/data
 training:
-  eval_all_always: false
+  eval_all: ${training.eval_all}
diff --git a/ml4cc/config/models/one_step/models/transformer.yaml b/ml4cc/config/models/one_step/models/transformer.yaml
@@ -1,13 +1,13 @@
 _target_: ml4cc.models.transformer.TransformerModule
 name: transformer
 hyperparameters:
-    input_dim: ${dataset.input_dim} # Actually this should be 1 or 15.
-    d_model: 128 #512
+    input_dim: 15 #${dataset.input_dim} # Actually this should be 1 or 15.
+    d_model: 128 # 512
     num_heads: 8
-    num_layers: 2 #3
-    hidden_dim: 248 #2048
+    num_layers: 2 # 3
+    hidden_dim: 256 # 2048
     num_classes: 1
-    max_len: ${dataset.input_dim}
+    max_len: ${dataset.input_dim} # Maybe ${dataset.max_peak_cands}
     lr: 1e-6
 checkpoint:
     model: null

diff --git a/ml4cc/config/models/two_step/clusterization/DNN.yaml b/ml4cc/config/models/two_step/clusterization/DNN.yaml
@@ -1,7 +1,7 @@
 _target_: ml4cc.models.simpler_models.DNNModule
 name: DNN
 hyperparameters:
-    n_features: 1650
+    n_features: ${dataset.max_peak_cands}
     linear_layer_1:
         out_features: 32
     linear_layer_2:

diff --git a/ml4cc/config/models/two_step/two_step.yaml b/ml4cc/config/models/two_step/two_step.yaml
@@ -1,4 +1,4 @@
 defaults:
   - _self_
   - peak_finding@peak_finding.model: LSTM
-  - clusterization@clusterization.model: CNN
+  - clusterization@clusterization.model: CNN
diff --git a/ml4cc/config/models/two_step_minimal/models/LSTM.yaml b/ml4cc/config/models/two_step_minimal/models/LSTM.yaml
@@ -5,7 +5,7 @@ hyperparameters:
     num_features: 15
     lstm_hidden_dim: 32
     num_lstm_layers: 1
-    regression: true
+    regression: false
 
 checkpoint:
     model: null

diff --git a/ml4cc/config/training.yaml b/ml4cc/config/training.yaml
@@ -7,7 +7,7 @@ training:
     models_dir: ${training.output_dir_}/models
     log_dir: ${training.output_dir_}/logs
     predictions_dir: ${training.output_dir_}/predictions
-    results_dir: ${training.output_dir}/results
+    results_dir: ${training.output_dir_}/results
     dataloader:
         batch_sizes:
             one_step: 128

diff --git a/ml4cc/models/LSTM.py b/ml4cc/models/LSTM.py
@@ -3,29 +3,30 @@
 import torch.optim as optim
 import torch.nn.functional as F
 from torch.optim.lr_scheduler import ReduceLROnPlateau
+from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
 
 
 # TODO: Is this implemented like in their paper? In their paper they have
 # multiple LSTMs.
 class LSTM(torch.nn.Module):
-    def __init__(self, num_features, lstm_hidden_dim: int = 32, num_lstm_layers: int = 1, regression: bool = False):
+    def __init__(self, num_features, lstm_hidden_dim: int = 32, num_lstm_layers: int = 1):
         super().__init__()
-        self.regression = regression
         self.lstm = torch.nn.LSTM(
             input_size=num_features, num_layers=num_lstm_layers, hidden_size=lstm_hidden_dim, batch_first=True
         )
         self.fc3 = torch.nn.Linear(lstm_hidden_dim, 32)
         self.fc4 = torch.nn.Linear(32, 1)
 
-    def forward(self, x):
-        ula, (h, _) = self.lstm(x)
-        out = h[-1] if self.regression else ula
-        # If we would like to have a prediction for each point in wf, then we
-        # would use ula instead of out here
-        out = F.relu(self.fc3(out))
+    def forward(self, x, mask):
+        lengths = mask.sum(dim=1)
+        packed_x = pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
+        ula, (h, _) = self.lstm(packed_x)
+        # Unpack sequence to apply linear layer to each timestep
+        padded_out, _ = pad_packed_sequence(ula, batch_first=True)
+        out = F.relu(self.fc3(padded_out))
         out = self.fc4(out)
-        clf = out if self.regression else F.sigmoid(out)
-        return clf
+        out = torch.sigmoid(out)  # use torch.sigmoid, not F.sigmoid (deprecated)
+        return out
 
 
 class LSTMModule(L.LightningModule):
@@ -37,7 +38,6 @@ def __init__(self, name: str, hyperparameters: dict, checkpoint: dict = None):
             num_features=self.hyperparameters["num_features"],
             lstm_hidden_dim=self.hyperparameters["lstm_hidden_dim"],
             num_lstm_layers=self.hyperparameters["num_lstm_layers"],
-            regression=self.hyperparameters.get("regression", False),
         )
 
     def training_step(self, batch, batch_idx):
@@ -51,8 +51,9 @@ def training_step(self, batch, batch_idx):
 
     def validation_step(self, batch, batch_idx):
         predicted_labels, target = self.forward(batch)
-        masked_target = target[target != -1]
-        masked_predicted_labels = predicted_labels[target != -1]
+        target_mask = (target != -1) & (target != -999)
+        masked_target = target[target_mask]
+        masked_predicted_labels = predicted_labels[target_mask]
         loss = F.binary_cross_entropy(masked_predicted_labels, masked_target)
         self.log("val_loss", loss)
         return loss
@@ -76,6 +77,6 @@ def test_step(self, batch, batch_idx):
         return predicted_labels
 
     def forward(self, batch):
-        waveform, target = batch
-        predicted_labels = self.lstm(waveform).squeeze(dim=-1)
+        waveform, target, mask = batch
+        predicted_labels = self.lstm(waveform, mask).squeeze(dim=-1)
         return predicted_labels, target
diff --git a/ml4cc/models/simpler_models.py b/ml4cc/models/simpler_models.py
@@ -66,7 +66,7 @@ def __init__(self, hyperparameters):
         self.flatten = nn.Flatten()
         self.fc1 = nn.Linear(
             # Compute flattened input size manually
-            in_features=6560,
+            in_features=2752,
             out_features=hyperparameters.linear_layer_1.out_features,
         )
         self.output = nn.Linear(
@@ -121,7 +121,7 @@ def forward(self, x, mask):
         lengths = mask.sum(dim=1)
         # Pack the padded sequence
         packed_x = pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
-        ula, (h, _) = self.lstm(x)
+        ula, (h, _) = self.lstm(packed_x)
         # Output and hidden state
         out = h[-1]  # Take the last output for prediction
         x = self.fc1(out)

diff --git a/ml4cc/models/transformer.py b/ml4cc/models/transformer.py
@@ -23,7 +23,7 @@ def forward(self, x):
 class WaveFormTransformer(nn.Module):
     def __init__(
         self,
-        input_dim: int,  # 1024 or 3000
+        input_dim: int,  # 1 or 15
         d_model: int,  # 512
         num_heads: int,  # 16
         num_layers: int,  # 3
@@ -37,20 +37,24 @@ def __init__(
         self.positional_encoding = PositionalEncoding(d_model, max_len)
 
         encoder_layer = nn.TransformerEncoderLayer(
-            d_model=d_model, nhead=num_heads, dim_feedforward=hidden_dim, dropout=dropout
+            d_model=d_model, nhead=num_heads, dim_feedforward=hidden_dim, dropout=dropout, batch_first=True
         )
         self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
 
         self.peak_finding_classifier = nn.Linear(d_model, num_classes)
         self.layernorm = nn.LayerNorm(d_model)
 
-    def forward(self, x):
+    def forward(self, x, mask):
         mean = x.mean(dim=1, keepdim=True)
         std = x.std(dim=1, keepdim=True)
+        if mask is not None:
+            # Make sure mask is bool and of shape [batch_size, seq_len]
+            # Transformer expects True where the token is **ignored**
+            mask = mask.bool()
         x = (x - mean) / (std + 1e-6)
         x = self.input_projection(x)
         x = self.positional_encoding(x)
-        x = self.transformer_encoder(x)
+        x = self.transformer_encoder(x, src_key_padding_mask=mask)
         # x = self.layernorm(x)
         # Shape: [batch_size, seq_length, num_classes]
         x = self.peak_finding_classifier(x)
@@ -107,10 +111,8 @@ def test_step(self, batch, batch_idx):
         return predicted_labels
 
     def forward(self, batch):
-        waveform, target = batch
-        predicted_labels = self.transformer(waveform).squeeze()
-        print("PREDICTED:", predicted_labels)
-        print("TARGET:", target)
+        waveform, target, mask = batch
+        predicted_labels = self.transformer(waveform, mask).squeeze()
         return predicted_labels, target
 
     def on_after_backward(self):