diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml index 4d24d45..39f83b6 100644 --- a/.github/workflows/pylint.yml +++ b/.github/workflows/pylint.yml @@ -3,7 +3,47 @@ name: Pylint on: [pull_request] jobs: + + lint-autofix: + runs-on: ubuntu-latest + if: github.actor != 'github-actions[bot]' + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + persist-credentials: false # Needed for manual push + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install black + + - name: Auto-fix with black + run: black --line-length 120 . + + - name: Commit and push changes + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + git fetch origin ${{ github.head_ref }} + git checkout ${{ github.head_ref }} + git add . + if ! git diff --cached --quiet; then + git commit -m "chore: auto-fix Python lint issues" + git rebase origin/${{ github.head_ref }} + git push https://x-access-token:${GITHUB_TOKEN}@github.com/${{ github.repository }} HEAD:refs/heads/${{ github.head_ref }} + fi + env: + # Required if using a token for push + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + build: + needs: lint-autofix runs-on: ubuntu-latest strategy: matrix: @@ -19,6 +59,13 @@ jobs: python -m pip install --upgrade pip pip install pylint pip install -r requirements.txt + - name: Checkout updated code + uses: actions/checkout@v4 + with: + ref: ${{ github.head_ref }} + - name: Run Pylint for logs (with score) + if: always() + run: pylint --rcfile=config/.pylintrc $(git ls-files '*.py') --exit-zero - name: Analysing the code with pylint run: | pylint --rcfile=config/.pylintrc $(git ls-files '*.py') --exit-zero --output-format=json > pylint_output.json @@ -41,7 +88,7 @@ jobs: uses: actions/upload-artifact@v4 with: name: pylint-report - path: pylint-report.json + path: pylint_output.json - name: Show Pylint warnings if: always() run: | @@ -53,3 +100,4 @@ jobs: echo "Errors:" jq -r '.[] | select(.type == "error" or .type == "fatal") | "\(.path):\(.line): \(.message)"' pylint_output.json || true + diff --git a/config/.pylintrc b/config/.pylintrc index 3f816e4..39f6b58 100644 --- a/config/.pylintrc +++ b/config/.pylintrc @@ -3,3 +3,7 @@ disable=C0114, C0116 [REPORTS] output-format=colorized +score = yes + +[FORMAT] +max-line-length=120 diff --git a/ml4cc/__init__.py b/ml4cc/__init__.py index 089f3a8..df3ac82 100644 --- a/ml4cc/__init__.py +++ b/ml4cc/__init__.py @@ -1,3 +1,3 @@ from . import models from . import tools -from . import data \ No newline at end of file +from . import data diff --git a/ml4cc/config/environment/lumi.yaml b/ml4cc/config/environment/lumi.yaml index a0fdb00..3c4adba 100644 --- a/ml4cc/config/environment/lumi.yaml +++ b/ml4cc/config/environment/lumi.yaml @@ -1,6 +1,6 @@ name: lumi project_dir: /scratch/project_465001293/ML4CC/ml4cc -data_dir: /scratch/project_465001293/ML4CC/data +data_dir: /scratch/project_465001293/ML4CC tmp_dir: /scratch/project_465001293/ML4CC/tmp slurm: queue: diff --git a/ml4cc/config/evaluation/evaluation.yaml b/ml4cc/config/evaluation/evaluation.yaml new file mode 100644 index 0000000..2702376 --- /dev/null +++ b/ml4cc/config/evaluation/evaluation.yaml @@ -0,0 +1,5 @@ +dataset: + num_evaluation_waveforms: 1000 + results_output_dir: ${training.results_dir}/data +training: + eval_all_always: false diff --git a/ml4cc/config/main.yaml b/ml4cc/config/main.yaml index c8694ef..f75f84a 100644 --- a/ml4cc/config/main.yaml +++ b/ml4cc/config/main.yaml @@ -3,5 +3,8 @@ defaults: - environment@host: manivald # Options: lumi, manivald - datasets@dataset: CEPC # Options: FCC, CEPC - models: models + - datasets@datasets.CEPC: CEPC + - datasets@datasets.FCC: FCC + - evaluation: evaluation - preprocessing - - training \ No newline at end of file + - training diff --git a/ml4cc/config/models/one_step/models/transformer.yaml b/ml4cc/config/models/one_step/models/transformer.yaml index 1677f71..3b44d9d 100644 --- a/ml4cc/config/models/one_step/models/transformer.yaml +++ b/ml4cc/config/models/one_step/models/transformer.yaml @@ -8,12 +8,7 @@ hyperparameters: hidden_dim: 2048 num_classes: 1 max_len: ${dataset.input_dim} + lr: 0.001 checkpoint: model: null losses: null - - -# TODO: Maybe need to have name and target under "model" key to instantiate the class? - -defaults: - - _self_ \ No newline at end of file diff --git a/ml4cc/config/models/two_step/clusterization/CNN.yaml b/ml4cc/config/models/two_step/clusterization/CNN.yaml index e4147f5..f2f1ff8 100644 --- a/ml4cc/config/models/two_step/clusterization/CNN.yaml +++ b/ml4cc/config/models/two_step/clusterization/CNN.yaml @@ -1,8 +1,25 @@ -_target_: ml4cc.models.simpler_models +_target_: ml4cc.models.simpler_models.DNNModule name: CNN +hyperparameters: + conv_layer_1: + in_channels: 1 + out_channels: 32 + kernel_size: 4 + pool_layer_1: + kernel_size: 2 + conv_layer_2: + out_channels: 16 + kernel_size: 4 + linear_layer_1: + out_features: 32 + output_layer: + in_features: 32 + out_features: 1 + num_features: ${dataset.input_dim} # TODO: Check if this is as done by Guang +optimizer: + target: torch.optim.AdamW + lr: 0.001 + checkpoint: model: null losses: null - -defaults: - - _self_ \ No newline at end of file diff --git a/ml4cc/config/models/two_step/clusterization/DGCNN.yaml b/ml4cc/config/models/two_step/clusterization/DGCNN.yaml index e28db29..8b0e395 100644 --- a/ml4cc/config/models/two_step/clusterization/DGCNN.yaml +++ b/ml4cc/config/models/two_step/clusterization/DGCNN.yaml @@ -1,9 +1,8 @@ -_target_: ml4cc.models.DGCNN +_target_: ml4cc.models.DGCNN.DGCNN name: DGCNN -checkpoint: - model: null - losses: null - +optimizer: + _target_: torch.optim.AdamW + lr: 0.001 hyperparameters: n_conv1: 32 n_conv2: 32 @@ -17,3 +16,7 @@ hyperparameters: k: 4 mlp_dropout: 0.5 out_channels: 2 + +checkpoint: + model: null + losses: null diff --git a/ml4cc/config/models/two_step/clusterization/DNN.yaml b/ml4cc/config/models/two_step/clusterization/DNN.yaml index 20ae21d..2ec028d 100644 --- a/ml4cc/config/models/two_step/clusterization/DNN.yaml +++ b/ml4cc/config/models/two_step/clusterization/DNN.yaml @@ -1,8 +1,18 @@ -_target_: ml4cc.models.simpler_models +_target_: ml4cc.models.simpler_models.DNNModule name: DNN +hyperparameters: + n_features: 1 # ${dataset.input_dim} # TODO: Fix + linear_layer_1: + out_features: 32 + linear_layer_2: + out_features: 32 + linear_layer_3: + out_features: 32 + output_layer: + out_features: 1 +optimizer: + target: torch.optim.AdamW + lr: 0.001 checkpoint: model: null - losses: null - -defaults: - - _self_ \ No newline at end of file + losses: null \ No newline at end of file diff --git a/ml4cc/config/models/two_step/clusterization/RNN.yaml b/ml4cc/config/models/two_step/clusterization/RNN.yaml index 305b4f1..ec614aa 100644 --- a/ml4cc/config/models/two_step/clusterization/RNN.yaml +++ b/ml4cc/config/models/two_step/clusterization/RNN.yaml @@ -1,8 +1,19 @@ -_target_: ml4cc.models.simpler_models +_target_: ml4cc.models.simpler_models.RNNModule name: RNN +hyperparameters: + LSTM_layers: + input_size: 1 + hidden_size: 16 + num_layers: 1 + batch_first: true + linear_layer_1: + out_features: 16 + output_layer: + out_features: 1 +optimizer: + target: torch.optim.AdamW + lr: 0.001 + checkpoint: model: null losses: null - -defaults: - - _self_ \ No newline at end of file diff --git a/ml4cc/config/models/two_step/peak_finding/LSTM.yaml b/ml4cc/config/models/two_step/peak_finding/LSTM.yaml index 997af4b..7a937c6 100644 --- a/ml4cc/config/models/two_step/peak_finding/LSTM.yaml +++ b/ml4cc/config/models/two_step/peak_finding/LSTM.yaml @@ -4,6 +4,3 @@ hyperparameters: input_dim: ${dataset.input_dim} lstm_hidden_dim: 32 num_lstm_layers: 1 - -defaults: - - _self_ diff --git a/ml4cc/config/models/two_step/two_step.yaml b/ml4cc/config/models/two_step/two_step.yaml index a598cf4..b1fbd9f 100644 --- a/ml4cc/config/models/two_step/two_step.yaml +++ b/ml4cc/config/models/two_step/two_step.yaml @@ -1,4 +1,4 @@ defaults: - _self_ - peak_finding@peak_finding.model: LSTM - - clusterization@clusterization.model: DNN \ No newline at end of file + - clusterization@clusterization.model: RNN \ No newline at end of file diff --git a/ml4cc/config/models/two_step_minimal/models/LSTM.yaml b/ml4cc/config/models/two_step_minimal/models/LSTM.yaml index 997af4b..7a937c6 100644 --- a/ml4cc/config/models/two_step_minimal/models/LSTM.yaml +++ b/ml4cc/config/models/two_step_minimal/models/LSTM.yaml @@ -4,6 +4,3 @@ hyperparameters: input_dim: ${dataset.input_dim} lstm_hidden_dim: 32 num_lstm_layers: 1 - -defaults: - - _self_ diff --git a/ml4cc/config/training.yaml b/ml4cc/config/training.yaml index 9f018a0..c5b5bdb 100644 --- a/ml4cc/config/training.yaml +++ b/ml4cc/config/training.yaml @@ -1,19 +1,22 @@ training: + debug_run: false type: one_step # Options: one_step, two_step, two_step_minimal output_dir: null output_dir_: ${training.output_dir}/${training.type} models_dir: ${training.output_dir_}/models log_dir: ${training.output_dir_}/logs predictions_dir: ${training.output_dir_}/predictions + results_dir: ${training.output_dir}/results dataloader: batch_sizes: one_step: 128 two_step: 512 + two_step_minimal: 512 batch_size: ${training.dataloader.batch_sizes[${training.type}]} - num_dataloader_workers: 2 + num_dataloader_workers: 1 prefetch_factor: 100 trainer: - max_epochs: 5 # 50 epochs in Guang paper + max_epochs: 50 # 50 epochs in Guang paper model_evaluation_only: False hydra: diff --git a/ml4cc/models/LSTM.py b/ml4cc/models/LSTM.py index ca4522e..742fb74 100644 --- a/ml4cc/models/LSTM.py +++ b/ml4cc/models/LSTM.py @@ -4,18 +4,23 @@ import torch.nn.functional as F - -class LSTM(torch.nn.Module): # TODO: Is this implemented like in their paper? In their paper they have multiple LSTMs. - def __init__(self, input_dim: int = 3000, lstm_hidden_dim: int = 32, num_lstm_layers: int = 1): +# TODO: Is this implemented like in their paper? In their paper they have +# multiple LSTMs. +class LSTM(torch.nn.Module): + def __init__(self, lstm_hidden_dim: int = 32, num_lstm_layers: int = 1): super().__init__() - self.lstm = torch.nn.LSTM(input_size=1, num_layers=num_lstm_layers, hidden_size=lstm_hidden_dim, batch_first=True) + self.lstm = torch.nn.LSTM( + input_size=1, num_layers=num_lstm_layers, hidden_size=lstm_hidden_dim, batch_first=True + ) self.fc3 = torch.nn.Linear(lstm_hidden_dim, 32) self.fc4 = torch.nn.Linear(32, 1) def forward(self, x): ula, (h, _) = self.lstm(x) out = h[-1] - out = F.relu(self.fc3(out)) # If we would like to have a prediction for each point in wf, then we would use ula instead of out here + # If we would like to have a prediction for each point in wf, then we + # would use ula instead of out here + out = F.relu(self.fc3(out)) clf = F.sigmoid(self.fc4(out)).squeeze() return clf @@ -26,9 +31,8 @@ def __init__(self, name: str, hyperparameters: dict): self.hyperparameters = hyperparameters super().__init__() self.lstm = LSTM( - input_dim=self.hyperparameters["input_dim"], lstm_hidden_dim=self.hyperparameters["lstm_hidden_dim"], - num_lstm_layers=self.hyperparameters["num_lstm_layers"] + num_lstm_layers=self.hyperparameters["num_lstm_layers"], ) def training_step(self, batch, batch_idx): @@ -47,15 +51,15 @@ def configure_optimizers(self): return optim.AdamW(self.parameters(), lr=0.001) def predict_step(self, batch, batch_idx): - predicted_labels, target = self.forward(batch) + predicted_labels, _ = self.forward(batch) return predicted_labels def test_step(self, batch, batch_idx): - predicted_labels, target = self.forward(batch) + predicted_labels, _ = self.forward(batch) return predicted_labels def forward(self, batch): - waveform, target, wf_idx = batch + waveform, target = batch predicted_labels = self.lstm(waveform).squeeze() return predicted_labels, target diff --git a/ml4cc/models/simpler_models.py b/ml4cc/models/simpler_models.py index eeaa638..5120802 100644 --- a/ml4cc/models/simpler_models.py +++ b/ml4cc/models/simpler_models.py @@ -1,17 +1,39 @@ import torch import lightning as L import torch.nn as nn -import torch.optim as optim import torch.nn.functional as F +# from hydra.utils import instantiate +import importlib +from omegaconf import OmegaConf +import torch.optim as optim + + +def resolve_target(target_str): + """Resolve a string like class to the actual class.""" + module_path, class_name = target_str.rsplit(".", 1) + module = importlib.import_module(module_path) + return getattr(module, class_name) + class DNNModel(nn.Module): - def __init__(self, nfeature): - super(DNNModel, self).__init__() - self.fc1 = nn.Linear(nfeature, 32) - self.fc2 = nn.Linear(32, 32) - self.fc3 = nn.Linear(32, 32) - self.output = nn.Linear(32, 1) + def __init__(self, hyperparameters): + super().__init__() + self.fc1 = nn.Linear( + in_features=hyperparameters.n_features, out_features=hyperparameters.linear_layer_1.out_features + ) + self.fc2 = nn.Linear( + in_features=hyperparameters.linear_layer_1.out_features, + out_features=hyperparameters.linear_layer_2.out_features, + ) + self.fc3 = nn.Linear( + in_features=hyperparameters.linear_layer_2.out_features, + out_features=hyperparameters.linear_layer_3.out_features, + ) + self.output = nn.Linear( + in_features=hyperparameters.linear_layer_3.out_features, + out_features=hyperparameters.output_layer.out_features, + ) def forward(self, x): x = torch.relu(self.fc1(x)) @@ -22,18 +44,35 @@ def forward(self, x): class CNNModel(nn.Module): - def __init__(self, nfeature): + def __init__(self, hyperparameters): super().__init__() - self.conv1 = nn.Conv1d(1, 32, kernel_size=4) - self.pool1 = nn.MaxPool1d(kernel_size=2) - self.conv2 = nn.Conv1d(32, 16, kernel_size=4) - self.pool2 = nn.MaxPool1d(kernel_size=2) + self.conv1 = nn.Conv1d( + in_channels=hyperparameters.conv_layer_1.in_channels, + out_channels=hyperparameters.conv_layer_1.out_channels, + kernel_size=hyperparameters.conv_layer_1.kernel_size, + ) + self.pool1 = nn.MaxPool1d(kernel_size=hyperparameters.pool_layer_1.kernel_size) + self.conv2 = nn.Conv1d( + in_channels=hyperparameters.conv_layer_1.out_channels, + out_channels=hyperparameters.conv_layer_2.out_channels, + kernel_size=hyperparameters.conv_layer_2.kernel_size, + ) + self.pool2 = nn.MaxPool1d(kernel_size=hyperparameters.pool_layer_2.kernel_size) self.flatten = nn.Flatten() - self.fc1 = nn.Linear(16 * ((nfeature // 4) - 3), 32) # Compute flattened input size manually - self.output = nn.Linear(32, 1) + self.fc1 = nn.Linear( + # Compute flattened input size manually + in_features=16 * ((hyperparameters.num_features // 4) - 3), + out_features=hyperparameters.linear_layer_1.out_features, + ) + self.output = nn.Linear( + in_features=hyperparameters.linear_layer_1.out_features, + out_features=hyperparameters.linear_layer_1.out_features, + ) def forward(self, x): - x = x.unsqueeze(1) # To have a shape of (batch_size, in_channels, sequence_length) ; sequence length is the len of time series + # To have a shape of (batch_size, in_channels, sequence_length) ; + # sequence length is the len of time series + x = x.unsqueeze(1) x = torch.relu(self.conv1(x)) x = self.pool1(x) x = torch.relu(self.conv2(x)) @@ -46,11 +85,22 @@ def forward(self, x): class RNNModel(nn.Module): - def __init__(self, nfeature): + def __init__(self, hyperparameters): super().__init__() - self.lstm = nn.LSTM(input_size=1, hidden_size=16, num_layers=1, batch_first=True) - self.fc1 = nn.Linear(16, 16) - self.output = nn.Linear(16, 1) + self.lstm = nn.LSTM( + input_size=hyperparameters.LSTM_layers.input_size, + hidden_size=hyperparameters.LSTM_layers.hidden_size, + num_layers=hyperparameters.LSTM_layers.num_layers, + batch_first=hyperparameters.LSTM_layers.batch_first, + ) + self.fc1 = nn.Linear( + in_features=hyperparameters.LSTM_layers.hidden_size, + out_features=hyperparameters.linear_layer_1.out_features, + ) + self.output = nn.Linear( + in_features=hyperparameters.linear_layer_1.out_features, + out_features=hyperparameters.output_layer.out_features, + ) def forward(self, x): x = x.unsqueeze(-1) @@ -64,10 +114,10 @@ def forward(self, x): class SimplerModelModule(L.LightningModule): - def __init__(self, lr, model_, n_features): + def __init__(self, optimizer_cfg, model): super().__init__() - self.lr = lr - self.model = model_(n_features) + self.optimizer_cfg = optimizer_cfg + self.model = model def training_step(self, batch, batch_idx): predicted_labels, target = self.forward(batch) @@ -82,18 +132,50 @@ def validation_step(self, batch, batch_idx): return loss def configure_optimizers(self): - # return optim.RMSprop(self.parameters(), lr=0.001) # They use this - return optim.AdamW(self.parameters(), lr=0.001) + optimizer_class = resolve_target(self.optimizer_cfg["target"]) + kwargs = OmegaConf.to_container(self.optimizer_cfg, resolve=True) + kwargs.pop("target") + return optimizer_class(params=self.model.parameters(), **kwargs) def predict_step(self, batch, batch_idx): - predicted_labels, target = self.forward(batch) + predicted_labels, _ = self.forward(batch) return predicted_labels def test_step(self, batch, batch_idx): - predicted_labels, target = self.forward(batch) + predicted_labels, _ = self.forward(batch) return predicted_labels def forward(self, batch): peaks, target = batch predicted_labels = self.model(peaks).squeeze() return predicted_labels, target + + +class RNNModule(SimplerModelModule): + def __init__(self, name: str, hyperparameters: dict, optimizer: dict, checkpoint: dict): + self.name = name + self.checkpoint = checkpoint + self.optimizer_cfg = optimizer + self.hyperparameters = OmegaConf.create(hyperparameters) + model = RNNModel(hyperparameters=hyperparameters) + super().__init__(optimizer_cfg=self.optimizer_cfg, model=model) + + +class DNNModule(SimplerModelModule): + def __init__(self, name: str, hyperparameters: dict, optimizer: dict, checkpoint: dict): + self.name = name + self.checkpoint = checkpoint + self.optimizer_cfg = optimizer + self.hyperparameters = OmegaConf.create(hyperparameters) + model = DNNModel(hyperparameters=hyperparameters) + super().__init__(optimizer_cfg=self.optimizer_cfg, model=model) + + +class CNNModule(SimplerModelModule): + def __init__(self, name: str, hyperparameters: dict, optimizer: dict, checkpoint: dict): + self.name = name + self.checkpoint = checkpoint + self.optimizer_cfg = optimizer + self.hyperparameters = OmegaConf.create(hyperparameters) + model = CNNModel(hyperparameters) + super().__init__(optimizer_cfg=self.optimizer_cfg, model=model) diff --git a/ml4cc/models/transformer.py b/ml4cc/models/transformer.py index 733aca1..2666850 100644 --- a/ml4cc/models/transformer.py +++ b/ml4cc/models/transformer.py @@ -5,6 +5,7 @@ import torch.optim as optim import torch.nn.functional as F + class PositionalEncoding(nn.Module): def __init__(self, d_model: int, max_len: int): super().__init__() @@ -14,28 +15,30 @@ def __init__(self, d_model: int, max_len: int): pe[:, 0::2] = torch.sin(position * div_term) pe[:, 1::2] = torch.cos(position * div_term) self.pe = pe.unsqueeze(0) - + def forward(self, x): - return x + self.pe[:, :x.size(1), :].to(x.device) + return x + self.pe[:, : x.size(1), :].to(x.device) class WaveFormTransformer(nn.Module): def __init__( - self, - input_dim: int, # 1024 or 3000 - d_model: int, # 512 - num_heads: int, # 16 - num_layers: int, # 3 - hidden_dim: int, # 4*d_model - num_classes: int, # 1, either bkg or signal - max_len: int, # As we have fixed nr, then it max_len=input_dim - dropout: float=0.1 + self, + input_dim: int, # 1024 or 3000 + d_model: int, # 512 + num_heads: int, # 16 + num_layers: int, # 3 + hidden_dim: int, # 4*d_model + num_classes: int, # 1, either bkg or signal + max_len: int, # As we have fixed nr, then it max_len=input_dim + dropout: float = 0.1, ): super().__init__() self.input_projection = nn.Linear(1, d_model) self.positional_encoding = PositionalEncoding(d_model, max_len) - encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=num_heads, dim_feedforward=hidden_dim, dropout=dropout) + encoder_layer = nn.TransformerEncoderLayer( + d_model=d_model, nhead=num_heads, dim_feedforward=hidden_dim, dropout=dropout + ) self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers) self.peak_finding_classifier = nn.Linear(d_model, num_classes) @@ -48,9 +51,10 @@ def forward(self, x): x = self.positional_encoding(x) x = self.transformer_encoder(x) x = self.layernorm(x) - x = self.peak_finding_classifier(x) # Shape: [batch_size, seq_length, num_classes] - # x = F.relu(x) + # Shape: [batch_size, seq_length, num_classes] + x = self.peak_finding_classifier(x) x = x.sum(dim=1) # Shape: [batch_size, num_classes] + x = F.relu(x) # x = self.clusterizer(x) return x @@ -70,6 +74,7 @@ def __init__(self, name: str, hyperparameters: dict, checkpoint: dict = None): num_classes=self.hyperparameters["num_classes"], max_len=self.hyperparameters["max_len"], ) + self.lr = self.hyperparameters["lr"] def training_step(self, batch, batch_idx): predicted_labels, target = self.forward(batch) @@ -84,7 +89,7 @@ def validation_step(self, batch, batch_idx): return loss def configure_optimizers(self): - return optim.AdamW(self.parameters(), lr=0.001) + return optim.AdamW(self.parameters(), lr=self.lr) def predict_step(self, batch, batch_idx): predicted_labels, _ = self.forward(batch) diff --git a/ml4cc/scripts/dataset_visualizer.py b/ml4cc/scripts/dataset_visualizer.py new file mode 100644 index 0000000..de4a65e --- /dev/null +++ b/ml4cc/scripts/dataset_visualizer.py @@ -0,0 +1,96 @@ +import os +import hydra +import numpy as np +import awkward as ak +from omegaconf import DictConfig +from ml4cc.tools.visualization import dataset as ds + + +def analyze_cluster_counts(cluster_counts: ak.Array) -> dict: + info = {"mean": np.mean(cluster_counts), "stdev": np.std(cluster_counts)} + return info + + +def analyze_FCC_case_data(cfg: DictConfig, particle: str, energy: str) -> dict: + path = os.path.join(cfg.data_dir, "one_step", "test", f"{energy}_1.parquet") + data = ak.from_parquet(path) + num_primary = ak.sum(data.target == 1, axis=-1) + num_secondary = ak.sum(data.target == 2, axis=-1) + secondary_peak_info = analyze_cluster_counts(num_secondary) + primary_peak_info = analyze_cluster_counts(num_primary) + info = { + "raw_num_primary": num_primary, + "raw_num_secondary": num_secondary, + "primary_peak_info": primary_peak_info, + "secondary_peak_info": secondary_peak_info, + } + return info + + +def analyze_CEPC_case_data(cfg: DictConfig, particle: str, energy: str) -> dict: + path = os.path.join(cfg.data_dir, "one_step", "test", f"signal_{particle}_{energy}_0.parquet") + data = ak.from_parquet(path) + num_primary = ak.sum(data.target == 1, axis=-1) + num_secondary = ak.sum(data.target == 2, axis=-1) + secondary_peak_info = analyze_cluster_counts(num_secondary) + primary_peak_info = analyze_cluster_counts(num_primary) + info = { + "raw_num_primary": num_primary, + "raw_num_secondary": num_secondary, + "primary_peak_info": primary_peak_info, + "secondary_peak_info": secondary_peak_info, + } + return info + + +def accumulate_statistics(cfg: DictConfig) -> dict: + full_info = {} + for dataset_name, dataset_values in cfg.datasets.items(): + full_info[dataset_name] = {} + energies = dataset_values.particle_energies + particle_types = dataset_values.particle_types + for particle_type in particle_types: + full_info[dataset_name][particle_type] = {} + for energy in energies: + if dataset_name == "CEPC": + info = analyze_CEPC_case_data(cfg, particle_type, energy) + elif dataset_name == "FCC": + info = analyze_FCC_case_data(cfg, particle_type, energy) + else: + raise ValueError("Something went wrong, experiment config not found") + full_info[dataset_name][particle_type][energy] = info + return full_info + + +def visualize_all(full_info: dict, cfg: DictConfig) -> None: + pp_output_path = os.path.join(cfg.evaluation.dataset.results_output_dir, "primary_peaks.pdf") + ds.visualize_num_peaks(full_info, pp_output_path, peak_type="primary", errorband=False) + sp_output_path = os.path.join(cfg.evaluation.dataset.results_output_dir, "secondary_peaks.pdf") + ds.visualize_num_peaks(full_info, sp_output_path, peak_type="secondary", errorband=False) + for dataset_name, dataset_values in cfg.datasets.items(): + energies = dataset_values.particle_energies + particle_types = dataset_values.particle_types + for particle_type in particle_types: + for energy in energies: + os.makedirs(cfg.evaluation.dataset.results_output_dir, exist_ok=True) + output_path = os.path.join( + cfg.evaluation.dataset.results_output_dir, f"{dataset_name}_{particle_type}_{energy}.png" + ) + ds.visualize_primary_v_secondary_peaks_2d_histogram( + full_info=full_info, + experiment=dataset_name, + particle_type=particle_type, + energy=energy, + output_path=output_path, + max_peaks=45, + ) + + +@hydra.main(config_path="../config", config_name="main.yaml", version_base=None) +def main(cfg: DictConfig): + full_info = accumulate_statistics(cfg=cfg) + visualize_all(full_info=full_info, cfg=cfg) + + +if __name__ == "__main__": + main() # pylint: disable=E1120 diff --git a/ml4cc/scripts/preprocessing.py b/ml4cc/scripts/preprocessing.py index 2423dd7..4dc2776 100644 --- a/ml4cc/scripts/preprocessing.py +++ b/ml4cc/scripts/preprocessing.py @@ -24,11 +24,7 @@ def prepare_fcc_inputs(cfg: DictConfig) -> list: def prepare_cepc_inputs(cfg: DictConfig) -> list: all_paths_to_process = [] for dataset in ["test", "train"]: - input_file_wcp = os.path.join( - cfg.raw_input_dir, - dataset, - "*" - ) + input_file_wcp = os.path.join(cfg.raw_input_dir, dataset, "*") raw_data_input_paths = list(glob.glob(input_file_wcp)) all_paths_to_process.extend(raw_data_input_paths) return all_paths_to_process @@ -64,9 +60,9 @@ def process_files(input_files: list, cfg: DictConfig) -> None: def run_job(cfg: DictConfig) -> None: input_paths = [] - with open(cfg.preprocessing.slurm.input_path, 'rt') as inFile: + with open(cfg.preprocessing.slurm.input_path, "rt", encoding="utf-8") as inFile: for line in inFile: - input_paths.append(line.strip('\n')) + input_paths.append(line.strip("\n")) total_start_time = time.time() process_files(input_files=input_paths, cfg=cfg) total_end_time = time.time() @@ -88,5 +84,5 @@ def main(cfg: DictConfig) -> None: prepare_inputs(cfg) -if __name__ == '__main__': - main() # pylint: disable=E1120 \ No newline at end of file +if __name__ == "__main__": + main() # pylint: disable=E1120 diff --git a/ml4cc/scripts/train.py b/ml4cc/scripts/train.py index fb233ed..364e151 100644 --- a/ml4cc/scripts/train.py +++ b/ml4cc/scripts/train.py @@ -1,3 +1,12 @@ +import warnings + +warnings.filterwarnings( + "ignore", + message=r".*onnxscript\.values\.OnnxFunction\.param_schemas.*", + category=FutureWarning, + module=r"onnxscript\.converter", +) + import os import glob import hydra @@ -6,6 +15,7 @@ import torch import lightning as L import awkward as ak +import torch.multiprocessing as mp from omegaconf import DictConfig from torch.utils.data import DataLoader, IterableDataset from lightning.pytorch.loggers import CSVLogger @@ -15,7 +25,7 @@ from ml4cc.tools.evaluation import two_step as tse from ml4cc.tools.evaluation import two_step_minimal as tsme - +torch.set_float32_matmul_precision("medium") # or 'high' DEVICE = "cuda" if torch.cuda.is_available() else "cpu" @@ -23,22 +33,26 @@ def base_train(cfg: DictConfig, training_type: str): os.makedirs(cfg.training.log_dir, exist_ok=True) - os.makedirs(cfg.training.models_dir, exist_ok=True) + models_dir = ( + os.path.join(cfg.training.models_dir, training_type) if training_type != "one_step" else cfg.training.models_dir + ) + os.makedirs(models_dir, exist_ok=True) checkpoint_callback = ModelCheckpoint( - dirpath=cfg.training.models_dir, + dirpath=models_dir, monitor="val_loss", mode="min", save_top_k=-1, save_weights_only=True, - filename="{epoch:02d}-{val_loss:.2f}" + filename="{epoch:02d}-{val_loss:.2f}", ) + max_epochs = 2 if cfg.training.debug_run else cfg.training.trainer.max_epochs trainer = L.Trainer( - max_epochs=cfg.training.trainer.max_epochs, + max_epochs=max_epochs, callbacks=[ TQDMProgressBar(refresh_rate=10), checkpoint_callback, ], - logger=CSVLogger(cfg.training.log_dir, name=training_type) + logger=CSVLogger(cfg.training.log_dir, name=training_type), ) return trainer, checkpoint_callback @@ -46,8 +60,8 @@ def base_train(cfg: DictConfig, training_type: str): def train_one_step(cfg: DictConfig, data_type: str): print(f"Training {cfg.models.one_step.model.name} for the one-step training.") model = instantiate(cfg.models.one_step.model) - datamodule = dl.OneStepDataModule(cfg=cfg, data_type=data_type) - if not cfg.model_evaluation_only: + datamodule = dl.OneStepDataModule(cfg=cfg, data_type=data_type, debug_run=cfg.training.debug_run) + if not cfg.training.model_evaluation_only: trainer, checkpoint_callback = base_train(cfg, training_type="one_step") trainer.fit(model=model, datamodule=datamodule) @@ -64,13 +78,13 @@ def train_one_step(cfg: DictConfig, data_type: str): def train_two_step_peak_finding(cfg: DictConfig, data_type: str): print(f"Training {cfg.models.two_step.peak_finding.model.name} for the two-step peak-finding training.") model = instantiate(cfg.models.two_step.peak_finding.model) - datamodule = dl.TwoStepPeakFindingDataModule(cfg=cfg, data_type=data_type) - if not cfg.model_evaluation_only: - trainer, checkpoint_callback = base_train(cfg, training_type="two_step_peak_finding") + datamodule = dl.TwoStepPeakFindingDataModule(cfg=cfg, data_type=data_type, debug_run=cfg.training.debug_run) + if not cfg.training.model_evaluation_only: + trainer, checkpoint_callback = base_train(cfg, training_type="two_step_pf") trainer.fit(model=model, datamodule=datamodule) best_model_path = checkpoint_callback.best_model_path - new_best_model_path = os.path.join(cfg.training.models_dir, "best_model.ckpt") + new_best_model_path = os.path.join(cfg.training.models_dir, "two_step_pf", "best_model.ckpt") shutil.copyfile(best_model_path, new_best_model_path) metrics_path = os.path.join(trainer.logger.log_dir, "metrics.csv") else: @@ -82,13 +96,13 @@ def train_two_step_peak_finding(cfg: DictConfig, data_type: str): def train_two_step_clusterization(cfg: DictConfig, data_type: str): print(f"Training {cfg.models.two_step.clusterization.model.name} for the two-step clusterization training.") model = instantiate(cfg.models.two_step.clusterization.model) - datamodule = dl.TwoStepClusterizationDataModule(cfg=cfg, data_type=data_type) - if not cfg.model_evaluation_only: - trainer, checkpoint_callback = base_train(cfg, training_type="two_step_clusterization") + datamodule = dl.TwoStepClusterizationDataModule(cfg=cfg, data_type=data_type, debug_run=cfg.training.debug_run) + if not cfg.training.model_evaluation_only: + trainer, checkpoint_callback = base_train(cfg, training_type="two_step_cl") trainer.fit(model=model, datamodule=datamodule) best_model_path = checkpoint_callback.best_model_path - new_best_model_path = os.path.join(cfg.training.models_dir, "best_model.ckpt") + new_best_model_path = os.path.join(cfg.training.models_dir, "two_step_cl", "best_model.ckpt") shutil.copyfile(best_model_path, new_best_model_path) metrics_path = os.path.join(trainer.logger.log_dir, "metrics.csv") else: @@ -100,8 +114,8 @@ def train_two_step_clusterization(cfg: DictConfig, data_type: str): def train_two_step_minimal(cfg: DictConfig, data_type: str): print(f"Training {cfg.models.two_step_minimal.model.name} for the two-step minimal training.") model = instantiate(cfg.models.two_step_minimal.model) - datamodule = dl.TwoStepMinimalDataModule(cfg=cfg, data_type=data_type) - if not cfg.model_evaluation_only: + datamodule = dl.TwoStepMinimalDataModule(cfg=cfg, data_type=data_type, debug_run=cfg.training.debug_run) + if not cfg.training.model_evaluation_only: trainer, checkpoint_callback = base_train(cfg, training_type="two_step_minimal") trainer.fit(model=model, datamodule=datamodule) @@ -132,49 +146,59 @@ def get_FCC_evaluation_scenarios(cfg: DictConfig) -> list: return evaluation_scenarios -def save_predictions(input_path: str, all_predictions: ak.Array, cfg: DictConfig): - predictions_dir = cfg.training.predictions_dir - os.makedirs(predictions_dir, exist_ok=True) - base_dir = cfg.dataset.data_dir - output_path = input_path.replace(base_dir, predictions_dir) - output_path = output_path.replace(".parquet", "_pred.parquet") +def save_predictions(input_path: str, all_predictions: ak.Array, cfg: DictConfig, scenario: str): + if not scenario == "two_step_cl": + predictions_dir = cfg.training.predictions_dir + base_scenario = "two_step" if "two_step" in scenario else scenario + additional_dir_level = scenario if base_scenario == "two_step" else "" + base_dir = cfg.dataset.data_dir + original_dir = os.path.join(base_dir, base_scenario) + predictions_dir = os.path.join(predictions_dir, additional_dir_level) + os.makedirs(predictions_dir, exist_ok=True) + output_path = input_path.replace(original_dir, predictions_dir) + output_path = output_path.replace(".parquet", "_pred.parquet") + else: + output_path = input_path.replace("two_step_pf", "two_step_cl") input_data = ak.from_parquet(input_path) - output_data = input_data.copy() - output_data["pred"] = all_predictions - + output_data = ak.copy(input_data) + output_data["pred"] = ak.Array(all_predictions) # pylint: disable=E1137 + print(f"Saving predictions to {output_path}") + os.makedirs(os.path.dirname(output_path), exist_ok=True) ak.to_parquet(output_data, output_path, row_group_size=cfg.preprocessing.row_group_size) -def create_prediction_files(file_list: list, iterable_dataset: IterableDataset, model, cfg: DictConfig): +def create_prediction_files(file_list: list, iterable_dataset: IterableDataset, model, cfg: DictConfig, scenario: str): + num_files = 2 if cfg.training.debug_run else None with torch.no_grad(): - for path in file_list: + for path in file_list[:num_files]: dataset = dl.RowGroupDataset(path) - iterable_dataset = iterable_dataset(dataset) + iterable_dataset_ = iterable_dataset(dataset, device=DEVICE) dataloader = DataLoader( - dataset=iterable_dataset, + dataset=iterable_dataset_, batch_size=cfg.training.dataloader.batch_size, - num_workers=cfg.training.dataloader.num_dataloader_workers, - prefetch_factor=cfg.training.dataloader.prefetch_factor, + # num_workers=cfg.training.dataloader.num_dataloader_workers, + # prefetch_factor=cfg.training.dataloader.prefetch_factor, ) - predictions = [] - for batch in dataloader: - predictions = model(batch) - all_predictions.append(predictions) - all_predictions = ak.concatenate(all_predictions, axis=0) - save_predictions(input_path=path, all_predictions=all_predictions, cfg=cfg) + all_predictions = [] + for i, batch in enumerate(dataloader): + predictions, _ = model(batch) + all_predictions.append(predictions.detach().cpu().numpy()) + all_predictions = ak.concatenate(all_predictions, axis=0) + save_predictions(input_path=path, all_predictions=all_predictions, cfg=cfg, scenario=scenario) def evaluate_one_step(cfg: DictConfig, model, metrics_path: str) -> list: model.to(DEVICE) model.eval() - wcp_path = os.path.join(cfg.dataset.data_dir, "one_step", "*", "*") + dir_ = "*" if cfg.evaluation.training.eval_all_always else "test" + wcp_path = os.path.join(cfg.dataset.data_dir, "one_step", dir_, "*") file_list = glob.glob(wcp_path) iterable_dataset = dl.OneStepIterableDataset # Create prediction files - create_prediction_files(file_list, iterable_dataset=iterable_dataset, model=model, cfg=cfg) + create_prediction_files(file_list, iterable_dataset=iterable_dataset, model=model, cfg=cfg, scenario="one_step") # Evaluate training ose.evaluate_training(cfg=cfg, metrics_path=metrics_path) @@ -183,12 +207,11 @@ def evaluate_one_step(cfg: DictConfig, model, metrics_path: str) -> list: def evaluate_two_step_peak_finding(cfg: DictConfig, model, metrics_path: str) -> list: model.to(DEVICE) model.eval() - wcp_path = os.path.join(cfg.dataset.data_dir, "one_step", "*", "*") + wcp_path = os.path.join(cfg.dataset.data_dir, "two_step", "*", "*") file_list = glob.glob(wcp_path) iterable_dataset = dl.TwoStepPeakFindingIterableDataset - # Create prediction files - create_prediction_files(file_list, iterable_dataset=iterable_dataset, model=model, cfg=cfg) + create_prediction_files(file_list, iterable_dataset=iterable_dataset, model=model, cfg=cfg, scenario="two_step_pf") # Evaluate training tse.evaluate_training(cfg=cfg, metrics_path=metrics_path, stage="peak_finding") @@ -197,12 +220,13 @@ def evaluate_two_step_peak_finding(cfg: DictConfig, model, metrics_path: str) -> def evaluate_two_step_clusterization(cfg: DictConfig, model, metrics_path: str) -> list: model.to(DEVICE) model.eval() - wcp_path = os.path.join(cfg.dataset.data_dir, "one_step", "*", "*") + dir_ = "*" if cfg.evaluation.training.eval_all_always else "test" + wcp_path = os.path.join(cfg.training.output_dir, "two_step", "predictions", "two_step_pf", dir_, "*") file_list = glob.glob(wcp_path) iterable_dataset = dl.TwoStepClusterizationIterableDataset # Create prediction files - create_prediction_files(file_list, iterable_dataset=iterable_dataset, model=model, cfg=cfg) + create_prediction_files(file_list, iterable_dataset=iterable_dataset, model=model, cfg=cfg, scenario="two_step_cl") # Evaluate training tse.evaluate_training(cfg=cfg, metrics_path=metrics_path, stage="clusterization") @@ -211,46 +235,46 @@ def evaluate_two_step_clusterization(cfg: DictConfig, model, metrics_path: str) def evaluate_two_step_minimal(cfg: DictConfig, model, metrics_path: str) -> list: model.to(DEVICE) model.eval() - wcp_path = os.path.join(cfg.dataset.data_dir, "one_step", "*", "*") + wcp_path = os.path.join(cfg.dataset.data_dir, "two_step", "*", "*") file_list = glob.glob(wcp_path) iterable_dataset = dl.TwoStepMinimalIterableDataset # Create prediction files - create_prediction_files(file_list, iterable_dataset=iterable_dataset, model=model, cfg=cfg) + create_prediction_files(file_list, iterable_dataset=iterable_dataset, model=model, cfg=cfg, scenario="two_step_min") # Evaluate training - tsme.evaluate_training(cfg=cfg, metrics_path=metrics_path, stage="clusterization") + tsme.evaluate_training(cfg=cfg, metrics_path=metrics_path) @hydra.main(config_path="../config", config_name="main.yaml", version_base=None) def main(cfg: DictConfig): training_type = cfg.training.type - if training_type == "one_step": + print("Training one-step model.") model, best_model_path, metrics_path = train_one_step(cfg, data_type="") checkpoint = torch.load(best_model_path, weights_only=False) - model.load_state_dict(checkpoint['state_dict']) + model.load_state_dict(checkpoint["state_dict"]) model.eval() evaluate_one_step(cfg, model, metrics_path) elif training_type == "two_step": model, best_model_path, metrics_path = train_two_step_peak_finding(cfg, data_type="") checkpoint = torch.load(best_model_path, weights_only=False) - model.load_state_dict(checkpoint['state_dict']) + model.load_state_dict(checkpoint["state_dict"]) model.eval() - evaluate_two_step_peak_finding(cfg, model, metrics_path, data_type="") + evaluate_two_step_peak_finding(cfg, model, metrics_path) - model, best_model_path, metrics_path = train_two_step_clusterization(cfg) - evaluate_two_step_clusterization(cfg, model, best_model_path, metrics_path, data_type="") + model, best_model_path, metrics_path = train_two_step_clusterization(cfg, data_type="") + evaluate_two_step_clusterization(cfg, model, metrics_path) elif training_type == "two_step_minimal": model, best_model_path, metrics_path = train_two_step_minimal(cfg, data_type="") checkpoint = torch.load(best_model_path, weights_only=False) - model.load_state_dict(checkpoint['state_dict']) + model.load_state_dict(checkpoint["state_dict"]) model.eval() evaluate_two_step_minimal(cfg, model, metrics_path) else: raise ValueError(f"Unknown training type: {training_type}") - if __name__ == "__main__": - main() # pylint: disable=E1120 \ No newline at end of file + mp.set_start_method("spawn") + main() # pylint: disable=E1120 diff --git a/ml4cc/scripts/train_all_scenarios.sh b/ml4cc/scripts/train_all_scenarios.sh index a9572d7..54172d9 100644 --- a/ml4cc/scripts/train_all_scenarios.sh +++ b/ml4cc/scripts/train_all_scenarios.sh @@ -1,7 +1,6 @@ #!/bin/bash # This script trains all scenarios for the ML4CC project. - PROGNAME=$0 # Parse user options. @@ -18,13 +17,11 @@ EOF } -TRAINING_SCENARIO=one_step -TRAINING_DATASET=FCC RUN_ON_LUMI=true TRAIN_TWO_STEP=false TRAIN_TWO_STEP_MINIMAL=false TRAIN_ONE_STEP=false -CLUSTERIZATION_MODEL=all +CLUSTERIZATION_MODEL=DNN HOST=lumi while getopts 'o:s:d:mc:' OPTION; do case $OPTION in @@ -42,8 +39,13 @@ done shift "$((OPTIND - 1))" -declare -a DATASETS=("FCC" "CEPC") -declare -a +# declare -a DATASETS=("FCC" "CEPC") +# declare -a SCENARIOS=("one_step" "two_step" "two_step_minimal") + +echo Output will be saved into: $BASE_DIR +echo Training scenario: $TRAINING_SCENARIO +echo Training dataset: $TRAINING_DATASET +echo Running on: $HOST if [ "$RUN_ON_LUMI" = true ] ; then TRAINING_SCRIPT=ml4cc/scripts/submit-gpu-lumi.sh @@ -62,4 +64,4 @@ if [ "$TRAINING_SCENARIO" = "all" ] ; then fi -sbatch $TRAINING_SCRIPT python3 ml4cc/scripts/train.py training.ouput_dir=$BASE_DIR dataset=$TRAINING_DATASET host=$HOST training.type=$TRAINING_SCENARIO +sbatch $TRAINING_SCRIPT python3 ml4cc/scripts/train.py training.output_dir=$BASE_DIR datasets@dataset=$TRAINING_DATASET environment@host=$HOST training.type=$TRAINING_SCENARIO # models.two_step.clusterization@clusterization.model=$CLUSTERIZATION_MODEL diff --git a/ml4cc/tools/data/dataloaders.py b/ml4cc/tools/data/dataloaders.py index 241b879..c9c13f1 100644 --- a/ml4cc/tools/data/dataloaders.py +++ b/ml4cc/tools/data/dataloaders.py @@ -9,18 +9,18 @@ from ml4cc.tools.data import io -##################################################################################### -##################################################################################### -###################### Base classes ########################## -##################################################################################### -##################################################################################### +########################################################################## +########################################################################## +###################### Base classes ############### +########################################################################## +########################################################################## class RowGroupDataset(Dataset): def __init__(self, data_loc: str): self.data_loc = data_loc self.input_paths = io.get_all_paths(data_loc) - self.row_groups = io.get_row_groups(self.input_paths) + self.row_groups = io.get_row_groups(self.input_paths) def __getitem__(self, index): return self.row_groups[index] @@ -30,17 +30,19 @@ def __len__(self): class BaseIterableDataset(IterableDataset): - """ Base iterable dataset class to be used for different types of trainings.""" - def __init__(self, dataset: Dataset): + """Base iterable dataset class to be used for different types of trainings.""" + + def __init__(self, dataset: Dataset, device: str): + super().__init__() self.dataset = dataset self.row_groups = [row_group for row_group in self.dataset] self.num_rows = sum([rg.num_rows for rg in self.row_groups]) + self.device = device print(f"There are {'{:,}'.format(self.num_rows)} waveforms in the dataset.") - super().__init__() def build_tensors(self, data: ak.Array): - """ Builds the input and target tensors from the data. - + """Builds the input and target tensors from the data. + Parameters: data : ak.Array The data used to build the tensors. The data is a chunk of the dataset loaded from a .parquet file. @@ -55,6 +57,11 @@ def build_tensors(self, data: ak.Array): def __len__(self): return self.num_rows + def _move_to_device(self, batch): + if isinstance(batch, (tuple, list)): + return [self._move_to_device(x) for x in batch] + return batch.to(self.device) + def __iter__(self): worker_info = torch.utils.data.get_worker_info() if worker_info is None: @@ -65,20 +72,27 @@ def __iter__(self): row_groups_start = worker_id * per_worker row_groups_end = row_groups_start + per_worker row_groups_to_process = self.row_groups[row_groups_start:row_groups_end] - for row_group in row_groups_to_process: # load one chunk from one file data = ak.from_parquet(row_group.filename, row_groups=[row_group.row_group]) tensors = self.build_tensors(data) - # return individual jets from the dataset for idx_wf in range(len(data)): - yield tensors[0][idx_wf], tensors[1][idx_wf] # features, targets + # features, targets + yield self._move_to_device(tensors[0][idx_wf]), self._move_to_device(tensors[1][idx_wf]) class BaseDataModule(LightningDataModule): - def __init__(self, cfg: DictConfig, iter_dataset: IterableDataset, data_type: str): - """ Base data module class to be used for different types of trainings. + def __init__( + self, + cfg: DictConfig, + iter_dataset: IterableDataset, + data_type: str, + debug_run: bool = False, + device: str = "cpu", + clusterization: bool = False, + ): + """Base data module class to be used for different types of trainings. Parameters: cfg : DictConfig The configuration file used to set up the data module. @@ -94,10 +108,14 @@ def __init__(self, cfg: DictConfig, iter_dataset: IterableDataset, data_type: st self.task = "two_step" if self.cfg.training.type == "two_step_minimal" else self.cfg.training.type self.data_type = data_type self.iter_dataset = iter_dataset + self.device = device self.train_loader = None self.val_loader = None + self.test_dataset = None self.train_dataset = None self.val_dataset = None + self.clusterization = clusterization + self.num_row_groups = 2 if debug_run else None self.save_hyperparameters() super().__init__() @@ -118,18 +136,56 @@ def get_CEPC_dataset_path(self, dataset_type: str) -> str: The directory of the test dataset files """ if dataset_type == "train": - train_loc = os.path.join(self.cfg.dataset.data_dir, self.task, "train") - val_loc = os.path.join(self.cfg.dataset.data_dir, self.task, "val") + if self.clusterization: + train_loc = os.path.join( + self.cfg.dataset.data_dir, + "two_step", + "predictions", + "two_step_pf", + "train", + f"{self.data_type}_*.parquet", + ) + val_loc = os.path.join( + self.cfg.dataset.data_dir, + "two_step", + "predictions", + "two_step_pf", + "val", + f"{self.data_type}_*.parquet", + ) + else: + train_loc = os.path.join(self.cfg.dataset.data_dir, self.task, "train") + val_loc = os.path.join(self.cfg.dataset.data_dir, self.task, "val") return train_loc, val_loc elif dataset_type == "test": - if self.cfg.dataset.train_dataset == "combined": - test_dir = os.path.join(self.cfg.dataset.data_dir, self.task, "test") - elif self.cfg.dataset.train_dataset == "separate": - test_dir = os.path.join(self.cfg.dataset.data_dir, self.task, "test", f"*{self.data_type}*.parquet") + if self.cfg.dataset.test_dataset == "combined": + if self.clusterization: + test_dir = os.path.join(self.cfg.dataset.data_dir, "two_step", "predictions", "two_step_pf", "test") + else: + test_dir = os.path.join(self.cfg.dataset.data_dir, self.task, "test") + elif self.cfg.dataset.test_dataset == "separate": + if self.clusterization: + test_dir = os.path.join( + self.cfg.dataset.data_dir, + "two_step", + "predictions", + "two_step_pf", + "test", + f"{self.data_type}_*.parquet", + ) + else: + test_dir = os.path.join(self.cfg.dataset.data_dir, self.task, "test", f"*{self.data_type}*.parquet") + else: + raise ValueError( + f"Unexpected method for test dataset: {self.cfg.dataset.train_dataset}. Options:\ + 'combined', 'separate']" + ) return test_dir else: - raise ValueError(f"Unexpected train dataset type: {self.cfg.dataset.train_dataset}.\ - Please use 'combined' or 'separate'.") + raise ValueError( + f"Unexpected train dataset type: {self.cfg.dataset.train_dataset}.\ + Please use 'combined' or 'separate'." + ) def get_FCC_dataset_path(self, dataset_type: str) -> str: """Returns the directory of the dataset files for FCC @@ -149,20 +205,70 @@ def get_FCC_dataset_path(self, dataset_type: str) -> str: """ if dataset_type == "train": if self.cfg.dataset.train_dataset == "combined": - train_loc = os.path.join(self.cfg.dataset.data_dir, self.task, "train") - val_loc = os.path.join(self.cfg.dataset.data_dir, self.task, "val") + if self.clusterization: + train_loc = os.path.join( + self.cfg.training.output_dir, "two_step", "predictions", "two_step_pf", "train" + ) + val_loc = os.path.join( + self.cfg.training.output_dir, "two_step", "predictions", "two_step_pf", "val" + ) + else: + train_loc = os.path.join(self.cfg.dataset.data_dir, self.task, "train") + val_loc = os.path.join(self.cfg.dataset.data_dir, self.task, "val") elif self.cfg.dataset.train_dataset == "separate": - train_loc = os.path.join(self.cfg.dataset.data_dir, self.task, "train", f"{self.data_type}_*.parquet") - val_loc = os.path.join(self.cfg.dataset.data_dir, self.task, "val", f"{self.data_type}_*.parquet") + if self.clusterization: + train_loc = os.path.join( + self.cfg.training.output_dir, + "two_step", + "predictions", + "two_step_pf", + "train", + f"{self.data_type}_*.parquet", + ) + val_loc = os.path.join( + self.cfg.training.output_dir, + "two_step", + "predictions", + "two_step_pf", + "val", + f"{self.data_type}_*.parquet", + ) + else: + train_loc = os.path.join( + self.cfg.dataset.data_dir, self.task, "train", f"{self.data_type}_*.parquet" + ) + val_loc = os.path.join(self.cfg.dataset.data_dir, self.task, "val", f"{self.data_type}_*.parquet") else: - raise ValueError(f"Unexpected train dataset type: {self.cfg.dataset.train_dataset}.\ - Please use 'combined' or 'separate'.") + raise ValueError( + f"Unexpected train dataset type: {self.cfg.dataset.train_dataset}.\ + Please use 'combined' or 'separate'." + ) return train_loc, val_loc elif dataset_type == "test": - if self.cfg.dataset.train_dataset == "combined": - test_dir = os.path.join(self.cfg.dataset.data_dir, self.task, "test") - elif self.cfg.dataset.train_dataset == "separate": - test_dir = os.path.join(self.cfg.dataset.data_dir, self.task, "test", f"{self.data_type}_*.parquet") + if self.cfg.dataset.test_dataset == "combined": + if self.clusterization: + test_dir = os.path.join( + self.cfg.training.output_dir, "two_step", "predictions", "two_step_pf", "test" + ) + else: + test_dir = os.path.join(self.cfg.dataset.data_dir, self.task, "test") + elif self.cfg.dataset.test_dataset == "separate": + if self.clusterization: + test_dir = os.path.join( + self.cfg.training.output_dir, + "two_step", + "predictions", + "two_step_pf", + "test", + f"{self.data_type}_*.parquet", + ) + else: + test_dir = os.path.join(self.cfg.dataset.data_dir, self.task, "test", f"{self.data_type}_*.parquet") + else: + raise ValueError( + f"Unexpected method for testing dataset: {self.cfg.dataset.train_dataset}. Options:\ + 'combined', 'separate']" + ) return test_dir else: raise ValueError(f"Unexpected dataset type: {dataset_type}. Please use 'train' or 'test'.") @@ -171,29 +277,32 @@ def setup(self, stage: str) -> None: if stage == "fit": if self.cfg.dataset.name == "CEPC": train_dir, val_dir = self.get_CEPC_dataset_path(dataset_type="train") - self.train_dataset = RowGroupDataset(data_loc=train_dir) - self.val_dataset = RowGroupDataset(data_loc=val_dir) + self.train_dataset = RowGroupDataset(data_loc=train_dir)[: self.num_row_groups] + self.val_dataset = RowGroupDataset(data_loc=val_dir)[: self.num_row_groups] elif self.cfg.dataset.name == "FCC": train_dir, val_dir = self.get_FCC_dataset_path(dataset_type="train") - self.train_dataset = RowGroupDataset(data_loc=train_dir) - self.val_dataset = RowGroupDataset(data_loc=val_dir) + self.train_dataset = RowGroupDataset(data_loc=train_dir)[: self.num_row_groups] + self.val_dataset = RowGroupDataset(data_loc=val_dir)[: self.num_row_groups] self.train_dataset = self.iter_dataset( dataset=self.train_dataset, + device=self.device, ) self.val_dataset = self.iter_dataset( dataset=self.val_dataset, + device=self.device, ) self.train_loader = DataLoader( self.train_dataset, batch_size=self.cfg.training.dataloader.batch_size, num_workers=self.cfg.training.dataloader.num_dataloader_workers, - prefetch_factor=self.cfg.training.dataloader.prefetch_factor, + # prefetch_factor=self.cfg.training.dataloader.prefetch_factor, ) self.val_loader = DataLoader( self.val_dataset, batch_size=self.cfg.training.dataloader.batch_size, + persistent_workers=True, num_workers=self.cfg.training.dataloader.num_dataloader_workers, - prefetch_factor=self.cfg.training.dataloader.prefetch_factor, + # prefetch_factor=self.cfg.training.dataloader.prefetch_factor, ) elif stage == "test": if self.cfg.dataset.name == "CEPC": @@ -202,15 +311,16 @@ def setup(self, stage: str) -> None: test_dir = self.get_FCC_dataset_path(dataset_type="test") else: raise ValueError(f"Unexpected dataset type: {self.cfg.dataset.name}. Please use 'CEPC' or 'FCC'.") - self.test_dataset = RowGroupDataset(data_loc=test_dir) + self.test_dataset = RowGroupDataset(data_loc=test_dir)[: self.num_row_groups] self.test_dataset = self.iter_dataset( dataset=self.test_dataset, + device=self.device, ) self.test_loader = DataLoader( self.test_dataset, batch_size=self.cfg.training.dataloader.batch_size, num_workers=self.cfg.training.dataloader.num_dataloader_workers, - prefetch_factor=self.cfg.training.dataloader.prefetch_factor, + # prefetch_factor=self.cfg.training.dataloader.prefetch_factor, ) else: raise ValueError(f"Unexpected stage: {stage}") @@ -226,12 +336,12 @@ def test_dataloader(self): class OneStepIterableDataset(BaseIterableDataset): - def __init__(self, dataset: Dataset): - super().__init__(dataset) + def __init__(self, dataset: Dataset, device: str): + super().__init__(dataset, device=device) def build_tensors(self, data: ak.Array): - """ Builds the input and target tensors from the data. - + """Builds the input and target tensors from the data. + Parameters: data : ak.Array The data used to build the tensors. The data is a chunk of the dataset loaded from a .parquet file. @@ -249,11 +359,11 @@ def build_tensors(self, data: ak.Array): class TwoStepPeakFindingIterableDataset(BaseIterableDataset): - def __init__(self, dataset: Dataset): - super().__init__(dataset) + def __init__(self, dataset: Dataset, device: str): + super().__init__(dataset, device=device) def build_tensors(self, data: ak.Array): - """ This iterable dataset is to be used for the first step (peak finding). For this, we have a target for each + """This iterable dataset is to be used for the first step (peak finding). For this, we have a target for each waveform window. When building the tensors, we flatten the waveforms, so we predict one value for each window. We target both primary and secondary peaks, setting a target of 1 for both of them, whereas background has a target of 0. @@ -269,23 +379,23 @@ def build_tensors(self, data: ak.Array): """ waveforms = ak.Array(data.waveform) wf_targets = ak.Array(data.target) - wf_windows = ak.flatten(waveforms, axis=-2) + wf_windows: ak.Array = ak.flatten(waveforms, axis=-2) window_size_mask = ak.num(wf_windows) == 15 - wf_windows = wf_windows[window_size_mask] + wf_windows = wf_windows[window_size_mask] # pylint: disable=unsubscriptable-object wf_targets = ak.values_astype((wf_targets == 1) + (wf_targets == 2), int) - target_windows = ak.flatten(wf_targets, axis=-1) - target_windows = target_windows[window_size_mask] + target_windows: ak.Array = ak.flatten(wf_targets, axis=-1) + target_windows = target_windows[window_size_mask] # pylint: disable=unsubscriptable-object wf_windows = torch.tensor(wf_windows, dtype=torch.float32) target_windows = torch.tensor(target_windows, dtype=torch.float32) - return wf_windows, target_windows # TODO: Unsqueeze? + return wf_windows.unsqueeze(-1), target_windows class TwoStepClusterizationIterableDataset(BaseIterableDataset): - def __init__(self, dataset: Dataset): - super().__init__(dataset) + def __init__(self, dataset: Dataset, device: str): + super().__init__(dataset, device=device) def build_tensors(self, data: ak.Array): - """ This iterable dataset is to be used for the second step (clusterization). + """This iterable dataset is to be used for the second step (clusterization). Here we use the predictions from the first step (peak finding) as input. Parameters: @@ -297,19 +407,19 @@ def build_tensors(self, data: ak.Array): targets : torch.Tensor The target values of the data """ - peaks = ak.Array(data.pred) - targets = ak.sum(data.target == 1, axis = -1) + peaks = ak.Array(data.pred) + targets = ak.sum(data.target == 1, axis=-1) targets = torch.tensor(targets, dtype=torch.float32) peaks = torch.tensor(peaks, dtype=torch.float32) - return peaks, targets + return peaks.unsqueeze(-1), targets class TwoStepMinimalIterableDataset(BaseIterableDataset): - def __init__(self, dataset: Dataset): - super().__init__(dataset) + def __init__(self, dataset: Dataset, device: str): + super().__init__(dataset, device=device) def build_tensors(self, data: ak.Array): - """ This iterable dataset is to be used for the minimal two-step approach,where we only target the primary + """This iterable dataset is to be used for the minimal two-step approach,where we only target the primary peaks with the peak finding. In principle this allows us to skip clusterization step, as we can sum all the predicted peaks. This approach is used for evaluating how much clusterization adds on top of the peak finding. The difference with the vanilla peak-finding in the vanilla two-step approach is, that we use only "primary" @@ -326,62 +436,67 @@ def build_tensors(self, data: ak.Array): """ waveforms = ak.Array(data.waveform) wf_targets = ak.Array(data.target) - wf_windows = ak.flatten(waveforms, axis=-2) + wf_windows: ak.Array = ak.flatten(waveforms, axis=-2) window_size_mask = ak.num(wf_windows) == 15 - wf_windows = wf_windows[window_size_mask] + wf_windows = wf_windows[window_size_mask] # pylint: disable=unsubscriptable-object wf_targets = ak.values_astype((wf_targets == 1), int) - target_windows = ak.flatten(wf_targets, axis=-1) - target_windows = target_windows[window_size_mask] + target_windows: ak.Array = ak.flatten(wf_targets, axis=-1) + target_windows = target_windows[window_size_mask] # pylint: disable=unsubscriptable-object wf_windows = torch.tensor(wf_windows, dtype=torch.float32) target_windows = torch.tensor(target_windows, dtype=torch.float32) - return wf_windows, target_windows # TODO: Unsqueeze? + return wf_windows.unsqueeze(-1), target_windows class TwoStepMinimalDataModule(BaseDataModule): - def __init__(self, cfg: DictConfig, data_type: str): - """ Data module for the minimal two-step approach. This is a simplified version of the two-step approach, + def __init__(self, cfg: DictConfig, data_type: str, debug_run: bool = False, device: str = "cpu"): + """Data module for the minimal two-step approach. This is a simplified version of the two-step approach, where we only target the primary peaks with the peak finding. In principle this allows us to skip clusterization step, as we can sum all the predicted peaks. This approach is used for evaluating how much clusterization adds on top of the peak finding. The difference with the vanilla peak-finding in the vanilla two-step approach is, that we use only "primary" peaks as targets. """ iter_dataset = TwoStepMinimalIterableDataset - super().__init__(cfg=cfg, iter_dataset=iter_dataset, data_type=data_type) + super().__init__(cfg=cfg, iter_dataset=iter_dataset, data_type=data_type, debug_run=debug_run, device=device) class TwoStepPeakFindingDataModule(BaseDataModule): - def __init__(self, cfg: DictConfig, data_type: str): - """ Data module for the two-step approach. This is a simplified version of the two-step approach, where we only + def __init__(self, cfg: DictConfig, data_type: str, debug_run: bool = False, device: str = "cpu"): + """Data module for the two-step approach. This is a simplified version of the two-step approach, where we only target the primary peaks with the peak finding. In principle this allows us to skip clusterization step, as we can sum all the predicted peaks. This approach is used for evaluating how much clusterization adds on top of the peak finding. The difference with the vanilla peak-finding in the vanilla two-step approach is, that we use only "primary" peaks as targets. """ iter_dataset = TwoStepPeakFindingIterableDataset - super().__init__(cfg=cfg, iter_dataset=iter_dataset, data_type=data_type) + super().__init__(cfg=cfg, iter_dataset=iter_dataset, data_type=data_type, debug_run=debug_run, device=device) class TwoStepClusterizationDataModule(BaseDataModule): - def __init__(self, cfg: DictConfig, data_type: str): - """ Data module for the two-step approach. This is a simplified version of the two-step approach, where we only + def __init__(self, cfg: DictConfig, data_type: str, debug_run: bool = False, device: str = "cpu"): + """Data module for the two-step approach. This is a simplified version of the two-step approach, where we only target the primary peaks with the peak finding. In principle this allows us to skip clusterization step, as we can sum all the predicted peaks. This approach is used for evaluating how much clusterization adds on top of the peak finding. The difference with the vanilla peak-finding in the vanilla two-step approach is, that we use only "primary" peaks as targets. """ iter_dataset = TwoStepClusterizationIterableDataset - super().__init__(cfg=cfg, iter_dataset=iter_dataset, data_type=data_type) + super().__init__( + cfg=cfg, + iter_dataset=iter_dataset, + data_type=data_type, + debug_run=debug_run, + device=device, + clusterization=True, + ) class OneStepDataModule(BaseDataModule): - def __init__(self, cfg: DictConfig, data_type: str): - """ Data module for the one-step approach. This is a simplified version of the two-step approach, where we only + def __init__(self, cfg: DictConfig, data_type: str, debug_run: bool = False, device: str = "cpu"): + """Data module for the one-step approach. This is a simplified version of the two-step approach, where we only target the primary peaks with the peak finding. In principle this allows us to skip clusterization step, as we can sum all the predicted peaks. This approach is used for evaluating how much clusterization adds on top of the peak finding. The difference with the vanilla peak-finding in the vanilla two-step approach is, that we use only "primary" peaks as targets. """ iter_dataset = OneStepIterableDataset - super().__init__(cfg=cfg, iter_dataset=iter_dataset, data_type=data_type) - - + super().__init__(cfg=cfg, iter_dataset=iter_dataset, data_type=data_type, debug_run=debug_run, device=device) diff --git a/ml4cc/tools/data/io.py b/ml4cc/tools/data/io.py index bea2cc2..1f0f142 100644 --- a/ml4cc/tools/data/io.py +++ b/ml4cc/tools/data/io.py @@ -4,12 +4,11 @@ import random import numpy as np import awkward as ak -from ml4cc.tools.data import io from torch.utils.data import ConcatDataset, Subset def load_root_file(path: str, tree_path: str = "sim", branches: list = None) -> ak.Array: - """ Loads the CEPC dataset .root file. + """Loads the CEPC dataset .root file. Parameters: path : str @@ -29,9 +28,9 @@ def load_root_file(path: str, tree_path: str = "sim", branches: list = None) -> return arrays -def get_all_paths(input_loc, n_files: int = None, columns: list = None) -> list: - """Loads all .parquet files specified by the input. The input can be a list of input_paths, a directory where the files - are located or a wildcard path. +def get_all_paths(input_loc, n_files: int = None) -> list: + """Loads all .parquet files specified by the input. The input can be a list of input_paths, a directory where the + files are located or a wildcard path. Parameters: input_loc : str @@ -39,8 +38,8 @@ def get_all_paths(input_loc, n_files: int = None, columns: list = None) -> list: n_files : int [default: None] Maximum number of input files to be loaded. By default all will be loaded. columns : list - [default: None] Names of the columns/branches to be loaded from the .parquet file. By default all columns will - be loaded + [default: None] Names of the columns/branches to be loaded from the .parquet file. By default all columns + will be loaded Returns: input_paths : list @@ -59,14 +58,15 @@ def get_all_paths(input_loc, n_files: int = None, columns: list = None) -> list: elif os.path.isfile(input_loc): input_paths = [input_loc] else: - raise ValueError(f"Unexpected input_loc") + raise ValueError(f"Unexpected input_loc: {input_loc}") else: - raise ValueError(f"Unexpected input_loc") + raise ValueError(f"Unexpected input_loc: {input_loc}") return input_paths def get_row_groups(input_paths: list) -> list: - """Get the row groups of the input files. The row groups are used to split the data into smaller chunks for processing. + """Get the row groups of the input files. The row groups are used to split the data into smaller chunks for + processing. Parameters: input_paths : list @@ -82,7 +82,7 @@ def get_row_groups(input_paths: list) -> list: num_row_groups = metadata["num_row_groups"] col_counts = metadata["col_counts"] row_groups.extend( - [io.RowGroup(data_path, row_group, col_counts[row_group]) for row_group in range(num_row_groups)] + [RowGroup(data_path, row_group, col_counts[row_group]) for row_group in range(num_row_groups)] ) return row_groups @@ -93,8 +93,9 @@ def save_array_to_file(data: ak.Array, output_path: str) -> None: class RowGroup: - """Class to represent a row group in a .parquet file. The row group is used to split the data into smaller chunks for - processing.""" + """Class to represent a row group in a .parquet file. The row group is used to split the data into smaller chunks + for processing.""" + def __init__(self, filename, row_group, num_rows): """Initializes the row group. Parameters: @@ -110,7 +111,6 @@ def __init__(self, filename, row_group, num_rows): self.num_rows = num_rows - def train_val_split_shuffle( concat_dataset: ConcatDataset, val_split: float = 0.2, diff --git a/ml4cc/tools/data/postprocessing.py b/ml4cc/tools/data/postprocessing.py index 7e50317..866e96e 100644 --- a/ml4cc/tools/data/postprocessing.py +++ b/ml4cc/tools/data/postprocessing.py @@ -1 +1,2 @@ -# TODO: Postprocessing is needed for the two-step approach, as the results from peak-finding are fed into the clusterization step. \ No newline at end of file +# TODO: Postprocessing is needed for the two-step approach, as the results +# from peak-finding are fed into the clusterization step. diff --git a/ml4cc/tools/data/preprocessing.py b/ml4cc/tools/data/preprocessing.py index e47c5bf..c2ca867 100644 --- a/ml4cc/tools/data/preprocessing.py +++ b/ml4cc/tools/data/preprocessing.py @@ -6,7 +6,9 @@ from omegaconf import DictConfig -def save_processed_data(arrays: ak.Array, path: str, cfg: DictConfig, data_type: str = "one_step", dataset: str = "") -> None: +def save_processed_data( + arrays: ak.Array, path: str, cfg: DictConfig, data_type: str = "one_step", dataset: str = "" +) -> None: """ Parameters: arrays: ak.Array @@ -33,7 +35,7 @@ def save_processed_data(arrays: ak.Array, path: str, cfg: DictConfig, data_type: def indices_to_booleans(indices: ak.Array, array_to_slice: ak.Array) -> ak.Array: - """ Creates boolean array from indices for masking 'array_to_slice' + """Creates boolean array from indices for masking 'array_to_slice' Parameters: indices : ak.Array @@ -50,8 +52,8 @@ def indices_to_booleans(indices: ak.Array, array_to_slice: ak.Array) -> ak.Array def train_val_test_split(arrays: ak.Array, cfg: DictConfig, test_also: bool = True) -> tuple: - """ Splits the data into train, val and test sets. - + """Splits the data into train, val and test sets. + Parameters: arrays : ak.Array The array to be split into train, val and test sets @@ -70,24 +72,26 @@ def train_val_test_split(arrays: ak.Array, cfg: DictConfig, test_also: bool = Tr random.seed(42) random.shuffle(indices) if test_also: - num_train_rows = int(np.ceil(total_len*cfg.train_val_test_split[0])) - num_val_rows = int(np.ceil(total_len*cfg.train_val_test_split[1])) + num_train_rows = int(np.ceil(total_len * cfg.train_val_test_split[0])) + num_val_rows = int(np.ceil(total_len * cfg.train_val_test_split[1])) train_indices = indices[:num_train_rows] - val_indices = indices[num_train_rows: num_train_rows + num_val_rows] - test_indices = indices[num_train_rows + num_val_rows:] + val_indices = indices[num_train_rows : num_train_rows + num_val_rows] + test_indices = indices[num_train_rows + num_val_rows :] else: frac_train_raw = cfg.train_val_test_split[0] frac_val_raw = cfg.train_val_test_split[1] frac_train = frac_train_raw / (frac_train_raw + frac_val_raw) frac_val = frac_val_raw / (frac_train_raw + frac_val_raw) - train_indices = indices[:int(np.ceil(total_len*frac_train))] - val_indices = indices[int(np.ceil(total_len*frac_train)):int(np.ceil(total_len*(frac_train + frac_val)))] + train_indices = indices[: int(np.ceil(total_len * frac_train))] + val_indices = indices[int(np.ceil(total_len * frac_train)) : int(np.ceil(total_len * (frac_train + frac_val)))] test_indices = None return train_indices, test_indices, val_indices -def save_train_val_test_data(arrays, path, train_indices, val_indices, test_indices, cfg: DictConfig, data_type: str = "one_step") -> None: - """ Saves the split data into train, val and test sets. +def save_train_val_test_data( + arrays, path, train_indices, val_indices, test_indices, cfg: DictConfig, data_type: str = "one_step" +) -> None: + """Saves the split data into train, val and test sets. Parameters: arrays : ak.Array @@ -116,7 +120,7 @@ def save_train_val_test_data(arrays, path, train_indices, val_indices, test_indi def process_onestep_root_file(path: str, cfg: DictConfig) -> None: - """ Processes the .root file into a more ML friendly format for one-step training. + """Processes the .root file into a more ML friendly format for one-step training. Parameters: path : str @@ -128,23 +132,24 @@ def process_onestep_root_file(path: str, cfg: DictConfig) -> None: None """ arrays = io.load_root_file(path=path, tree_path=cfg.dataset.tree_path, branches=cfg.dataset.branches) - primary_ionization_mask = indices_to_booleans(arrays['time'][arrays['tag'] == 1], arrays['wf_i']) - secondary_ionization_mask = indices_to_booleans(arrays['time'][arrays['tag'] == 2], arrays['wf_i']) + primary_ionization_mask = indices_to_booleans(arrays["time"][arrays["tag"] == 1], arrays["wf_i"]) + secondary_ionization_mask = indices_to_booleans(arrays["time"][arrays["tag"] == 2], arrays["wf_i"]) target = (primary_ionization_mask * 1) + (secondary_ionization_mask * 2) - processed_array = ak.Array({ - "waveform": arrays['wf_i'], - "target": target - }) + processed_array = ak.Array({"waveform": arrays["wf_i"], "target": target}) if cfg.dataset.name == "FCC": train_indices, test_indices, val_indices = train_val_test_split(arrays, cfg.preprocessing) - save_train_val_test_data(processed_array, path, train_indices, val_indices, test_indices, cfg=cfg, data_type="one_step") + save_train_val_test_data( + processed_array, path, train_indices, val_indices, test_indices, cfg=cfg, data_type="one_step" + ) elif cfg.dataset.name == "CEPC": if "test" in path: save_processed_data(processed_array, path, data_type="one_step", cfg=cfg) elif "train" in path: train_indices, test_indices, val_indices = train_val_test_split(arrays, cfg.preprocessing, test_also=False) - save_train_val_test_data(processed_array, path, train_indices, val_indices, test_indices, cfg=cfg, data_type="one_step") + save_train_val_test_data( + processed_array, path, train_indices, val_indices, test_indices, cfg=cfg, data_type="one_step" + ) else: raise ValueError(f"Unknown dataset in path: {path}") else: @@ -152,7 +157,7 @@ def process_onestep_root_file(path: str, cfg: DictConfig) -> None: def process_twostep_root_file(path: str, cfg: DictConfig, nleft: int = 5, nright: int = 9) -> None: - """ Processes the peakfinding .root file into the format Guang used for 2-step training. + """Processes the peakfinding .root file into the format Guang used for 2-step training. Parameters: path : str @@ -175,27 +180,34 @@ def process_twostep_root_file(path: str, cfg: DictConfig, nleft: int = 5, nright target_window = [] peak_indices = np.array(arrays.time[event_idx], dtype=int) for peak_idx, peak_loc in enumerate(peak_indices): - if peak_loc < nleft: continue - wf_window = arrays.wf_i[event_idx][peak_loc - nleft: peak_loc + nright + 1] + if peak_loc < nleft: + continue + wf_window = arrays.wf_i[event_idx][peak_loc - nleft : peak_loc + nright + 1] target = arrays.tag[event_idx][peak_idx] wf_windows.append(wf_window) target_window.append(target) all_targets.append(target_window) all_windows.append(wf_windows) - processed_array = ak.Array({ - "target": all_targets, - "waveform": all_windows, - "wf_i": arrays.wf_i, - }) + processed_array = ak.Array( + { + "target": all_targets, + "waveform": all_windows, + "wf_i": arrays.wf_i, + } + ) if cfg.dataset.name == "FCC": train_indices, test_indices, val_indices = train_val_test_split(arrays, cfg.preprocessing) - save_train_val_test_data(processed_array, path, train_indices, val_indices, test_indices, cfg=cfg, data_type="two_step") + save_train_val_test_data( + processed_array, path, train_indices, val_indices, test_indices, cfg=cfg, data_type="two_step" + ) elif cfg.dataset.name == "CEPC": if "test" in path: save_processed_data(processed_array, path, data_type="two_step", cfg=cfg) elif "train" in path: train_indices, test_indices, val_indices = train_val_test_split(arrays, cfg.preprocessing, test_also=False) - save_train_val_test_data(processed_array, path, train_indices, val_indices, test_indices, cfg=cfg, data_type="two_step") + save_train_val_test_data( + processed_array, path, train_indices, val_indices, test_indices, cfg=cfg, data_type="two_step" + ) else: raise ValueError(f"Unknown dataset in path: {path}") else: diff --git a/ml4cc/tools/data/slurm_tools.py b/ml4cc/tools/data/slurm_tools.py index 0b1516c..02cf169 100644 --- a/ml4cc/tools/data/slurm_tools.py +++ b/ml4cc/tools/data/slurm_tools.py @@ -5,7 +5,7 @@ def generate_run_id(run_id_len=10): - """ Creates a random alphanumeric string with a length of run_id_len + """Creates a random alphanumeric string with a length of run_id_len Args: run_id_len : int @@ -15,11 +15,11 @@ def generate_run_id(run_id_len=10): random_string : str The randomly generated alphanumeric string """ - return ''.join(random.choices(string.ascii_uppercase + string.digits, k=run_id_len)) + return "".join(random.choices(string.ascii_uppercase + string.digits, k=run_id_len)) def create_tmp_run_dir(cfg): - """ Creates the temporary directory where the slurm job, log and error files are saved for each job + """Creates the temporary directory where the slurm job, log and error files are saved for each job Args: cfg : DictConfig @@ -36,11 +36,11 @@ def create_tmp_run_dir(cfg): def prepare_job_file( - input_file, - job_idx, - output_dir, - run_script, - cfg, + input_file, + job_idx, + output_dir, + run_script, + cfg, ): """Writes the job file that will be executed by slurm @@ -68,17 +68,18 @@ def prepare_job_file( os.makedirs(error_dir, exist_ok=True) log_dir = os.path.join(output_dir, "log_files") os.makedirs(log_dir, exist_ok=True) - job_file = os.path.join(job_dir, 'execute' + str(job_idx) + '.sh') - error_file = os.path.join(error_dir, 'error' + str(job_idx)) - log_file = os.path.join(log_dir, 'output' + str(job_idx)) - with open(job_file, 'wt') as filehandle: - filehandle.writelines(dedent( - f""" + job_file = os.path.join(job_dir, "execute" + str(job_idx) + ".sh") + error_file = os.path.join(error_dir, "error" + str(job_idx)) + log_file = os.path.join(log_dir, "output" + str(job_idx)) + with open(job_file, "wt") as filehandle: + filehandle.writelines( + dedent( + f""" #!/bin/bash #SBATCH --job-name=ntupelizer #SBATCH --ntasks={cfg.slurm.queue.preprocessing.cpus} #SBATCH --partition={cfg.slurm.queue.preprocessing.partition} - #SBATCH --time=02:00:00 + #SBATCH --time={cfg.slurm.queue.preprocessing.time} #SBATCH --cpus-per-task=1 #SBATCH --mem-per-cpu={cfg.slurm.queue.preprocessing.mem} #SBATCH -e {error_file} @@ -86,7 +87,9 @@ def prepare_job_file( env date ./run.sh python {run_script} preprocessing.slurm.slurm_run=True preprocessing.slurm.input_path={input_file} - """).strip('\n')) + """ + ).strip("\n") + ) return job_file @@ -94,7 +97,9 @@ def multipath_slurm_processor(input_path_chunks, job_script, cfg): output_dir = create_tmp_run_dir(cfg=cfg) input_file_paths = create_job_input_list(input_path_chunks, output_dir) for job_idx, input_file_path in enumerate(input_file_paths): - prepare_job_file(input_file=input_file_path, job_idx=job_idx, output_dir=output_dir, run_script=job_script, cfg=cfg) + prepare_job_file( + input_file=input_file_path, job_idx=job_idx, output_dir=output_dir, run_script=job_script, cfg=cfg + ) print(f"Temporary directory created to {output_dir}") print(f"Run `bash ml4cc/scripts/submit_batchJobs.sh {output_dir}/executables/`") @@ -103,7 +108,7 @@ def create_job_input_list(input_path_chunks: list, output_dir: str): input_file_paths = [] for i, in_chunk in enumerate(input_path_chunks): input_file_path = os.path.join(output_dir, f"input_paths_{i}.txt") - with open(input_file_path, 'wt') as outFile: + with open(input_file_path, "wt") as outFile: for path in in_chunk: outFile.write(path + "\n") input_file_paths.append(input_file_path) diff --git a/ml4cc/tools/evaluation/clusterization.py b/ml4cc/tools/evaluation/clusterization.py index 4e448d2..2999990 100644 --- a/ml4cc/tools/evaluation/clusterization.py +++ b/ml4cc/tools/evaluation/clusterization.py @@ -9,6 +9,7 @@ from ml4cc.tools.data import io from ml4cc.tools.visualization import losses as l from ml4cc.tools.visualization import regression as r + hep.style.use(hep.styles.CMS) DEVICE = "cuda" if torch.cuda.is_available() else "cpu" @@ -16,11 +17,12 @@ def filter_losses(metrics_path: str): metrics_data = pd.read_csv(metrics_path) - val_loss = np.array(metrics_data['val_loss']) + val_loss = np.array(metrics_data["val_loss"]) # train_loss = np.array(metrics_data['train_loss']) val_loss = val_loss[~np.isnan(val_loss)] # train_loss = train_loss[~np.isnan(train_loss)] - return val_loss#, train_loss + return val_loss # , train_loss + def evaluate_training(model, dataloader, metrics_path, cfg): all_true = [] @@ -44,11 +46,13 @@ def evaluate_training(model, dataloader, metrics_path, cfg): prediction_save = ak.Array(prediction_save) waveform_save = ak.Array(waveform_save) true_save = ak.Array(true_save) - pred_file_data = ak.Array({ - "detected_peaks": prediction_save, - "waveform": waveform_save, - "target": true_save, - }) + pred_file_data = ak.Array( + { + "detected_peaks": prediction_save, + "waveform": waveform_save, + "target": true_save, + } + ) pred_file_path = os.path.join(cfg.training.output_dir, "predictions.parquet") io.save_array_to_file(data=pred_file_data, output_path=pred_file_path) diff --git a/ml4cc/tools/evaluation/general.py b/ml4cc/tools/evaluation/general.py index d0bb843..30471a7 100644 --- a/ml4cc/tools/evaluation/general.py +++ b/ml4cc/tools/evaluation/general.py @@ -4,8 +4,8 @@ def filter_losses(metrics_path: str): metrics_data = pd.read_csv(metrics_path) - val_loss = np.array(metrics_data['val_loss']) + val_loss = np.array(metrics_data["val_loss"]) # train_loss = np.array(metrics_data['train_loss']) val_loss = val_loss[~np.isnan(val_loss)] # train_loss = train_loss[~np.isnan(train_loss)] - return val_loss#, train_loss + return val_loss # , train_loss diff --git a/ml4cc/tools/evaluation/one_step.py b/ml4cc/tools/evaluation/one_step.py index 1f756ad..c7ff743 100644 --- a/ml4cc/tools/evaluation/one_step.py +++ b/ml4cc/tools/evaluation/one_step.py @@ -1,51 +1,27 @@ import os -import tqdm -import pandas as pd -import numpy as np +import glob import awkward as ak from ml4cc.tools.data import io from ml4cc.tools.visualization import losses as l from ml4cc.tools.visualization import regression as r +from ml4cc.tools.evaluation import general as g - -def filter_losses(metrics_path: str): - metrics_data = pd.read_csv(metrics_path) - val_loss = np.array(metrics_data['val_loss']) - # train_loss = np.array(metrics_data['train_loss']) - val_loss = val_loss[~np.isnan(val_loss)] - # train_loss = train_loss[~np.isnan(train_loss)] - return val_loss#, train_loss - - -def evaluate_training(model, dataloader, metrics_path, cfg): +def evaluate_training(cfg, metrics_path): + results_dir = cfg.training.results_dir + os.makedirs(results_dir, exist_ok=True) + predictions_dir = cfg.training.predictions_dir + test_dir = os.path.join(predictions_dir, "test") + test_wcp = glob.glob(os.path.join(test_dir, "*")) all_true = [] all_preds = [] - all_waveforms = [] - print("Prediction progress for TEST dataset") - for batch_idx, batch in tqdm.tqdm(enumerate(dataloader), total=len(dataloader)): - waveform, target = batch - pred = model(batch)[0] - all_true.append(target.detach().cpu().numpy()) - all_preds.append(pred.detach().cpu().numpy()) - all_waveforms.append(waveform.detach().cpu().numpy()) - pred_file_data = ak.Array({ - "detected_peaks": all_preds, - "waveform": all_waveforms, - "target": all_true, - }) - results_dir = os.path.join(cfg.training.output_dir, "results") - os.makedirs(results_dir, exist_ok=True) - pred_file_path = os.path.join(results_dir, "predictions.parquet") - io.save_array_to_file(data=pred_file_data, output_path=pred_file_path) + for path in test_wcp: + data = ak.from_parquet(path) + all_true.append(ak.sum(data.target, axis=-1)) + all_preds.append(data.pred) truth = ak.flatten(all_true, axis=None) preds = ak.flatten(all_preds, axis=None) - # cl.plot_classification( - # truth=ak.flatten(all_true, axis=None), - # preds=ak.flatten(all_preds, axis=None), - # output_dir=results_dir - # ) resolution_output_path = os.path.join(results_dir, "resolution.pdf") r.evaluate_resolution(truth, preds, output_path=resolution_output_path) @@ -53,7 +29,6 @@ def evaluate_training(model, dataloader, metrics_path, cfg): distribution_output_path = os.path.join(results_dir, "true_pred_distributions.pdf") r.plot_true_pred_distributions(truth, preds, output_path=distribution_output_path) - # val_loss, train_loss = filter_losses(metrics_path) - val_loss = filter_losses(metrics_path) + val_loss = g.filter_losses(metrics_path) losses_output_path = os.path.join(cfg.training.output_dir, "losses.png") - l.plot_loss_evolution(val_loss=val_loss, train_loss=None, output_path=losses_output_path) \ No newline at end of file + l.plot_loss_evolution(val_loss=val_loss, train_loss=None, output_path=losses_output_path) diff --git a/ml4cc/tools/evaluation/peakFinding.py b/ml4cc/tools/evaluation/peakFinding.py index 5615aac..342c449 100644 --- a/ml4cc/tools/evaluation/peakFinding.py +++ b/ml4cc/tools/evaluation/peakFinding.py @@ -10,6 +10,7 @@ from omegaconf import DictConfig from ml4cc.tools.visualization import losses as l from ml4cc.tools.visualization import classification as cl + hep.style.use(hep.styles.CMS) @@ -18,11 +19,11 @@ def filter_losses(metrics_path: str): metrics_data = pd.read_csv(metrics_path) - val_loss = np.array(metrics_data['val_loss']) + val_loss = np.array(metrics_data["val_loss"]) # train_loss = np.array(metrics_data['train_loss']) val_loss = val_loss[~np.isnan(val_loss)] # train_loss = train_loss[~np.isnan(train_loss)] - return val_loss#, train_loss + return val_loss # , train_loss def create_pred_values(preds: np.array, cfg: DictConfig): @@ -31,9 +32,10 @@ def create_pred_values(preds: np.array, cfg: DictConfig): zero_count = int((window_size - 1) / 2) for pred in preds: if pred > 0.5: # If detected peak - pred_vector.extend([0]*zero_count + [1] + [0]*zero_count) # So the middle of the window is the location of the peak + # So the middle of the window is the location of the peak + pred_vector.extend([0] * zero_count + [1] + [0] * zero_count) else: - pred_vector.extend([0]*window_size) + pred_vector.extend([0] * window_size) return np.array(pred_vector) @@ -43,9 +45,9 @@ def create_true_values(true: np.array, cfg: DictConfig): true_vector = [] for t in true: if t > 0.5: - true_vector.extend([0]*zero_count + [1] + [0]*zero_count) + true_vector.extend([0] * zero_count + [1] + [0] * zero_count) else: - true_vector.extend([0]*window_size) + true_vector.extend([0] * window_size) return np.array(true_vector) @@ -72,11 +74,13 @@ def evaluate_training(model, dataloader, metrics_path, cfg, output_dir=""): prediction_save = ak.Array(prediction_save) waveform_save = ak.Array(waveform_save) true_save = ak.Array(true_save) - pred_file_data = ak.Array({ - "detected_peaks": prediction_save, - "waveform": waveform_save, - "target": true_save, - }) + pred_file_data = ak.Array( + { + "detected_peaks": prediction_save, + "waveform": waveform_save, + "target": true_save, + } + ) if output_dir == "": output_dir = cfg.training.output_dir pred_file_path = os.path.join(output_dir, "predictions.parquet") @@ -98,9 +102,9 @@ def plot_prediction_v_true(wfs, vals, window_size=15): wf = np.concatenate(wfs.squeeze().numpy()) plt.plot(np.arange(len(wf)), wf) for i, x in enumerate(vals): - loc = (window_size/2) + i * window_size + loc = (window_size / 2) + i * window_size if x > 0.5: - plt.axvline(loc, ymax=4, linestyle='--', color='red') + plt.axvline(loc, ymax=4, linestyle="--", color="red") plt.ylabel("Amplitude") plt.xlabel("Time") plt.grid() diff --git a/ml4cc/tools/evaluation/two_step.py b/ml4cc/tools/evaluation/two_step.py index e69de29..c975421 100644 --- a/ml4cc/tools/evaluation/two_step.py +++ b/ml4cc/tools/evaluation/two_step.py @@ -0,0 +1,5 @@ +from omegaconf import DictConfig + + +def evaluate_training(cfg: DictConfig, metrics_path: str, stage: str): + pass diff --git a/ml4cc/tools/evaluation/two_step_minimal.py b/ml4cc/tools/evaluation/two_step_minimal.py index e69de29..3eabbb4 100644 --- a/ml4cc/tools/evaluation/two_step_minimal.py +++ b/ml4cc/tools/evaluation/two_step_minimal.py @@ -0,0 +1,5 @@ +from omegaconf import DictConfig + + +def evaluate_training(cfg: DictConfig, metrics_path: str): + pass diff --git a/ml4cc/tools/general.py b/ml4cc/tools/general.py index a64055a..6c40356 100644 --- a/ml4cc/tools/general.py +++ b/ml4cc/tools/general.py @@ -3,7 +3,7 @@ def print_config(cfg: DictConfig) -> None: - """ Prints the configuration used for the processing + """Prints the configuration used for the processing Parameters: cfg : DictConfig diff --git a/ml4cc/tools/visualization/classification.py b/ml4cc/tools/visualization/classification.py index a0dc049..ff356eb 100644 --- a/ml4cc/tools/visualization/classification.py +++ b/ml4cc/tools/visualization/classification.py @@ -3,11 +3,12 @@ import mplhep as hep import matplotlib.pyplot as plt from sklearn.metrics import roc_curve, roc_auc_score + hep.style.use(hep.styles.CMS) -def plot_roc_curve(truth: np.array, preds: np.array, output_path: str='') -> None: - """ Plots the ROC curve based on the true and predicted values +def plot_roc_curve(truth: np.array, preds: np.array, output_path: str = "") -> None: + """Plots the ROC curve based on the true and predicted values Parameters: truth : np.array @@ -27,14 +28,14 @@ def plot_roc_curve(truth: np.array, preds: np.array, output_path: str='') -> Non plt.ylim(0, 1) plt.xlabel("FPR") plt.ylabel("TPR") - plt.figtext(0.5, 0.5, f'AUC={auc_value:.4f}') - if output_path != '': - plt.savefig(output_path, bbox_inches='tight') + plt.figtext(0.5, 0.5, f"AUC={auc_value:.4f}") + if output_path != "": + plt.savefig(output_path, bbox_inches="tight") plt.close("all") -def plot_classifier_scores_distribution(truth: np.array, preds: np.array, output_path: str='') -> None: - """ Plots the distribution of classifier scores based on the true and predicted values +def plot_classifier_scores_distribution(truth: np.array, preds: np.array, output_path: str = "") -> None: + """Plots the distribution of classifier scores based on the true and predicted values Parameters: truth : np.array @@ -50,18 +51,18 @@ def plot_classifier_scores_distribution(truth: np.array, preds: np.array, output bkg_idx = truth == 0 sig_idx = truth == 1 bins = np.linspace(0, 1, num=25) - plt.hist(preds[bkg_idx], color='r', label='BKG', histtype='step', density=True, bins=bins) - plt.hist(preds[sig_idx], color='b', label='SIG', histtype='step', density=True, bins=bins) + plt.hist(preds[bkg_idx], color="r", label="BKG", histtype="step", density=True, bins=bins) + plt.hist(preds[sig_idx], color="b", label="SIG", histtype="step", density=True, bins=bins) plt.legend() plt.xlabel(r"$\mathcal{D}_p$") plt.ylabel("Count [a.u.]") - if output_path != '': + if output_path != "": plt.savefig(output_path) plt.close("all") -def plot_classification(truth: np.array, preds: np.array, output_dir: str='') -> None: - """ Plots the distribution of classifier scores based on the true and predicted values +def plot_classification(truth: np.array, preds: np.array, output_dir: str = "") -> None: + """Plots the distribution of classifier scores based on the true and predicted values Parameters: truth : np.array diff --git a/ml4cc/tools/visualization/dataset.py b/ml4cc/tools/visualization/dataset.py new file mode 100644 index 0000000..db42727 --- /dev/null +++ b/ml4cc/tools/visualization/dataset.py @@ -0,0 +1,82 @@ +import numpy as np +import mplhep as hep +import matplotlib.pyplot as plt + +hep.style.use(hep.styles.CMS) + + +def collect_energy_wise_info(particle_type_info, peak_type): + plotting_info = {"energy": [], "mean_values": [], "min_values": [], "max_values": [], "stdev": []} + for energy, energy_info in particle_type_info.items(): + if energy_info == {}: + continue + plotting_info["energy"].append(energy) + plotting_info["mean_values"].append(energy_info[f"{peak_type}_peak_info"]["mean"]) + plotting_info["min_values"].append( + energy_info[f"{peak_type}_peak_info"]["mean"] - energy_info[f"{peak_type}_peak_info"]["stdev"] + ) + plotting_info["max_values"].append( + energy_info[f"{peak_type}_peak_info"]["mean"] + energy_info[f"{peak_type}_peak_info"]["stdev"] + ) + plotting_info["stdev"].append(energy_info[f"{peak_type}_peak_info"]["stdev"]) + return plotting_info + + +def visualize_num_peaks(full_info, output_path: str, peak_type="primary", errorband=True): + fig, ax = plt.subplots(figsize=(10, 10)) + p_name_mapping = {"muon": r"$\mu^{\pm}$", "K": r"$K^{\pm}$", "pi": r"$\pi^{\pm}$"} + marker_mapping = {"CEPC": "v", "FCC": "^"} + color_map = {"K": "g", "pi": "r", "muon": "b"} + for experiment_name, experiment_info in full_info.items(): + for particle_type, particle_type_info in experiment_info.items(): + plotting_info = collect_energy_wise_info(particle_type_info, peak_type) + plt.plot( + plotting_info["energy"], + plotting_info["mean_values"], + ls="-", + marker=marker_mapping[experiment_name], + label=f"{experiment_name}: {p_name_mapping[particle_type]}", + color=color_map[particle_type], + ) + if errorband: + ax.fill_between( + x=plotting_info["energy"], + y1=plotting_info["min_values"], + y2=plotting_info["max_values"], + color=color_map[particle_type], + alpha=0.3, + ) + plt.legend() + plt.yscale("log") + plt.xscale("log") + fig.savefig(output_path) + + +def visualize_primary_v_secondary_peaks_2d_histogram( + full_info: dict, experiment: str, particle_type: str, energy: float, output_path: str, max_peaks: int = 28 +): + x = np.array(full_info[experiment][particle_type][energy]["raw_num_primary"]) + y = np.array(full_info[experiment][particle_type][energy]["raw_num_secondary"]) + + fig = plt.figure(figsize=(16, 16)) + grid = plt.GridSpec(4, 4, hspace=0.0, wspace=0.0) + + main_ax = fig.add_subplot(grid[1:, :-1]) + y_hist = fig.add_subplot(grid[1:, -1], sharey=main_ax) + x_hist = fig.add_subplot(grid[0, :-1], sharex=main_ax) + + xbins = ybins = np.linspace(0, max_peaks + 1, max_peaks) + H, x_edges, y_edges = np.histogram2d(x, y, bins=xbins, density=None, weights=None) + hep.hist2dplot(H, x_edges, y_edges, ax=main_ax, cbar=False, cmap="Greys") + + main_ax.set_xlabel("Number of Primary Peaks") + main_ax.set_ylabel("Number of Secondary Peaks") + + x_hist.hist(x, bins=xbins, color="gray", histtype="step") + x_hist.tick_params(top=True, labeltop=False, bottom=True, labelbottom=False) + x_hist.tick_params(left=True, labelleft=False, right=True, labelright=True) + + y_hist.hist(y, bins=ybins, orientation="horizontal", color="gray", histtype="step") + y_hist.tick_params(top=True, labeltop=True, bottom=True, labelbottom=False) + y_hist.tick_params(left=True, labelleft=False, right=True, labelright=False) + plt.savefig(output_path) diff --git a/ml4cc/tools/visualization/losses.py b/ml4cc/tools/visualization/losses.py index d41975d..47cad6b 100644 --- a/ml4cc/tools/visualization/losses.py +++ b/ml4cc/tools/visualization/losses.py @@ -1,16 +1,12 @@ import numpy as np import mplhep as hep import matplotlib.pyplot as plt + hep.style.use(hep.styles.CMS) -def plot_loss_evolution( - val_loss: np.array, - train_loss: np.array, - output_path: str = "", - loss_name: str = "MSE" -): - """ Plots the evolution of train and validation loss. +def plot_loss_evolution(val_loss: np.array, train_loss: np.array, output_path: str = "", loss_name: str = "MSE"): + """Plots the evolution of train and validation loss. Parameters: val_loss : np.array @@ -26,16 +22,16 @@ def plot_loss_evolution( None """ # if multirun case? - plt.plot(val_loss, label="val_loss", color='k') + plt.plot(val_loss, label="val_loss", color="k") if train_loss is not None: - plt.plot(train_loss, label="train_loss", ls="--", color='k') + plt.plot(train_loss, label="train_loss", ls="--", color="k") plt.grid() - plt.yscale('log') - plt.ylabel(f'{loss_name} loss [a.u.]') - plt.xlabel('epoch') + plt.yscale("log") + plt.ylabel(f"{loss_name} loss [a.u.]") + plt.xlabel("epoch") plt.xlim(0, len(val_loss)) plt.legend() plt.savefig(output_path) - if output_path != '': - plt.savefig(output_path, bbox_inches='tight') + if output_path != "": + plt.savefig(output_path, bbox_inches="tight") plt.close("all") diff --git a/ml4cc/tools/visualization/regression.py b/ml4cc/tools/visualization/regression.py index e1327f5..38a55f2 100644 --- a/ml4cc/tools/visualization/regression.py +++ b/ml4cc/tools/visualization/regression.py @@ -2,17 +2,19 @@ import mplhep as hep import boost_histogram as bh import matplotlib.pyplot as plt + hep.style.use(hep.styles.CMS) def calculate_resolution(truth: np.array, preds: np.array) -> np.array: - """ We calculate the resolution as IQR/median as stdev is more affected by outliers""" - ratios = truth/preds + """We calculate the resolution as IQR/median as stdev is more affected by outliers""" + ratios = truth / preds iqr = np.quantile(ratios, 0.75) - np.quantile(ratios, 0.25) median = np.quantile(ratios, 0.5) resolution = iqr / median return resolution, ratios + def to_bh(data, bins, cumulative=False): h1 = bh.Histogram(bh.axis.Variable(bins)) h1.fill(data) @@ -25,23 +27,21 @@ def evaluate_resolution(truth: np.array, preds: np.array, output_path: str) -> N resolution, ratios = calculate_resolution(truth, preds) bins = np.linspace(0.5, 1.5, 101) hep.histplot(to_bh(ratios, bins=bins), ax=plt.gca(), density=True) - plt.axvline(x=1, ls='--') - plt.figtext(0.5, 0.5, f'IQR={resolution:.4f}') + plt.axvline(x=1, ls="--") + plt.figtext(0.5, 0.5, f"IQR={resolution:.4f}") plt.xlabel(r"$n_{cls}^{true}/n_{cls}^{pred}$") - plt.savefig(output_path, bbox_inches='tight') + plt.savefig(output_path, bbox_inches="tight") plt.close("all") def plot_true_pred_distributions(truth: np.array, preds: np.array, output_path: str) -> None: bins = np.linspace( - start=np.min(np.concatenate((truth, preds))), - stop=np.max(np.concatenate((truth, preds))), - num=25 + start=np.min(np.concatenate((truth, preds))), stop=np.max(np.concatenate((truth, preds))), num=25 ) - plt.hist(preds, bins=bins, histtype='step', label="Reconstructed", density=True) - plt.hist(truth, bins=bins, histtype='step', label="Target", density=True) + plt.hist(preds, bins=bins, histtype="step", label="Reconstructed", density=True) + plt.hist(truth, bins=bins, histtype="step", label="Target", density=True) plt.xlabel("Number of primary clusters") plt.ylabel("Number entries [a.u.]") plt.legend() - plt.savefig(output_path, bbox_inches='tight') - plt.close('all') + plt.savefig(output_path, bbox_inches="tight") + plt.close("all") diff --git a/ml4cc/tools/visualization/waveform.py b/ml4cc/tools/visualization/waveform.py index 7d6bf0e..c12d634 100644 --- a/ml4cc/tools/visualization/waveform.py +++ b/ml4cc/tools/visualization/waveform.py @@ -6,19 +6,19 @@ def plot_waveform( - waveform: np.array = None, - height: np.array = None, - time: np.array = None, - figsize: tuple = (16, 9), - x_label: str = "Time", - y_label: str = "Amplitude" + waveform: np.array = None, + height: np.array = None, + time: np.array = None, + figsize: tuple = (16, 9), + x_label: str = "Time", + y_label: str = "Amplitude", ): fig, ax = plt.subplots(figsize=figsize) if waveform is not None: time_ = np.arange(len(waveform)) - ax.plot(time_, waveform, label='waveform') + ax.plot(time_, waveform, label="waveform") elif (height is not None) and (time is not None): - ax.plot(time, height, label='waveform') + ax.plot(time, height, label="waveform") else: raise ValueError("Please provide either the waveform or height and time.") ax.set_xlabel(x_label)