Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

### [Latest]

- Allow configurable virtual dataset creation directory [#117](https://github.com/umami-hep/umami-preprocessing/pull/117)

- Add campaign year reweighting/resampling to Xbb configs [#118](https://github.com/umami-hep/umami-preprocessing/pull/118)

### [v0.3.0](https://github.com/umami-hep/umami-preprocessing/releases/tag/v0.3.0) (23.02.2026)
Expand Down
5 changes: 5 additions & 0 deletions docs/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,11 @@ Below is an example and a table explaining each setting.
|`pattern`|`str` or `list[str]`| A single pattern or a list of pattern that match h5 files in a downloaded dataset. H5 files matching each pattern will be transparently merged using virtual datasets. | *Required* |
|`equal_jets`|`bool`| Only relevant when providing a list of patterns. If `True`, the same number of jets from each DSID are selected. This is required for e.g. in Xbb QCD where each DSID belongs to a different slice, and the resampling would break if you tried to resample with one or more slices missing. If `False` this is not enforced, allowing for larger numbers of available jets. | `True` |

The virtual dataset files created from wildcard patterns are by default stored alongside the input ntuples.
If you have no write access to the input ntuples directory and would like to collect all VDS files in an accessible directory instead, set `vds_dir` in the global config (see [Global Config](#global-config)).
Each pattern gets its own VDS file named after its DSID directory (e.g. `vds_dir/user.wlai.601589.e8547_..._output_vds.h5`).



### Global Cuts

Expand Down
60 changes: 60 additions & 0 deletions tests/unit/classes/test_components.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
from __future__ import annotations

from pathlib import Path

# import pytest
from ftag import Cuts, Flavours, Sample
from ftag.mock import get_mock_file

import upp.classes.components as components_module
from upp.classes.components import Component
from upp.classes.region import Region


def make_component(tmp_path: Path, vds_dir: Path | None) -> Component:
fname = get_mock_file()[0]
sample = Sample(pattern=fname, name="test", vds_dir=vds_dir)
return Component(
region=Region(name="lowpt", cuts=Cuts.empty()),
sample=sample,
flavour=Flavours.bjets,
global_cuts=Cuts.empty(),
dirname=tmp_path / "components" / "sub",
num_jets=100,
num_jets_estimate_available=0,
equal_jets=True,
)


def test_setup_reader_passes_vds_dir(tmp_path, monkeypatch):
"""setup_reader forwards vds_dir from sample to H5Reader when set."""
vds_dir = tmp_path / "vds"
comp = make_component(tmp_path, vds_dir=vds_dir)

captured = {}

class _H5Reader:
def __init__(self, **kwargs):
captured.update(kwargs)

monkeypatch.setattr(components_module, "H5Reader", _H5Reader)
comp.setup_reader(batch_size=100)

assert "vds_dir" in captured
assert captured["vds_dir"] == vds_dir


def test_setup_reader_no_vds_dir(tmp_path, monkeypatch):
"""setup_reader does not inject vds_dir when sample.vds_dir is None."""
comp = make_component(tmp_path, vds_dir=None)

captured = {}

class _H5Reader:
def __init__(self, **kwargs):
captured.update(kwargs)

monkeypatch.setattr(components_module, "H5Reader", _H5Reader)
comp.setup_reader(batch_size=100)

assert "vds_dir" not in captured
4 changes: 4 additions & 0 deletions upp/classes/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,9 @@ def setup_reader(
if fname is None:
fname = self.sample.path

if "vds_dir" not in kwargs and self.sample.vds_dir is not None:
kwargs["vds_dir"] = self.sample.vds_dir

self.reader = H5Reader(
fname=fname,
batch_size=batch_size,
Expand Down Expand Up @@ -337,6 +340,7 @@ def from_config(cls, config: PreprocessingConfig) -> Components:
ntuple_dir=config.ntuple_dir,
name=component["sample"]["name"],
skip_checks=config.skip_checks,
vds_dir=config.vds_dir,
)

# Create the Component instances for the different flavours
Expand Down
4 changes: 4 additions & 0 deletions upp/classes/preprocessing_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,9 @@ class PreprocessingConfig:
Skip checks for the input files. This is used for grid submission
skip_config_copy : bool, optional
Decide, if the config copying is skipped or not. By default False
vds_dir : Path | None, optional
Directory name for creation of virtual datasets. By default None
If none is given, virtual datasets is created next to input ntuples
"""

config_path: Path
Expand All @@ -142,6 +145,7 @@ class PreprocessingConfig:
num_jets_per_output_file: int | None = None
skip_checks: bool = False
skip_config_copy: bool = False
vds_dir: Path | None = None

def __post_init__(self):
# postprocess paths
Expand Down
1 change: 1 addition & 0 deletions upp/stages/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,7 @@ def plot_resampling_dists(config: PreprocessingConfig, stage: str) -> None:
jets_name=config.jets_name,
shuffle=False,
equal_jets=True,
vds_dir=config.vds_dir,
).load(
{config.jets_name: vars_to_load},
num_jets=config.num_jets_estimate_plotting,
Expand Down
1 change: 1 addition & 0 deletions upp/stages/resampling.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,7 @@ def run_on_region(
jets_name=self.jets_name,
equal_jets=equal_jets_flag,
transform=self.transform,
vds_dir=sample.vds_dir,
)

# Define a stream of jets with the cuts for the region and the variables used
Expand Down
1 change: 1 addition & 0 deletions upp/utils/check_input_samples.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,7 @@ def run_input_sample_check(
fname=config.ntuple_dir / sample,
batch_size=config.batch_size,
jets_name=config.jets_name,
vds_dir=getattr(config, "vds_dir", None),
).num_jets

# Drop the pattern
Expand Down