diff --git a/changelog.md b/changelog.md index 18996fa5..ecc358c6 100644 --- a/changelog.md +++ b/changelog.md @@ -2,6 +2,8 @@ ### [Latest] +- Allow configurable virtual dataset creation directory [#117](https://github.com/umami-hep/umami-preprocessing/pull/117) + - Add campaign year reweighting/resampling to Xbb configs [#118](https://github.com/umami-hep/umami-preprocessing/pull/118) ### [v0.3.0](https://github.com/umami-hep/umami-preprocessing/releases/tag/v0.3.0) (23.02.2026) diff --git a/docs/configuration.md b/docs/configuration.md index dc85c1fe..445fe2d0 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -40,6 +40,11 @@ Below is an example and a table explaining each setting. |`pattern`|`str` or `list[str]`| A single pattern or a list of pattern that match h5 files in a downloaded dataset. H5 files matching each pattern will be transparently merged using virtual datasets. | *Required* | |`equal_jets`|`bool`| Only relevant when providing a list of patterns. If `True`, the same number of jets from each DSID are selected. This is required for e.g. in Xbb QCD where each DSID belongs to a different slice, and the resampling would break if you tried to resample with one or more slices missing. If `False` this is not enforced, allowing for larger numbers of available jets. | `True` | +The virtual dataset files created from wildcard patterns are by default stored alongside the input ntuples. +If you have no write access to the input ntuples directory and would like to collect all VDS files in an accessible directory instead, set `vds_dir` in the global config (see [Global Config](#global-config)). +Each pattern gets its own VDS file named after its DSID directory (e.g. `vds_dir/user.wlai.601589.e8547_..._output_vds.h5`). + + ### Global Cuts diff --git a/tests/unit/classes/test_components.py b/tests/unit/classes/test_components.py new file mode 100644 index 00000000..cd885f2f --- /dev/null +++ b/tests/unit/classes/test_components.py @@ -0,0 +1,60 @@ +from __future__ import annotations + +from pathlib import Path + +# import pytest +from ftag import Cuts, Flavours, Sample +from ftag.mock import get_mock_file + +import upp.classes.components as components_module +from upp.classes.components import Component +from upp.classes.region import Region + + +def make_component(tmp_path: Path, vds_dir: Path | None) -> Component: + fname = get_mock_file()[0] + sample = Sample(pattern=fname, name="test", vds_dir=vds_dir) + return Component( + region=Region(name="lowpt", cuts=Cuts.empty()), + sample=sample, + flavour=Flavours.bjets, + global_cuts=Cuts.empty(), + dirname=tmp_path / "components" / "sub", + num_jets=100, + num_jets_estimate_available=0, + equal_jets=True, + ) + + +def test_setup_reader_passes_vds_dir(tmp_path, monkeypatch): + """setup_reader forwards vds_dir from sample to H5Reader when set.""" + vds_dir = tmp_path / "vds" + comp = make_component(tmp_path, vds_dir=vds_dir) + + captured = {} + + class _H5Reader: + def __init__(self, **kwargs): + captured.update(kwargs) + + monkeypatch.setattr(components_module, "H5Reader", _H5Reader) + comp.setup_reader(batch_size=100) + + assert "vds_dir" in captured + assert captured["vds_dir"] == vds_dir + + +def test_setup_reader_no_vds_dir(tmp_path, monkeypatch): + """setup_reader does not inject vds_dir when sample.vds_dir is None.""" + comp = make_component(tmp_path, vds_dir=None) + + captured = {} + + class _H5Reader: + def __init__(self, **kwargs): + captured.update(kwargs) + + monkeypatch.setattr(components_module, "H5Reader", _H5Reader) + comp.setup_reader(batch_size=100) + + assert "vds_dir" not in captured diff --git a/upp/classes/components.py b/upp/classes/components.py index babd90a4..89638627 100644 --- a/upp/classes/components.py +++ b/upp/classes/components.py @@ -86,6 +86,9 @@ def setup_reader( if fname is None: fname = self.sample.path + if "vds_dir" not in kwargs and self.sample.vds_dir is not None: + kwargs["vds_dir"] = self.sample.vds_dir + self.reader = H5Reader( fname=fname, batch_size=batch_size, @@ -337,6 +340,7 @@ def from_config(cls, config: PreprocessingConfig) -> Components: ntuple_dir=config.ntuple_dir, name=component["sample"]["name"], skip_checks=config.skip_checks, + vds_dir=config.vds_dir, ) # Create the Component instances for the different flavours diff --git a/upp/classes/preprocessing_config.py b/upp/classes/preprocessing_config.py index 2c279f08..59ac2fbe 100644 --- a/upp/classes/preprocessing_config.py +++ b/upp/classes/preprocessing_config.py @@ -119,6 +119,9 @@ class PreprocessingConfig: Skip checks for the input files. This is used for grid submission skip_config_copy : bool, optional Decide, if the config copying is skipped or not. By default False + vds_dir : Path | None, optional + Directory name for creation of virtual datasets. By default None + If none is given, virtual datasets is created next to input ntuples """ config_path: Path @@ -142,6 +145,7 @@ class PreprocessingConfig: num_jets_per_output_file: int | None = None skip_checks: bool = False skip_config_copy: bool = False + vds_dir: Path | None = None def __post_init__(self): # postprocess paths diff --git a/upp/stages/plot.py b/upp/stages/plot.py index f5d5e7d9..f66b7836 100644 --- a/upp/stages/plot.py +++ b/upp/stages/plot.py @@ -178,6 +178,7 @@ def plot_resampling_dists(config: PreprocessingConfig, stage: str) -> None: jets_name=config.jets_name, shuffle=False, equal_jets=True, + vds_dir=config.vds_dir, ).load( {config.jets_name: vars_to_load}, num_jets=config.num_jets_estimate_plotting, diff --git a/upp/stages/resampling.py b/upp/stages/resampling.py index ff0f243c..b0aa67af 100644 --- a/upp/stages/resampling.py +++ b/upp/stages/resampling.py @@ -332,6 +332,7 @@ def run_on_region( jets_name=self.jets_name, equal_jets=equal_jets_flag, transform=self.transform, + vds_dir=sample.vds_dir, ) # Define a stream of jets with the cuts for the region and the variables used diff --git a/upp/utils/check_input_samples.py b/upp/utils/check_input_samples.py index 2d43ecfa..100e1dc2 100644 --- a/upp/utils/check_input_samples.py +++ b/upp/utils/check_input_samples.py @@ -211,6 +211,7 @@ def run_input_sample_check( fname=config.ntuple_dir / sample, batch_size=config.batch_size, jets_name=config.jets_name, + vds_dir=getattr(config, "vds_dir", None), ).num_jets # Drop the pattern