From 08ec2462ceb63ad8945c7acc29ea3d07b69f7ad4 Mon Sep 17 00:00:00 2001 From: Wei Date: Tue, 3 Feb 2026 11:33:43 +0000 Subject: [PATCH 1/9] add vds_dir in config --- upp/classes/preprocessing_config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/upp/classes/preprocessing_config.py b/upp/classes/preprocessing_config.py index 2c279f08..ea095290 100644 --- a/upp/classes/preprocessing_config.py +++ b/upp/classes/preprocessing_config.py @@ -142,6 +142,7 @@ class PreprocessingConfig: num_jets_per_output_file: int | None = None skip_checks: bool = False skip_config_copy: bool = False + vds_dir: Path | None = None def __post_init__(self): # postprocess paths From 9f40104e2fec441e88f304d0b5893f8eed15ccc6 Mon Sep 17 00:00:00 2001 From: Wei Date: Tue, 3 Feb 2026 11:34:01 +0000 Subject: [PATCH 2/9] linting --- upp/classes/preprocessing_config.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/upp/classes/preprocessing_config.py b/upp/classes/preprocessing_config.py index ea095290..59ac2fbe 100644 --- a/upp/classes/preprocessing_config.py +++ b/upp/classes/preprocessing_config.py @@ -119,6 +119,9 @@ class PreprocessingConfig: Skip checks for the input files. This is used for grid submission skip_config_copy : bool, optional Decide, if the config copying is skipped or not. By default False + vds_dir : Path | None, optional + Directory name for creation of virtual datasets. By default None + If none is given, virtual datasets is created next to input ntuples """ config_path: Path From 50892f74f818f091040a93b48717b2f1f1568fa1 Mon Sep 17 00:00:00 2001 From: Wei Date: Tue, 3 Feb 2026 11:36:03 +0000 Subject: [PATCH 3/9] add vds_dir for all stages --- upp/classes/components.py | 4 ++++ upp/stages/plot.py | 1 + upp/stages/resampling.py | 1 + upp/utils/check_input_samples.py | 1 + 4 files changed, 7 insertions(+) diff --git a/upp/classes/components.py b/upp/classes/components.py index babd90a4..6ddacb8d 100644 --- a/upp/classes/components.py +++ b/upp/classes/components.py @@ -85,6 +85,9 @@ def setup_reader( """ if fname is None: fname = self.sample.path + + if "vds_dir" not in kwargs and self.sample.vds_dir is not None: + kwargs["vds_dir"] = self.sample.vds_dir self.reader = H5Reader( fname=fname, @@ -337,6 +340,7 @@ def from_config(cls, config: PreprocessingConfig) -> Components: ntuple_dir=config.ntuple_dir, name=component["sample"]["name"], skip_checks=config.skip_checks, + vds_dir=config.vds_dir, ) # Create the Component instances for the different flavours diff --git a/upp/stages/plot.py b/upp/stages/plot.py index f5d5e7d9..aa2be720 100644 --- a/upp/stages/plot.py +++ b/upp/stages/plot.py @@ -178,6 +178,7 @@ def plot_resampling_dists(config: PreprocessingConfig, stage: str) -> None: jets_name=config.jets_name, shuffle=False, equal_jets=True, + vds_dir=config.vds_dir ).load( {config.jets_name: vars_to_load}, num_jets=config.num_jets_estimate_plotting, diff --git a/upp/stages/resampling.py b/upp/stages/resampling.py index ff0f243c..b0aa67af 100644 --- a/upp/stages/resampling.py +++ b/upp/stages/resampling.py @@ -332,6 +332,7 @@ def run_on_region( jets_name=self.jets_name, equal_jets=equal_jets_flag, transform=self.transform, + vds_dir=sample.vds_dir, ) # Define a stream of jets with the cuts for the region and the variables used diff --git a/upp/utils/check_input_samples.py b/upp/utils/check_input_samples.py index 2d43ecfa..ba6230a3 100644 --- a/upp/utils/check_input_samples.py +++ b/upp/utils/check_input_samples.py @@ -211,6 +211,7 @@ def run_input_sample_check( fname=config.ntuple_dir / sample, batch_size=config.batch_size, jets_name=config.jets_name, + vds_dir=config.vds_dir, ).num_jets # Drop the pattern From ee7974b126c7c32a7e9bd55556418d9075bfd9b4 Mon Sep 17 00:00:00 2001 From: Wei Lai Date: Tue, 24 Feb 2026 10:58:59 +0000 Subject: [PATCH 4/9] linting --- upp/classes/components.py | 2 +- upp/stages/plot.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/upp/classes/components.py b/upp/classes/components.py index 6ddacb8d..89638627 100644 --- a/upp/classes/components.py +++ b/upp/classes/components.py @@ -85,7 +85,7 @@ def setup_reader( """ if fname is None: fname = self.sample.path - + if "vds_dir" not in kwargs and self.sample.vds_dir is not None: kwargs["vds_dir"] = self.sample.vds_dir diff --git a/upp/stages/plot.py b/upp/stages/plot.py index aa2be720..f66b7836 100644 --- a/upp/stages/plot.py +++ b/upp/stages/plot.py @@ -178,7 +178,7 @@ def plot_resampling_dists(config: PreprocessingConfig, stage: str) -> None: jets_name=config.jets_name, shuffle=False, equal_jets=True, - vds_dir=config.vds_dir + vds_dir=config.vds_dir, ).load( {config.jets_name: vars_to_load}, num_jets=config.num_jets_estimate_plotting, From cb327fc1b737c31d3d10d9c23dac3fdaf4687a82 Mon Sep 17 00:00:00 2001 From: Wei Date: Tue, 24 Feb 2026 11:07:00 +0000 Subject: [PATCH 5/9] add documentation --- docs/configuration.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/configuration.md b/docs/configuration.md index dc85c1fe..445fe2d0 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -40,6 +40,11 @@ Below is an example and a table explaining each setting. |`pattern`|`str` or `list[str]`| A single pattern or a list of pattern that match h5 files in a downloaded dataset. H5 files matching each pattern will be transparently merged using virtual datasets. | *Required* | |`equal_jets`|`bool`| Only relevant when providing a list of patterns. If `True`, the same number of jets from each DSID are selected. This is required for e.g. in Xbb QCD where each DSID belongs to a different slice, and the resampling would break if you tried to resample with one or more slices missing. If `False` this is not enforced, allowing for larger numbers of available jets. | `True` | +The virtual dataset files created from wildcard patterns are by default stored alongside the input ntuples. +If you have no write access to the input ntuples directory and would like to collect all VDS files in an accessible directory instead, set `vds_dir` in the global config (see [Global Config](#global-config)). +Each pattern gets its own VDS file named after its DSID directory (e.g. `vds_dir/user.wlai.601589.e8547_..._output_vds.h5`). + + ### Global Cuts From dd4391f6f3d20cc3703d9942704b62c706566ba0 Mon Sep 17 00:00:00 2001 From: Wei Lai Date: Tue, 24 Feb 2026 11:14:33 +0000 Subject: [PATCH 6/9] bug fix --- upp/utils/check_input_samples.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/upp/utils/check_input_samples.py b/upp/utils/check_input_samples.py index ba6230a3..100e1dc2 100644 --- a/upp/utils/check_input_samples.py +++ b/upp/utils/check_input_samples.py @@ -211,7 +211,7 @@ def run_input_sample_check( fname=config.ntuple_dir / sample, batch_size=config.batch_size, jets_name=config.jets_name, - vds_dir=config.vds_dir, + vds_dir=getattr(config, "vds_dir", None), ).num_jets # Drop the pattern From edaebf1a775da2ea0bd97663632e6fb4493311a5 Mon Sep 17 00:00:00 2001 From: Wei Date: Tue, 24 Feb 2026 11:19:55 +0000 Subject: [PATCH 7/9] changelog --- changelog.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/changelog.md b/changelog.md index 18996fa5..ecc358c6 100644 --- a/changelog.md +++ b/changelog.md @@ -2,6 +2,8 @@ ### [Latest] +- Allow configurable virtual dataset creation directory [#117](https://github.com/umami-hep/umami-preprocessing/pull/117) + - Add campaign year reweighting/resampling to Xbb configs [#118](https://github.com/umami-hep/umami-preprocessing/pull/118) ### [v0.3.0](https://github.com/umami-hep/umami-preprocessing/releases/tag/v0.3.0) (23.02.2026) From 78f028d342da17c35b447e50b15b4ae5d550572a Mon Sep 17 00:00:00 2001 From: Wei Date: Tue, 3 Mar 2026 10:12:45 +0000 Subject: [PATCH 8/9] adding unit test for components --- tests/unit/classes/test_components.py | 58 +++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 tests/unit/classes/test_components.py diff --git a/tests/unit/classes/test_components.py b/tests/unit/classes/test_components.py new file mode 100644 index 00000000..b5bbd198 --- /dev/null +++ b/tests/unit/classes/test_components.py @@ -0,0 +1,58 @@ +from __future__ import annotations + +from pathlib import Path + +from ftag import Cuts, Flavours, Sample +from ftag.mock import get_mock_file + +from upp.classes.components import Component +from upp.classes.region import Region + + +def make_component(tmp_path: Path, vds_dir: Path | None) -> Component: + fname = get_mock_file()[0] + sample = Sample(pattern=fname, name="test", vds_dir=vds_dir) + return Component( + region=Region(name="lowpt", cuts=Cuts.empty()), + sample=sample, + flavour=Flavours.bjets, + global_cuts=Cuts.empty(), + dirname=tmp_path / "components" / "sub", + num_jets=100, + num_jets_estimate_available=0, + equal_jets=True, + ) + + +def test_setup_reader_passes_vds_dir(tmp_path, monkeypatch): + """setup_reader forwards vds_dir from sample to H5Reader when set.""" + vds_dir = tmp_path / "vds" + comp = make_component(tmp_path, vds_dir=vds_dir) + + captured = {} + + class _H5Reader: + def __init__(self, **kwargs): + captured.update(kwargs) + + monkeypatch.setattr("upp.classes.components.H5Reader", _H5Reader) + comp.setup_reader(batch_size=100) + + assert "vds_dir" in captured + assert captured["vds_dir"] == vds_dir + + +def test_setup_reader_no_vds_dir(tmp_path, monkeypatch): + """setup_reader does not inject vds_dir when sample.vds_dir is None.""" + comp = make_component(tmp_path, vds_dir=None) + + captured = {} + + class _H5Reader: + def __init__(self, **kwargs): + captured.update(kwargs) + + monkeypatch.setattr("upp.classes.components.H5Reader", _H5Reader) + comp.setup_reader(batch_size=100) + + assert "vds_dir" not in captured From 2d2d767e2f98e7a48c0daececb75f8c698647ef5 Mon Sep 17 00:00:00 2001 From: Wei Date: Tue, 3 Mar 2026 10:53:42 +0000 Subject: [PATCH 9/9] bug fix --- tests/unit/classes/test_components.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/unit/classes/test_components.py b/tests/unit/classes/test_components.py index b5bbd198..cd885f2f 100644 --- a/tests/unit/classes/test_components.py +++ b/tests/unit/classes/test_components.py @@ -2,9 +2,11 @@ from pathlib import Path +# import pytest from ftag import Cuts, Flavours, Sample from ftag.mock import get_mock_file +import upp.classes.components as components_module from upp.classes.components import Component from upp.classes.region import Region @@ -35,7 +37,7 @@ class _H5Reader: def __init__(self, **kwargs): captured.update(kwargs) - monkeypatch.setattr("upp.classes.components.H5Reader", _H5Reader) + monkeypatch.setattr(components_module, "H5Reader", _H5Reader) comp.setup_reader(batch_size=100) assert "vds_dir" in captured @@ -52,7 +54,7 @@ class _H5Reader: def __init__(self, **kwargs): captured.update(kwargs) - monkeypatch.setattr("upp.classes.components.H5Reader", _H5Reader) + monkeypatch.setattr(components_module, "H5Reader", _H5Reader) comp.setup_reader(batch_size=100) assert "vds_dir" not in captured