From 4ca68cb6e9f0ecaf5173fbbfc270d6afa4c028bf Mon Sep 17 00:00:00 2001 From: Maarten De Coen Date: Wed, 3 Apr 2024 15:52:02 +0200 Subject: [PATCH 001/119] Update README.md to make example analysis from ghent group --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e5e6fb4ce..524eb30e1 100644 --- a/README.md +++ b/README.md @@ -63,7 +63,7 @@ To create an analysis using columnflow, it is recommended to start from a predef The following command (no previous git clone required) interactively asks for a handful of names and settings, and creates a minimal, yet fully functioning project structure for you! ```shell -bash -c "$(curl -Ls https://raw.githubusercontent.com/columnflow/columnflow/master/create_analysis.sh)" +bash -c "$(curl -Ls https://gitlab.cern.ch/ghentanalysis/columnflowanalysis/-/raw/columnflow/columnflow/master/create_analysis.sh)" ``` At the end of the setup, you will see further instructions and suggestions to run your first analysis tasks (example below). From 5fccc742b16c28d4536f4effc77b42c7400ffe04 Mon Sep 17 00:00:00 2001 From: Maarten De Coen Date: Wed, 3 Apr 2024 16:09:45 +0200 Subject: [PATCH 002/119] Update create_analysis.sh to point to ghent git locations --- create_analysis.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/create_analysis.sh b/create_analysis.sh index 0bfd0b8ec..b44bf45fb 100755 --- a/create_analysis.sh +++ b/create_analysis.sh @@ -257,16 +257,16 @@ create_analysis() { local gh_prefix="https://github.com/" - $( str_lc "${cf_use_ssh}" ) && gh_prefix="git@github.com:" + $( str_lc "${cf_use_ssh}" ) && gh_prefix="ssh://git@gitlab.cern.ch:" mkdir -p modules if ${debug}; then ln -s "${this_dir}" modules/columnflow else - git submodule add -b "${fetch_cf_branch}" "${gh_prefix}columnflow/columnflow.git" modules/columnflow + git submodule add -b "${fetch_cf_branch}" "${gh_prefix}7999/ghentanalysis/columnflowanalysis/columnflow/columnflow.git" modules/columnflow fi if [ "${cf_analysis_flavor}" = "cms_minimal" ]; then - git submodule add -b "${fetch_cmsdb_branch}" "${gh_prefix}uhh-cms/cmsdb.git" modules/cmsdb + git submodule add -b "${fetch_cmsdb_branch}" "${gh_prefix}7999/ghentanalysis/cmsdb.git" modules/cmsdb fi git submodule update --init --recursive From 32750875ff56cdf908f9e209e3dc6784bf994f50 Mon Sep 17 00:00:00 2001 From: Maarten De Coen Date: Wed, 3 Apr 2024 16:14:05 +0200 Subject: [PATCH 003/119] Update create_analysis.sh: error in location of columnflow on ghent gitlab --- create_analysis.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/create_analysis.sh b/create_analysis.sh index b44bf45fb..495f2be35 100755 --- a/create_analysis.sh +++ b/create_analysis.sh @@ -263,7 +263,7 @@ create_analysis() { if ${debug}; then ln -s "${this_dir}" modules/columnflow else - git submodule add -b "${fetch_cf_branch}" "${gh_prefix}7999/ghentanalysis/columnflowanalysis/columnflow/columnflow.git" modules/columnflow + git submodule add -b "${fetch_cf_branch}" "${gh_prefix}7999/ghentanalysis/columnflowanalysis/columnflow.git" modules/columnflow fi if [ "${cf_analysis_flavor}" = "cms_minimal" ]; then git submodule add -b "${fetch_cmsdb_branch}" "${gh_prefix}7999/ghentanalysis/cmsdb.git" modules/cmsdb From 10f5b358595512bb7f7d3a91cd0c0957e00d8817 Mon Sep 17 00:00:00 2001 From: Maarten De Coen Date: Wed, 3 Apr 2024 16:24:54 +0200 Subject: [PATCH 004/119] Update .gitmodules with absolute git paths for law and order --- .gitmodules | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitmodules b/.gitmodules index 6dc2212dd..cc55a2ef4 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,7 +1,7 @@ [submodule "modules/law"] path = modules/law - url = ../../riga/law.git + url = https://github.com/riga/law.git [submodule "modules/order"] path = modules/order - url = ../../riga/order.git + url = https://github.com/riga/order.git From 8994af87a75678e5a57b461eb1f90a5711f7fc07 Mon Sep 17 00:00:00 2001 From: Maarten De Coen Date: Wed, 3 Apr 2024 16:41:54 +0200 Subject: [PATCH 005/119] Update law.cfg with t2b MC location --- analysis_templates/cms_minimal/law.cfg | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/analysis_templates/cms_minimal/law.cfg b/analysis_templates/cms_minimal/law.cfg index 764843759..eab65d02f 100644 --- a/analysis_templates/cms_minimal/law.cfg +++ b/analysis_templates/cms_minimal/law.cfg @@ -67,11 +67,11 @@ log_array_function_runtime: False [outputs] # list of all used file systems -wlcg_file_systems: wlcg_fs, wlcg_fs_infn_redirector, wlcg_fs_global_redirector +wlcg_file_systems: wlcg_fs_t2b_redirector, wlcg_fs, wlcg_fs_infn_redirector, wlcg_fs_global_redirector # list of file systems used by columnflow.tasks.external.GetDatasetLFNs.iter_nano_files to # look for the correct fs per nano input file (in that order) -lfn_sources: wlcg_fs_infn_redirector, wlcg_fs_global_redirector +lfn_sources: wlcg_fs_t2b_redirector, wlcg_fs_infn_redirector, wlcg_fs_global_redirector # output locations per task family # for local targets : "local[, LOCAL_FS_NAME or STORE_PATH]" @@ -101,6 +101,18 @@ remote_lcg_setup: /cvmfs/grid.cern.ch/centos7-ui-200122/etc/profile.d/setup-c7-u base: / +[wlcg_fs_t2b_redirector] + +# set this to your desired location +base: /pnfs/iihe/cms/ph/sc4 +use_cache: $CF_WLCG_USE_CACHE +cache_root: $CF_WLCG_CACHE_ROOT +cache_cleanup: $CF_WLCG_CACHE_CLEANUP +cache_max_size: 15GB +cache_global_lock: True +cache_mtime_patience: -1 + + [wlcg_fs] # set this to your desired location From f0624fcddd3c47300157d1a7c8b7be0a1d6269d6 Mon Sep 17 00:00:00 2001 From: juvanden Date: Wed, 3 Apr 2024 17:10:40 +0200 Subject: [PATCH 006/119] ghent_template added --- analysis_templates/ghent_template/.flake8 | 10 + .../ghent_template/.gitattributes | 5 + analysis_templates/ghent_template/.gitignore | 38 + analysis_templates/ghent_template/LICENSE | 674 ++++++++++++++++++ analysis_templates/ghent_template/README.md | 9 + .../__cf_module_name__/__init__.py | 8 + .../calibration/__init__.py | 1 + .../__cf_module_name__/calibration/example.py | 50 ++ .../categorization/__init__.py | 1 + .../categorization/example.py | 26 + .../__cf_module_name__/columnflow_patches.py | 39 + .../__cf_module_name__/config/__init__.py | 1 + .../config/analysis___cf_short_name_lc__.py | 375 ++++++++++ .../__cf_module_name__/inference/__init__.py | 1 + .../__cf_module_name__/inference/example.py | 112 +++ .../__cf_module_name__/ml/__init__.py | 1 + .../__cf_module_name__/ml/example.py | 93 +++ .../__cf_module_name__/plotting/__init__ | 1 + .../__cf_module_name__/plotting/example.py | 83 +++ .../__cf_module_name__/production/__init__.py | 1 + .../__cf_module_name__/production/example.py | 102 +++ .../__cf_module_name__/selection/__init__.py | 1 + .../__cf_module_name__/selection/example.py | 166 +++++ .../__cf_module_name__/tasks/__init__.py | 5 + .../__cf_module_name__/tasks/base.py | 12 + .../ghent_template/bin/githooks/post-commit | 18 + analysis_templates/ghent_template/law.cfg | 134 ++++ .../ghent_template/sandboxes/example.sh | 18 + .../ghent_template/sandboxes/example.txt | 8 + .../ghent_template/sandboxes/example_dev.sh | 18 + analysis_templates/ghent_template/setup.sh | 162 +++++ .../ghent_template/tests/__init__.py | 18 + .../ghent_template/tests/run_all | 43 ++ .../ghent_template/tests/run_linting | 16 + 34 files changed, 2250 insertions(+) create mode 100644 analysis_templates/ghent_template/.flake8 create mode 100644 analysis_templates/ghent_template/.gitattributes create mode 100644 analysis_templates/ghent_template/.gitignore create mode 100644 analysis_templates/ghent_template/LICENSE create mode 100644 analysis_templates/ghent_template/README.md create mode 100644 analysis_templates/ghent_template/__cf_module_name__/__init__.py create mode 100644 analysis_templates/ghent_template/__cf_module_name__/calibration/__init__.py create mode 100644 analysis_templates/ghent_template/__cf_module_name__/calibration/example.py create mode 100644 analysis_templates/ghent_template/__cf_module_name__/categorization/__init__.py create mode 100644 analysis_templates/ghent_template/__cf_module_name__/categorization/example.py create mode 100644 analysis_templates/ghent_template/__cf_module_name__/columnflow_patches.py create mode 100644 analysis_templates/ghent_template/__cf_module_name__/config/__init__.py create mode 100644 analysis_templates/ghent_template/__cf_module_name__/config/analysis___cf_short_name_lc__.py create mode 100644 analysis_templates/ghent_template/__cf_module_name__/inference/__init__.py create mode 100644 analysis_templates/ghent_template/__cf_module_name__/inference/example.py create mode 100644 analysis_templates/ghent_template/__cf_module_name__/ml/__init__.py create mode 100644 analysis_templates/ghent_template/__cf_module_name__/ml/example.py create mode 100644 analysis_templates/ghent_template/__cf_module_name__/plotting/__init__ create mode 100644 analysis_templates/ghent_template/__cf_module_name__/plotting/example.py create mode 100644 analysis_templates/ghent_template/__cf_module_name__/production/__init__.py create mode 100644 analysis_templates/ghent_template/__cf_module_name__/production/example.py create mode 100644 analysis_templates/ghent_template/__cf_module_name__/selection/__init__.py create mode 100644 analysis_templates/ghent_template/__cf_module_name__/selection/example.py create mode 100644 analysis_templates/ghent_template/__cf_module_name__/tasks/__init__.py create mode 100644 analysis_templates/ghent_template/__cf_module_name__/tasks/base.py create mode 100755 analysis_templates/ghent_template/bin/githooks/post-commit create mode 100644 analysis_templates/ghent_template/law.cfg create mode 100644 analysis_templates/ghent_template/sandboxes/example.sh create mode 100644 analysis_templates/ghent_template/sandboxes/example.txt create mode 100644 analysis_templates/ghent_template/sandboxes/example_dev.sh create mode 100644 analysis_templates/ghent_template/setup.sh create mode 100644 analysis_templates/ghent_template/tests/__init__.py create mode 100755 analysis_templates/ghent_template/tests/run_all create mode 100755 analysis_templates/ghent_template/tests/run_linting diff --git a/analysis_templates/ghent_template/.flake8 b/analysis_templates/ghent_template/.flake8 new file mode 100644 index 000000000..b3b274697 --- /dev/null +++ b/analysis_templates/ghent_template/.flake8 @@ -0,0 +1,10 @@ +[flake8] + +# line length of 100 is recommended, but set it to a forgiving value +max-line-length = 120 + +# codes of errors to ignore +ignore = E128, E306, E402, E722, E731, W504, Q003 + +# enforce double quotes +inline-quotes = double diff --git a/analysis_templates/ghent_template/.gitattributes b/analysis_templates/ghent_template/.gitattributes new file mode 100644 index 000000000..0461245fd --- /dev/null +++ b/analysis_templates/ghent_template/.gitattributes @@ -0,0 +1,5 @@ +*.pdf filter=lfs diff=lfs merge=lfs -text +*.png filter=lfs diff=lfs merge=lfs -text +*.jpg filter=lfs diff=lfs merge=lfs -text +*.jpeg filter=lfs diff=lfs merge=lfs -text +*.root filter=lfs diff=lfs merge=lfs -text diff --git a/analysis_templates/ghent_template/.gitignore b/analysis_templates/ghent_template/.gitignore new file mode 100644 index 000000000..e6043753d --- /dev/null +++ b/analysis_templates/ghent_template/.gitignore @@ -0,0 +1,38 @@ +*.sublime-project +*.sublime-workspace +*.pyc +*.log +*.DS_Store +*.egg-info +*.pkl +*.pdf +*.png +*.root +*.npy +*.npz +*.h5 +*.hdf5 +*.json +*.yaml +*.pb +*.out +*.parquet +.env_*.sh +.env_*.sh.tmp +.coverage +coverage*.xml +requirements_user.txt +__pycache__ +dist +build +static +docs/_build +tmp +store +software +data +.data +.law +.setups +.mypy_cache +.vscode diff --git a/analysis_templates/ghent_template/LICENSE b/analysis_templates/ghent_template/LICENSE new file mode 100644 index 000000000..f288702d2 --- /dev/null +++ b/analysis_templates/ghent_template/LICENSE @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. diff --git a/analysis_templates/ghent_template/README.md b/analysis_templates/ghent_template/README.md new file mode 100644 index 000000000..ac945c23e --- /dev/null +++ b/analysis_templates/ghent_template/README.md @@ -0,0 +1,9 @@ +# __cf_analysis_name__ Analysis + + +### Resources + +- [columnflow](https://github.com/uhh-cms/columnflow) +- [law](https://github.com/riga/law) +- [order](https://github.com/riga/order) +- [luigi](https://github.com/spotify/luigi) diff --git a/analysis_templates/ghent_template/__cf_module_name__/__init__.py b/analysis_templates/ghent_template/__cf_module_name__/__init__.py new file mode 100644 index 000000000..32763ecb7 --- /dev/null +++ b/analysis_templates/ghent_template/__cf_module_name__/__init__.py @@ -0,0 +1,8 @@ +# coding: utf-8 + + +from __cf_module_name__.columnflow_patches import patch_all + + +# apply cf patches once +patch_all() diff --git a/analysis_templates/ghent_template/__cf_module_name__/calibration/__init__.py b/analysis_templates/ghent_template/__cf_module_name__/calibration/__init__.py new file mode 100644 index 000000000..57d631c3f --- /dev/null +++ b/analysis_templates/ghent_template/__cf_module_name__/calibration/__init__.py @@ -0,0 +1 @@ +# coding: utf-8 diff --git a/analysis_templates/ghent_template/__cf_module_name__/calibration/example.py b/analysis_templates/ghent_template/__cf_module_name__/calibration/example.py new file mode 100644 index 000000000..227a3ab03 --- /dev/null +++ b/analysis_templates/ghent_template/__cf_module_name__/calibration/example.py @@ -0,0 +1,50 @@ +# coding: utf-8 + +""" +Exemplary calibration methods. +""" + +from columnflow.calibration import Calibrator, calibrator +from columnflow.production.cms.seeds import deterministic_seeds +from columnflow.util import maybe_import +from columnflow.columnar_util import set_ak_column + +np = maybe_import("numpy") +ak = maybe_import("awkward") + + +@calibrator( + uses={ + deterministic_seeds, + "Jet.pt", "Jet.mass", + }, + produces={ + deterministic_seeds, + "Jet.pt", "Jet.mass", + "Jet.pt_jec_up", "Jet.mass_jec_up", + "Jet.pt_jec_down", "Jet.mass_jec_down", + }, +) +def example(self: Calibrator, events: ak.Array, **kwargs) -> ak.Array: + # a) "correct" Jet.pt by scaling four momenta by 1.1 (pt<30) or 0.9 (pt<=30) + # b) add 4 new columns faking the effect of JEC variations + + # add deterministic seeds that could (e.g.) be used for smearings + events = self[deterministic_seeds](events, **kwargs) + + # a) + pt_mask = ak.flatten(events.Jet.pt < 30) + n_jet_pt = np.asarray(ak.flatten(events.Jet.pt)) + n_jet_mass = np.asarray(ak.flatten(events.Jet.mass)) + n_jet_pt[pt_mask] *= 1.1 + n_jet_pt[~pt_mask] *= 0.9 + n_jet_mass[pt_mask] *= 1.1 + n_jet_mass[~pt_mask] *= 0.9 + + # b) + events = set_ak_column(events, "Jet.pt_jec_up", events.Jet.pt * 1.05) + events = set_ak_column(events, "Jet.mass_jec_up", events.Jet.mass * 1.05) + events = set_ak_column(events, "Jet.pt_jec_down", events.Jet.pt * 0.95) + events = set_ak_column(events, "Jet.mass_jec_down", events.Jet.mass * 0.95) + + return events diff --git a/analysis_templates/ghent_template/__cf_module_name__/categorization/__init__.py b/analysis_templates/ghent_template/__cf_module_name__/categorization/__init__.py new file mode 100644 index 000000000..57d631c3f --- /dev/null +++ b/analysis_templates/ghent_template/__cf_module_name__/categorization/__init__.py @@ -0,0 +1 @@ +# coding: utf-8 diff --git a/analysis_templates/ghent_template/__cf_module_name__/categorization/example.py b/analysis_templates/ghent_template/__cf_module_name__/categorization/example.py new file mode 100644 index 000000000..0ae64c0a0 --- /dev/null +++ b/analysis_templates/ghent_template/__cf_module_name__/categorization/example.py @@ -0,0 +1,26 @@ +# coding: utf-8 + +""" +Exemplary selection methods. +""" + +from columnflow.categorization import Categorizer, categorizer +from columnflow.util import maybe_import + +ak = maybe_import("awkward") + + +# +# categorizer functions used by categories definitions +# + +@categorizer(uses={"event"}) +def cat_incl(self: Categorizer, events: ak.Array, **kwargs) -> tuple[ak.Array, ak.Array]: + # fully inclusive selection + return events, ak.ones_like(events.event) == 1 + + +@categorizer(uses={"Jet.pt"}) +def cat_2j(self: Categorizer, events: ak.Array, **kwargs) -> tuple[ak.Array, ak.Array]: + # two or more jets + return events, ak.num(events.Jet.pt, axis=1) >= 2 diff --git a/analysis_templates/ghent_template/__cf_module_name__/columnflow_patches.py b/analysis_templates/ghent_template/__cf_module_name__/columnflow_patches.py new file mode 100644 index 000000000..4a0eba031 --- /dev/null +++ b/analysis_templates/ghent_template/__cf_module_name__/columnflow_patches.py @@ -0,0 +1,39 @@ +# coding: utf-8 + +""" +Collection of patches of underlying columnflow tasks. +""" + +import os + +import law +from columnflow.util import memoize + + +logger = law.logger.get_logger(__name__) + + +@memoize +def patch_bundle_repo_exclude_files(): + from columnflow.tasks.framework.remote import BundleRepo + + # get the relative path to CF_BASE + cf_rel = os.path.relpath(os.environ["CF_BASE"], os.environ["__cf_short_name_uc___BASE"]) + + # amend exclude files to start with the relative path to CF_BASE + exclude_files = [os.path.join(cf_rel, path) for path in BundleRepo.exclude_files] + + # add additional files + exclude_files.extend([ + "docs", "tests", "data", "assets", ".law", ".setups", ".data", ".github", + ]) + + # overwrite them + BundleRepo.exclude_files[:] = exclude_files + + logger.debug("patched exclude_files of cf.BundleRepo") + + +@memoize +def patch_all(): + patch_bundle_repo_exclude_files() diff --git a/analysis_templates/ghent_template/__cf_module_name__/config/__init__.py b/analysis_templates/ghent_template/__cf_module_name__/config/__init__.py new file mode 100644 index 000000000..57d631c3f --- /dev/null +++ b/analysis_templates/ghent_template/__cf_module_name__/config/__init__.py @@ -0,0 +1 @@ +# coding: utf-8 diff --git a/analysis_templates/ghent_template/__cf_module_name__/config/analysis___cf_short_name_lc__.py b/analysis_templates/ghent_template/__cf_module_name__/config/analysis___cf_short_name_lc__.py new file mode 100644 index 000000000..5767bfba6 --- /dev/null +++ b/analysis_templates/ghent_template/__cf_module_name__/config/analysis___cf_short_name_lc__.py @@ -0,0 +1,375 @@ +# coding: utf-8 + +""" +Configuration of the __cf_analysis_name__ analysis. +""" + +import functools + +import law +import order as od +from scinum import Number + +from columnflow.util import DotDict, maybe_import +from columnflow.columnar_util import EMPTY_FLOAT, ColumnCollection +from columnflow.config_util import ( + get_root_processes_from_campaign, add_shift_aliases, get_shifts_from_sources, add_category, + verify_config_processes, +) + +ak = maybe_import("awkward") + + +# +# the main analysis object +# + +analysis___cf_short_name_lc__ = ana = od.Analysis( + name="analysis___cf_short_name_lc__", + id=1, +) + +# analysis-global versions +# (see cfg.x.versions below for more info) +ana.x.versions = {} + +# files of bash sandboxes that might be required by remote tasks +# (used in cf.HTCondorWorkflow) +ana.x.bash_sandboxes = ["$CF_BASE/sandboxes/cf.sh"] +default_sandbox = law.Sandbox.new(law.config.get("analysis", "default_columnar_sandbox")) +if default_sandbox.sandbox_type == "bash" and default_sandbox.name not in ana.x.bash_sandboxes: + ana.x.bash_sandboxes.append(default_sandbox.name) + +# files of cmssw sandboxes that might be required by remote tasks +# (used in cf.HTCondorWorkflow) +ana.x.cmssw_sandboxes = [ + "$CF_BASE/sandboxes/cmssw_default.sh", +] + +# config groups for conveniently looping over certain configs +# (used in wrapper_factory) +ana.x.config_groups = {} + + +# +# setup configs +# + +# an example config is setup below, based on cms NanoAOD v9 for Run2 2017, focussing on +# ttbar and single top MCs, plus single muon data +# update this config or add additional ones to accomodate the needs of your analysis + +from cmsdb.campaigns.run2_2017_nano_v9 import campaign_run2_2017_nano_v9 + +# copy the campaign +# (creates copies of all linked datasets, processes, etc. to allow for encapsulated customization) +campaign = campaign_run2_2017_nano_v9.copy() + +# get all root processes +procs = get_root_processes_from_campaign(campaign) + +# create a config by passing the campaign, so id and name will be identical +cfg = ana.add_config(campaign) + +# gather campaign data +year = campaign.x.year + +# add processes we are interested in +process_names = [ + "data", + "tt", + "st", +] +for process_name in process_names: + # add the process + proc = cfg.add_process(procs.get(process_name)) + + # configuration of colors, labels, etc. can happen here + if proc.is_mc: + proc.color1 = (244, 182, 66) if proc.name == "tt" else (244, 93, 66) + +# add datasets we need to study +dataset_names = [ + # data + "data_mu_b", + # backgrounds + "tt_sl_powheg", + # signals + "st_tchannel_t_powheg", +] +for dataset_name in dataset_names: + # add the dataset + dataset = cfg.add_dataset(campaign.get_dataset(dataset_name)) + + # for testing purposes, limit the number of files to 2 + for info in dataset.info.values(): + info.n_files = min(info.n_files, 2) + +# verify that the root process of all datasets is part of any of the registered processes +verify_config_processes(cfg, warn=True) + +# default objects, such as calibrator, selector, producer, ml model, inference model, etc +cfg.x.default_calibrator = "example" +cfg.x.default_selector = "example" +cfg.x.default_producer = "example" +cfg.x.default_ml_model = None +cfg.x.default_inference_model = "example" +cfg.x.default_categories = ("incl",) +cfg.x.default_variables = ("n_jet", "jet1_pt") + + +# process groups for conveniently looping over certain processs +# (used in wrapper_factory and during plotting) +cfg.x.process_groups = {} + +# dataset groups for conveniently looping over certain datasets +# (used in wrapper_factory and during plotting) +cfg.x.dataset_groups = {} + +# category groups for conveniently looping over certain categories +# (used during plotting) +cfg.x.category_groups = {} + +# variable groups for conveniently looping over certain variables +# (used during plotting) +cfg.x.variable_groups = {} + +# shift groups for conveniently looping over certain shifts +# (used during plotting) +cfg.x.shift_groups = {} + +# general_settings groups for conveniently looping over different values for the general-settings parameter +# (used during plotting) +cfg.x.general_settings_groups = {} + +# process_settings groups for conveniently looping over different values for the process-settings parameter +# (used during plotting) +cfg.x.process_settings_groups = {} + +# variable_settings groups for conveniently looping over different values for the variable-settings parameter +# (used during plotting) +cfg.x.variable_settings_groups = {} + +# custom_style_config groups for conveniently looping over certain style configs +# (used during plotting) +cfg.x.custom_style_config_groups = {} + +# selector step groups for conveniently looping over certain steps +# (used in cutflow tasks) +cfg.x.selector_step_groups = { + "default": ["muon", "jet"], +} + +# calibrator groups for conveniently looping over certain calibrators +# (used during calibration) +cfg.x.calibrator_groups = {} + +# producer groups for conveniently looping over certain producers +# (used during the ProduceColumns task) +cfg.x.producer_groups = {} + +# ml_model groups for conveniently looping over certain ml_models +# (used during the machine learning tasks) +cfg.x.ml_model_groups = {} + + +# custom method and sandbox for determining dataset lfns +cfg.x.get_dataset_lfns = None +cfg.x.get_dataset_lfns_sandbox = None + +# whether to validate the number of obtained LFNs in GetDatasetLFNs +# (currently set to false because the number of files per dataset is truncated to 2) +cfg.x.validate_dataset_lfns = False + +# lumi values in inverse pb +# https://twiki.cern.ch/twiki/bin/view/CMS/LumiRecommendationsRun2?rev=2#Combination_and_correlations +cfg.x.luminosity = Number(41480, { + "lumi_13TeV_2017": 0.02j, + "lumi_13TeV_1718": 0.006j, + "lumi_13TeV_correlated": 0.009j, +}) + +# names of muon correction sets and working points +# (used in the muon producer) +cfg.x.muon_sf_names = ("NUM_TightRelIso_DEN_TightIDandIPCut", f"{year}_UL") + +# register shifts +cfg.add_shift(name="nominal", id=0) + +# tune shifts are covered by dedicated, varied datasets, so tag the shift as "disjoint_from_nominal" +# (this is currently used to decide whether ML evaluations are done on the full shifted dataset) +cfg.add_shift(name="tune_up", id=1, type="shape", tags={"disjoint_from_nominal"}) +cfg.add_shift(name="tune_down", id=2, type="shape", tags={"disjoint_from_nominal"}) + +# fake jet energy correction shift, with aliases flaged as "selection_dependent", i.e. the aliases +# affect columns that might change the output of the event selection +cfg.add_shift(name="jec_up", id=20, type="shape") +cfg.add_shift(name="jec_down", id=21, type="shape") +add_shift_aliases( + cfg, + "jec", + { + "Jet.pt": "Jet.pt_{name}", + "Jet.mass": "Jet.mass_{name}", + "MET.pt": "MET.pt_{name}", + "MET.phi": "MET.phi_{name}", + }, +) + +# event weights due to muon scale factors +cfg.add_shift(name="mu_up", id=10, type="shape") +cfg.add_shift(name="mu_down", id=11, type="shape") +add_shift_aliases(cfg, "mu", {"muon_weight": "muon_weight_{direction}"}) + +# external files +json_mirror = "/afs/cern.ch/work/m/mrieger/public/mirrors/jsonpog-integration-9ea86c4c" +cfg.x.external_files = DotDict.wrap({ + # lumi files + "lumi": { + "golden": ("/afs/cern.ch/cms/CAF/CMSCOMM/COMM_DQM/certification/Collisions17/13TeV/Legacy_2017/Cert_294927-306462_13TeV_UL2017_Collisions17_GoldenJSON.txt", "v1"), # noqa + "normtag": ("/afs/cern.ch/user/l/lumipro/public/Normtags/normtag_PHYSICS.json", "v1"), + }, + + # muon scale factors + "muon_sf": (f"{json_mirror}/POG/MUO/{year}_UL/muon_Z.json.gz", "v1"), +}) + +# target file size after MergeReducedEvents in MB +cfg.x.reduced_file_size = 512.0 + +# columns to keep after certain steps +cfg.x.keep_columns = DotDict.wrap({ + "cf.ReduceEvents": { + # general event info, mandatory for reading files with coffea + ColumnCollection.MANDATORY_COFFEA, # additional columns can be added as strings, similar to object info + # object info + "Jet.pt", "Jet.eta", "Jet.phi", "Jet.mass", "Jet.btagDeepFlavB", "Jet.hadronFlavour", + "Muon.pt", "Muon.eta", "Muon.phi", "Muon.mass", "Muon.pfRelIso04_all", + "MET.pt", "MET.phi", "MET.significance", "MET.covXX", "MET.covXY", "MET.covYY", + "PV.npvs", + # all columns added during selection using a ColumnCollection flag + ColumnCollection.ALL_FROM_SELECTOR, + }, + "cf.MergeSelectionMasks": { + "cutflow.*", + }, + "cf.UniteColumns": { + "*", + }, +}) + +# event weight columns as keys in an OrderedDict, mapped to shift instances they depend on +get_shifts = functools.partial(get_shifts_from_sources, cfg) +cfg.x.event_weights = DotDict({ + "normalization_weight": [], + "muon_weight": get_shifts("mu"), +}) + +# versions per task family, either referring to strings or to callables receving the invoking +# task instance and parameters to be passed to the task family +cfg.x.versions = { + # "cf.CalibrateEvents": "prod1", + # "cf.SelectEvents": (lambda cls, inst, params: "prod1" if params.get("selector") == "default" else "dev1"), + # ... +} + +# channels +# (just one for now) +cfg.add_channel(name="mutau", id=1) + +# add categories using the "add_category" tool which adds auto-generated ids +# the "selection" entries refer to names of categorizers, e.g. in categorization/example.py +# note: it is recommended to always add an inclusive category with id=1 or name="incl" which is used +# in various places, e.g. for the inclusive cutflow plots and the "empty" selector +add_category( + cfg, + id=1, + name="incl", + selection="cat_incl", + label="inclusive", +) +add_category( + cfg, + name="2j", + selection="cat_2j", + label="2 jets", +) + +# add variables +# (the "event", "run" and "lumi" variables are required for some cutflow plotting task, +# and also correspond to the minimal set of columns that coffea's nano scheme requires) +cfg.add_variable( + name="event", + expression="event", + binning=(1, 0.0, 1.0e9), + x_title="Event number", + discrete_x=True, +) +cfg.add_variable( + name="run", + expression="run", + binning=(1, 100000.0, 500000.0), + x_title="Run number", + discrete_x=True, +) +cfg.add_variable( + name="lumi", + expression="luminosityBlock", + binning=(1, 0.0, 5000.0), + x_title="Luminosity block", + discrete_x=True, +) +cfg.add_variable( + name="n_jet", + expression="n_jet", + binning=(11, -0.5, 10.5), + x_title="Number of jets", + discrete_x=True, +) +# pt of all jets in every event +cfg.add_variable( + name="jets_pt", + expression="Jet.pt", + binning=(40, 0.0, 400.0), + unit="GeV", + x_title=r"$p_{T} of all jets$", +) +# pt of the first jet in every event +cfg.add_variable( + name="jet1_pt", # variable name, to be given to the "--variables" argument for the plotting task + expression="Jet.pt[:,0]", # content of the variable + null_value=EMPTY_FLOAT, # value to be given if content not available for event + binning=(40, 0.0, 400.0), # (bins, lower edge, upper edge) + unit="GeV", # unit of the variable, if any + x_title=r"Jet 1 $p_{T}$", # x title of histogram when plotted +) +# eta of the first jet in every event +cfg.add_variable( + name="jet1_eta", + expression="Jet.eta[:,0]", + null_value=EMPTY_FLOAT, + binning=(30, -3.0, 3.0), + x_title=r"Jet 1 $\eta$", +) +cfg.add_variable( + name="ht", + expression=lambda events: ak.sum(events.Jet.pt, axis=1), + binning=(40, 0.0, 800.0), + unit="GeV", + x_title="HT", +) +# weights +cfg.add_variable( + name="mc_weight", + expression="mc_weight", + binning=(200, -10, 10), + x_title="MC weight", +) +# cutflow variables +cfg.add_variable( + name="cf_jet1_pt", + expression="cutflow.jet1_pt", + binning=(40, 0.0, 400.0), + unit="GeV", + x_title=r"Jet 1 $p_{T}$", +) diff --git a/analysis_templates/ghent_template/__cf_module_name__/inference/__init__.py b/analysis_templates/ghent_template/__cf_module_name__/inference/__init__.py new file mode 100644 index 000000000..57d631c3f --- /dev/null +++ b/analysis_templates/ghent_template/__cf_module_name__/inference/__init__.py @@ -0,0 +1 @@ +# coding: utf-8 diff --git a/analysis_templates/ghent_template/__cf_module_name__/inference/example.py b/analysis_templates/ghent_template/__cf_module_name__/inference/example.py new file mode 100644 index 000000000..0eec2bb30 --- /dev/null +++ b/analysis_templates/ghent_template/__cf_module_name__/inference/example.py @@ -0,0 +1,112 @@ +# coding: utf-8 + +""" +Example inference model. +""" + +from columnflow.inference import inference_model, ParameterType, ParameterTransformation + + +@inference_model +def example(self): + + # + # categories + # + + self.add_category( + "cat1", + config_category="incl", + config_variable="jet1_pt", + config_data_datasets=["data_mu_b"], + mc_stats=True, + ) + self.add_category( + "cat2", + config_category="2j", + config_variable="jet1_eta", + # fake data from TT + data_from_processes=["TT"], + mc_stats=True, + ) + + # + # processes + # + + self.add_process( + "ST", + is_signal=True, + config_process="st", + config_mc_datasets=["st_tchannel_t_powheg"], + ) + self.add_process( + "TT", + config_process="tt", + config_mc_datasets=["tt_sl_powheg"], + ) + + # + # parameters + # + + # groups + self.add_parameter_group("experiment") + self.add_parameter_group("theory") + + # lumi + lumi = self.config_inst.x.luminosity + for unc_name in lumi.uncertainties: + self.add_parameter( + unc_name, + type=ParameterType.rate_gauss, + effect=lumi.get(names=unc_name, direction=("down", "up"), factor=True), + transformations=[ParameterTransformation.symmetrize], + ) + + # tune uncertainty + self.add_parameter( + "tune", + process="TT", + type=ParameterType.shape, + config_shift_source="tune", + ) + + # muon weight uncertainty + self.add_parameter( + "mu", + process=["ST", "TT"], + type=ParameterType.shape, + config_shift_source="mu", + ) + + # jet energy correction uncertainty + self.add_parameter( + "jec", + process=["ST", "TT"], + type=ParameterType.shape, + config_shift_source="jec", + ) + + # a custom asymmetric uncertainty that is converted from rate to shape + self.add_parameter( + "QCDscale_ttbar", + process="TT", + type=ParameterType.shape, + transformations=[ParameterTransformation.effect_from_rate], + effect=(0.5, 1.1), + ) + + +@inference_model +def example_no_shapes(self): + # same initialization as "example" above + example.init_func.__get__(self, self.__class__)() + + # + # remove all shape parameters + # + + for category_name, process_name, parameter in self.iter_parameters(): + if parameter.type.is_shape or any(trafo.from_shape for trafo in parameter.transformations): + self.remove_parameter(parameter.name, process=process_name, category=category_name) diff --git a/analysis_templates/ghent_template/__cf_module_name__/ml/__init__.py b/analysis_templates/ghent_template/__cf_module_name__/ml/__init__.py new file mode 100644 index 000000000..57d631c3f --- /dev/null +++ b/analysis_templates/ghent_template/__cf_module_name__/ml/__init__.py @@ -0,0 +1 @@ +# coding: utf-8 diff --git a/analysis_templates/ghent_template/__cf_module_name__/ml/example.py b/analysis_templates/ghent_template/__cf_module_name__/ml/example.py new file mode 100644 index 000000000..48e7e936c --- /dev/null +++ b/analysis_templates/ghent_template/__cf_module_name__/ml/example.py @@ -0,0 +1,93 @@ +# coding: utf-8 + +""" +Test model definition. +""" + +from __future__ import annotations + +import law +import order as od + +from columnflow.types import Any +from columnflow.ml import MLModel +from columnflow.util import maybe_import, dev_sandbox +from columnflow.columnar_util import Route, set_ak_column + +ak = maybe_import("awkward") +tf = maybe_import("tensorflow") + +law.contrib.load("tensorflow") + + +class ExampleModel(MLModel): + + # mark the model as accepting only a single config + single_config = True + + def setup(self): + # dynamically add variables for the quantities produced by this model + if f"{self.cls_name}.output" not in self.config_inst.variables: + self.config_inst.add_variable( + name=f"{self.cls_name}.output", + null_value=-1, + binning=(20, -1.0, 1.0), + x_title=f"{self.cls_name} DNN output", + ) + + def sandbox(self, task: law.Task) -> str: + return dev_sandbox("bash::$__cf_short_name_uc___BASE/sandboxes/example.sh") + + def datasets(self, config_inst: od.Config) -> set[od.Dataset]: + return { + config_inst.get_dataset("st_tchannel_t_powheg"), + config_inst.get_dataset("tt_sl_powheg"), + } + + def uses(self, config_inst: od.Config) -> set[Route | str]: + return { + "Jet.pt", "Muon.pt", + } + + def produces(self, config_inst: od.Config) -> set[Route | str]: + return { + f"{self.cls_name}.ouptut", + } + + def output(self, task: law.Task) -> law.FileSystemDirectoryTarget: + return task.target(f"mlmodel_f{task.branch}of{self.folds}", dir=True) + + def open_model(self, target: law.FileSystemDirectoryTarget) -> tf.keras.models.Model: + return target.load(formatter="tf_keras_model") + + def train( + self, + task: law.Task, + input: dict[str, list[dict[str, law.FileSystemFileTarget]]], + output: law.FileSystemDirectoryTarget, + ) -> None: + # define a dummy NN + x = tf.keras.Input(shape=(2,)) + a1 = tf.keras.layers.Dense(10, activation="elu")(x) + y = tf.keras.layers.Dense(2, activation="softmax")(a1) + model = tf.keras.Model(inputs=x, outputs=y) + + # the output is just a single directory target + output.dump(model, formatter="tf_keras_model") + + def evaluate( + self, + task: law.Task, + events: ak.Array, + models: list[Any], + fold_indices: ak.Array, + events_used_in_training: bool = False, + ) -> ak.Array: + # fake evaluation + events = set_ak_column(events, f"{self.cls_name}.output", 0.5) + + return events + + +# usable derivations +example = ExampleModel.derive("example", cls_dict={"folds": 2}) diff --git a/analysis_templates/ghent_template/__cf_module_name__/plotting/__init__ b/analysis_templates/ghent_template/__cf_module_name__/plotting/__init__ new file mode 100644 index 000000000..57d631c3f --- /dev/null +++ b/analysis_templates/ghent_template/__cf_module_name__/plotting/__init__ @@ -0,0 +1 @@ +# coding: utf-8 diff --git a/analysis_templates/ghent_template/__cf_module_name__/plotting/example.py b/analysis_templates/ghent_template/__cf_module_name__/plotting/example.py new file mode 100644 index 000000000..943d3ce33 --- /dev/null +++ b/analysis_templates/ghent_template/__cf_module_name__/plotting/example.py @@ -0,0 +1,83 @@ +# coding: utf-8 + +""" +Examples for custom plot functions. +""" + +from __future__ import annotations + +from collections import OrderedDict + +from columnflow.util import maybe_import +from columnflow.plotting.plot_util import ( + remove_residual_axis, + apply_variable_settings, + apply_process_settings, +) + +hist = maybe_import("hist") +np = maybe_import("numpy") +mpl = maybe_import("matplotlib") +plt = maybe_import("matplotlib.pyplot") +mplhep = maybe_import("mplhep") +od = maybe_import("order") + + +def my_plot1d_func( + hists: OrderedDict[od.Process, hist.Hist], + config_inst: od.Config, + category_inst: od.Category, + variable_insts: list[od.Variable], + style_config: dict | None = None, + yscale: str | None = "", + process_settings: dict | None = None, + variable_settings: dict | None = None, + example_param: str | float | bool | None = None, + **kwargs, +) -> tuple(plt.Figure, tuple(plt.Axis,)): + """ + This is an exemplary custom plotting function. + + Exemplary task call: + + .. code-block:: bash + law run cf.PlotVariables1D --version v1 --processes st,tt --variables jet1_pt \ + --plot-function __cf_module_name__.plotting.example.my_plot1d_func \ + --general-settings example_param=some_text + """ + # we can add arbitrary parameters via the `general_settings` parameter to access them in the + # plotting function. They are automatically parsed either to a bool, float, or string + print(f"The example_param has been set to '{example_param}' (type: {type(example_param)})") + + # call helper function to remove shift axis from histogram + remove_residual_axis(hists, "shift") + + # call helper functions to apply the variable_settings and process_settings + variable_inst = variable_insts[0] + hists = apply_variable_settings(hists, variable_insts, variable_settings) + hists = apply_process_settings(hists, process_settings) + + # use the mplhep CMS stype + plt.style.use(mplhep.style.CMS) + + # create a figure and fill it with content + fig, ax = plt.subplots() + for proc_inst, h in hists.items(): + h.plot1d( + ax=ax, + label=proc_inst.label, + color=proc_inst.color1, + ) + + # styling and parameter implementation (e.g. `yscale`) + ax.set( + yscale=yscale, + ylabel=variable_inst.get_full_y_title(), + xlabel=variable_inst.get_full_x_title(), + xscale="log" if variable_inst.log_x else "linear", + ) + ax.legend() + mplhep.cms.label(ax=ax, fontsize=22, llabel="private work") + + # task expects a figure and a tuple of axes as output + return fig, (ax,) diff --git a/analysis_templates/ghent_template/__cf_module_name__/production/__init__.py b/analysis_templates/ghent_template/__cf_module_name__/production/__init__.py new file mode 100644 index 000000000..57d631c3f --- /dev/null +++ b/analysis_templates/ghent_template/__cf_module_name__/production/__init__.py @@ -0,0 +1 @@ +# coding: utf-8 diff --git a/analysis_templates/ghent_template/__cf_module_name__/production/example.py b/analysis_templates/ghent_template/__cf_module_name__/production/example.py new file mode 100644 index 000000000..b190ac04f --- /dev/null +++ b/analysis_templates/ghent_template/__cf_module_name__/production/example.py @@ -0,0 +1,102 @@ +# coding: utf-8 + +""" +Column production methods related to higher-level features. +""" + + +from columnflow.production import Producer, producer +from columnflow.production.categories import category_ids +from columnflow.production.normalization import normalization_weights +from columnflow.production.cms.seeds import deterministic_seeds +from columnflow.production.cms.mc_weight import mc_weight +from columnflow.production.cms.muon import muon_weights +from columnflow.selection.util import create_collections_from_masks +from columnflow.util import maybe_import +from columnflow.columnar_util import EMPTY_FLOAT, Route, set_ak_column + +np = maybe_import("numpy") +ak = maybe_import("awkward") + + +@producer( + uses={ + # nano columns + "Jet.pt", + }, + produces={ + # new columns + "ht", "n_jet", + }, +) +def features(self: Producer, events: ak.Array, **kwargs) -> ak.Array: + events = set_ak_column(events, "ht", ak.sum(events.Jet.pt, axis=1)) + events = set_ak_column(events, "n_jet", ak.num(events.Jet.pt, axis=1), value_type=np.int32) + + return events + + +@producer( + uses={ + mc_weight, category_ids, + # nano columns + "Jet.pt", + }, + produces={ + mc_weight, category_ids, + # new columns + "cutflow.jet1_pt", + }, +) +def cutflow_features( + self: Producer, + events: ak.Array, + object_masks: dict[str, dict[str, ak.Array]], + **kwargs, +) -> ak.Array: + if self.dataset_inst.is_mc: + events = self[mc_weight](events, **kwargs) + + # apply object masks and create new collections + reduced_events = create_collections_from_masks(events, object_masks) + + # create category ids per event and add categories back to the + events = self[category_ids](reduced_events, target_events=events, **kwargs) + + # add cutflow columns + events = set_ak_column( + events, + "cutflow.jet1_pt", + Route("Jet.pt[:,0]").apply(events, EMPTY_FLOAT), + ) + + return events + + +@producer( + uses={ + features, category_ids, normalization_weights, muon_weights, deterministic_seeds, + }, + produces={ + features, category_ids, normalization_weights, muon_weights, deterministic_seeds, + }, +) +def example(self: Producer, events: ak.Array, **kwargs) -> ak.Array: + # features + events = self[features](events, **kwargs) + + # category ids + events = self[category_ids](events, **kwargs) + + # deterministic seeds + events = self[deterministic_seeds](events, **kwargs) + + # mc-only weights + if self.dataset_inst.is_mc: + # normalization weights + events = self[normalization_weights](events, **kwargs) + + # muon weights + events = self[muon_weights](events, **kwargs) + + return events diff --git a/analysis_templates/ghent_template/__cf_module_name__/selection/__init__.py b/analysis_templates/ghent_template/__cf_module_name__/selection/__init__.py new file mode 100644 index 000000000..57d631c3f --- /dev/null +++ b/analysis_templates/ghent_template/__cf_module_name__/selection/__init__.py @@ -0,0 +1 @@ +# coding: utf-8 diff --git a/analysis_templates/ghent_template/__cf_module_name__/selection/example.py b/analysis_templates/ghent_template/__cf_module_name__/selection/example.py new file mode 100644 index 000000000..60e8041e7 --- /dev/null +++ b/analysis_templates/ghent_template/__cf_module_name__/selection/example.py @@ -0,0 +1,166 @@ +# coding: utf-8 + +""" +Exemplary selection methods. +""" + +from collections import defaultdict + +from columnflow.selection import Selector, SelectionResult, selector +from columnflow.selection.stats import increment_stats +from columnflow.selection.util import sorted_indices_from_mask +from columnflow.production.processes import process_ids +from columnflow.production.cms.mc_weight import mc_weight +from columnflow.util import maybe_import + +from __cf_module_name__.production.example import cutflow_features + +np = maybe_import("numpy") +ak = maybe_import("awkward") + + +# +# other unexposed selectors +# (not selectable from the command line but used by other, exposed selectors) +# + + +@selector( + uses={"Muon.pt", "Muon.eta"}, +) +def muon_selection( + self: Selector, + events: ak.Array, + **kwargs, +) -> tuple[ak.Array, SelectionResult]: + # example muon selection: exactly one muon + muon_mask = (events.Muon.pt >= 20.0) & (abs(events.Muon.eta) < 2.1) + muon_sel = ak.sum(muon_mask, axis=1) == 1 + + # build and return selection results + # "objects" maps source columns to new columns and selections to be applied on the old columns + # to create them, e.g. {"Muon": {"MySelectedMuon": indices_applied_to_Muon}} + return events, SelectionResult( + steps={ + "muon": muon_sel, + }, + objects={ + "Muon": { + "Muon": muon_mask, + }, + }, + ) + + +@selector( + uses={"Jet.pt", "Jet.eta"}, +) +def jet_selection( + self: Selector, + events: ak.Array, + **kwargs, +) -> tuple[ak.Array, SelectionResult]: + # example jet selection: at least one jet + jet_mask = (events.Jet.pt >= 25.0) & (abs(events.Jet.eta) < 2.4) + jet_sel = ak.sum(jet_mask, axis=1) >= 1 + + # build and return selection results + # "objects" maps source columns to new columns and selections to be applied on the old columns + # to create them, e.g. {"Jet": {"MyCustomJetCollection": indices_applied_to_Jet}} + return events, SelectionResult( + steps={ + "jet": jet_sel, + }, + objects={ + "Jet": { + "Jet": sorted_indices_from_mask(jet_mask, events.Jet.pt, ascending=False), + }, + }, + aux={ + "n_jets": ak.sum(jet_mask, axis=1), + }, + ) + + +# +# exposed selectors +# (those that can be invoked from the command line) +# + +@selector( + uses={ + # selectors / producers called within _this_ selector + mc_weight, cutflow_features, process_ids, muon_selection, jet_selection, + increment_stats, + }, + produces={ + # selectors / producers whose newly created columns should be kept + mc_weight, cutflow_features, process_ids, + }, + exposed=True, +) +def example( + self: Selector, + events: ak.Array, + stats: defaultdict, + **kwargs, +) -> tuple[ak.Array, SelectionResult]: + # prepare the selection results that are updated at every step + results = SelectionResult() + + # muon selection + events, muon_results = self[muon_selection](events, **kwargs) + results += muon_results + + # jet selection + events, jet_results = self[jet_selection](events, **kwargs) + results += jet_results + + # combined event selection after all steps + results.event = results.steps.muon & results.steps.jet + + # create process ids + events = self[process_ids](events, **kwargs) + + # add the mc weight + if self.dataset_inst.is_mc: + events = self[mc_weight](events, **kwargs) + + # add cutflow features, passing per-object masks + events = self[cutflow_features](events, results.objects, **kwargs) + + # increment stats + weight_map = { + "num_events": Ellipsis, + "num_events_selected": results.event, + } + group_map = {} + if self.dataset_inst.is_mc: + weight_map = { + **weight_map, + # mc weight for all events + "sum_mc_weight": (events.mc_weight, Ellipsis), + "sum_mc_weight_selected": (events.mc_weight, results.event), + } + group_map = { + # per process + "process": { + "values": events.process_id, + "mask_fn": (lambda v: events.process_id == v), + }, + # per jet multiplicity + "njet": { + "values": results.x.n_jets, + "mask_fn": (lambda v: results.x.n_jets == v), + }, + } + events, results = self[increment_stats]( + events, + results, + stats, + weight_map=weight_map, + group_map=group_map, + **kwargs, + ) + + return events, results diff --git a/analysis_templates/ghent_template/__cf_module_name__/tasks/__init__.py b/analysis_templates/ghent_template/__cf_module_name__/tasks/__init__.py new file mode 100644 index 000000000..07e64c984 --- /dev/null +++ b/analysis_templates/ghent_template/__cf_module_name__/tasks/__init__.py @@ -0,0 +1,5 @@ +# coding: utf-8 +# flake8: noqa + +# provisioning imports +import __cf_module_name__.tasks.base diff --git a/analysis_templates/ghent_template/__cf_module_name__/tasks/base.py b/analysis_templates/ghent_template/__cf_module_name__/tasks/base.py new file mode 100644 index 000000000..555e01838 --- /dev/null +++ b/analysis_templates/ghent_template/__cf_module_name__/tasks/base.py @@ -0,0 +1,12 @@ +# coding: utf-8 + +""" +Custom base tasks. +""" + +from columnflow.tasks.framework.base import BaseTask + + +class __cf_short_name_uc__Task(BaseTask): + + task_namespace = "__cf_short_name_lc__" diff --git a/analysis_templates/ghent_template/bin/githooks/post-commit b/analysis_templates/ghent_template/bin/githooks/post-commit new file mode 100755 index 000000000..95aa6ed7e --- /dev/null +++ b/analysis_templates/ghent_template/bin/githooks/post-commit @@ -0,0 +1,18 @@ +#!/bin/sh + +# Custom post-commit hook that runs the linter. + +action() { + # just lint + if [ ! -d "${__cf_short_name_uc___BASE}" ]; then + >&2 echo "__cf_short_name_uc___BASE not setup, skip linting" + elif [ "${__cf_short_name_uc___SKIP_POST_COMMIT}" != "1" ]; then + echo "post-commit linting ..." + bash "${__cf_short_name_uc___BASE}/tests/run_linting" + echo + fi + + # always end successfully + return "0" +} +action "$@" diff --git a/analysis_templates/ghent_template/law.cfg b/analysis_templates/ghent_template/law.cfg new file mode 100644 index 000000000..764843759 --- /dev/null +++ b/analysis_templates/ghent_template/law.cfg @@ -0,0 +1,134 @@ +[core] + +# inherit from the columnflow configuration file +inherit: $CF_BASE/law.cfg + + +[modules] + +columnflow.tasks.cms.inference +columnflow.tasks.cms.external +__cf_module_name__.tasks + + +[logging] + +law: INFO +luigi-interface: INFO +gfal2: WARNING +columnflow.columnar_util-perf: INFO + + +[analysis] + +default_analysis: __cf_module_name__.config.analysis___cf_short_name_lc__.analysis___cf_short_name_lc__ +default_config: run2_2017_nano_v9 +default_dataset: st_tchannel_t_powheg + +calibration_modules: columnflow.calibration.cms.{jets,met}, __cf_module_name__.calibration.example +selection_modules: columnflow.selection.{empty}, columnflow.selection.cms.{json_filter, met_filters}, __cf_module_name__.selection.example +production_modules: columnflow.production.{categories,normalization,processes}, columnflow.production.cms.{btag,electron,mc_weight,muon,pdf,pileup,scale,seeds}, __cf_module_name__.production.example +categorization_modules: __cf_module_name__.categorization.example +ml_modules: columnflow.ml, __cf_module_name__.ml.example +inference_modules: columnflow.inference, __cf_module_name__.inference.example + +# namespace of all columnflow tasks +cf_task_namespace: cf + +# default sandbox for main tasks with standard packages for columnar processing +default_columnar_sandbox: bash::$CF_BASE/sandboxes/venv_columnar.sh + +# wether or not the ensure_proxy decorator should be skipped, even if used by task's run methods +skip_ensure_proxy: False + +# some remote workflow parameter defaults +htcondor_flavor: $CF_HTCONDOR_FLAVOR +htcondor_share_software: False +slurm_flavor: $CF_SLURM_FLAVOR +slurm_partition: $CF_SLURM_PARTITION + +# ChunkedIOHandler defaults +chunked_io_chunk_size: 100000 +chunked_io_pool_size: 2 +chunked_io_debug: False + +# csv list of task families that inherit from ChunkedReaderMixin and whose output arrays should be +# checked (raising an exception) for non-finite values before saving them to disk +check_finite_output: cf.CalibrateEvents, cf.SelectEvents, cf.ProduceColumns + +# csv list of task families that inherit from ChunkedReaderMixin and whose input columns should be +# checked (raising an exception) for overlaps between fields when created a merged input array +check_overlapping_inputs: None + +# whether to log runtimes of array functions by default +log_array_function_runtime: False + + +[outputs] + +# list of all used file systems +wlcg_file_systems: wlcg_fs, wlcg_fs_infn_redirector, wlcg_fs_global_redirector + +# list of file systems used by columnflow.tasks.external.GetDatasetLFNs.iter_nano_files to +# look for the correct fs per nano input file (in that order) +lfn_sources: wlcg_fs_infn_redirector, wlcg_fs_global_redirector + +# output locations per task family +# for local targets : "local[, LOCAL_FS_NAME or STORE_PATH]" +# for remote targets: "wlcg[, WLCG_FS_NAME]" +# (when WLCG_FS_NAME is empty, the tasks' "default_wlcg_fs" attribute is used) +# examples: +# cf.CalibrateEvents: wlcg +# cf.SelectEvents: local + + +[job] + +job_file_dir: $CF_JOB_BASE +job_file_dir_cleanup: False + +# storage element (SE) and output directory on that SE for crab's internal output +# (crab might not even move files there, but it is strictly required for crab's job submission) +crab_storage_element: $CF_CRAB_STORAGE_ELEMENT +crab_base_directory: $CF_CRAB_BASE_DIRECTORY + +# lcg setup file sourced in remote jobs to access gfal tools +remote_lcg_setup: /cvmfs/grid.cern.ch/centos7-ui-200122/etc/profile.d/setup-c7-ui-python3-example.sh + + +[local_fs] + +base: / + + +[wlcg_fs] + +# set this to your desired location +base: root://eosuser.cern.ch/eos/user/$CF_CERN_USER_FIRSTCHAR/$CF_CERN_USER/$CF_STORE_NAME +create_file_dir: True +use_cache: $CF_WLCG_USE_CACHE +cache_root: $CF_WLCG_CACHE_ROOT +cache_cleanup: $CF_WLCG_CACHE_CLEANUP +cache_max_size: 50GB + + +[wlcg_fs_infn_redirector] + +base: root://xrootd-cms.infn.it/ +use_cache: $CF_WLCG_USE_CACHE +cache_root: $CF_WLCG_CACHE_ROOT +cache_cleanup: $CF_WLCG_CACHE_CLEANUP +cache_max_size: 15GB +cache_global_lock: True +cache_mtime_patience: -1 + + +[wlcg_fs_global_redirector] + +base: root://cms-xrd-global.cern.ch/ +use_cache: $CF_WLCG_USE_CACHE +cache_root: $CF_WLCG_CACHE_ROOT +cache_cleanup: $CF_WLCG_CACHE_CLEANUP +cache_max_size: 15GB +cache_global_lock: True +cache_mtime_patience: -1 diff --git a/analysis_templates/ghent_template/sandboxes/example.sh b/analysis_templates/ghent_template/sandboxes/example.sh new file mode 100644 index 000000000..ee4b0b84a --- /dev/null +++ b/analysis_templates/ghent_template/sandboxes/example.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +# Script that sets up a virtual env in $CF_VENV_PATH. +# For more info on functionality and parameters, see the generic setup script _setup_venv.sh. + +action() { + local shell_is_zsh=$( [ -z "${ZSH_VERSION}" ] && echo "false" || echo "true" ) + local this_file="$( ${shell_is_zsh} && echo "${(%):-%x}" || echo "${BASH_SOURCE[0]}" )" + local this_dir="$( cd "$( dirname "${this_file}" )" && pwd )" + + # set variables and source the generic venv setup + export CF_SANDBOX_FILE="${CF_SANDBOX_FILE:-${this_file}}" + export CF_VENV_NAME="$( basename "${this_file%.sh}" )" + export CF_VENV_REQUIREMENTS="${this_dir}/example.txt" + + source "${CF_BASE}/sandboxes/_setup_venv.sh" "$@" +} +action "$@" diff --git a/analysis_templates/ghent_template/sandboxes/example.txt b/analysis_templates/ghent_template/sandboxes/example.txt new file mode 100644 index 000000000..368f0ab06 --- /dev/null +++ b/analysis_templates/ghent_template/sandboxes/example.txt @@ -0,0 +1,8 @@ +# version 1 + +git+https://github.com/CoffeaTeam/coffea.git@b9356b9#egg=coffea +awkward~=2.0 +dask-awkward~=2023.1 +uproot~=5.0 +tabulate~=0.9 +tensorflow~=2.11 diff --git a/analysis_templates/ghent_template/sandboxes/example_dev.sh b/analysis_templates/ghent_template/sandboxes/example_dev.sh new file mode 100644 index 000000000..37ceb6556 --- /dev/null +++ b/analysis_templates/ghent_template/sandboxes/example_dev.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +# Script that sets up a virtual env in $CF_VENV_PATH. +# For more info on functionality and parameters, see the generic setup script _setup_venv.sh. + +action() { + local shell_is_zsh=$( [ -z "${ZSH_VERSION}" ] && echo "false" || echo "true" ) + local this_file="$( ${shell_is_zsh} && echo "${(%):-%x}" || echo "${BASH_SOURCE[0]}" )" + local this_dir="$( cd "$( dirname "${this_file}" )" && pwd )" + + # set variables and source the generic venv setup + export CF_SANDBOX_FILE="${CF_SANDBOX_FILE:-${this_file}}" + export CF_VENV_NAME="$( basename "${this_file%.sh}" )" + export CF_VENV_REQUIREMENTS="${this_dir}/example.txt,${CF_BASE}/sandboxes/dev.txt" + + source "${CF_BASE}/sandboxes/_setup_venv.sh" "$@" +} +action "$@" diff --git a/analysis_templates/ghent_template/setup.sh b/analysis_templates/ghent_template/setup.sh new file mode 100644 index 000000000..379e8169b --- /dev/null +++ b/analysis_templates/ghent_template/setup.sh @@ -0,0 +1,162 @@ +#!/usr/bin/env bash + +setup___cf_short_name_lc__() { + # Runs the project setup, leading to a collection of environment variables starting with either + # - "CF_", for controlling behavior implemented by columnflow, or + # - "__cf_short_name_uc___", for features provided by the analysis repository itself. + # Check the setup.sh in columnflow for documentation of the "CF_" variables. The purpose of all + # "__cf_short_name_uc___" variables is documented below. + # + # The setup also handles the installation of the software stack via virtual environments, and + # optionally an interactive setup where the user can configure certain variables. + # + # + # Arguments: + # 1. The name of the setup. "default" (which is itself the default when no name is set) + # triggers a setup with good defaults, avoiding all queries to the user and the writing of + # a custom setup file. See "interactive_setup()" for more info. + # + # + # Optinally preconfigured environment variables: + # None yet. + # + # + # Variables defined by the setup and potentially required throughout the analysis: + # __cf_short_name_uc___BASE + # The absolute analysis base directory. Used to infer file locations relative to it. + # __cf_short_name_uc___SETUP + # A flag that is set to 1 after the setup was successful. + + # prevent repeated setups + if [ "${__cf_short_name_uc___SETUP}" = "1" ]; then + >&2 echo "the __cf_analysis_name__ analysis was already succesfully setup" + >&2 echo "re-running the setup requires a new shell" + return "1" + fi + + + # + # prepare local variables + # + + local shell_is_zsh="$( [ -z "${ZSH_VERSION}" ] && echo "false" || echo "true" )" + local this_file="$( ${shell_is_zsh} && echo "${(%):-%x}" || echo "${BASH_SOURCE[0]}" )" + local this_dir="$( cd "$( dirname "${this_file}" )" && pwd )" + local orig="${PWD}" + local setup_name="${1:-default}" + local setup_is_default="false" + [ "${setup_name}" = "default" ] && setup_is_default="true" + + # zsh options + if ${shell_is_zsh}; then + emulate -L bash + setopt globdots + fi + + + # + # global variables + # (__cf_short_name_uc__ = __cf_analysis_name__, CF = columnflow) + # + + # start exporting variables + export __cf_short_name_uc___BASE="${this_dir}" + export CF_BASE="${this_dir}/modules/columnflow" + export CF_REPO_BASE="${__cf_short_name_uc___BASE}" + export CF_REPO_BASE_ALIAS="__cf_short_name_uc___BASE" + export CF_SETUP_NAME="${setup_name}" + + # load cf setup helpers + CF_SKIP_SETUP="1" source "${CF_BASE}/setup.sh" "" || return "$?" + + # interactive setup + if [ "${CF_REMOTE_ENV}" != "1" ]; then + cf_setup_interactive_body() { + # pre-export the CF_FLAVOR which will be cms + export CF_FLAVOR="cms" + + # query common variables + cf_setup_interactive_common_variables + + # query specific variables + # nothing yet ... + } + cf_setup_interactive "${CF_SETUP_NAME}" "${__cf_short_name_uc___BASE}/.setups/${CF_SETUP_NAME}.sh" || return "$?" + fi + + # continue the fixed setup + export CF_CONDA_BASE="${CF_CONDA_BASE:-${CF_SOFTWARE_BASE}/conda}" + export CF_VENV_BASE="${CF_VENV_BASE:-${CF_SOFTWARE_BASE}/venvs}" + export CF_CMSSW_BASE="${CF_CMSSW_BASE:-${CF_SOFTWARE_BASE}/cmssw}" + + + # + # common variables + # + + cf_setup_common_variables || return "$?" + + + # + # minimal local software setup + # + + cf_setup_software_stack "${CF_SETUP_NAME}" || return "$?" + + # ammend paths that are not covered by the central cf setup + export PATH="${__cf_short_name_uc___BASE}/bin:${PATH}" + export PYTHONPATH="${__cf_short_name_uc___BASE}:${__cf_short_name_uc___BASE}/modules/cmsdb:${PYTHONPATH}" + + # initialze submodules + if [ -e "${__cf_short_name_uc___BASE}/.git" ]; then + local m + for m in $( ls -1q "${__cf_short_name_uc___BASE}/modules" ); do + cf_init_submodule "${__cf_short_name_uc___BASE}" "modules/${m}" + done + fi + + + # + # git hooks + # + + cf_setup_git_hooks || return "$?" + + + # + # law setup + # + + export LAW_HOME="${LAW_HOME:-${__cf_short_name_uc___BASE}/.law}" + export LAW_CONFIG_FILE="${LAW_CONFIG_FILE:-${__cf_short_name_uc___BASE}/law.cfg}" + + if which law &> /dev/null; then + # source law's bash completion scipt + source "$( law completion )" "" + + # silently index + law index -q + fi + + # finalize + export __cf_short_name_uc___SETUP="1" +} + +main() { + # Invokes the main action of this script, catches possible error codes and prints a message. + + # run the actual setup + if setup___cf_short_name_lc__ "$@"; then + cf_color green "__cf_analysis_name__ analysis successfully setup" + return "0" + else + local code="$?" + cf_color red "setup failed with code ${code}" + return "${code}" + fi +} + +# entry point +if [ "${__cf_short_name_uc___SKIP_SETUP}" != "1" ]; then + main "$@" +fi diff --git a/analysis_templates/ghent_template/tests/__init__.py b/analysis_templates/ghent_template/tests/__init__.py new file mode 100644 index 000000000..e97ffe8b0 --- /dev/null +++ b/analysis_templates/ghent_template/tests/__init__.py @@ -0,0 +1,18 @@ +# coding: utf-8 +# flake8: noqa + +""" +Entry point for all tests. +""" + +__all__ = [] + +# adjust the path to import the package +import os +import sys +base = os.path.normpath(os.path.join(os.path.abspath(__file__), "../..")) +sys.path.append(base) +import __cf_module_name__ # noqa + +# import all tests +# ... diff --git a/analysis_templates/ghent_template/tests/run_all b/analysis_templates/ghent_template/tests/run_all new file mode 100755 index 000000000..b14cbaad6 --- /dev/null +++ b/analysis_templates/ghent_template/tests/run_all @@ -0,0 +1,43 @@ +#!/usr/bin/env bash + +# Script that triggers all run_* scripts in this directory with default arguments. By default, the +# process is terminated if a script returns with a non-zero exit code. +# +# Arguments: +# 1. The mode. When "force", all scripts are executed independenlty of non-zero exit codes of +# previous scripts. + +action() { + local shell_is_zsh="$( [ -z "${ZSH_VERSION}" ] && echo "false" || echo "true" )" + local this_file="$( ${shell_is_zsh} && echo "${(%):-%x}" || echo "${BASH_SOURCE[0]}" )" + local this_dir="$( cd "$( dirname "${this_file}" )" && pwd )" + + # get arguments + local mode="${1}" + + # local and global return codes + local ret_global="0" + local ret + + # colored echo helper + cecho() { + local col="${1}" + local msg="${2}" + echo -e "\x1b[0;49;${col}m${msg}\x1b[0m" + } + + # linting + cecho 35 "check linting ..." + bash "${this_dir}/run_linting" + ret="$?" + if [ "${ret}" != "0" ]; then + >&2 cecho 31 "run_linting failed with exit code ${ret}" + [ "${mode}" = "force" ] || return "${ret}" + ret_global="1" + else + cecho 32 "done" + fi + + return "${ret_global}" +} +action "$@" diff --git a/analysis_templates/ghent_template/tests/run_linting b/analysis_templates/ghent_template/tests/run_linting new file mode 100755 index 000000000..c6d174c95 --- /dev/null +++ b/analysis_templates/ghent_template/tests/run_linting @@ -0,0 +1,16 @@ +#!/usr/bin/env bash + +# Script that runs linting checks on selected files. + +action() { + local shell_is_zsh="$( [ -z "${ZSH_VERSION}" ] && echo "false" || echo "true" )" + local this_file="$( ${shell_is_zsh} && echo "${(%):-%x}" || echo "${BASH_SOURCE[0]}" )" + local this_dir="$( cd "$( dirname "${this_file}" )" && pwd )" + local __cf_short_name_lc___dir="$( dirname "${this_dir}" )" + + ( + cd "${__cf_short_name_lc___dir}" && \ + flake8 __cf_module_name__ tests + ) +} +action "$@" From 54cd08910b69cede1770cafc974e7052c46cc0f7 Mon Sep 17 00:00:00 2001 From: maadcoen Date: Wed, 3 Apr 2024 17:56:38 +0200 Subject: [PATCH 007/119] README.md updated with columnflow location on GhentAnalysis github --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 524eb30e1..93476800a 100644 --- a/README.md +++ b/README.md @@ -63,7 +63,7 @@ To create an analysis using columnflow, it is recommended to start from a predef The following command (no previous git clone required) interactively asks for a handful of names and settings, and creates a minimal, yet fully functioning project structure for you! ```shell -bash -c "$(curl -Ls https://gitlab.cern.ch/ghentanalysis/columnflowanalysis/-/raw/columnflow/columnflow/master/create_analysis.sh)" +bash -c "$(curl -Ls https://raw.githubusercontent.com/GhentAnalysis/columnflow/main/create_analysis.sh)" ``` At the end of the setup, you will see further instructions and suggestions to run your first analysis tasks (example below). From 0ec05ae70a1149c49072975161a17eafada29598 Mon Sep 17 00:00:00 2001 From: maadcoen Date: Wed, 3 Apr 2024 18:05:09 +0200 Subject: [PATCH 008/119] create_analysis.sh updated with columnflow location on GhentAnalysis github --- create_analysis.sh | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/create_analysis.sh b/create_analysis.sh index 495f2be35..330718f48 100755 --- a/create_analysis.sh +++ b/create_analysis.sh @@ -17,7 +17,7 @@ create_analysis() { local this_file="$( ${shell_is_zsh} && echo "${(%):-%x}" || echo "${BASH_SOURCE[0]}" )" local this_dir="$( cd "$( dirname "${this_file}" )" && pwd )" local exec_dir="$( pwd )" - local fetch_cf_branch="master" + local fetch_cf_branch="main" local fetch_cmsdb_branch="master" local debug="${CF_CREATE_ANALYSIS_DEBUG:-false}" @@ -203,8 +203,8 @@ create_analysis() { rm -rf "${exec_dir}/.cf_analysis_setup" mkdir -p "${exec_dir}/.cf_analysis_setup" || return "$?" cd "${exec_dir}/.cf_analysis_setup" - curl -L -s -k "https://github.com/columnflow/columnflow/tarball/${fetch_cf_branch}" | tar -xz || return "$?" - mv columnflow-columnflow-*/"analysis_templates/${cf_analysis_flavor}" "${cf_analysis_base}" || return "$?" + curl -L -s -k "https://github.com/GhentAnalysis/columnflow/tarball/${fetch_cf_branch}" | tar -xz || return "$?" + mv GhentAnalysis-columnflow-*/"analysis_templates/${cf_analysis_flavor}" "${cf_analysis_base}" || return "$?" cd "${cf_analysis_base}" || return "$?" rm -rf "${exec_dir}/.cf_analysis_setup" fi @@ -255,18 +255,22 @@ create_analysis() { echo_color cyan "setup submodules" - local gh_prefix="https://github.com/" + local gh_prefix_github="https://github.com/" + local gh_prefix_gitlab="https://gitlab.cern.ch/" + + + $( str_lc "${cf_use_ssh}" ) && gh_prefix_github="git@github.com:" + $( str_lc "${cf_use_ssh}" ) && gh_prefix_gitlab="ssh://git@gitlab.cern.ch:" - $( str_lc "${cf_use_ssh}" ) && gh_prefix="ssh://git@gitlab.cern.ch:" mkdir -p modules if ${debug}; then ln -s "${this_dir}" modules/columnflow else - git submodule add -b "${fetch_cf_branch}" "${gh_prefix}7999/ghentanalysis/columnflowanalysis/columnflow.git" modules/columnflow + git submodule add -b "${fetch_cf_branch}" "${gh_prefix_github}GhentAnalysis/columnflow.git" modules/columnflow fi if [ "${cf_analysis_flavor}" = "cms_minimal" ]; then - git submodule add -b "${fetch_cmsdb_branch}" "${gh_prefix}7999/ghentanalysis/cmsdb.git" modules/cmsdb + git submodule add -b "${fetch_cmsdb_branch}" "${gh_prefix_gitlab}7999/ghentanalysis/cmsdb.git" modules/cmsdb fi git submodule update --init --recursive From fd907c54e7401d9ae5706443393cc05d72dee2b8 Mon Sep 17 00:00:00 2001 From: maadcoen Date: Wed, 3 Apr 2024 19:07:15 +0200 Subject: [PATCH 009/119] add ghent_template to the analysis flavors --- create_analysis.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/create_analysis.sh b/create_analysis.sh index 330718f48..12e4c3714 100755 --- a/create_analysis.sh +++ b/create_analysis.sh @@ -161,7 +161,7 @@ create_analysis() { echo query_input "cf_short_name" "Short name for environment variables, pre- and suffixes" "${cf_module_name}" echo - query_input "cf_analysis_flavor" "The flavor of the analysis to setup" "cms_minimal" "cms_minimal" + query_input "cf_analysis_flavor" "The flavor of the analysis to setup" "ghent_template" "cms_minimal,ghent_template" echo query_input "cf_use_ssh" "Use ssh for git submodules" "True" "True,False" echo From 53da5d66a777bfe13198d0d465d66856fde194f4 Mon Sep 17 00:00:00 2001 From: maadcoen Date: Wed, 3 Apr 2024 19:14:10 +0200 Subject: [PATCH 010/119] bugfix: :7999/ is part of gitlab prefix added submodules for ghent_template --- create_analysis.sh | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/create_analysis.sh b/create_analysis.sh index 12e4c3714..8a8fe8777 100755 --- a/create_analysis.sh +++ b/create_analysis.sh @@ -260,7 +260,7 @@ create_analysis() { $( str_lc "${cf_use_ssh}" ) && gh_prefix_github="git@github.com:" - $( str_lc "${cf_use_ssh}" ) && gh_prefix_gitlab="ssh://git@gitlab.cern.ch:" + $( str_lc "${cf_use_ssh}" ) && gh_prefix_gitlab="ssh://git@gitlab.cern.ch:7999/" mkdir -p modules @@ -270,7 +270,11 @@ create_analysis() { git submodule add -b "${fetch_cf_branch}" "${gh_prefix_github}GhentAnalysis/columnflow.git" modules/columnflow fi if [ "${cf_analysis_flavor}" = "cms_minimal" ]; then - git submodule add -b "${fetch_cmsdb_branch}" "${gh_prefix_gitlab}7999/ghentanalysis/cmsdb.git" modules/cmsdb + git submodule add -b "${fetch_cmsdb_branch}" "${gh_prefix_gitlab}ghentanalysis/cmsdb.git" modules/cmsdb + fi + if [ "${cf_analysis_flavor}" = "ghent_template" ]; then + git submodule add -b "${fetch_cmsdb_branch}" "${gh_prefix_gitlab}cms-nanoAOD/jsonpog-integration.git" modules/jsonpog-integration + git submodule add -b "${fetch_cmsdb_branch}" "${gh_prefix_gitlab}ghentanalysis/cmsdb.git" modules/cmsdb fi git submodule update --init --recursive From 7b8fe4b3f94a6fc3387967434a05a67231f8a6a7 Mon Sep 17 00:00:00 2001 From: maadcoen Date: Thu, 4 Apr 2024 08:18:19 +0200 Subject: [PATCH 011/119] chunked_io_debug: True for more information on errors occuring inside environments (essential to debugging) --- law.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/law.cfg b/law.cfg index f2285314d..a47d0ba69 100644 --- a/law.cfg +++ b/law.cfg @@ -45,7 +45,7 @@ slurm_partition: $CF_SLURM_PARTITION # ChunkedIOHandler defaults chunked_io_chunk_size: 100000 chunked_io_pool_size: 2 -chunked_io_debug: False +chunked_io_debug: True # csv list of task families that inherit from ChunkedReaderMixin and whose output arrays should be # checked (raising an exception) for non-finite values before saving them to disk From 24ca7f8425b0fc49cae341c6d025a64b469107ef Mon Sep 17 00:00:00 2001 From: maadcoen Date: Thu, 4 Apr 2024 08:20:23 +0200 Subject: [PATCH 012/119] Edit the columnflow_patches.py to remove htcondor_flavor and max_runtime --- .../__cf_module_name__/columnflow_patches.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/analysis_templates/ghent_template/__cf_module_name__/columnflow_patches.py b/analysis_templates/ghent_template/__cf_module_name__/columnflow_patches.py index 4a0eba031..75e3673a4 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/columnflow_patches.py +++ b/analysis_templates/ghent_template/__cf_module_name__/columnflow_patches.py @@ -34,6 +34,18 @@ def patch_bundle_repo_exclude_files(): logger.debug("patched exclude_files of cf.BundleRepo") +@memoize +def patch_htcondor_workflow(): + from columnflow.tasks.framework.remote import HTCondorWorkflow + + # change the max_runtime parameter default + HTCondorWorkflow.max_runtime._default = 0 + logger.debug("patched max_runtime of cf.HTCondorWorkflow") + + HTCondorWorkflow.htcondor_flavor._default = 'NO_STR' + logger.debug("patched flavor of cf.HTCondorWorkflow") + + @memoize def patch_all(): patch_bundle_repo_exclude_files() From 51c162dbe2b1075ad20df6d3c21f1610c365dc36 Mon Sep 17 00:00:00 2001 From: maadcoen Date: Thu, 4 Apr 2024 08:22:03 +0200 Subject: [PATCH 013/119] Add the path to t2b servers so that files can be read from there --- analysis_templates/ghent_template/law.cfg | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/analysis_templates/ghent_template/law.cfg b/analysis_templates/ghent_template/law.cfg index 764843759..eab65d02f 100644 --- a/analysis_templates/ghent_template/law.cfg +++ b/analysis_templates/ghent_template/law.cfg @@ -67,11 +67,11 @@ log_array_function_runtime: False [outputs] # list of all used file systems -wlcg_file_systems: wlcg_fs, wlcg_fs_infn_redirector, wlcg_fs_global_redirector +wlcg_file_systems: wlcg_fs_t2b_redirector, wlcg_fs, wlcg_fs_infn_redirector, wlcg_fs_global_redirector # list of file systems used by columnflow.tasks.external.GetDatasetLFNs.iter_nano_files to # look for the correct fs per nano input file (in that order) -lfn_sources: wlcg_fs_infn_redirector, wlcg_fs_global_redirector +lfn_sources: wlcg_fs_t2b_redirector, wlcg_fs_infn_redirector, wlcg_fs_global_redirector # output locations per task family # for local targets : "local[, LOCAL_FS_NAME or STORE_PATH]" @@ -101,6 +101,18 @@ remote_lcg_setup: /cvmfs/grid.cern.ch/centos7-ui-200122/etc/profile.d/setup-c7-u base: / +[wlcg_fs_t2b_redirector] + +# set this to your desired location +base: /pnfs/iihe/cms/ph/sc4 +use_cache: $CF_WLCG_USE_CACHE +cache_root: $CF_WLCG_CACHE_ROOT +cache_cleanup: $CF_WLCG_CACHE_CLEANUP +cache_max_size: 15GB +cache_global_lock: True +cache_mtime_patience: -1 + + [wlcg_fs] # set this to your desired location From abad1a1bf0573a685296e3deec2f2c8d453ef444 Mon Sep 17 00:00:00 2001 From: maadcoen Date: Thu, 4 Apr 2024 09:32:40 +0200 Subject: [PATCH 014/119] add POG json to external files. For Lumi files: normtag from lumi pog github (cloned into the modules), everything else from https://cms-service-dqmdc.web.cern.ch/CAF/certification. All other POG jsons from the jsonpog-integration gitlab (cloned into the modules) --- .../config/analysis___cf_short_name_lc__.py | 48 +++++++++++++++++-- analysis_templates/ghent_template/setup.sh | 1 + create_analysis.sh | 1 + 3 files changed, 45 insertions(+), 5 deletions(-) diff --git a/analysis_templates/ghent_template/__cf_module_name__/config/analysis___cf_short_name_lc__.py b/analysis_templates/ghent_template/__cf_module_name__/config/analysis___cf_short_name_lc__.py index 5767bfba6..60a674ba8 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/config/analysis___cf_short_name_lc__.py +++ b/analysis_templates/ghent_template/__cf_module_name__/config/analysis___cf_short_name_lc__.py @@ -73,6 +73,7 @@ # gather campaign data year = campaign.x.year +ecm = campaign.ecm # add processes we are interested in process_names = [ @@ -222,16 +223,53 @@ add_shift_aliases(cfg, "mu", {"muon_weight": "muon_weight_{direction}"}) # external files -json_mirror = "/afs/cern.ch/work/m/mrieger/public/mirrors/jsonpog-integration-9ea86c4c" +json_mirror = "${MODULE_BASE}/jsonpog-integration" +year_short = str(year)[2:] # 20XX > XX +lumi_cert_site = f"https://cms-service-dqmdc.web.cern.ch/CAF/certification/Collisions{year_short}/{ecm}TeV/" +pu_reweighting_site = f"{lumi_cert_site}/PileUp/UltraLegacy" +runs = {2016: "271036-284044", 2017: "294927-306462", 2018: "314472-325175"} cfg.x.external_files = DotDict.wrap({ - # lumi files + # lumi files (golden run 2 only!!) "lumi": { - "golden": ("/afs/cern.ch/cms/CAF/CMSCOMM/COMM_DQM/certification/Collisions17/13TeV/Legacy_2017/Cert_294927-306462_13TeV_UL2017_Collisions17_GoldenJSON.txt", "v1"), # noqa - "normtag": ("/afs/cern.ch/user/l/lumipro/public/Normtags/normtag_PHYSICS.json", "v1"), + "golden": (f"{lumi_cert_site}/Legacy_{year}/Cert_{runs[year]}_{ecm}TeV_UL{year}_Collisions{year_short}_GoldenJSON.txt", "v1"), # noqa + "normtag": ("${MODULE_BASE}/Normtags/normtag_PHYSICS.json", "v1"), }, + # jet energy correction + "jet_jerc": (f"{json_mirror}/POG/JME/{year}{corr_postfix}_UL/jet_jerc.json.gz", "v1"), + + # electron scale factors + "electron_sf": (f"{json_mirror}/POG/EGM/{year}{corr_postfix}_UL/electron.json.gz", "v1"), + # muon scale factors - "muon_sf": (f"{json_mirror}/POG/MUO/{year}_UL/muon_Z.json.gz", "v1"), + "muon_sf": (f"{json_mirror}/POG/MUO/{year}{corr_postfix}_UL/muon_Z.json.gz", "v1"), + + # btag scale factor + "btag_sf_corr": (f"{json_mirror}/POG/BTV/{year}{corr_postfix}_UL/btagging.json.gz", "v1"), + + # fake rates + "muon_fakerate": (f"{json_mirror}/POG/BTV/{year}{corr_postfix}_UL/btagging.json.gz", "v1"), + "electron_fakerate": (f"{json_mirror}/POG/BTV/{year}{corr_postfix}_UL/btagging.json.gz", "v1"), + + # run 2 only!! + # files from https://twiki.cern.ch/twiki/bin/viewauth/CMS/PileupJSONFileforData?rev=44#Pileup_JSON_Files_For_Run_II # noqa + "pu": { + "json": (f"{pu_reweightin_website}/pileup_latest.txt", "v1"), # noqa + "mc_profile": ( + "https://raw.githubusercontent.com/cms-sw/cmssw/435f0b04c0e318c1036a6b95eb169181bbbe8344/SimGeneral/MixingModule/python/mix_2018_25ns_UltraLegacy_PoissonOOTPU_cfi.py", # noqa + "v1"), # noqa + "data_profile": { + "nominal": ( + f"{pu_reweighting_site}/PileupHistogram-goldenJSON-{ecm}tev-{year}-69200ub-99bins.root", "v1"), + # noqa + "minbias_xs_up": ( + f"{pu_reweighting_site}/PileupHistogram-goldenJSON-{ecm}tev-{year}-72400ub-99bins.root", "v1"), + # noqa + "minbias_xs_down": ( + f"{pu_reweighting_site}/PileupHistogram-goldenJSON-{ecm}tev-{year}-66000ub-99bins.root", "v1"), + # noqa + }, + }, }) # target file size after MergeReducedEvents in MB diff --git a/analysis_templates/ghent_template/setup.sh b/analysis_templates/ghent_template/setup.sh index 379e8169b..7184e2d71 100644 --- a/analysis_templates/ghent_template/setup.sh +++ b/analysis_templates/ghent_template/setup.sh @@ -61,6 +61,7 @@ setup___cf_short_name_lc__() { # start exporting variables export __cf_short_name_uc___BASE="${this_dir}" + export MODULE_BASE="${this_dir}/modules" export CF_BASE="${this_dir}/modules/columnflow" export CF_REPO_BASE="${__cf_short_name_uc___BASE}" export CF_REPO_BASE_ALIAS="__cf_short_name_uc___BASE" diff --git a/create_analysis.sh b/create_analysis.sh index 8a8fe8777..9424c208a 100755 --- a/create_analysis.sh +++ b/create_analysis.sh @@ -273,6 +273,7 @@ create_analysis() { git submodule add -b "${fetch_cmsdb_branch}" "${gh_prefix_gitlab}ghentanalysis/cmsdb.git" modules/cmsdb fi if [ "${cf_analysis_flavor}" = "ghent_template" ]; then + git submodule add -b "${fetch_cmsdb_branch}" "${gh_prefix_github}CMS-LUMI-POG/Normtags.git" modules/Normtags git submodule add -b "${fetch_cmsdb_branch}" "${gh_prefix_gitlab}cms-nanoAOD/jsonpog-integration.git" modules/jsonpog-integration git submodule add -b "${fetch_cmsdb_branch}" "${gh_prefix_gitlab}ghentanalysis/cmsdb.git" modules/cmsdb fi From 7d6665bff239d73382ce7e6b32ab54d3b162152d Mon Sep 17 00:00:00 2001 From: maadcoen Date: Thu, 4 Apr 2024 09:42:27 +0200 Subject: [PATCH 015/119] fetch from pog_externals --- create_analysis.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/create_analysis.sh b/create_analysis.sh index 9424c208a..a03886aed 100755 --- a/create_analysis.sh +++ b/create_analysis.sh @@ -17,7 +17,7 @@ create_analysis() { local this_file="$( ${shell_is_zsh} && echo "${(%):-%x}" || echo "${BASH_SOURCE[0]}" )" local this_dir="$( cd "$( dirname "${this_file}" )" && pwd )" local exec_dir="$( pwd )" - local fetch_cf_branch="main" + local fetch_cf_branch="pog_externals" local fetch_cmsdb_branch="master" local debug="${CF_CREATE_ANALYSIS_DEBUG:-false}" From c078b55a8625571d97ca794912b77d8786324dab Mon Sep 17 00:00:00 2001 From: maadcoen Date: Thu, 4 Apr 2024 09:43:25 +0200 Subject: [PATCH 016/119] fetch from pog_externals (also in readme) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 93476800a..3c53a5a2e 100644 --- a/README.md +++ b/README.md @@ -63,7 +63,7 @@ To create an analysis using columnflow, it is recommended to start from a predef The following command (no previous git clone required) interactively asks for a handful of names and settings, and creates a minimal, yet fully functioning project structure for you! ```shell -bash -c "$(curl -Ls https://raw.githubusercontent.com/GhentAnalysis/columnflow/main/create_analysis.sh)" +bash -c "$(curl -Ls https://raw.githubusercontent.com/GhentAnalysis/columnflow/pog_externals/create_analysis.sh)" ``` At the end of the setup, you will see further instructions and suggestions to run your first analysis tasks (example below). From a8e8c122a5a4d492f3ba08fed9d1941f76c5924d Mon Sep 17 00:00:00 2001 From: maadcoen Date: Thu, 4 Apr 2024 11:13:24 +0200 Subject: [PATCH 017/119] auto removal of line 6 "/ada_mnt/ada" --- setup.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.sh b/setup.sh index 41a019b50..6a66936a8 100644 --- a/setup.sh +++ b/setup.sh @@ -583,6 +583,7 @@ cf_setup_software_stack() { 2>&1 "${CF_CONDA_BASE}/bin/micromamba" shell hook -y --prefix="$PWD" &> micromamba.sh || return "$?" # make the setup file relocatable sed -i -r "s|${CF_CONDA_BASE}|\$\{MAMBA_ROOT_PREFIX\}|" "micromamba.sh" || return "$?" + sed -i -r "6 s|/ada_mnt/ada||" "micromamba.sh" || return "$?" mv "micromamba.sh" "${CF_CONDA_BASE}/etc/profile.d/micromamba.sh" cat << EOF > "${CF_CONDA_BASE}/.mambarc" changeps1: false From 60f6fbb1db7cf539b4324958e14e58c37c1815f3 Mon Sep 17 00:00:00 2001 From: maadcoen Date: Thu, 4 Apr 2024 11:13:24 +0200 Subject: [PATCH 018/119] auto removal of line 6 "/ada_mnt/ada" --- setup.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.sh b/setup.sh index 41a019b50..6a66936a8 100644 --- a/setup.sh +++ b/setup.sh @@ -583,6 +583,7 @@ cf_setup_software_stack() { 2>&1 "${CF_CONDA_BASE}/bin/micromamba" shell hook -y --prefix="$PWD" &> micromamba.sh || return "$?" # make the setup file relocatable sed -i -r "s|${CF_CONDA_BASE}|\$\{MAMBA_ROOT_PREFIX\}|" "micromamba.sh" || return "$?" + sed -i -r "6 s|/ada_mnt/ada||" "micromamba.sh" || return "$?" mv "micromamba.sh" "${CF_CONDA_BASE}/etc/profile.d/micromamba.sh" cat << EOF > "${CF_CONDA_BASE}/.mambarc" changeps1: false From 250bcae4cd1020e58048d10249d77e6c7f1344c7 Mon Sep 17 00:00:00 2001 From: maadcoen Date: Thu, 4 Apr 2024 11:50:57 +0200 Subject: [PATCH 019/119] removed bugs in external files specifications --- .../config/analysis___cf_short_name_lc__.py | 13 +++++++------ analysis_templates/ghent_template/setup.sh | 1 - 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/analysis_templates/ghent_template/__cf_module_name__/config/analysis___cf_short_name_lc__.py b/analysis_templates/ghent_template/__cf_module_name__/config/analysis___cf_short_name_lc__.py index 60a674ba8..53d4234b7 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/config/analysis___cf_short_name_lc__.py +++ b/analysis_templates/ghent_template/__cf_module_name__/config/analysis___cf_short_name_lc__.py @@ -74,6 +74,8 @@ # gather campaign data year = campaign.x.year ecm = campaign.ecm +year2 = year % 100 +corr_postfix = f"{campaign.x.vfp}VFP" if year == 2016 else "" # add processes we are interested in process_names = [ @@ -223,16 +225,15 @@ add_shift_aliases(cfg, "mu", {"muon_weight": "muon_weight_{direction}"}) # external files -json_mirror = "${MODULE_BASE}/jsonpog-integration" -year_short = str(year)[2:] # 20XX > XX -lumi_cert_site = f"https://cms-service-dqmdc.web.cern.ch/CAF/certification/Collisions{year_short}/{ecm}TeV/" +json_mirror = "modules/jsonpog-integration" +lumi_cert_site = f"https://cms-service-dqmdc.web.cern.ch/CAF/certification/Collisions{year2}/{ecm}TeV/" pu_reweighting_site = f"{lumi_cert_site}/PileUp/UltraLegacy" runs = {2016: "271036-284044", 2017: "294927-306462", 2018: "314472-325175"} cfg.x.external_files = DotDict.wrap({ # lumi files (golden run 2 only!!) "lumi": { - "golden": (f"{lumi_cert_site}/Legacy_{year}/Cert_{runs[year]}_{ecm}TeV_UL{year}_Collisions{year_short}_GoldenJSON.txt", "v1"), # noqa - "normtag": ("${MODULE_BASE}/Normtags/normtag_PHYSICS.json", "v1"), + "golden": (f"{lumi_cert_site}/Legacy_{year}/Cert_{runs[year]}_{ecm}TeV_UL{year}_Collisions{year2}_GoldenJSON.txt", "v1"), # noqa + "normtag": ("modules/Normtags/normtag_PHYSICS.json", "v1"), }, # jet energy correction @@ -254,7 +255,7 @@ # run 2 only!! # files from https://twiki.cern.ch/twiki/bin/viewauth/CMS/PileupJSONFileforData?rev=44#Pileup_JSON_Files_For_Run_II # noqa "pu": { - "json": (f"{pu_reweightin_website}/pileup_latest.txt", "v1"), # noqa + "json": (f"{pu_reweighting_site}/pileup_latest.txt", "v1"), # noqa "mc_profile": ( "https://raw.githubusercontent.com/cms-sw/cmssw/435f0b04c0e318c1036a6b95eb169181bbbe8344/SimGeneral/MixingModule/python/mix_2018_25ns_UltraLegacy_PoissonOOTPU_cfi.py", # noqa "v1"), # noqa diff --git a/analysis_templates/ghent_template/setup.sh b/analysis_templates/ghent_template/setup.sh index 7184e2d71..379e8169b 100644 --- a/analysis_templates/ghent_template/setup.sh +++ b/analysis_templates/ghent_template/setup.sh @@ -61,7 +61,6 @@ setup___cf_short_name_lc__() { # start exporting variables export __cf_short_name_uc___BASE="${this_dir}" - export MODULE_BASE="${this_dir}/modules" export CF_BASE="${this_dir}/modules/columnflow" export CF_REPO_BASE="${__cf_short_name_uc___BASE}" export CF_REPO_BASE_ALIAS="__cf_short_name_uc___BASE" From dc741173e78137d184f34c1cca7bd0a683b04118 Mon Sep 17 00:00:00 2001 From: maadcoen Date: Thu, 4 Apr 2024 11:53:58 +0200 Subject: [PATCH 020/119] added target directory to law.cfg --- analysis_templates/ghent_template/law.cfg | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/analysis_templates/ghent_template/law.cfg b/analysis_templates/ghent_template/law.cfg index eab65d02f..8536f5a72 100644 --- a/analysis_templates/ghent_template/law.cfg +++ b/analysis_templates/ghent_template/law.cfg @@ -19,6 +19,12 @@ gfal2: WARNING columnflow.columnar_util-perf: INFO +[target] + +tmp_dir: /pnfs/iihe/cms/store/user/$CF_CERN_USER/tmp +tmp_dir_perm: 777 + + [analysis] default_analysis: __cf_module_name__.config.analysis___cf_short_name_lc__.analysis___cf_short_name_lc__ From d27cb879c34718c3ed572f69eddb88a4e0ba4496 Mon Sep 17 00:00:00 2001 From: maadcoen Date: Thu, 4 Apr 2024 13:49:08 +0200 Subject: [PATCH 021/119] change to 2018 since tt_sl NanoAOD is not yet available on t2b --- .../config/analysis___cf_short_name_lc__.py | 4 ++-- analysis_templates/ghent_template/law.cfg | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/analysis_templates/ghent_template/__cf_module_name__/config/analysis___cf_short_name_lc__.py b/analysis_templates/ghent_template/__cf_module_name__/config/analysis___cf_short_name_lc__.py index 5767bfba6..03a923360 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/config/analysis___cf_short_name_lc__.py +++ b/analysis_templates/ghent_template/__cf_module_name__/config/analysis___cf_short_name_lc__.py @@ -59,11 +59,11 @@ # ttbar and single top MCs, plus single muon data # update this config or add additional ones to accomodate the needs of your analysis -from cmsdb.campaigns.run2_2017_nano_v9 import campaign_run2_2017_nano_v9 +from cmsdb.campaigns.run2_2018_nano_v9 import campaign_run2_2018_nano_v9 # copy the campaign # (creates copies of all linked datasets, processes, etc. to allow for encapsulated customization) -campaign = campaign_run2_2017_nano_v9.copy() +campaign = campaign_run2_2018_nano_v9.copy() # get all root processes procs = get_root_processes_from_campaign(campaign) diff --git a/analysis_templates/ghent_template/law.cfg b/analysis_templates/ghent_template/law.cfg index 8536f5a72..902fed943 100644 --- a/analysis_templates/ghent_template/law.cfg +++ b/analysis_templates/ghent_template/law.cfg @@ -28,7 +28,7 @@ tmp_dir_perm: 777 [analysis] default_analysis: __cf_module_name__.config.analysis___cf_short_name_lc__.analysis___cf_short_name_lc__ -default_config: run2_2017_nano_v9 +default_config: run2_2018_nano_v9 default_dataset: st_tchannel_t_powheg calibration_modules: columnflow.calibration.cms.{jets,met}, __cf_module_name__.calibration.example From 21d94ec6dd3b3a5f2792b1dd8f78022eeba7247a Mon Sep 17 00:00:00 2001 From: juvanden Date: Thu, 4 Apr 2024 14:30:20 +0200 Subject: [PATCH 022/119] addition structure ghent_template (to be tested) --- .../analysis/___cf_short_name_lc__.py | 8 + .../analysis/create_analysis.py | 77 ++++ .../__cf_module_name__/calibration/default.py | 74 ++++ .../__cf_module_name__/calibration/example.py | 50 --- .../__cf_module_name__/calibration/jet.py | 11 + .../__cf_module_name__/config/categories.py | 58 +++ .../config/config__cf_short_name_lc__.py | 368 ++++++++++++++++++ .../__cf_module_name__/config/datasets.py | 112 ++++++ .../__cf_module_name__/config/processes.py | 30 ++ .../__cf_module_name__/config/shifts.py | 189 +++++++++ .../__cf_module_name__/config/style.py | 39 ++ .../__cf_module_name__/config/variables.py | 43 ++ .../__cf_module_name__/production/default.py | 60 +++ .../production/normalized_btag.py | 130 +++++++ .../production/normalized_weights.py | 126 ++++++ .../__cf_module_name__/production/weights.py | 186 +++++++++ .../selection/categories.py | 69 ++++ .../__cf_module_name__/selection/default.py | 265 +++++++++++++ .../__cf_module_name__/selection/objects.py | 165 ++++++++ .../__cf_module_name__/selection/stats.py | 109 ++++++ .../__cf_module_name__/selection/trigger.py | 94 +++++ columnflow/selection/util.py | 8 + columnflow/util.py | 63 +++ 23 files changed, 2284 insertions(+), 50 deletions(-) create mode 100644 analysis_templates/ghent_template/__cf_module_name__/analysis/___cf_short_name_lc__.py create mode 100644 analysis_templates/ghent_template/__cf_module_name__/analysis/create_analysis.py create mode 100644 analysis_templates/ghent_template/__cf_module_name__/calibration/default.py delete mode 100644 analysis_templates/ghent_template/__cf_module_name__/calibration/example.py create mode 100644 analysis_templates/ghent_template/__cf_module_name__/calibration/jet.py create mode 100644 analysis_templates/ghent_template/__cf_module_name__/config/categories.py create mode 100644 analysis_templates/ghent_template/__cf_module_name__/config/config__cf_short_name_lc__.py create mode 100644 analysis_templates/ghent_template/__cf_module_name__/config/datasets.py create mode 100644 analysis_templates/ghent_template/__cf_module_name__/config/processes.py create mode 100644 analysis_templates/ghent_template/__cf_module_name__/config/shifts.py create mode 100644 analysis_templates/ghent_template/__cf_module_name__/config/style.py create mode 100644 analysis_templates/ghent_template/__cf_module_name__/config/variables.py create mode 100644 analysis_templates/ghent_template/__cf_module_name__/production/default.py create mode 100644 analysis_templates/ghent_template/__cf_module_name__/production/normalized_btag.py create mode 100644 analysis_templates/ghent_template/__cf_module_name__/production/normalized_weights.py create mode 100644 analysis_templates/ghent_template/__cf_module_name__/production/weights.py create mode 100644 analysis_templates/ghent_template/__cf_module_name__/selection/categories.py create mode 100644 analysis_templates/ghent_template/__cf_module_name__/selection/default.py create mode 100644 analysis_templates/ghent_template/__cf_module_name__/selection/objects.py create mode 100644 analysis_templates/ghent_template/__cf_module_name__/selection/stats.py create mode 100644 analysis_templates/ghent_template/__cf_module_name__/selection/trigger.py diff --git a/analysis_templates/ghent_template/__cf_module_name__/analysis/___cf_short_name_lc__.py b/analysis_templates/ghent_template/__cf_module_name__/analysis/___cf_short_name_lc__.py new file mode 100644 index 000000000..a068e2138 --- /dev/null +++ b/analysis_templates/ghent_template/__cf_module_name__/analysis/___cf_short_name_lc__.py @@ -0,0 +1,8 @@ + +""" +Main analysis object for the __cf_short_name_lc__ analysis +""" + +from analysis___cf_short_name_lc__.analysis.create_analysis import create_analysis + +__cf_short_name_lc__ = create_analysis("__cf_short_name_lc__", 3, tags={"is_signal_region"}) diff --git a/analysis_templates/ghent_template/__cf_module_name__/analysis/create_analysis.py b/analysis_templates/ghent_template/__cf_module_name__/analysis/create_analysis.py new file mode 100644 index 000000000..d648133c2 --- /dev/null +++ b/analysis_templates/ghent_template/__cf_module_name__/analysis/create_analysis.py @@ -0,0 +1,77 @@ +# coding: utf-8 + +""" +Configuration of the ___cf_short_name_lc__ analysis. +""" + +import os + +import law +import order as od + + +thisdir = os.path.dirname(os.path.abspath(__file__)) + + +def create_analysis( + name, + id, + **kwargs, +) -> od.Analysis: + + # + # the main analysis object + # + + analysis_inst = od.Analysis( + name=name, + id=id, + **kwargs, + ) + + # analysis-global versions + analysis_inst.set_aux("versions", { + }) + + # files of sandboxes that might be required by remote tasks + # (used in cf.HTCondorWorkflow) + analysis_inst.x.bash_sandboxes = [ + "$CF_BASE/sandboxes/cf.sh", + ] + default_sandbox = law.Sandbox.new(law.config.get("analysis", "default_columnar_sandbox")) + if default_sandbox.sandbox_type == "bash" and default_sandbox.name not in analysis_inst.x.bash_sandboxes: + analysis_inst.x.bash_sandboxes.append(default_sandbox.name) + # cmssw sandboxes that should be bundled for remote jobs in case they are needed + analysis_inst.x.cmssw_sandboxes = [ + "$CF_BASE/sandboxes/cmssw_default.sh", + ] + + # config groups for conveniently looping over certain configs + # (used in wrapper_factory) + analysis_inst.set_aux("config_groups", {}) + + # + # import campaigns and load configs + # + + from ___cf_short_name_lc__.config.config____cf_short_name_lc__ import add_config + from cmsdb.campaigns.run2_2018_nano_v9 import campaign_run2_2018_nano_v9 + + # default config + c18 = add_config( # noqa + analysis_inst, + campaign_run2_2018_nano_v9.copy(), + config_name="c18", + config_id=2, + ) + + # config with limited number of files + l18 = add_config( # noqa + analysis_inst, + campaign_run2_2018_nano_v9.copy(), + config_name="l18", + config_id=12, + limit_dataset_files=2, + ) + + return analysis_inst diff --git a/analysis_templates/ghent_template/__cf_module_name__/calibration/default.py b/analysis_templates/ghent_template/__cf_module_name__/calibration/default.py new file mode 100644 index 000000000..8eab79701 --- /dev/null +++ b/analysis_templates/ghent_template/__cf_module_name__/calibration/default.py @@ -0,0 +1,74 @@ +# coding: utf-8 + +""" +Calibration methods. +""" + +from columnflow.calibration import Calibrator, calibrator +from columnflow.calibration.cms.jets import jec, jer +from columnflow.production.cms.seeds import deterministic_seeds +from columnflow.util import maybe_import + +from __cf_short_name_lc__.calibration.jet import jec_nominal + +ak = maybe_import("awkward") + + +@calibrator( + uses={deterministic_seeds}, + produces={deterministic_seeds}, +) +def default(self: Calibrator, events: ak.Array, **kwargs) -> ak.Array: + events = self[deterministic_seeds](events, **kwargs) + + if self.dataset_inst.is_data: + events = self[jec_nominal](events, **kwargs) + else: + events = self[jec](events, **kwargs) + + return events + + +@default.init +def default_init(self: Calibrator) -> None: + if not getattr(self, "dataset_inst", None): + return + + if self.dataset_inst.is_data: + calibrators = {jec_nominal} + else: + calibrators = {jec} + + self.uses |= calibrators + self.produces |= calibrators + + +@calibrator( + uses={deterministic_seeds}, + produces={deterministic_seeds}, +) +def skip_jecunc(self: Calibrator, events: ak.Array, **kwargs) -> ak.Array: + """ only uses jec_nominal for test purposes """ + events = self[deterministic_seeds](events, **kwargs) + + if self.dataset_inst.is_data: + events = self[jec_nominal](events, **kwargs) + else: + events = self[jec_nominal](events, **kwargs) + events = self[jer](events, **kwargs) + + return events + + +@skip_jecunc.init +def skip_jecunc_init(self: Calibrator) -> None: + if not getattr(self, "dataset_inst", None): + return + + if self.dataset_inst.is_data: + calibrators = {jec_nominal} + else: + calibrators = {jec_nominal, jer} + + self.uses |= calibrators + self.produces |= calibrators diff --git a/analysis_templates/ghent_template/__cf_module_name__/calibration/example.py b/analysis_templates/ghent_template/__cf_module_name__/calibration/example.py deleted file mode 100644 index 227a3ab03..000000000 --- a/analysis_templates/ghent_template/__cf_module_name__/calibration/example.py +++ /dev/null @@ -1,50 +0,0 @@ -# coding: utf-8 - -""" -Exemplary calibration methods. -""" - -from columnflow.calibration import Calibrator, calibrator -from columnflow.production.cms.seeds import deterministic_seeds -from columnflow.util import maybe_import -from columnflow.columnar_util import set_ak_column - -np = maybe_import("numpy") -ak = maybe_import("awkward") - - -@calibrator( - uses={ - deterministic_seeds, - "Jet.pt", "Jet.mass", - }, - produces={ - deterministic_seeds, - "Jet.pt", "Jet.mass", - "Jet.pt_jec_up", "Jet.mass_jec_up", - "Jet.pt_jec_down", "Jet.mass_jec_down", - }, -) -def example(self: Calibrator, events: ak.Array, **kwargs) -> ak.Array: - # a) "correct" Jet.pt by scaling four momenta by 1.1 (pt<30) or 0.9 (pt<=30) - # b) add 4 new columns faking the effect of JEC variations - - # add deterministic seeds that could (e.g.) be used for smearings - events = self[deterministic_seeds](events, **kwargs) - - # a) - pt_mask = ak.flatten(events.Jet.pt < 30) - n_jet_pt = np.asarray(ak.flatten(events.Jet.pt)) - n_jet_mass = np.asarray(ak.flatten(events.Jet.mass)) - n_jet_pt[pt_mask] *= 1.1 - n_jet_pt[~pt_mask] *= 0.9 - n_jet_mass[pt_mask] *= 1.1 - n_jet_mass[~pt_mask] *= 0.9 - - # b) - events = set_ak_column(events, "Jet.pt_jec_up", events.Jet.pt * 1.05) - events = set_ak_column(events, "Jet.mass_jec_up", events.Jet.mass * 1.05) - events = set_ak_column(events, "Jet.pt_jec_down", events.Jet.pt * 0.95) - events = set_ak_column(events, "Jet.mass_jec_down", events.Jet.mass * 0.95) - - return events diff --git a/analysis_templates/ghent_template/__cf_module_name__/calibration/jet.py b/analysis_templates/ghent_template/__cf_module_name__/calibration/jet.py new file mode 100644 index 000000000..0164fda4f --- /dev/null +++ b/analysis_templates/ghent_template/__cf_module_name__/calibration/jet.py @@ -0,0 +1,11 @@ +# coding: utf-8 + +""" +Custom jet energy calibration methods that disable data uncertainties (for searches). +""" + +from columnflow.calibration.cms.jets import jec + + +# custom jec calibrator that only runs nominal correction +jec_nominal = jec.derive("jec_nominal", cls_dict={"uncertainty_sources": []}) \ No newline at end of file diff --git a/analysis_templates/ghent_template/__cf_module_name__/config/categories.py b/analysis_templates/ghent_template/__cf_module_name__/config/categories.py new file mode 100644 index 000000000..ca9db4fed --- /dev/null +++ b/analysis_templates/ghent_template/__cf_module_name__/config/categories.py @@ -0,0 +1,58 @@ +# coding: utf-8 + +""" +Definition of categories. +""" + +from collections import OrderedDict + +import law + +from columnflow.config_util import create_category_combinations +from columnflow.util import call_once_on_config + +import order as od + +logger = law.logger.get_logger(__name__) + + +@call_once_on_config() +def add_categories_selection(config: od.Config) -> None: + """ + Adds categories to a *config*, that are typically produced in `SelectEvents`. + """ + + config.x.regions = ("incl", "CR_WZ") + config.x.lepton_channels = ("2e", "1e1mu", "2mu") + + config.add_category( + name="incl", + id=1, + selection="catid_selection_incl", + label="Inclusive", + ) + + # add lepton categories defined in ___cf_short_name_lc__.selection.categories to the config + for lepton_channel in config.x.lepton_channels: + + config.add_category( + name="{}_{}".format(region, lepton_channel), + selection=["catid_selection_{}".format(lepton_channel)], + label="{}, {}".format(region, config.x.lepton_channel_labels[lepton_channel]), + ) + + +@call_once_on_config() +def add_categories_production(config: od.Config) -> None: + """ + Adds categories to a *config*, that are typically produced in `ProduceColumns`. + """ + + # + # switch existing categories to different production module + # + + for lepton_channel in config.x.lepton_channels: + + cat_lepton = config.get_category("{}_{}".format(region, lepton_channel)) + cat_lepton.selection = ["catid_{}".format(lepton_channel)] diff --git a/analysis_templates/ghent_template/__cf_module_name__/config/config__cf_short_name_lc__.py b/analysis_templates/ghent_template/__cf_module_name__/config/config__cf_short_name_lc__.py new file mode 100644 index 000000000..b949b2135 --- /dev/null +++ b/analysis_templates/ghent_template/__cf_module_name__/config/config__cf_short_name_lc__.py @@ -0,0 +1,368 @@ +# coding: utf-8 + +""" +Configuration of the __cf_short_name_lc__ analysis. +""" +from __future__ import annotations + +import order as od +from scinum import Number + +from columnflow.util import DotDict, maybe_import +from columnflow.config_util import ( + verify_config_processes, +) + +from __cf_short_name_lc__.config.styling import stylize_processes +from __cf_short_name_lc__.config.datasets import add_datasets, configure_datasets, get_dataset_lfns +from __cf_short_name_lc__.config.processes import add_processes +from __cf_short_name_lc__.config.categories import add_categories_selection +from __cf_short_name_lc__.config.variables import add_feature_variables +from __cf_short_name_lc__.config.shifts import add_shifts +from __cf_short_name_lc__.selection.trigger import add_triggers +from __cf_short_name_lc__.util import four_vec +ak = maybe_import("awkward") + + +def add_config( + analysis: od.Analysis, + campaign: od.Campaign, + config_name: str | None = None, + config_id: int | None = None, + limit_dataset_files: int | None = None, +) -> od.Config: + # validations + assert campaign.x.year in [2016, 2017, 2018] + if campaign.x.year == 2016: + assert campaign.x.vfp in ["pre", "post"] + # gather campaign data + year = campaign.x.year + year2 = year % 100 + corr_postfix = f"{campaign.x.vfp}VFP" if year == 2016 else "" + + if year != 2018: + raise NotImplementedError("For now, only 2018 campaign is fully implemented") + + cfg = analysis.add_config(campaign, name=config_name, id=config_id, tags=analysis.tags) + + year = campaign.x.year + year2 = year % 100 + corr_postfix = f"{campaign.x.vfp}VFP" if year == 2016 else "" + cfg.x.year = year + + add_ttz_processes(cfg, campaign) + + add_triggers(cfg, campaign) + add_ttz_datasets(cfg, campaign) + + cfg.x.get_dataset_lfns = get_dataset_lfns + + configure_ttz_datasets(cfg, limit_dataset_files) + + # verify that the root process of all datasets is part of any of the registered processes + verify_config_processes(cfg, warn=True) + + # lumi values in inverse pb + # https://twiki.cern.ch/twiki/bin/view/CMS/LumiRecommendationsRun2?rev=2#Combination_and_correlations + if year == 2016: + cfg.x.luminosity = Number(36310, { + "lumi_13TeV_2016": 0.01j, + "lumi_13TeV_correlated": 0.006j, + }) + elif year == 2017: + cfg.x.luminosity = Number(41480, { + "lumi_13TeV_2017": 0.02j, + "lumi_13TeV_1718": 0.006j, + "lumi_13TeV_correlated": 0.009j, + }) + elif year == 2018: # 2018 + cfg.x.luminosity = Number(59830, { + "lumi_13TeV_2017": 0.015j, + "lumi_13TeV_1718": 0.002j, + "lumi_13TeV_correlated": 0.02j, + }) + + cfg.x.minbias_xs = Number(69.2, 0.046j) + + # jec configuration + # https://twiki.cern.ch/twiki/bin/view/CMS/JECDataMC?rev=201 + jerc_postfix = "APV" if year == 2016 and campaign.x.vfp == "post" else "" + cfg.x.jec = DotDict.wrap({ + "campaign": f"Summer19UL{year2}{jerc_postfix}", + "version": {2016: "V7", 2017: "V5", 2018: "V5"}[year], + "jet_type": "AK4PFchs", + "levels": ["L1FastJet", "L2Relative", "L2L3Residual", "L3Absolute"], + "levels_for_type1_met": ["L1FastJet"], + "uncertainty_sources": [ + # "AbsoluteStat", + # "AbsoluteScale", + # "AbsoluteSample", + # "AbsoluteFlavMap", + # "AbsoluteMPFBias", + # "Fragmentation", + # "SinglePionECAL", + # "SinglePionHCAL", + # "FlavorQCD", + # "TimePtEta", + # "RelativeJEREC1", + # "RelativeJEREC2", + # "RelativeJERHF", + # "RelativePtBB", + # "RelativePtEC1", + # "RelativePtEC2", + # "RelativePtHF", + # "RelativeBal", + # "RelativeSample", + # "RelativeFSR", + # "RelativeStatFSR", + # "RelativeStatEC", + # "RelativeStatHF", + # "PileUpDataMC", + # "PileUpPtRef", + # "PileUpPtBB", + # "PileUpPtEC1", + # "PileUpPtEC2", + # "PileUpPtHF", + # "PileUpMuZero", + # "PileUpEnvelope", + # "SubTotalPileUp", + # "SubTotalRelative", + # "SubTotalPt", + # "SubTotalScale", + # "SubTotalAbsolute", + # "SubTotalMC", + "Total", + # "TotalNoFlavor", + # "TotalNoTime", + # "TotalNoFlavorNoTime", + # "FlavorZJet", + # "FlavorPhotonJet", + # "FlavorPureGluon", + # "FlavorPureQuark", + # "FlavorPureCharm", + # "FlavorPureBottom", + # "TimeRunA", + # "TimeRunB", + # "TimeRunC", + # "TimeRunD", + "CorrelationGroupMPFInSitu", + "CorrelationGroupIntercalibration", + "CorrelationGroupbJES", + "CorrelationGroupFlavor", + "CorrelationGroupUncorrelated", + ], + }) + + # JER + # https://twiki.cern.ch/twiki/bin/view/CMS/JetResolution?rev=107 + cfg.x.jer = DotDict.wrap({ + "campaign": f"Summer19UL{year2}{jerc_postfix}", + "version": "JR" + {2016: "V3", 2017: "V2", 2018: "V2"}[year], + "jet_type": "AK4PFchs", + }) + + # JEC uncertainty sources propagated to btag scale factors + # (names derived from contents in BTV correctionlib file) + cfg.x.btag_sf_jec_sources = [ + "", # total + "Absolute", + "AbsoluteMPFBias", + "AbsoluteScale", + "AbsoluteStat", + f"Absolute_{year}", + "BBEC1", + f"BBEC1_{year}", + "EC2", + f"EC2_{year}", + "FlavorQCD", + "Fragmentation", + "HF", + f"HF_{year}", + "PileUpDataMC", + "PileUpPtBB", + "PileUpPtEC1", + "PileUpPtEC2", + "PileUpPtHF", + "PileUpPtRef", + "RelativeBal", + "RelativeFSR", + "RelativeJEREC1", + "RelativeJEREC2", + "RelativeJERHF", + "RelativePtBB", + "RelativePtEC1", + "RelativePtEC2", + "RelativePtHF", + "RelativeSample", + f"RelativeSample_{year}", + "RelativeStatEC", + "RelativeStatFSR", + "RelativeStatHF", + "SinglePionECAL", + "SinglePionHCAL", + "TimePtEta", + ] + + # b-tag working points + # https://twiki.cern.ch/twiki/bin/view/CMS/BtagRecommendation106XUL16preVFP?rev=6 + # https://twiki.cern.ch/twiki/bin/view/CMS/BtagRecommendation106XUL16postVFP?rev=8 + # https://twiki.cern.ch/twiki/bin/view/CMS/BtagRecommendation106XUL17?rev=15 + # https://twiki.cern.ch/twiki/bin/view/CMS/BtagRecommendation106XUL17?rev=17 + btag_key = f"2016{campaign.x.vfp}" if year == 2016 else year + cfg.x.btag_working_points = DotDict.wrap({ + "deepjet": { + "loose": {"2016pre": 0.0508, "2016post": 0.0480, 2017: 0.0532, 2018: 0.0490}[btag_key], + "medium": {"2016pre": 0.2598, "2016post": 0.2489, 2017: 0.3040, 2018: 0.2783}[btag_key], + "tight": {"2016pre": 0.6502, "2016post": 0.6377, 2017: 0.7476, 2018: 0.7100}[btag_key], + }, + "deepcsv": { + "loose": {"2016pre": 0.2027, "2016post": 0.1918, 2017: 0.1355, 2018: 0.1208}[btag_key], + "medium": {"2016pre": 0.6001, "2016post": 0.5847, 2017: 0.4506, 2018: 0.4168}[btag_key], + "tight": {"2016pre": 0.8819, "2016post": 0.8767, 2017: 0.7738, 2018: 0.7665}[btag_key], + }, + }) + cfg.x.btag_sf = ("deepJet_shape", cfg.x.btag_sf_jec_sources) + + # names of electron correction sets and working points + # (used in the electron_sf producer) + cfg.x.electron_sf_names = ("UL-Electron-ID-SF", f"{year}{corr_postfix}", "wp80iso") + cfg.x.muon_sf_names = ("NUM_TightRelIso_DEN_TightIDandIPCut", f"{year}{corr_postfix}_UL") + + # external files + json_mirror = "${MODULE_BASE}/jsonpog-integration" + year_short = str(year)[2:] # 20XX > XX + lumi_cert_site = f"https://cms-service-dqmdc.web.cern.ch/CAF/certification/Collisions{year_short}/{ecm}TeV/" + pu_reweighting_site = f"{lumi_cert_site}/PileUp/UltraLegacy" + runs = {2016: "271036-284044", 2017: "294927-306462", 2018: "314472-325175"} + cfg.x.external_files = DotDict.wrap({ + # lumi files (golden run 2 only!!) + "lumi": { + "golden": (f"{lumi_cert_site}/Legacy_{year}/Cert_{runs[year]}_{ecm}TeV_UL{year}_Collisions{year_short}_GoldenJSON.txt", "v1"), # noqa + "normtag": ("${MODULE_BASE}/Normtags/normtag_PHYSICS.json", "v1"), + }, + + # jet energy correction + "jet_jerc": (f"{json_mirror}/POG/JME/{year}{corr_postfix}_UL/jet_jerc.json.gz", "v1"), + + # electron scale factors + "electron_sf": (f"{json_mirror}/POG/EGM/{year}{corr_postfix}_UL/electron.json.gz", "v1"), + + # muon scale factors + "muon_sf": (f"{json_mirror}/POG/MUO/{year}{corr_postfix}_UL/muon_Z.json.gz", "v1"), + + # btag scale factor + "btag_sf_corr": (f"{json_mirror}/POG/BTV/{year}{corr_postfix}_UL/btagging.json.gz", "v1"), + + # fake rates + "muon_fakerate": (f"{json_mirror}/POG/BTV/{year}{corr_postfix}_UL/btagging.json.gz", "v1"), + "electron_fakerate": (f"{json_mirror}/POG/BTV/{year}{corr_postfix}_UL/btagging.json.gz", "v1"), + + # run 2 only!! + # files from https://twiki.cern.ch/twiki/bin/viewauth/CMS/PileupJSONFileforData?rev=44#Pileup_JSON_Files_For_Run_II # noqa + "pu": { + "json": (f"{pu_reweightin_website}/pileup_latest.txt", "v1"), # noqa + "mc_profile": ( + "https://raw.githubusercontent.com/cms-sw/cmssw/435f0b04c0e318c1036a6b95eb169181bbbe8344/SimGeneral/MixingModule/python/mix_2018_25ns_UltraLegacy_PoissonOOTPU_cfi.py", # noqa + "v1"), # noqa + "data_profile": { + "nominal": ( + f"{pu_reweighting_site}/PileupHistogram-goldenJSON-{ecm}tev-{year}-69200ub-99bins.root", "v1"), + # noqa + "minbias_xs_up": ( + f"{pu_reweighting_site}/PileupHistogram-goldenJSON-{ecm}tev-{year}-72400ub-99bins.root", "v1"), + # noqa + "minbias_xs_down": ( + f"{pu_reweighting_site}/PileupHistogram-goldenJSON-{ecm}tev-{year}-66000ub-99bins.root", "v1"), + # noqa + }, + }, + }) + + # process groups for conveniently looping over certain processs + # (used in wrapper_factory and during plotting) + cfg.x.process_groups = { + "test": ["tt_dl"], + "all": ["tt_dl", "dy", "data"], + "sim": ["tt_dl", "dy"], + } + + # dataset groups for conveniently looping over certain datasets + # (used in wrapper_factory and during plotting) + cfg.x.dataset_groups = { + "test": ["tt_dl"], + "all": ["tt_dl", "dy*", "data*"], + "sim": ["tt_dl", "dy*"], + } + + cfg.x.variable_groups = { + "default": ["n_jet"], + } + + # category groups for conveniently looping over certain categories + # (used during plotting) + cfg.x.category_groups = { + "default": ["incl"], + } + + # shift groups for conveniently looping over certain shifts + # (used during plotting) + cfg.x.event_weights = DotDict() + cfg.x.event_weights["normalization_weight"] = [] + add_shifts(cfg) + + cfg.x.shift_groups = { + "jer": ["nominal", "jer_up", "jer_down"], + "btag": ["nominal", "btag*"], + "all": cfg.shifts.names(), + } + + # selector step groups for conveniently looping over certain steps + # (used in cutflow tasks) + cfg.x.selector_step_groups = {} + + # custom method and sandbox for determining dataset lfns + cfg.x.get_dataset_lfns = None + cfg.x.get_dataset_lfns_sandbox = None + + # whether to validate the number of obtained LFNs in GetDatasetLFNs + # (currently set to false because the number of files per dataset is truncated to 2) + cfg.x.validate_dataset_lfns = False + + # columns to keep after certain steps + cfg.x.keep_columns = DotDict.wrap({ + "cf.MergeSelectionMasks": { + "mc_weight", "normalization_weight", "process_id", "category_ids", "cutflow.*", + }, + }) + + cfg.x.keep_columns["cf.ReduceEvents"] = ( + { + # general event information + "run", "luminosityBlock", "event", + # columns added during selection, required in general + "mc_weight", "PV.npvs", "process_id", "category_ids", "deterministic_seed", + # weight-related columns + "pu_weight*", "pdf_weight*", + "murf_envelope_weight*", "mur_weight*", "muf_weight*", + "btag_weight*", + # extra columns + } | four_vec( # Jets + {"Jet"}, + {"btagDeepFlavB", "btagDeepFlavCvB"}, + ) | four_vec( # Leptons + {"Electron", "Muon", } + ) + ) + + cfg.x.default_calibrator = "skip_jecunc" # skip jet energy correction up and down variation to save time in running + cfg.x.default_selector = "default" + cfg.x.default_producer = "features" + cfg.x.default_ml_model = None + cfg.x.default_inference_model = "example" + cfg.x.default_variables = ("n_jet",) + + add_categories_selection(cfg) + add_feature_variables(cfg) + stylize_processes(cfg) + + return cfg diff --git a/analysis_templates/ghent_template/__cf_module_name__/config/datasets.py b/analysis_templates/ghent_template/__cf_module_name__/config/datasets.py new file mode 100644 index 000000000..52f6a1e3e --- /dev/null +++ b/analysis_templates/ghent_template/__cf_module_name__/config/datasets.py @@ -0,0 +1,112 @@ +# coding: utf-8 + +""" +Dataset configuration for the Ghent Columnflow analysis template (based on tt). +""" + +from __future__ import annotations + +import law +import order as od +import cmsdb.processes as procs +from columnflow.tasks.external import GetDatasetLFNs +# import cmsdb.processes as procs + + +logger = law.logger.get_logger(__name__) + + +def add_datasets(config: od.Config, campaign: od.Campaign): + # load custom produced datasets into campaign + get_custom_datasets(campaign) + + # use custom get_dataset_lfns function + config.x.get_dataset_lfns = get_dataset_lfns + + # add datasets we need to study + dataset_names = { + "2018": [ + # data + "data_mumu_a", + "data_mumu_b", + "data_mumu_c", + "data_mumu_d", + + "data_egamma_a", + "data_egamma_b", + "data_egamma_c", + "data_egamma_d", + + "data_muoneg_a", + "data_muoneg_b", + "data_muoneg_c", + "data_muoneg_d", + + "data_mu_a", + "data_mu_b", + "data_mu_c", + "data_mu_d", + + # backgrounds + + # ewk + "dy_lept_m50_ht-100to200_madgraph", + "dy_lept_m50_ht-200to400_madgraph", + "dy_lept_m50_ht-400to600_madgraph", + "dy_lept_m50_ht-600to800_madgraph", + "dy_lept_m50_ht-800to1200_madgraph", + "dy_lept_m50_ht-1200to2500_madgraph", + + # ttbar + + "tt_dl", + ]}[f"{config.x.year}{config.x.corr_postfix}"] + + # loop over all dataset names and add them to the config + for dataset_name in dataset_names: + config.add_dataset(campaign.get_dataset(dataset_name)) + + +def configure_datasets(config: od.Config, limit_dataset_files: int | None = None): + + for dataset in config.datasets: + if limit_dataset_files: + # apply optional limit on the max. number of files per dataset + for info in dataset.info.values(): + if info.n_files > limit_dataset_files: + info.n_files = limit_dataset_files + + # adding tag info to datasets for data double counting removal + if dataset.name.startswith("data_egamma"): + dataset.add_tag("EGamma") + elif dataset.name.startswith("data_mumu"): + dataset.add_tag("DoubleMuon") + elif dataset.name.startswith("data_mu_"): + dataset.add_tag("SingleMuon") + elif dataset.name.startswith("data_muoneg"): + dataset.add_tag("MuonEG") + + # for each dataset, select which triggers to require + # (and which to veto to avoid double counting events + # in recorded data) + if dataset.is_data: + prev_triggers = set() + for tag, triggers in config.x.trigger_matrix: + if dataset.has_tag(tag): + dataset.x.require_triggers = triggers + dataset.x.veto_triggers = prev_triggers + break + prev_triggers = prev_triggers | triggers + + elif dataset.is_mc: + dataset.x.require_triggers = config.x.all_triggers + + # add more tag info to datasets + if dataset.name.startswith(("t")): + dataset.x.has_top = True + dataset.add_tag("has_top") + + # example of removing scale, pdf variations for a specific dataset + if dataset.name.startswith(("GluGLuToContinToZZ")): + dataset.add_tag("skip_scale") + dataset.add_tag("skip_pdf") diff --git a/analysis_templates/ghent_template/__cf_module_name__/config/processes.py b/analysis_templates/ghent_template/__cf_module_name__/config/processes.py new file mode 100644 index 000000000..686c9a0ea --- /dev/null +++ b/analysis_templates/ghent_template/__cf_module_name__/config/processes.py @@ -0,0 +1,30 @@ +# coding: utf-8 + +""" +Configuration of the Run 3 ttZ processes. +""" + +import order as od +from columnflow.config_util import get_root_processes_from_campaign + + +def add_processes(config: od.Config, campaign: od.Campaign): + # get all root processes + procs = get_root_processes_from_campaign(campaign) + + config.add_process(procs.n.data) + + config.add_process(procs.n.tt) + + config.add_process(procs.n.dy) + + # How to add new processes: + # Add custom process to encapsulate all background processes: + + bg = config.add_process( + name="background", + id=1, + label="Background", + + ) + bg.add_process(config.get_process("dy")) diff --git a/analysis_templates/ghent_template/__cf_module_name__/config/shifts.py b/analysis_templates/ghent_template/__cf_module_name__/config/shifts.py new file mode 100644 index 000000000..afa0aa392 --- /dev/null +++ b/analysis_templates/ghent_template/__cf_module_name__/config/shifts.py @@ -0,0 +1,189 @@ +# coding: utf-8 + +""" +Definition of shifts. +""" + +from collections import OrderedDict + +import law +import os +import re +import yaml + +from columnflow.util import DotDict, call_once_on_config + +import order as od + +logger = law.logger.get_logger(__name__) + + +@call_once_on_config() +def add_shifts(config: od.Config) -> None: + """ + Adds categories to a *config*, that are typically produced in `SelectEvents`. + """ + + def add_shift_aliases(shift_source: str, aliases: dict[str], selection_dependent: bool): + + for direction in ["up", "down"]: + shift = config.get_shift(od.Shift.join_name(shift_source, direction)) + # format keys and values + inject_shift = lambda s: re.sub(r"\{([^_])", r"{_\1", s).format(**shift.__dict__) + _aliases = {inject_shift(key): inject_shift(value) for key, value in aliases.items()} + alias_type = "column_aliases_selection_dependent" if selection_dependent else "column_aliases" + # extend existing or register new column aliases + shift.set_aux(alias_type, shift.get_aux(alias_type, {})).update(_aliases) + + config.add_shift(name="nominal", id=0) + config.add_shift(name="minbias_xs_up", id=7, type="shape") + config.add_shift(name="minbias_xs_down", id=8, type="shape") + add_shift_aliases( + "minbias_xs", + { + "pu_weight": "pu_weight_{name}", + "normalized_pu_weight": "normalized_pu_weight_{name}", + }, + selection_dependent=False) + + config.add_shift(name="top_pt_up", id=9, type="shape") + config.add_shift(name="top_pt_down", id=10, type="shape") + add_shift_aliases("top_pt", {"top_pt_weight": "top_pt_weight_{direction}"}, selection_dependent=False) + + # lepton uncertainties + config.add_shift(name="e_sf_up", id=40, type="shape") + config.add_shift(name="e_sf_down", id=41, type="shape") + config.add_shift(name="e_trig_sf_up", id=42, type="shape") + config.add_shift(name="e_trig_sf_down", id=43, type="shape") + add_shift_aliases("e_sf", {"electron_weight": "electron_weight_{direction}"}, selection_dependent=False) + + config.add_shift(name="mu_sf_up", id=50, type="shape") + config.add_shift(name="mu_sf_down", id=51, type="shape") + config.add_shift(name="mu_trig_sf_up", id=52, type="shape") + config.add_shift(name="mu_trig_sf_down", id=53, type="shape") + add_shift_aliases("mu_sf", {"muon_weight": "muon_weight_{direction}"}, selection_dependent=False) + + # b-tagging uncertainties + btag_uncs = [ + "hf", "lf", f"hfstats1_{config.x.year}", f"hfstats2_{config.x.year}", + f"lfstats1_{config.x.year}", f"lfstats2_{config.x.year}", "cferr1", "cferr2", + ] + for i, unc in enumerate(btag_uncs): + config.add_shift(name=f"btag_{unc}_up", id=100 + 2 * i, type="shape") + config.add_shift(name=f"btag_{unc}_down", id=101 + 2 * i, type="shape") + add_shift_aliases( + f"btag_{unc}", + { + "normalized_btag_weight": f"normalized_btag_weight_{unc}_" + "{direction}", + "normalized_njet_btag_weight": f"normalized_njet_btag_weight_{unc}_" + "{direction}", + }, + selection_dependent=False, + ) + + config.add_shift(name="mur_up", id=201, type="shape") + config.add_shift(name="mur_down", id=202, type="shape") + config.add_shift(name="muf_up", id=203, type="shape") + config.add_shift(name="muf_down", id=204, type="shape") + config.add_shift(name="murf_envelope_up", id=205, type="shape") + config.add_shift(name="murf_envelope_down", id=206, type="shape") + config.add_shift(name="pdf_up", id=207, type="shape") + config.add_shift(name="pdf_down", id=208, type="shape") + + for unc in ["mur", "muf", "murf_envelope", "pdf"]: + # add_shift_aliases(unc, {f"{unc}_weight": f"{unc}_weight_" + "{direction}"}, selection_dependent=False) + add_shift_aliases( + unc, + {f"normalized_{unc}_weight": f"normalized_{unc}_weight_" + "{direction}"}, + selection_dependent=False, + ) + + all_jec_sources = [ + "AbsoluteStat", + "AbsoluteScale", + "AbsoluteSample", + "AbsoluteFlavMap", + "AbsoluteMPFBias", + "Fragmentation", + "SinglePionECAL", + "SinglePionHCAL", + "FlavorQCD", + "TimePtEta", + "RelativeJEREC1", + "RelativeJEREC2", + "RelativeJERHF", + "RelativePtBB", + "RelativePtEC1", + "RelativePtEC2", + "RelativePtHF", + "RelativeBal", + "RelativeSample", + "RelativeFSR", + "RelativeStatFSR", + "RelativeStatEC", + "RelativeStatHF", + "PileUpDataMC", + "PileUpPtRef", + "PileUpPtBB", + "PileUpPtEC1", + "PileUpPtEC2", + "PileUpPtHF", + "PileUpMuZero", + "PileUpEnvelope", + "SubTotalPileUp", + "SubTotalRelative", + "SubTotalPt", + "SubTotalScale", + "SubTotalAbsolute", + "SubTotalMC", + "Total", + "TotalNoFlavor", + "TotalNoTime", + "TotalNoFlavorNoTime", + "FlavorZJet", + "FlavorPhotonJet", + "FlavorPureGluon", + "FlavorPureQuark", + "FlavorPureCharm", + "FlavorPureBottom", + "TimeRunA", + "TimeRunB", + "TimeRunC", + "TimeRunD", + "CorrelationGroupMPFInSitu", + "CorrelationGroupIntercalibration", + "CorrelationGroupbJES", + "CorrelationGroupFlavor", + "CorrelationGroupUncorrelated", + ] + + for jec_source in config.x.jec["uncertainty_sources"]: + idx = all_jec_sources.index(jec_source) + config.add_shift(name=f"jec_{jec_source}_up", id=5000 + 2 * idx, type="shape") + config.add_shift(name=f"jec_{jec_source}_down", id=5001 + 2 * idx, type="shape") + add_shift_aliases( + f"jec_{jec_source}", + {"Jet.pt": "Jet.pt_{name}", "Jet.mass": "Jet.mass_{name}"}, + selection_dependent=True, + ) + + config.add_shift(name="jer_up", id=6000, type="shape", tags={"selection_dependent"}) + config.add_shift(name="jer_down", id=6001, type="shape", tags={"selection_dependent"}) + add_shift_aliases("jer", {"Jet.pt": "Jet.pt_{name}", "Jet.mass": "Jet.mass_{name}"}, selection_dependent=True) + + get_shifts = lambda *keys: sum(([config.get_shift(f"{k}_up"), config.get_shift(f"{k}_down")] for k in keys), []) + + config.x.event_weights["normalized_btag_weight"] = get_shifts(*(f"btag_{unc}" for unc in btag_uncs)) + config.x.event_weights["normalized_pu_weight"] = get_shifts("minbias_xs") + config.x.event_weights["electron_weight"] = get_shifts("e_sf") + config.x.event_weights["muon_weight"] = get_shifts("mu_sf") + + for dataset in config.datasets: + dataset.x.event_weights = DotDict() + if not dataset.has_tag("skip_scale"): + # pdf/scale weights for all non-qcd datasets + dataset.x.event_weights["normalized_murf_envelope_weight"] = get_shifts("murf_envelope") + dataset.x.event_weights["normalized_mur_weight"] = get_shifts("mur") + dataset.x.event_weights["normalized_muf_weight"] = get_shifts("muf") + + if not dataset.has_tag("skip_pdf"): + dataset.x.event_weights["normalized_pdf_weight"] = get_shifts("pdf") diff --git a/analysis_templates/ghent_template/__cf_module_name__/config/style.py b/analysis_templates/ghent_template/__cf_module_name__/config/style.py new file mode 100644 index 000000000..f656d4e3f --- /dev/null +++ b/analysis_templates/ghent_template/__cf_module_name__/config/style.py @@ -0,0 +1,39 @@ +""" +Collection of helpers for styling, e.g. +- dicitonaries of defaults for variable definition, colors, labels, etc. +- functions to quickly create variable insts in a predefined way +""" + +import order as od + +from columnflow.columnar_util import EMPTY_FLOAT + +# +# Processes +# + +default_process_colors = { + "data": "#000000", # black + "tt": "#cf9fff", # green + "dy_lep": "#377eb8", # blue +} + + +def stylize_processes(config: od.Config) -> None: + """ + Small helper that sets the process insts to analysis-appropriate defaults + For now: only colors and unstacking + Could also include some more defaults (labels, unstack, ...) + """ + + for proc in config.processes: + # set default colors + if color := default_process_colors.get(proc.name, None): + proc.color1 = color + proc.color2 = "#000000" + + config.x.default_legend_cfg = { + "ncol": 2, + "loc": "upper right", + "fontsize": 15, + } diff --git a/analysis_templates/ghent_template/__cf_module_name__/config/variables.py b/analysis_templates/ghent_template/__cf_module_name__/config/variables.py new file mode 100644 index 000000000..42f9ddb1f --- /dev/null +++ b/analysis_templates/ghent_template/__cf_module_name__/config/variables.py @@ -0,0 +1,43 @@ +import order as od + +from columnflow.util import maybe_import + +np = maybe_import("numpy") +ak = maybe_import("awkward") + +from columnflow.columnar_util import EMPTY_FLOAT, call_once_on_config + + +@call_once_on_config() +def add_variables(config: od.Config) -> None: + """ + Adds variables to a *config* that are produced as part of the `features` producer. + """ + config.add_variable( + name="event", + expression="event", + binning=(1, 0.0, 1.0e9), + x_title="Event number", + discrete_x=True, + ) + config.add_variable( + name="run", + expression="run", + binning=(1, 100000.0, 500000.0), + x_title="Run number", + discrete_x=True, + ) + config.add_variable( + name="lumi", + expression="luminosityBlock", + binning=(1, 0.0, 5000.0), + x_title="Luminosity block", + discrete_x=True, + ) + config.add_variable( + name="n_jet", + expression="n_jet", + binning=(6, 0.5, 6.5), + x_title="Number of jets", + discrete_x=True, + ) diff --git a/analysis_templates/ghent_template/__cf_module_name__/production/default.py b/analysis_templates/ghent_template/__cf_module_name__/production/default.py new file mode 100644 index 000000000..8e85d99cb --- /dev/null +++ b/analysis_templates/ghent_template/__cf_module_name__/production/default.py @@ -0,0 +1,60 @@ + +""" +Column production methods related to higher-level features. +""" + +import functools + +from columnflow.production import Producer, producer +from columnflow.util import maybe_import, four_vec +from columnflow.columnar_util import set_ak_column, EMPTY_FLOAT + +from columnflow.production.categories import category_ids + +from __cf_short_name_lc__.production.weights import event_weights +from __cf_short_name_lc__.config.categories import add_categories_production + +np = maybe_import("numpy") +ak = maybe_import("awkward") +coffea = maybe_import("coffea") +maybe_import("coffea.nanoevents.methods.nanoaod") + + +@producer( + uses=({ + category_ids, + event_weights, + } | four_vec( + {"Electron", "Muon", } + ) | four_vec( + {"Jet"}, + {"hadronFlavour"} + ) + ), + produces=({ + category_ids, event_weights, + "ht", "n_jet", "n_electron", "n_muon", "n_bjet"}), +) +def default(self: Producer, events: ak.Array, **kwargs) -> ak.Array: + + # add event weights + if self.dataset_inst.is_mc: + events = self[event_weights](events, **kwargs) + + # (re)produce category i + events = self[category_ids](events, **kwargs) + + events = set_ak_column_f32(events, "ht", ak.sum(events.Jet.pt, axis=1)) + events = set_ak_column(events, "n_jet", ak.sum(events.Jet.pt > 0, axis=1)) + events = set_ak_column(events, "n_bjet", ak.sum(events.Jet.btagDeepFlavB >= + self.config_inst.x.btag_working_points.deepjet.medium, axis=1)) + events = set_ak_column(events, "n_electron", ak.sum(events.Electron.pt > 0, axis=1)) + events = set_ak_column(events, "n_muon", ak.sum(events.Muon.pt > 0, axis=1)) + + return events + + +@features.init +def features_init(self: Producer) -> None: + # add categories to config + add_categories_production(self.config_inst) diff --git a/analysis_templates/ghent_template/__cf_module_name__/production/normalized_btag.py b/analysis_templates/ghent_template/__cf_module_name__/production/normalized_btag.py new file mode 100644 index 000000000..b25f74b7b --- /dev/null +++ b/analysis_templates/ghent_template/__cf_module_name__/production/normalized_btag.py @@ -0,0 +1,130 @@ +# coding: utf-8 + +""" +Producers for phase-space normalized btag scale factor weights. +""" + +from __future__ import annotations + +from columnflow.production import Producer, producer +from columnflow.production.cms.btag import btag_weights +from columnflow.util import maybe_import, safe_div, InsertableDict +from columnflow.columnar_util import set_ak_column + +np = maybe_import("numpy") +ak = maybe_import("awkward") + + +@producer( + uses={ + btag_weights.PRODUCES, "process_id", "Jet.pt", + }, + # produced columns are defined in the init function below + mc_only=True, +) +def normalized_btag_weights(self: Producer, events: ak.Array, **kwargs) -> ak.Array: + + for weight_name in self[btag_weights].produces: + if not weight_name.startswith("btag_weight"): + continue + + # create a weight vectors starting with ones for both weight variations, i.e., + # nomalization per pid and normalization per pid and jet multiplicity + norm_weight_per_pid = np.ones(len(events), dtype=np.float32) + norm_weight_per_pid_njet = np.ones(len(events), dtype=np.float32) + + # fill weights with a new mask per unique process id (mostly just one) + for pid in self.unique_process_ids: + pid_mask = events.process_id == pid + # single value + norm_weight_per_pid[pid_mask] = self.ratio_per_pid[weight_name][pid] + # lookup table + n_jets = ak.num(events[pid_mask].Jet.pt, axis=1) + norm_weight_per_pid_njet[pid_mask] = self.ratio_per_pid_njet[weight_name][pid][n_jets] + + # multiply with actual weight + norm_weight_per_pid = norm_weight_per_pid * events[weight_name] + norm_weight_per_pid_njet = norm_weight_per_pid_njet * events[weight_name] + + # store them + events = set_ak_column(events, f"normalized_{weight_name}", norm_weight_per_pid) + events = set_ak_column(events, f"normalized_njet_{weight_name}", norm_weight_per_pid_njet) + + return events + + +@normalized_btag_weights.init +def normalized_btag_weights_init(self: Producer) -> None: + for weight_name in self[btag_weights].produces: + if not weight_name.startswith("btag_weight"): + continue + + self.produces.add(f"normalized_{weight_name}") + self.produces.add(f"normalized_njet_{weight_name}") + + +@normalized_btag_weights.requires +def normalized_btag_weights_requires(self: Producer, reqs: dict) -> None: + from columnflow.tasks.selection import MergeSelectionStats + reqs["selection_stats"] = MergeSelectionStats.req( + self.task, + tree_index=0, + branch=-1, + _exclude=MergeSelectionStats.exclude_params_forest_merge, + ) + + +@normalized_btag_weights.setup +def normalized_btag_weights_setup(self: Producer, reqs: dict, inputs: dict, reader_targets: InsertableDict) -> None: + # load the selection stats + stats = inputs["selection_stats"]["collection"][0]["stats"].load(formatter="json") + + # get the unique process ids in that dataset + key = "sum_mc_weight_selected_no_bjet_per_process_and_njet" + self.unique_process_ids = list(map(int, stats[key].keys())) + + # get the maximum numbers of jets + max_n_jets = max(map(int, sum((list(d.keys()) for d in stats[key].values()), []))) + + # helper to get numerators and denominators + def numerator_per_pid(pid): + key = "sum_mc_weight_selected_no_bjet_per_process" + return stats[key].get(str(pid), 0.0) + + def denominator_per_pid(weight_name, pid): + key = f"sum_mc_weight_{weight_name}_selected_no_bjet_per_process" + return stats[key].get(str(pid), 0.0) + + def numerator_per_pid_njet(pid, n_jets): + key = "sum_mc_weight_selected_no_bjet_per_process_and_njet" + d = stats[key].get(str(pid), {}) + return d.get(str(n_jets), 0.0) + + def denominator_per_pid_njet(weight_name, pid, n_jets): + key = f"sum_mc_weight_{weight_name}_selected_no_bjet_per_process_and_njet" + d = stats[key].get(str(pid), {}) + return d.get(str(n_jets), 0.0) + + # extract the ratio per weight and pid + self.ratio_per_pid = { + weight_name: { + pid: safe_div(numerator_per_pid(pid), denominator_per_pid(weight_name, pid)) + for pid in self.unique_process_ids + } + for weight_name in self[btag_weights].produces + if weight_name.startswith("btag_weight") + } + + # extract the ratio per weight, pid and also the jet multiplicity, using the latter as in index + # for a lookup table (since it naturally starts at 0) + self.ratio_per_pid_njet = { + weight_name: { + pid: np.array([ + safe_div(numerator_per_pid_njet(pid, n_jets), denominator_per_pid_njet(weight_name, pid, n_jets)) + for n_jets in range(max_n_jets + 1) + ]) + for pid in self.unique_process_ids + } + for weight_name in self[btag_weights].produces + if weight_name.startswith("btag_weight") + } \ No newline at end of file diff --git a/analysis_templates/ghent_template/__cf_module_name__/production/normalized_weights.py b/analysis_templates/ghent_template/__cf_module_name__/production/normalized_weights.py new file mode 100644 index 000000000..d3129b348 --- /dev/null +++ b/analysis_templates/ghent_template/__cf_module_name__/production/normalized_weights.py @@ -0,0 +1,126 @@ +# coding: utf-8 + +""" +Column production methods related to generic event weights. +""" + +from typing import Iterable, Callable + +import law + +from columnflow.production import Producer, producer +from columnflow.util import maybe_import, safe_div, InsertableDict +from columnflow.columnar_util import set_ak_column + +ak = maybe_import("awkward") +np = maybe_import("numpy") + + +logger = law.logger.get_logger(__name__) + + +def normalized_weight_factory( + producer_name: str, + weight_producers: Iterable[Producer], + **kwargs, +) -> Callable: + + @producer( + uses=set(weight_producers) | set().union(*[w.produces for w in weight_producers]) | {"process_id"}, + cls_name=producer_name, + mc_only=True, + # skip the checking existence of used/produced columns because not all columns are there + check_used_columns=False, + check_produced_columns=False, + # remaining produced columns are defined in the init function below + ) + def normalized_weight(self: Producer, events: ak.Array, **kwargs) -> ak.Array: + + # check existence of requested weights to normalize and run producer if missing + missing_weights = self.weight_names.difference(events.fields) + + if missing_weights: + # try to produce missing weights + for prod in self.weight_producers: + if ( + self[prod].produced_columns.difference(events.fields) and + self[prod].used_columns.intersection(events.fields) + ): + logger.info(f"Rerun producer {self[prod].cls_name}") + events = self[prod](events, **kwargs) + + # Create normalized weight columns if possible + if not_reproduced := missing_weights.difference(events.fields): + logger.info(f"Weight columns {not_reproduced} could not be reproduced") + + + for weight_name in self.weight_names.intersection(events.fields): + # create a weight vector starting with ones + norm_weight_per_pid = np.ones(len(events), dtype=np.float32) + + # fill weights with a new mask per unique process id (mostly just one) + for pid in self.unique_process_ids: + pid_mask = events.process_id == pid + norm_weight_per_pid[pid_mask] = self.ratio_per_pid[weight_name][pid] + + # multiply with actual weight + norm_weight_per_pid = norm_weight_per_pid * events[weight_name] + + # store it + norm_weight_per_pid = ak.values_astype(norm_weight_per_pid, np.float32) + events = set_ak_column(events, f"normalized_{weight_name}", norm_weight_per_pid) + + + return events + + @normalized_weight.init + def normalized_weight_init(self: Producer) -> None: + self.weight_producers = weight_producers + + # resolve weight names + self.weight_names = set() + for col in self.used_columns: + col = col.string_nano_column + if "weight" in col and "normalized" not in col and "btag" not in col: + self.weight_names.add(col) + + self.produces |= set(f"normalized_{weight_name}" for weight_name in self.weight_names) + + @normalized_weight.requires + def normalized_weight_requires(self: Producer, reqs: dict) -> None: + from columnflow.tasks.selection import MergeSelectionStats + reqs["selection_stats"] = MergeSelectionStats.req( + self.task, + tree_index=0, + branch=-1, + _exclude=MergeSelectionStats.exclude_params_forest_merge, + ) + + @normalized_weight.setup + def normalized_weight_setup(self: Producer, reqs: dict, inputs: dict, reader_targets: InsertableDict) -> None: + # load the selection stats + stats = inputs["selection_stats"]["collection"][0]["stats"].load(formatter="json") + + # get the unique process ids in that dataset + key = "sum_mc_weight_per_process" + self.unique_process_ids = list(map(int, stats[key].keys())) + + # helper to get numerators and denominators + def numerator_per_pid(pid): + key = "sum_mc_weight_per_process" + return stats[key].get(str(pid), 0.0) + + def denominator_per_pid(weight_name, pid): + key = f"sum_mc_weight_{weight_name}_per_process" + return stats[key].get(str(pid), 0.0) + + # extract the ratio per weight and pid + self.ratio_per_pid = { + weight_name: { + pid: safe_div(numerator_per_pid(pid), denominator_per_pid(weight_name, pid)) + for pid in self.unique_process_ids + } + for weight_name in self.weight_names + } + + return normalized_weight \ No newline at end of file diff --git a/analysis_templates/ghent_template/__cf_module_name__/production/weights.py b/analysis_templates/ghent_template/__cf_module_name__/production/weights.py new file mode 100644 index 000000000..08e5cbed2 --- /dev/null +++ b/analysis_templates/ghent_template/__cf_module_name__/production/weights.py @@ -0,0 +1,186 @@ +# coding: utf-8 + +""" +Column production methods related to generic event weights. +""" + +from columnflow.util import maybe_import +from columnflow.columnar_util import set_ak_column, has_ak_column, Route +from columnflow.selection import SelectionResult +from columnflow.production import Producer, producer +from columnflow.production.cms.pileup import pu_weight +from columnflow.production.normalization import normalization_weights +from columnflow.production.cms.electron import electron_weights +from columnflow.production.cms.muon import muon_weights +from columnflow.production.cms.btag import btag_weights +from columnflow.production.cms.scale import murmuf_weights, murmuf_envelope_weights +from columnflow.production.cms.pdf import pdf_weights +from __cf_short_name_lc__.production.normalized_weights import normalized_weight_factory +from __cf_short_name_lc__.production.normalized_btag import normalized_btag_weights + +np = maybe_import("numpy") +ak = maybe_import("awkward") + + +@producer( + produces={"event_weight"}, + mc_only=True, +) +def event_weight(self: Producer, events: ak.Array, **kwargs) -> ak.Array: + """ + Producer that calculates the 'final' event weight (as done in cf.CreateHistograms) + """ + weight = ak.Array(np.ones(len(events))) + if self.dataset_inst.is_mc: + for column in self.config_inst.x.event_weights: + weight = weight * Route(column).apply(events) + for column in self.dataset_inst.x("event_weights", []): + if has_ak_column(events, column): + weight = weight * Route(column).apply(events) + else: + self.logger.warning_once( + f"missing_dataset_weight_{column}", + f"weight '{column}' for dataset {self.dataset_inst.name} not found", + ) + + events = set_ak_column(events, "event_weight", weight) + + return events + + +@event_weight.init +def event_weight_init(self: Producer) -> None: + if not getattr(self, "dataset_inst", None): + return + + self.uses |= set(self.config_inst.x.event_weights.keys()) + self.uses |= set(self.dataset_inst.x("event_weights", {}).keys()) + + +@producer( + uses={pu_weight, btag_weights + }, + # don't save btag_weights to save storage space, since we can reproduce them in ProduceColumns + produces={pu_weight}, + mc_only=True, +) +def event_weights_to_normalize(self: Producer, events: ak.Array, results: SelectionResult, **kwargs) -> ak.Array: + """ + Wrapper of several event weight producers that are typically called as part of SelectEvents + since it is required to normalize them before applying certain event selections. + """ + + # compute pu weights + + events = self[pu_weight](events, **kwargs) + + # compute btag SF weights (for renormalization tasks) + events = self[btag_weights](events, jet_mask=results.aux["jet_mask"], **kwargs) + + # skip scale/pdf weights for some datasets (missing columns) + if not self.dataset_inst.has_tag("skip_scale"): + # compute scale weights + events = self[murmuf_envelope_weights](events, **kwargs) + + # read out mur and weights + events = self[murmuf_weights](events, **kwargs) + + if not self.dataset_inst.has_tag("skip_pdf"): + # compute pdf weights + events = self[pdf_weights]( + events, + outlier_action="remove", + outlier_log_mode="warning", + **kwargs, + ) + + return events + + +@event_weights_to_normalize.init +def event_weights_to_normalize_init(self) -> None: + if not getattr(self, "dataset_inst", None): + return + + if not self.dataset_inst.has_tag("skip_scale"): + self.uses |= {murmuf_envelope_weights, murmuf_weights} + self.produces |= {murmuf_envelope_weights, murmuf_weights} + + if not self.dataset_inst.has_tag("skip_pdf"): + self.uses |= {pdf_weights} + self.produces |= {pdf_weights} + + +normalized_scale_weights = normalized_weight_factory( + producer_name="normalized_scale_weights", + weight_producers={murmuf_envelope_weights, murmuf_weights}, +) + +normalized_pdf_weights = normalized_weight_factory( + producer_name="normalized_pdf_weights", + weight_producers={pdf_weights}, +) + +normalized_pu_weights = normalized_weight_factory( + producer_name="normalized_pu_weights", + weight_producers={pu_weight}, +) + + +@producer( + uses={ + normalization_weights, electron_weights, muon_weights, btag_weights, + normalized_btag_weights, + normalized_pu_weights, + event_weight, + }, + produces={ + normalization_weights, electron_weights, muon_weights, + normalized_btag_weights, + normalized_pu_weights, + event_weight, + }, + mc_only=True, +) +def event_weights(self: Producer, events: ak.Array, **kwargs) -> ak.Array: + """ + Wrapper of several event weight producers that are typically called in ProduceColumns. + """ + # compute normalization weights + + events = self[normalization_weights](events, **kwargs) + + # compute btag SF weights + events = self[btag_weights](events, **kwargs) + # compute electron and muon SF weights + events = self[electron_weights](events, **kwargs) + events = self[muon_weights](events, **kwargs) + + # normalize event weights using stats + events = self[normalized_btag_weights](events, **kwargs) + events = self[normalized_pu_weights](events, **kwargs) + + if not self.dataset_inst.has_tag("skip_scale"): + events = self[normalized_scale_weights](events, **kwargs) + + if not self.dataset_inst.has_tag("skip_pdf"): + events = self[normalized_pdf_weights](events, **kwargs) + + # calculate the full event weight for plotting purposes + events = self[event_weight](events, **kwargs) + + return events + + +@event_weights.init +def event_weights_init(self: Producer) -> None: + if not getattr(self, "dataset_inst", None): + return + + if not self.dataset_inst.has_tag("skip_scale"): + self.uses |= {normalized_scale_weights} + self.produces |= {normalized_scale_weights} + + if not self.dataset_inst.has_tag("skip_pdf"): + self.uses |= {normalized_pdf_weights} + self.produces |= {normalized_pdf_weights} diff --git a/analysis_templates/ghent_template/__cf_module_name__/selection/categories.py b/analysis_templates/ghent_template/__cf_module_name__/selection/categories.py new file mode 100644 index 000000000..b49236b92 --- /dev/null +++ b/analysis_templates/ghent_template/__cf_module_name__/selection/categories.py @@ -0,0 +1,69 @@ +""" +Selection methods defining categories based on selection step results. +""" + +from columnflow.util import maybe_import +from columnflow.categorization import Categorizer, categorizer +from columnflow.selection import SelectionResult + +np = maybe_import("numpy") +ak = maybe_import("awkward") + + +@categorizer(uses={"event"}, call_force=True) +def catid_selection_incl(self: Categorizer, events: ak.Array, **kwargs) -> tuple[ak.Array, ak.Array]: + mask = ak.ones_like(events.event) > 0 + return events, mask + +# +# Categorizer called as part of cf.SelectEvents +# + + +@categorizer(uses={"event"}, call_force=True) +def catid_selection_2e( + self: Categorizer, events: ak.Array, results: SelectionResult, **kwargs, +) -> tuple[ak.Array, ak.Array]: + mask = ((ak.num(results.objects.Electron.Electron, axis=-1) == 2) & + (ak.num(results.objects.Muon.Muon, axis=-1) == 0)) + return events, mask + + +@categorizer(uses={"event"}, call_force=True) +def catid_selection_1e1mu( + self: Categorizer, events: ak.Array, results: SelectionResult, **kwargs, +) -> tuple[ak.Array, ak.Array]: + mask = ((ak.num(results.objects.Electron.Electron, axis=-1) == 1) & + (ak.num(results.objects.Muon.Muon, axis=-1) == 1)) + return events, mask + + +@categorizer(uses={"event"}, call_force=True) +def catid_selection_2mu( + self: Categorizer, events: ak.Array, results: SelectionResult, **kwargs, +) -> tuple[ak.Array, ak.Array]: + mask = ((ak.num(results.objects.Electron.Electron, axis=-1) == 0) & + (ak.num(results.objects.Muon.Muon, axis=-1) == 2)) + return events, mask + +# +# Categorizer called as part of cf.ProduceColumns +# + + +@categorizer(uses={"Electron.pt", "Muon.pt"}, call_force=True) +def catid_2e(self: Categorizer, events: ak.Array, **kwargs) -> tuple[ak.Array, ak.Array]: + mask = ((ak.sum(events.Electron.pt > 0, axis=-1) == 2) & (ak.sum(events.Muon.pt > 0, axis=-1) == 0)) + return events, mask + + +@categorizer(uses={"Electron.pt", "Muon.pt"}, call_force=True) +def catid_1e1mu(self: Categorizer, events: ak.Array, **kwargs) -> tuple[ak.Array, ak.Array]: + mask = ((ak.sum(events.Electron.pt > 0, axis=-1) == 1) & (ak.sum(events.Muon.pt > 0, axis=-1) == 1)) + return events, mask + + +@categorizer(uses={"Electron.pt", "Muon.pt"}, call_force=True) +def catid_2mu(self: Categorizer, events: ak.Array, **kwargs) -> tuple[ak.Array, ak.Array]: + mask = ((ak.sum(events.Electron.pt > 0, axis=-1) == 0) & (ak.sum(events.Muon.pt > 0, axis=-1) == 2)) + return events, mask diff --git a/analysis_templates/ghent_template/__cf_module_name__/selection/default.py b/analysis_templates/ghent_template/__cf_module_name__/selection/default.py new file mode 100644 index 000000000..d2ce3e0e7 --- /dev/null +++ b/analysis_templates/ghent_template/__cf_module_name__/selection/default.py @@ -0,0 +1,265 @@ +# coding: utf-8 + +""" +Selection modules for ttz. +""" + +from collections import defaultdict +from typing import Tuple + +import law + +from columnflow.util import maybe_import, four_vec +from columnflow.columnar_util import set_ak_column +from columnflow.production.util import attach_coffea_behavior + +from columnflow.selection import Selector, SelectionResult, selector +from columnflow.selection.util import masked_sorted_indices + +from columnflow.production.cms.mc_weight import mc_weight +from columnflow.production.categories import category_ids +from columnflow.production.processes import process_ids + +from __cf_short_name_lc__.production.weights import event_weights_to_normalize +from __cf_short_name_lc__.selection.objects import electron_object, muon_object, jet_object +from __cf_short_name_lc__.selection.cutflow_features import cutflow_features +from __cf_short_name_lc__.selection.stats import ttz_increment_stats +from __cf_short_name_lc__.selection.trigger import trigger_selection + +np = maybe_import("numpy") +ak = maybe_import("awkward") +coffea = maybe_import("coffea") +maybe_import("coffea.nanoevents.methods.nanoaod") + +logger = law.logger.get_logger(__name__) + + +def TetraVec(arr: ak.Array) -> ak.Array: + TetraVec = ak.zip({"pt": arr.pt, "eta": arr.eta, "phi": arr.phi, "mass": arr.mass}, + with_name="PtEtaPhiMLorentzVector", + behavior=coffea.nanoevents.methods.vector.behavior) + return TetraVec + + +@selector( + uses={ + process_ids, attach_coffea_behavior, + mc_weight + }, + produces={ + process_ids, attach_coffea_behavior, + mc_weight + }, + exposed=False, +) +def pre_selection( + self: Selector, + events: ak.Array, + stats: defaultdict, + **kwargs, +) -> Tuple[ak.Array, SelectionResult]: + + if self.dataset_inst.is_mc: + events = self[mc_weight](events, **kwargs) + + # create process ids together with custom ttz definition (and future wz 0b, 1b, >2b definition) + events = self[process_ids](events, **kwargs) + # ensure coffea behavior + events = self[attach_coffea_behavior](events, **kwargs) + + results = SelectionResult() + return events, results + + +@selector( + uses=four_vec( + ("Electron", "Muon"), + ), + triggers=None +) +def lepton_selection( + self: Selector, + events: ak.Array, + results: SelectionResult, + stats: defaultdict, + **kwargs, +) -> Tuple[ak.Array, SelectionResult]: + + # apply the object selection from results + electron = (events.Electron[results.objects.Electron.Electron]) + muon = (events.Muon[results.objects.Muon.Muon]) + + # create new object: leptons + leptons = ak.concatenate([muon, electron], axis=-1) + leptons = leptons[ak.argsort(leptons.pt, axis=-1, ascending=False)] + + # required for pt cuts and Z-cuts on masks + fill_with = { + "pt": -999, "eta": -999, "phi": -999, "charge": -999, + "pdgId": -999, "mass": -999, "e_idx": -999, "mu_idx": -999, + "sip3d": -999 + } + leptons = ak.fill_none(ak.pad_none(leptons, 2, axis=-1), fill_with) + + # construct the Z-boson candidate mask + mll = (TetraVec(leptons[:, 0]) + TetraVec(leptons[:, 1])).mass + z_mask = ( + (leptons[:, 0].charge != leptons[:, 1].charge) & + (abs(leptons[:, 0].pdgId) == abs(leptons[:, 1].pdgId)) & + (abs(mll - 91) < 15) + ) + + lepton_mask = ( + (leptons.pt[:, 0] > 30) & + (leptons.pt[:, 1] > 20) & + (~z_mask) & # no Z-boson peak leptons + (ak.all(lepton.tight, axis=-1)) # all loose leptons in the event must be tight + ) + + # Electron and Muon indices corresponding to lepton selection + empty_events = ak.zeros_like(1 * events.event, dtype=np.uint16) + empty_indices = empty_events[..., None][..., :0] + e_indices = ak.where(lepton_mask, leptons.e_idx, empty_indices) + mu_indices = ak.where(lepton_mask, leptons.mu_idx, empty_indices) + e_indices_l = ak.drop_none(e_indices) + mu_indices_l = ak.drop_none(mu_indices) + + # loose indices on electron and muon + e_indices = masked_sorted_indices(e_mask_tight, electron.pt) + mu_indices = masked_sorted_indices(mu_mask_tight, muon.pt) + + return events, SelectionResult( + steps={ + "Lepton": lepton_mask, + }, + objects={}, + aux={ + # save the selected lepton for the duration of the selection + # multiplication of a coffea particle with 1 yields the lorentz vector + "lepton": leptons, + }, + ) + + +@selector( + uses=(four_vec("Jet", ("btagDeepFlavB"))), + exposed=False, +) +def jet_selection( + self: Selector, + events: ak.Array, + results: SelectionResult, + stats: defaultdict, + **kwargs, +) -> Tuple[ak.Array, SelectionResult]: + + jet = (events.Jet[results.objects.Jet.Jet]) + lepton = results.aux.lepton + + bjet_mask_medium = -(jet.btagDeepFlavB >= self.config_inst.x.btag_working_points.deepjet.medium) + + jet_mask = ( + (ak.sum(bjet_mask_medium) >= 1) + ) + + return events, SelectionResult( + steps={ + "Jet": jet_mask, + }, + objects={}, + ) + + +@selector( + uses={ + category_ids, ttz_increment_stats + }, + produces={ + category_ids, ttz_increment_stats + }, + exposed=False, +) +def post_selection( + self: Selector, + events: ak.Array, + results: SelectionResult, + stats: defaultdict, + **kwargs, +) -> Tuple[ak.Array, SelectionResult]: + # build categories + events = self[category_ids](events, results=results, **kwargs) + # add cutflow features + if self.config_inst.x("do_cutflow_features", False): + events = self[cutflow_features](events, results=results, **kwargs) + + # produce event weights + if self.dataset_inst.is_mc: + events = self[event_weights_to_normalize](events, results=results, **kwargs) + + # increment stats + self[ttz_increment_stats](events, results, stats, **kwargs) + + return events, results + + +@post_selection.init +def post_selection_init(self: Selector) -> None: + if self.config_inst.x("do_cutflow_features", False): + self.uses.add(cutflow_features) + self.produces.add(cutflow_features) + + if not getattr(self, "dataset_inst", None) or self.dataset_inst.is_data: + return + + self.uses.add(event_weights_to_normalize) + self.produces.add(event_weights_to_normalize) + + +@selector( + uses={ + pre_selection, post_selection, trigger_selection, + lepton_selection, jet_selection, lepton_gen_features + }, + produces={ + pre_selection, post_selection, trigger_selection, + lepton_selection, jet_selection, lepton_gen_features + }, + exposed=True, +) +def default( + self: Selector, + events: ak.Array, + stats: defaultdict, + **kwargs, +) -> tuple[ak.Array, SelectionResult]: + # prepare the selection results that are updated at every step + + # lepton selection + events, results = self[pre_selection](events, stats, **kwargs) + + # apply trigger selection (with double counting removal for data) + events, trigger_results = self[trigger_selection](events, **kwargs) + results += trigger_results + + events, muon_results = self[muon_object](events, stats, **kwargs) + results += muon_object_results + + events, electron_results = self[electron_object](events, results, stats, **kwargs) + results += electron_object_results + + events, jet_results = self[jet_object](events, results, stats, **kwargs) + results += jet_object_results + + events, lepton_selection_results = self[lepton_selection](events, results, stats, **kwargs) + results += lepton_selection_results + + events, jet_selection_results = self[jet_selection](events, results, stats, **kwargs) + results += jet_selection_results + + # combined event selection after all steps + results.event = results.steps.Trigger & results.steps.Lepton & results.steps.Jet + + # add cutflow features, passing per-object masks + events, results = self[post_selection](events, results, stats, **kwargs) + + return events, results diff --git a/analysis_templates/ghent_template/__cf_module_name__/selection/objects.py b/analysis_templates/ghent_template/__cf_module_name__/selection/objects.py new file mode 100644 index 000000000..6d1bbf206 --- /dev/null +++ b/analysis_templates/ghent_template/__cf_module_name__/selection/objects.py @@ -0,0 +1,165 @@ +# coding: utf-8 + +""" +Selection modules for object selection of Muon, Electron, and Jet. +""" + +from collections import defaultdict +from typing import Tuple + +import law + +from columnflow.util import maybe_import +from columnflow.columnar_util import set_ak_column +from columnflow.production.util import attach_coffea_behavior +from columnflow.selection import Selector, SelectionResult, selector +from columnflow.selection.util import masked_sorted_indices + + +def masked_sorted_indices(mask: ak.Array, sort_var: ak.Array, ascending: bool = False) -> ak.Array: + """ + Helper function to obtain the correct indices of an object mask + """ + indices = ak.argsort(sort_var, axis=-1, ascending=ascending) + return indices[mask[indices]] + + +@selector( + uses=four_vec( + ("Muon"), + ("sip3d", "dxy", "dz", "miniPFRelIso_all", "tightId") + ), + produces={"Muon.tight"}, + triggers=None +) +def muon_object( + self: Selector, + events: ak.Array, + stats: defaultdict, + **kwargs, +) -> Tuple[ak.Array, SelectionResult]: + + muon = (events.Muon) + + # loose object electron mask + mu_mask_loose = ( + (abs(muon.eta) < 2.4) & + (muon.pt > 10.) & + (muon.miniPFRelIso_all < 0.4) & + (muon.sip3d < 8) & + (abs(muon.dxy) < 0.05) & + (abs(muon.dz) < 0.1) + ) + + # tight object muon mask (tight cutbased ID) + mu_mask_tight = ( + (mu_mask) & + (muon.tightId) + ) + + events = set_ak_column(events, "Muon.tight", mu_mask_tight, value_type=bool) + + return events, SelectionResult( + steps={}, + objects={ + "Muon": { + "Muon": masked_sorted_indices(mu_mask, muon.pt) + } + }, + ) + + +@selector( + uses=four_vec( + ("Electron"), + ("sip3d", "charge", "isPFcand", "dxy", "dz", "miniPFRelIso_all", "mvaFall17V2Iso_WP90", "tightCharge", + "lostHits", "convVeto") + ) | four_vec( + ("Muon"), + ), + produces={"Electron.tight"}, + triggers=None +) +def electron_object( + self: Selector, + events: ak.Array, + results: SelectionResult, + stats: defaultdict, + **kwargs, +) -> Tuple[ak.Array, SelectionResult]: + + electron = (events.Electron) + # add muon loose selection to veto electrons that coincide with muons + muon = (events.Muon[results.objects.Muon.Muon]) + + # loose object electron mask + e_mask = ( + (abs(electron.eta) < 2.5) & + (electron.pt > 15) & + (electron.miniPFRelIso_all < 0.4) & + (electron.sip3d < 8) & + (abs(electron.dxy) < 0.05) & + (abs(electron.dz) < 0.1) & + (electron.losthist < 2) & + (electron.isPFcand) & + (electron.convVeto) & + (electron.tightCharge > 1) & + # remove electrons that have tight muon close to it + (ak.is_none(electron.nearest(muon, threshold=0.05), axis=-1)) + ) + # tight object electron mask (mvaFall17 WP90) + e_mask_tight = ( + (e_mask) & + (electron.mvaFall17V2Iso_WP90) + ) + + events = set_ak_column(events, "Electron.tight", e_mask_tight, value_type=bool) + + return events, SelectionResult( + steps={}, + objects={ + "Electron": { + "Electron": masked_sorted_indices(e_mask, electron.pt) + } + }, + ) + + +@selector( + uses=(four_vec({"Electron", "Muon"}) | four_vec("Jet", ("jetId", "btagDeepFlavB"))), + exposed=False, +) +def jet_object( + self: Selector, + events: ak.Array, + results: SelectionResult, + stats: defaultdict, + **kwargs, +) -> Tuple[ak.Array, SelectionResult]: + + jet = (events.Jet) + muon = (events.Muon)[results.objects.Muon.Muon] + electron = (events.Electron)[results.objects.Electron.Electron] + + dR_mask = ( + (ak.is_none(jet.nearest(muon, threshold=0.4), axis=-1)) & + (ak.is_none(jet.nearest(electron, threshold=0.4), axis=-1)) + ) + + jet_mask = ( + (jet.pt > 30) & + (abs(jet.eta) < 2.5) & + (jet.jetId >= 2) & + (dR_mask) + ) + + jet_indices = masked_sorted_indices(jet_mask, events.Jet.pt) + + return events, SelectionResult( + steps={}, + objects={ + "Jet": { + "Jet": jet_indices, + }, + }, + ) diff --git a/analysis_templates/ghent_template/__cf_module_name__/selection/stats.py b/analysis_templates/ghent_template/__cf_module_name__/selection/stats.py new file mode 100644 index 000000000..3edf2a4c2 --- /dev/null +++ b/analysis_templates/ghent_template/__cf_module_name__/selection/stats.py @@ -0,0 +1,109 @@ +# coding: utf-8 + +""" +Stat-related methods. +""" +from __future__ import annotations + +import functools + +from columnflow.selection import Selector, SelectionResult, selector +from columnflow.selection.stats import increment_stats +from columnflow.production import Producer, producer +from columnflow.production.cms.btag import btag_weights +from __cf_short_name_lc__.production.weights import event_weights_to_normalize + +from columnflow.util import maybe_import +from columnflow.ml import MLModel + +np = maybe_import("numpy") +ak = maybe_import("awkward") + + +@selector( + uses={increment_stats, btag_weights, event_weights_to_normalize}, +) +def __cf_short_name_lc___increment_stats( + self: Selector, + events: ak.Array, + results: SelectionResult, + stats: dict, + **kwargs, +) -> ak.Array: + # collect important information from the results + event_mask = results.event + event_mask_no_bjet = results.event # currently no b-jet selection, will change to a result without b jet selection + n_jets = results.x.n_jets + + # weight map definition + weight_map = { + # "num" operations + "num_events": Ellipsis, # all events + "num_events_selected": event_mask, # selected events only + "num_events_selected_no_bjet": event_mask_no_bjet, + } + + if self.dataset_inst.is_mc: + weight_map["num_negative_weights"] = (events.mc_weight < 0) + # "sum" operations + weight_map["sum_mc_weight"] = events.mc_weight # weights of all events + weight_map["sum_mc_weight_selected"] = (events.mc_weight, event_mask) # weights of selected events + weight_map["sum_mc_weight_no_bjet"] = (events.mc_weight, event_mask_no_bjet) + weight_map["sum_mc_weight_selected_no_bjet"] = (events.mc_weight, event_mask_no_bjet) + + weight_columns = list( + set(self[event_weights_to_normalize].produced_columns) | + set(self[btag_weights].produced_columns), + ) + weight_columns = sorted([col.string_nano_column for col in weight_columns]) + + # mc weight times correction weight (with variations) without any selection + for name in weight_columns: + if "weight" not in name: + # skip non-weight columns here + continue + + weight_map[f"sum_mc_weight_{name}"] = (events.mc_weight * events[name], Ellipsis) + + # weights for selected events + weight_map[f"sum_mc_weight_{name}_selected"] = (events.mc_weight * events[name], event_mask) + + if name.startswith("btag_weight"): + # weights for selected events, excluding the bjet selection + weight_map[f"sum_mc_weight_{name}_selected_no_bjet"] = ( + (events.mc_weight * events[name], event_mask_no_bjet) + ) + + group_map = { + "process": { + "values": events.process_id, + "mask_fn": (lambda v: events.process_id == v), + }, + "njet": { + "values": results.x.n_jets, + "mask_fn": (lambda v: n_jets == v), + }, + } + + group_combinations = [("process", "njet")] + + self[increment_stats]( + events, + results, + stats, + weight_map=weight_map, + group_map=group_map, + group_combinations=group_combinations, + **kwargs, + ) + + return events + + +@__cf_short_name_lc__increment_stats.init +def __cf_short_name_lc___increment_stats_init(self: Selector) -> None: + if not getattr(self, "dataset_inst", None): + return + + if self.dataset_inst.is_mc: + self.uses |= {"mc_weight"} diff --git a/analysis_templates/ghent_template/__cf_module_name__/selection/trigger.py b/analysis_templates/ghent_template/__cf_module_name__/selection/trigger.py new file mode 100644 index 000000000..7c4604ae2 --- /dev/null +++ b/analysis_templates/ghent_template/__cf_module_name__/selection/trigger.py @@ -0,0 +1,94 @@ +# coding: utf-8 +from __future__ import annotations + +from columnflow.selection import Selector, SelectionResult, selector +from columnflow.util import maybe_import + +np = maybe_import("numpy") +ak = maybe_import("awkward") + + +def add_triggers(cfg: od.Config, campaign: od.Campaign): + cfg.x.trigger_matrix = [ + ( + "EGamma", { + "Ele32_WPTight_Gsf", + "Ele115_CaloIdVT_GsfTrkIdT", + "Ele23_Ele12_CaloIdL_TrackIdL_IsoVL", + "DoubleEle25_CaloIdL_MW", + "Ele16_Ele12_Ele8_CaloIdL_TrackIdL", + }, + ), + ( + "DoubleMuon", { + "Mu37_TkMu27", + "Mu17_TrkIsoVVL_Mu8_TrkIsoVVL_DZ_Mass3p8", + }, + ), + ( + "MuonEG", { + "Mu23_TrkIsoVVL_Ele12_CaloIdL_TrackIdL_IsoVL_DZ", + "Mu8_TrkIsoVVL_Ele23_CaloIdL_TrackIdL_IsoVL_DZ", + "Mu12_TrkIsoVVL_Ele23_CaloIdL_TrackIdL_IsoVL_DZ", + "Mu23_TrkIsoVVL_Ele12_CaloIdL_TrackIdL_IsoVL", + "Mu27_Ele37_CaloIdL_MW", + "Mu37_Ele27_CaloIdL_MW", + }, + ), + ( + "SingleMuon", { + "IsoMu24", + "IsoMu27", + "Mu50", + "OldMu100", + "TkMu100", + }, + ), + ] + + cfg.x.all_triggers = { + trigger + for _, triggers in cfg.x.trigger_matrix + for trigger in triggers + } + + +@selector +def trigger_selection( + self: Selector, + events: ak.Array, + **kwargs, +) -> tuple[ak.Array, SelectionResult]: + + # start with an all-false mask + sel_trigger = ak.Array(np.zeros(len(events), dtype=bool)) + veto_trigger = ak.Array(np.zeros(len(events), dtype=bool)) + + # pick events that passed one of the required triggers + + for trigger in self.dataset_inst.x("require_triggers", []): + sel_trigger = sel_trigger | events.HLT[trigger] + + # but reject events that also passed one of the triggers to veto + for trigger in self.dataset_inst.x("veto_triggers", []): + veto_trigger = veto_trigger & ~events.HLT[trigger] + sel_trigger = sel_trigger & ~events.HLT[trigger] + + return events, SelectionResult( + steps={ + "Trigger": sel_trigger, "VetoTrigger": veto_trigger + }, + ) + + +@trigger_selection.init +def trigger_selection_init(self: Selector) -> None: + # return immediately if config object has not been loaded yet + if not getattr(self, "config_inst", None): + return + + # add HLT trigger bits to uses + self.uses |= { + f"HLT.{trigger}" + for trigger in self.config_inst.x.all_triggers + } diff --git a/columnflow/selection/util.py b/columnflow/selection/util.py index 69618caed..c8123e1fb 100644 --- a/columnflow/selection/util.py +++ b/columnflow/selection/util.py @@ -80,3 +80,11 @@ def create_collections_from_masks( events = set_ak_column(events, dst_name, dst_collection) return events + + +def masked_sorted_indices(mask: ak.Array, sort_var: ak.Array, ascending: bool = False) -> ak.Array: + """ + Helper function to obtain the correct indices of an object mask + """ + indices = ak.argsort(sort_var, axis=-1, ascending=ascending) + return indices[mask[indices]] diff --git a/columnflow/util.py b/columnflow/util.py index 0dec79fea..843e22cb9 100644 --- a/columnflow/util.py +++ b/columnflow/util.py @@ -30,6 +30,7 @@ import multiprocessing.pool from functools import wraps from collections import OrderedDict +from typing import Hashable, Iterable, Callable import law from law.util import InsertableDict # noqa @@ -521,6 +522,68 @@ def get_source_code(obj: Any, indent: str | int = None) -> str: return code +def call_once_on_config(include_hash=False): + """ + Parametrized decorator to ensure that function *func* is only called once for the config *config* + """ + def outer(func): + def inner(config, *args, **kwargs): + tag = f"{func.__name__}_called" + if include_hash: + tag += f"_{func.__hash__()}" + + if config.has_tag(tag): + return + + config.add_tag(tag) + return func(config, *args, **kwargs) + return inner + return outer + + +def four_vec( + collections: str | Iterable[str], + columns: str | Iterable[str] | None = None, + skip_defaults: bool = False, +) -> set[str]: + """ + Helper to quickly get a set of 4-vector component string for all collections in *collections*. + Additional columns can be added wih the optional *columns* parameter. + + Example: + + .. code-block:: python + + four_vec("Jet", "jetId") + # -> {"Jet.pt", "Jet.eta", "Jet.phi", "Jet.mass", "Jet.jetId"} + + four_vec({"Electron", "Muon"}) + # -> { + "Electron.pt", "Electron.eta", "Electron.phi", "Electron.mass", + "Muon.pt", "Muon.eta", "Muon.phi", "Muon.mass", + } + """ + # make sure *collections* is a set + collections = law.util.make_set(collections) + + # transform *columns* to a set and add the default 4-vector components + columns = law.util.make_set(columns) if columns else set() + default_columns = {"pt", "eta", "phi", "mass"} + if not skip_defaults: + columns |= default_columns + + outp = set( + f"{obj}.{var}" + for obj in collections + for var in columns + ) + + # manually remove MET eta and mass + outp = outp.difference({"MET.eta", "MET.mass"}) + + return outp + + class DotDict(OrderedDict): """ Subclass of *OrderedDict* that provides read and write access to items via attributes by From cf9ef144527c345632578b928a32b04235f44fc7 Mon Sep 17 00:00:00 2001 From: juvanden Date: Thu, 4 Apr 2024 14:43:50 +0200 Subject: [PATCH 023/119] changed to law.cfg (added save location and modules) --- analysis_templates/ghent_template/law.cfg | 25 ++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/analysis_templates/ghent_template/law.cfg b/analysis_templates/ghent_template/law.cfg index 902fed943..6cf59f4f8 100644 --- a/analysis_templates/ghent_template/law.cfg +++ b/analysis_templates/ghent_template/law.cfg @@ -28,12 +28,12 @@ tmp_dir_perm: 777 [analysis] default_analysis: __cf_module_name__.config.analysis___cf_short_name_lc__.analysis___cf_short_name_lc__ -default_config: run2_2018_nano_v9 +default_config: l18 default_dataset: st_tchannel_t_powheg -calibration_modules: columnflow.calibration.cms.{jets,met}, __cf_module_name__.calibration.example -selection_modules: columnflow.selection.{empty}, columnflow.selection.cms.{json_filter, met_filters}, __cf_module_name__.selection.example -production_modules: columnflow.production.{categories,normalization,processes}, columnflow.production.cms.{btag,electron,mc_weight,muon,pdf,pileup,scale,seeds}, __cf_module_name__.production.example +calibration_modules: columnflow.calibration.cms.{jets,met}, __cf_module_name__.calibration.{default,jet} +selection_modules: columnflow.selection.{empty}, columnflow.selection.cms.{json_filter, met_filters}, __cf_module_name__.selection.{default,categories,stats,trigger} +production_modules: columnflow.production.{categories,normalization,processes}, columnflow.production.cms.{btag,electron,mc_weight,muon,pdf,pileup,scale,seeds}, __cf_module_name__.production.{weights,features,categories} categorization_modules: __cf_module_name__.categorization.example ml_modules: columnflow.ml, __cf_module_name__.ml.example inference_modules: columnflow.inference, __cf_module_name__.inference.example @@ -79,6 +79,8 @@ wlcg_file_systems: wlcg_fs_t2b_redirector, wlcg_fs, wlcg_fs_infn_redirector, wlc # look for the correct fs per nano input file (in that order) lfn_sources: wlcg_fs_t2b_redirector, wlcg_fs_infn_redirector, wlcg_fs_global_redirector +shared_location: /pnfs/iihe/cms/store/user/$CF_CERN_USER/columnflow/data/__cf_module_name__ + # output locations per task family # for local targets : "local[, LOCAL_FS_NAME or STORE_PATH]" # for remote targets: "wlcg[, WLCG_FS_NAME]" @@ -86,6 +88,19 @@ lfn_sources: wlcg_fs_t2b_redirector, wlcg_fs_infn_redirector, wlcg_fs_global_red # examples: # cf.CalibrateEvents: wlcg # cf.SelectEvents: local +cf.GetDatasetLFNs: local, %(shared_location)s +cf.CalibrateEvents: local, %(shared_location)s +cf.CreatePileupWeights: local, %(shared_location)s +cf.SelectEvents: local, %(shared_location)s +cf.MergeSelectionStats: local, %(shared_location)s +cf.MergeSelectionMasks: local, %(shared_location)s +cf.ReduceEvents: local, %(shared_location)s +cf.MergeReductionStats: local, %(shared_location)s +cf.MergeReducedEvents: local, %(shared_location)s +cf.ProduceColumns: local, %(shared_location)s +cf.CreateHistograms: local, %(shared_location)s +cf.MergeHistograms: local, %(shared_location)s + [job] @@ -149,4 +164,4 @@ cache_root: $CF_WLCG_CACHE_ROOT cache_cleanup: $CF_WLCG_CACHE_CLEANUP cache_max_size: 15GB cache_global_lock: True -cache_mtime_patience: -1 +cache_mtime_patience: -1 \ No newline at end of file From 754a317ec0525c3e14af6b9d3eb89d288c300a78 Mon Sep 17 00:00:00 2001 From: juvanden Date: Thu, 4 Apr 2024 15:06:18 +0200 Subject: [PATCH 024/119] fix awkward import and add cutflow_features --- .../config/config__cf_short_name_lc__.py | 2 +- .../production/cutflow_features.py | 55 +++++++++++++++++++ .../__cf_module_name__/selection/default.py | 12 +++- .../__cf_module_name__/selection/objects.py | 4 +- 4 files changed, 68 insertions(+), 5 deletions(-) create mode 100644 analysis_templates/ghent_template/__cf_module_name__/production/cutflow_features.py diff --git a/analysis_templates/ghent_template/__cf_module_name__/config/config__cf_short_name_lc__.py b/analysis_templates/ghent_template/__cf_module_name__/config/config__cf_short_name_lc__.py index b949b2135..e79c4f457 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/config/config__cf_short_name_lc__.py +++ b/analysis_templates/ghent_template/__cf_module_name__/config/config__cf_short_name_lc__.py @@ -356,7 +356,7 @@ def add_config( cfg.x.default_calibrator = "skip_jecunc" # skip jet energy correction up and down variation to save time in running cfg.x.default_selector = "default" - cfg.x.default_producer = "features" + cfg.x.default_producer = "default" cfg.x.default_ml_model = None cfg.x.default_inference_model = "example" cfg.x.default_variables = ("n_jet",) diff --git a/analysis_templates/ghent_template/__cf_module_name__/production/cutflow_features.py b/analysis_templates/ghent_template/__cf_module_name__/production/cutflow_features.py new file mode 100644 index 000000000..6f493526c --- /dev/null +++ b/analysis_templates/ghent_template/__cf_module_name__/production/cutflow_features.py @@ -0,0 +1,55 @@ +# coding: utf-8 + +""" +Column production methods for cutflow features. +""" + + +from columnflow.production import Producer, producer +from columnflow.production.categories import category_ids +from columnflow.production.cms.mc_weight import mc_weight +from columnflow.selection.util import create_collections_from_masks +from columnflow.util import maybe_import +from columnflow.columnar_util import EMPTY_FLOAT, Route, set_ak_column + +np = maybe_import("numpy") +ak = maybe_import("awkward") + + +producer( + uses={ + mc_weight, category_ids, + # nano columns + "Jet.pt", + }, + produces={ + mc_weight, category_ids, + # new columns + "cutflow.jet1_pt", + }, +) + + +def cutflow_features( + self: Producer, + events: ak.Array, + object_masks: dict[str, dict[str, ak.Array]], + **kwargs, +) -> ak.Array: + if self.dataset_inst.is_mc: + events = self[mc_weight](events, **kwargs) + + # apply object masks and create new collections + reduced_events = create_collections_from_masks(events, object_masks) + + # create category ids per event and add categories back to the + events = self[category_ids](reduced_events, target_events=events, **kwargs) + + # add cutflow columns + events = set_ak_column( + events, + "cutflow.jet1_pt", + Route("Jet.pt[:,0]").apply(events, EMPTY_FLOAT), + ) + + return events diff --git a/analysis_templates/ghent_template/__cf_module_name__/selection/default.py b/analysis_templates/ghent_template/__cf_module_name__/selection/default.py index d2ce3e0e7..b9feda897 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/selection/default.py +++ b/analysis_templates/ghent_template/__cf_module_name__/selection/default.py @@ -1,7 +1,7 @@ # coding: utf-8 """ -Selection modules for ttz. +Selection modules for __cf_short_name_lc__. """ from collections import defaultdict @@ -21,8 +21,9 @@ from columnflow.production.processes import process_ids from __cf_short_name_lc__.production.weights import event_weights_to_normalize +from __cf_short_name_lc__.production.cutflow_features import cutflow_features + from __cf_short_name_lc__.selection.objects import electron_object, muon_object, jet_object -from __cf_short_name_lc__.selection.cutflow_features import cutflow_features from __cf_short_name_lc__.selection.stats import ttz_increment_stats from __cf_short_name_lc__.selection.trigger import trigger_selection @@ -241,22 +242,27 @@ def default( events, trigger_results = self[trigger_selection](events, **kwargs) results += trigger_results + # apply muon object selection events, muon_results = self[muon_object](events, stats, **kwargs) results += muon_object_results + # apply electron object selection events, electron_results = self[electron_object](events, results, stats, **kwargs) results += electron_object_results + # apply jet object selection events, jet_results = self[jet_object](events, results, stats, **kwargs) results += jet_object_results + # apply lepton event selection events, lepton_selection_results = self[lepton_selection](events, results, stats, **kwargs) results += lepton_selection_results + # apply jet event selection events, jet_selection_results = self[jet_selection](events, results, stats, **kwargs) results += jet_selection_results - # combined event selection after all steps + # combine event selection after all steps results.event = results.steps.Trigger & results.steps.Lepton & results.steps.Jet # add cutflow features, passing per-object masks diff --git a/analysis_templates/ghent_template/__cf_module_name__/selection/objects.py b/analysis_templates/ghent_template/__cf_module_name__/selection/objects.py index 6d1bbf206..86a2c349e 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/selection/objects.py +++ b/analysis_templates/ghent_template/__cf_module_name__/selection/objects.py @@ -9,12 +9,14 @@ import law -from columnflow.util import maybe_import +from columnflow.util import maybe_import, four_vec from columnflow.columnar_util import set_ak_column from columnflow.production.util import attach_coffea_behavior from columnflow.selection import Selector, SelectionResult, selector from columnflow.selection.util import masked_sorted_indices +ak = maybe_import("awkward") + def masked_sorted_indices(mask: ak.Array, sort_var: ak.Array, ascending: bool = False) -> ak.Array: """ From 0606baaca130fc64547c4c49b7c795868bacb194 Mon Sep 17 00:00:00 2001 From: maadcoen Date: Thu, 4 Apr 2024 15:19:57 +0200 Subject: [PATCH 025/119] golden json names are incosistent over the years :( --- .../config/analysis___cf_short_name_lc__.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/analysis_templates/ghent_template/__cf_module_name__/config/analysis___cf_short_name_lc__.py b/analysis_templates/ghent_template/__cf_module_name__/config/analysis___cf_short_name_lc__.py index 34605c3ae..5e3b70500 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/config/analysis___cf_short_name_lc__.py +++ b/analysis_templates/ghent_template/__cf_module_name__/config/analysis___cf_short_name_lc__.py @@ -228,11 +228,15 @@ json_mirror = "modules/jsonpog-integration" lumi_cert_site = f"https://cms-service-dqmdc.web.cern.ch/CAF/certification/Collisions{year2}/{ecm}TeV/" pu_reweighting_site = f"{lumi_cert_site}/PileUp/UltraLegacy" -runs = {2016: "271036-284044", 2017: "294927-306462", 2018: "314472-325175"} +goldenjsons = { + 2016: f"Cert_271036-284044_13TeV_Legacy2016_Collisions16_JSON.txt", + 2017: f"Cert_294927-306462_13TeV_UL2017_Collisions17_GoldenJSON.txt", + 2018: f"Cert_314472-325175_13TeV_Legacy2018_Collisions18_JSON.txt", +} cfg.x.external_files = DotDict.wrap({ # lumi files (golden run 2 only!!) "lumi": { - "golden": (f"{lumi_cert_site}/Legacy_{year}/Cert_{runs[year]}_{ecm}TeV_UL{year}_Collisions{year2}_GoldenJSON.txt", "v1"), # noqa + "golden": (f"{lumi_cert_site}/Legacy_{year}/{goldenjsons[year]}", "v1"), "normtag": ("modules/Normtags/normtag_PHYSICS.json", "v1"), }, From 1bbcf729d9fc194b0f7de5c0c56319934f6fd27f Mon Sep 17 00:00:00 2001 From: maadcoen Date: Thu, 4 Apr 2024 15:20:25 +0200 Subject: [PATCH 026/119] ecm in names cannot have trailing zeros --- .../config/analysis___cf_short_name_lc__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/analysis_templates/ghent_template/__cf_module_name__/config/analysis___cf_short_name_lc__.py b/analysis_templates/ghent_template/__cf_module_name__/config/analysis___cf_short_name_lc__.py index 5e3b70500..cad9f3f5c 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/config/analysis___cf_short_name_lc__.py +++ b/analysis_templates/ghent_template/__cf_module_name__/config/analysis___cf_short_name_lc__.py @@ -226,7 +226,7 @@ # external files json_mirror = "modules/jsonpog-integration" -lumi_cert_site = f"https://cms-service-dqmdc.web.cern.ch/CAF/certification/Collisions{year2}/{ecm}TeV/" +lumi_cert_site = f"https://cms-service-dqmdc.web.cern.ch/CAF/certification/Collisions{year2}/{ecm:g}TeV/" pu_reweighting_site = f"{lumi_cert_site}/PileUp/UltraLegacy" goldenjsons = { 2016: f"Cert_271036-284044_13TeV_Legacy2016_Collisions16_JSON.txt", @@ -265,13 +265,13 @@ "v1"), # noqa "data_profile": { "nominal": ( - f"{pu_reweighting_site}/PileupHistogram-goldenJSON-{ecm}tev-{year}-69200ub-99bins.root", "v1"), + f"{pu_reweighting_site}/PileupHistogram-goldenJSON-{ecm:g}tev-{year}-69200ub-99bins.root", "v1"), # noqa "minbias_xs_up": ( - f"{pu_reweighting_site}/PileupHistogram-goldenJSON-{ecm}tev-{year}-72400ub-99bins.root", "v1"), + f"{pu_reweighting_site}/PileupHistogram-goldenJSON-{ecm:g}tev-{year}-72400ub-99bins.root", "v1"), # noqa "minbias_xs_down": ( - f"{pu_reweighting_site}/PileupHistogram-goldenJSON-{ecm}tev-{year}-66000ub-99bins.root", "v1"), + f"{pu_reweighting_site}/PileupHistogram-goldenJSON-{ecm:g}tev-{year}-66000ub-99bins.root", "v1"), # noqa }, }, From d3c67b4134a14efc0f51adfd3ed752a6118ee66c Mon Sep 17 00:00:00 2001 From: maadcoen Date: Thu, 4 Apr 2024 15:20:40 +0200 Subject: [PATCH 027/119] remove dummy lepton fakerates --- .../config/analysis___cf_short_name_lc__.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/analysis_templates/ghent_template/__cf_module_name__/config/analysis___cf_short_name_lc__.py b/analysis_templates/ghent_template/__cf_module_name__/config/analysis___cf_short_name_lc__.py index cad9f3f5c..c529a10f7 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/config/analysis___cf_short_name_lc__.py +++ b/analysis_templates/ghent_template/__cf_module_name__/config/analysis___cf_short_name_lc__.py @@ -252,10 +252,6 @@ # btag scale factor "btag_sf_corr": (f"{json_mirror}/POG/BTV/{year}{corr_postfix}_UL/btagging.json.gz", "v1"), - # fake rates - "muon_fakerate": (f"{json_mirror}/POG/BTV/{year}{corr_postfix}_UL/btagging.json.gz", "v1"), - "electron_fakerate": (f"{json_mirror}/POG/BTV/{year}{corr_postfix}_UL/btagging.json.gz", "v1"), - # run 2 only!! # files from https://twiki.cern.ch/twiki/bin/viewauth/CMS/PileupJSONFileforData?rev=44#Pileup_JSON_Files_For_Run_II # noqa "pu": { From 66cea48a82de973930566e56f2d4a11882c92f8b Mon Sep 17 00:00:00 2001 From: juvanden Date: Thu, 4 Apr 2024 15:24:36 +0200 Subject: [PATCH 028/119] change default dataset to tt_dl_powheg --- analysis_templates/ghent_template/law.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/analysis_templates/ghent_template/law.cfg b/analysis_templates/ghent_template/law.cfg index 6cf59f4f8..58c984abd 100644 --- a/analysis_templates/ghent_template/law.cfg +++ b/analysis_templates/ghent_template/law.cfg @@ -29,7 +29,7 @@ tmp_dir_perm: 777 default_analysis: __cf_module_name__.config.analysis___cf_short_name_lc__.analysis___cf_short_name_lc__ default_config: l18 -default_dataset: st_tchannel_t_powheg +default_dataset: tt_sl_powheg calibration_modules: columnflow.calibration.cms.{jets,met}, __cf_module_name__.calibration.{default,jet} selection_modules: columnflow.selection.{empty}, columnflow.selection.cms.{json_filter, met_filters}, __cf_module_name__.selection.{default,categories,stats,trigger} From 87f6940934ab7c40d0e00777bbe12f9122843c4d Mon Sep 17 00:00:00 2001 From: juvanden Date: Thu, 4 Apr 2024 15:33:15 +0200 Subject: [PATCH 029/119] fixed naming increment_stats --- .../__cf_module_name__/selection/default.py | 8 ++++---- .../ghent_template/__cf_module_name__/selection/stats.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/analysis_templates/ghent_template/__cf_module_name__/selection/default.py b/analysis_templates/ghent_template/__cf_module_name__/selection/default.py index b9feda897..7d9cdb82a 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/selection/default.py +++ b/analysis_templates/ghent_template/__cf_module_name__/selection/default.py @@ -24,7 +24,7 @@ from __cf_short_name_lc__.production.cutflow_features import cutflow_features from __cf_short_name_lc__.selection.objects import electron_object, muon_object, jet_object -from __cf_short_name_lc__.selection.stats import ttz_increment_stats +from __cf_short_name_lc__.selection.stats import __cf_short_name_lc___increment_stats from __cf_short_name_lc__.selection.trigger import trigger_selection np = maybe_import("numpy") @@ -173,10 +173,10 @@ def jet_selection( @selector( uses={ - category_ids, ttz_increment_stats + category_ids, __cf_short_name_lc___increment_stats }, produces={ - category_ids, ttz_increment_stats + category_ids, __cf_short_name_lc___increment_stats }, exposed=False, ) @@ -198,7 +198,7 @@ def post_selection( events = self[event_weights_to_normalize](events, results=results, **kwargs) # increment stats - self[ttz_increment_stats](events, results, stats, **kwargs) + self[__cf_short_name_lc___increment_stats](events, results, stats, **kwargs) return events, results diff --git a/analysis_templates/ghent_template/__cf_module_name__/selection/stats.py b/analysis_templates/ghent_template/__cf_module_name__/selection/stats.py index 3edf2a4c2..6ce74484c 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/selection/stats.py +++ b/analysis_templates/ghent_template/__cf_module_name__/selection/stats.py @@ -100,7 +100,7 @@ def __cf_short_name_lc___increment_stats( return events -@__cf_short_name_lc__increment_stats.init +@__cf_short_name_lc___increment_stats.init def __cf_short_name_lc___increment_stats_init(self: Selector) -> None: if not getattr(self, "dataset_inst", None): return From 622fbbfd8d29cfe1b99aeb4296a073288c33f73f Mon Sep 17 00:00:00 2001 From: juvanden Date: Thu, 4 Apr 2024 16:12:50 +0200 Subject: [PATCH 030/119] bug fixes to template --- .../analysis/___cf_short_name_lc__.py | 8 - .../analysis/create_analysis.py | 2 +- .../__cf_module_name__/config/categories.py | 7 +- .../config/config__cf_short_name_lc__.py | 368 ------------------ .../__cf_module_name__/config/datasets.py | 8 +- .../__cf_module_name__/config/processes.py | 2 +- .../__cf_module_name__/config/style.py | 39 -- .../__cf_module_name__/config/variables.py | 4 +- .../__cf_module_name__/selection/default.py | 4 +- analysis_templates/ghent_template/law.cfg | 2 +- 10 files changed, 13 insertions(+), 431 deletions(-) delete mode 100644 analysis_templates/ghent_template/__cf_module_name__/analysis/___cf_short_name_lc__.py delete mode 100644 analysis_templates/ghent_template/__cf_module_name__/config/config__cf_short_name_lc__.py delete mode 100644 analysis_templates/ghent_template/__cf_module_name__/config/style.py diff --git a/analysis_templates/ghent_template/__cf_module_name__/analysis/___cf_short_name_lc__.py b/analysis_templates/ghent_template/__cf_module_name__/analysis/___cf_short_name_lc__.py deleted file mode 100644 index a068e2138..000000000 --- a/analysis_templates/ghent_template/__cf_module_name__/analysis/___cf_short_name_lc__.py +++ /dev/null @@ -1,8 +0,0 @@ - -""" -Main analysis object for the __cf_short_name_lc__ analysis -""" - -from analysis___cf_short_name_lc__.analysis.create_analysis import create_analysis - -__cf_short_name_lc__ = create_analysis("__cf_short_name_lc__", 3, tags={"is_signal_region"}) diff --git a/analysis_templates/ghent_template/__cf_module_name__/analysis/create_analysis.py b/analysis_templates/ghent_template/__cf_module_name__/analysis/create_analysis.py index d648133c2..27715fbcb 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/analysis/create_analysis.py +++ b/analysis_templates/ghent_template/__cf_module_name__/analysis/create_analysis.py @@ -54,7 +54,7 @@ def create_analysis( # import campaigns and load configs # - from ___cf_short_name_lc__.config.config____cf_short_name_lc__ import add_config + from __cf_short_name_lc__.config.config___cf_short_name_lc__ import add_config from cmsdb.campaigns.run2_2018_nano_v9 import campaign_run2_2018_nano_v9 # default config diff --git a/analysis_templates/ghent_template/__cf_module_name__/config/categories.py b/analysis_templates/ghent_template/__cf_module_name__/config/categories.py index ca9db4fed..f51f38de9 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/config/categories.py +++ b/analysis_templates/ghent_template/__cf_module_name__/config/categories.py @@ -24,6 +24,7 @@ def add_categories_selection(config: od.Config) -> None: config.x.regions = ("incl", "CR_WZ") config.x.lepton_channels = ("2e", "1e1mu", "2mu") + config.x.lepton_channel_labels = {"2e": "$ee$", "1e1mu": "$e\mu$", "2mu": "$\mu\mu$"} config.add_category( name="incl", @@ -36,9 +37,9 @@ def add_categories_selection(config: od.Config) -> None: for lepton_channel in config.x.lepton_channels: config.add_category( - name="{}_{}".format(region, lepton_channel), + name=lepton_channel, selection=["catid_selection_{}".format(lepton_channel)], - label="{}, {}".format(region, config.x.lepton_channel_labels[lepton_channel]), + label=config.x.lepton_channel_labels[lepton_channel], ) @@ -54,5 +55,5 @@ def add_categories_production(config: od.Config) -> None: for lepton_channel in config.x.lepton_channels: - cat_lepton = config.get_category("{}_{}".format(region, lepton_channel)) + cat_lepton = config.get_category(lepton_channel) cat_lepton.selection = ["catid_{}".format(lepton_channel)] diff --git a/analysis_templates/ghent_template/__cf_module_name__/config/config__cf_short_name_lc__.py b/analysis_templates/ghent_template/__cf_module_name__/config/config__cf_short_name_lc__.py deleted file mode 100644 index e79c4f457..000000000 --- a/analysis_templates/ghent_template/__cf_module_name__/config/config__cf_short_name_lc__.py +++ /dev/null @@ -1,368 +0,0 @@ -# coding: utf-8 - -""" -Configuration of the __cf_short_name_lc__ analysis. -""" -from __future__ import annotations - -import order as od -from scinum import Number - -from columnflow.util import DotDict, maybe_import -from columnflow.config_util import ( - verify_config_processes, -) - -from __cf_short_name_lc__.config.styling import stylize_processes -from __cf_short_name_lc__.config.datasets import add_datasets, configure_datasets, get_dataset_lfns -from __cf_short_name_lc__.config.processes import add_processes -from __cf_short_name_lc__.config.categories import add_categories_selection -from __cf_short_name_lc__.config.variables import add_feature_variables -from __cf_short_name_lc__.config.shifts import add_shifts -from __cf_short_name_lc__.selection.trigger import add_triggers -from __cf_short_name_lc__.util import four_vec -ak = maybe_import("awkward") - - -def add_config( - analysis: od.Analysis, - campaign: od.Campaign, - config_name: str | None = None, - config_id: int | None = None, - limit_dataset_files: int | None = None, -) -> od.Config: - # validations - assert campaign.x.year in [2016, 2017, 2018] - if campaign.x.year == 2016: - assert campaign.x.vfp in ["pre", "post"] - # gather campaign data - year = campaign.x.year - year2 = year % 100 - corr_postfix = f"{campaign.x.vfp}VFP" if year == 2016 else "" - - if year != 2018: - raise NotImplementedError("For now, only 2018 campaign is fully implemented") - - cfg = analysis.add_config(campaign, name=config_name, id=config_id, tags=analysis.tags) - - year = campaign.x.year - year2 = year % 100 - corr_postfix = f"{campaign.x.vfp}VFP" if year == 2016 else "" - cfg.x.year = year - - add_ttz_processes(cfg, campaign) - - add_triggers(cfg, campaign) - add_ttz_datasets(cfg, campaign) - - cfg.x.get_dataset_lfns = get_dataset_lfns - - configure_ttz_datasets(cfg, limit_dataset_files) - - # verify that the root process of all datasets is part of any of the registered processes - verify_config_processes(cfg, warn=True) - - # lumi values in inverse pb - # https://twiki.cern.ch/twiki/bin/view/CMS/LumiRecommendationsRun2?rev=2#Combination_and_correlations - if year == 2016: - cfg.x.luminosity = Number(36310, { - "lumi_13TeV_2016": 0.01j, - "lumi_13TeV_correlated": 0.006j, - }) - elif year == 2017: - cfg.x.luminosity = Number(41480, { - "lumi_13TeV_2017": 0.02j, - "lumi_13TeV_1718": 0.006j, - "lumi_13TeV_correlated": 0.009j, - }) - elif year == 2018: # 2018 - cfg.x.luminosity = Number(59830, { - "lumi_13TeV_2017": 0.015j, - "lumi_13TeV_1718": 0.002j, - "lumi_13TeV_correlated": 0.02j, - }) - - cfg.x.minbias_xs = Number(69.2, 0.046j) - - # jec configuration - # https://twiki.cern.ch/twiki/bin/view/CMS/JECDataMC?rev=201 - jerc_postfix = "APV" if year == 2016 and campaign.x.vfp == "post" else "" - cfg.x.jec = DotDict.wrap({ - "campaign": f"Summer19UL{year2}{jerc_postfix}", - "version": {2016: "V7", 2017: "V5", 2018: "V5"}[year], - "jet_type": "AK4PFchs", - "levels": ["L1FastJet", "L2Relative", "L2L3Residual", "L3Absolute"], - "levels_for_type1_met": ["L1FastJet"], - "uncertainty_sources": [ - # "AbsoluteStat", - # "AbsoluteScale", - # "AbsoluteSample", - # "AbsoluteFlavMap", - # "AbsoluteMPFBias", - # "Fragmentation", - # "SinglePionECAL", - # "SinglePionHCAL", - # "FlavorQCD", - # "TimePtEta", - # "RelativeJEREC1", - # "RelativeJEREC2", - # "RelativeJERHF", - # "RelativePtBB", - # "RelativePtEC1", - # "RelativePtEC2", - # "RelativePtHF", - # "RelativeBal", - # "RelativeSample", - # "RelativeFSR", - # "RelativeStatFSR", - # "RelativeStatEC", - # "RelativeStatHF", - # "PileUpDataMC", - # "PileUpPtRef", - # "PileUpPtBB", - # "PileUpPtEC1", - # "PileUpPtEC2", - # "PileUpPtHF", - # "PileUpMuZero", - # "PileUpEnvelope", - # "SubTotalPileUp", - # "SubTotalRelative", - # "SubTotalPt", - # "SubTotalScale", - # "SubTotalAbsolute", - # "SubTotalMC", - "Total", - # "TotalNoFlavor", - # "TotalNoTime", - # "TotalNoFlavorNoTime", - # "FlavorZJet", - # "FlavorPhotonJet", - # "FlavorPureGluon", - # "FlavorPureQuark", - # "FlavorPureCharm", - # "FlavorPureBottom", - # "TimeRunA", - # "TimeRunB", - # "TimeRunC", - # "TimeRunD", - "CorrelationGroupMPFInSitu", - "CorrelationGroupIntercalibration", - "CorrelationGroupbJES", - "CorrelationGroupFlavor", - "CorrelationGroupUncorrelated", - ], - }) - - # JER - # https://twiki.cern.ch/twiki/bin/view/CMS/JetResolution?rev=107 - cfg.x.jer = DotDict.wrap({ - "campaign": f"Summer19UL{year2}{jerc_postfix}", - "version": "JR" + {2016: "V3", 2017: "V2", 2018: "V2"}[year], - "jet_type": "AK4PFchs", - }) - - # JEC uncertainty sources propagated to btag scale factors - # (names derived from contents in BTV correctionlib file) - cfg.x.btag_sf_jec_sources = [ - "", # total - "Absolute", - "AbsoluteMPFBias", - "AbsoluteScale", - "AbsoluteStat", - f"Absolute_{year}", - "BBEC1", - f"BBEC1_{year}", - "EC2", - f"EC2_{year}", - "FlavorQCD", - "Fragmentation", - "HF", - f"HF_{year}", - "PileUpDataMC", - "PileUpPtBB", - "PileUpPtEC1", - "PileUpPtEC2", - "PileUpPtHF", - "PileUpPtRef", - "RelativeBal", - "RelativeFSR", - "RelativeJEREC1", - "RelativeJEREC2", - "RelativeJERHF", - "RelativePtBB", - "RelativePtEC1", - "RelativePtEC2", - "RelativePtHF", - "RelativeSample", - f"RelativeSample_{year}", - "RelativeStatEC", - "RelativeStatFSR", - "RelativeStatHF", - "SinglePionECAL", - "SinglePionHCAL", - "TimePtEta", - ] - - # b-tag working points - # https://twiki.cern.ch/twiki/bin/view/CMS/BtagRecommendation106XUL16preVFP?rev=6 - # https://twiki.cern.ch/twiki/bin/view/CMS/BtagRecommendation106XUL16postVFP?rev=8 - # https://twiki.cern.ch/twiki/bin/view/CMS/BtagRecommendation106XUL17?rev=15 - # https://twiki.cern.ch/twiki/bin/view/CMS/BtagRecommendation106XUL17?rev=17 - btag_key = f"2016{campaign.x.vfp}" if year == 2016 else year - cfg.x.btag_working_points = DotDict.wrap({ - "deepjet": { - "loose": {"2016pre": 0.0508, "2016post": 0.0480, 2017: 0.0532, 2018: 0.0490}[btag_key], - "medium": {"2016pre": 0.2598, "2016post": 0.2489, 2017: 0.3040, 2018: 0.2783}[btag_key], - "tight": {"2016pre": 0.6502, "2016post": 0.6377, 2017: 0.7476, 2018: 0.7100}[btag_key], - }, - "deepcsv": { - "loose": {"2016pre": 0.2027, "2016post": 0.1918, 2017: 0.1355, 2018: 0.1208}[btag_key], - "medium": {"2016pre": 0.6001, "2016post": 0.5847, 2017: 0.4506, 2018: 0.4168}[btag_key], - "tight": {"2016pre": 0.8819, "2016post": 0.8767, 2017: 0.7738, 2018: 0.7665}[btag_key], - }, - }) - cfg.x.btag_sf = ("deepJet_shape", cfg.x.btag_sf_jec_sources) - - # names of electron correction sets and working points - # (used in the electron_sf producer) - cfg.x.electron_sf_names = ("UL-Electron-ID-SF", f"{year}{corr_postfix}", "wp80iso") - cfg.x.muon_sf_names = ("NUM_TightRelIso_DEN_TightIDandIPCut", f"{year}{corr_postfix}_UL") - - # external files - json_mirror = "${MODULE_BASE}/jsonpog-integration" - year_short = str(year)[2:] # 20XX > XX - lumi_cert_site = f"https://cms-service-dqmdc.web.cern.ch/CAF/certification/Collisions{year_short}/{ecm}TeV/" - pu_reweighting_site = f"{lumi_cert_site}/PileUp/UltraLegacy" - runs = {2016: "271036-284044", 2017: "294927-306462", 2018: "314472-325175"} - cfg.x.external_files = DotDict.wrap({ - # lumi files (golden run 2 only!!) - "lumi": { - "golden": (f"{lumi_cert_site}/Legacy_{year}/Cert_{runs[year]}_{ecm}TeV_UL{year}_Collisions{year_short}_GoldenJSON.txt", "v1"), # noqa - "normtag": ("${MODULE_BASE}/Normtags/normtag_PHYSICS.json", "v1"), - }, - - # jet energy correction - "jet_jerc": (f"{json_mirror}/POG/JME/{year}{corr_postfix}_UL/jet_jerc.json.gz", "v1"), - - # electron scale factors - "electron_sf": (f"{json_mirror}/POG/EGM/{year}{corr_postfix}_UL/electron.json.gz", "v1"), - - # muon scale factors - "muon_sf": (f"{json_mirror}/POG/MUO/{year}{corr_postfix}_UL/muon_Z.json.gz", "v1"), - - # btag scale factor - "btag_sf_corr": (f"{json_mirror}/POG/BTV/{year}{corr_postfix}_UL/btagging.json.gz", "v1"), - - # fake rates - "muon_fakerate": (f"{json_mirror}/POG/BTV/{year}{corr_postfix}_UL/btagging.json.gz", "v1"), - "electron_fakerate": (f"{json_mirror}/POG/BTV/{year}{corr_postfix}_UL/btagging.json.gz", "v1"), - - # run 2 only!! - # files from https://twiki.cern.ch/twiki/bin/viewauth/CMS/PileupJSONFileforData?rev=44#Pileup_JSON_Files_For_Run_II # noqa - "pu": { - "json": (f"{pu_reweightin_website}/pileup_latest.txt", "v1"), # noqa - "mc_profile": ( - "https://raw.githubusercontent.com/cms-sw/cmssw/435f0b04c0e318c1036a6b95eb169181bbbe8344/SimGeneral/MixingModule/python/mix_2018_25ns_UltraLegacy_PoissonOOTPU_cfi.py", # noqa - "v1"), # noqa - "data_profile": { - "nominal": ( - f"{pu_reweighting_site}/PileupHistogram-goldenJSON-{ecm}tev-{year}-69200ub-99bins.root", "v1"), - # noqa - "minbias_xs_up": ( - f"{pu_reweighting_site}/PileupHistogram-goldenJSON-{ecm}tev-{year}-72400ub-99bins.root", "v1"), - # noqa - "minbias_xs_down": ( - f"{pu_reweighting_site}/PileupHistogram-goldenJSON-{ecm}tev-{year}-66000ub-99bins.root", "v1"), - # noqa - }, - }, - }) - - # process groups for conveniently looping over certain processs - # (used in wrapper_factory and during plotting) - cfg.x.process_groups = { - "test": ["tt_dl"], - "all": ["tt_dl", "dy", "data"], - "sim": ["tt_dl", "dy"], - } - - # dataset groups for conveniently looping over certain datasets - # (used in wrapper_factory and during plotting) - cfg.x.dataset_groups = { - "test": ["tt_dl"], - "all": ["tt_dl", "dy*", "data*"], - "sim": ["tt_dl", "dy*"], - } - - cfg.x.variable_groups = { - "default": ["n_jet"], - } - - # category groups for conveniently looping over certain categories - # (used during plotting) - cfg.x.category_groups = { - "default": ["incl"], - } - - # shift groups for conveniently looping over certain shifts - # (used during plotting) - cfg.x.event_weights = DotDict() - cfg.x.event_weights["normalization_weight"] = [] - add_shifts(cfg) - - cfg.x.shift_groups = { - "jer": ["nominal", "jer_up", "jer_down"], - "btag": ["nominal", "btag*"], - "all": cfg.shifts.names(), - } - - # selector step groups for conveniently looping over certain steps - # (used in cutflow tasks) - cfg.x.selector_step_groups = {} - - # custom method and sandbox for determining dataset lfns - cfg.x.get_dataset_lfns = None - cfg.x.get_dataset_lfns_sandbox = None - - # whether to validate the number of obtained LFNs in GetDatasetLFNs - # (currently set to false because the number of files per dataset is truncated to 2) - cfg.x.validate_dataset_lfns = False - - # columns to keep after certain steps - cfg.x.keep_columns = DotDict.wrap({ - "cf.MergeSelectionMasks": { - "mc_weight", "normalization_weight", "process_id", "category_ids", "cutflow.*", - }, - }) - - cfg.x.keep_columns["cf.ReduceEvents"] = ( - { - # general event information - "run", "luminosityBlock", "event", - # columns added during selection, required in general - "mc_weight", "PV.npvs", "process_id", "category_ids", "deterministic_seed", - # weight-related columns - "pu_weight*", "pdf_weight*", - "murf_envelope_weight*", "mur_weight*", "muf_weight*", - "btag_weight*", - # extra columns - } | four_vec( # Jets - {"Jet"}, - {"btagDeepFlavB", "btagDeepFlavCvB"}, - ) | four_vec( # Leptons - {"Electron", "Muon", } - ) - ) - - cfg.x.default_calibrator = "skip_jecunc" # skip jet energy correction up and down variation to save time in running - cfg.x.default_selector = "default" - cfg.x.default_producer = "default" - cfg.x.default_ml_model = None - cfg.x.default_inference_model = "example" - cfg.x.default_variables = ("n_jet",) - - add_categories_selection(cfg) - add_feature_variables(cfg) - stylize_processes(cfg) - - return cfg diff --git a/analysis_templates/ghent_template/__cf_module_name__/config/datasets.py b/analysis_templates/ghent_template/__cf_module_name__/config/datasets.py index 52f6a1e3e..4e7cd772c 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/config/datasets.py +++ b/analysis_templates/ghent_template/__cf_module_name__/config/datasets.py @@ -17,11 +17,6 @@ def add_datasets(config: od.Config, campaign: od.Campaign): - # load custom produced datasets into campaign - get_custom_datasets(campaign) - - # use custom get_dataset_lfns function - config.x.get_dataset_lfns = get_dataset_lfns # add datasets we need to study dataset_names = { @@ -59,7 +54,8 @@ def add_datasets(config: od.Config, campaign: od.Campaign): # ttbar - "tt_dl", + "tt_dl_powheg", + "tt_sl_powheg" ]}[f"{config.x.year}{config.x.corr_postfix}"] # loop over all dataset names and add them to the config diff --git a/analysis_templates/ghent_template/__cf_module_name__/config/processes.py b/analysis_templates/ghent_template/__cf_module_name__/config/processes.py index 686c9a0ea..452e7326e 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/config/processes.py +++ b/analysis_templates/ghent_template/__cf_module_name__/config/processes.py @@ -23,7 +23,7 @@ def add_processes(config: od.Config, campaign: od.Campaign): bg = config.add_process( name="background", - id=1, + id=9999, label="Background", ) diff --git a/analysis_templates/ghent_template/__cf_module_name__/config/style.py b/analysis_templates/ghent_template/__cf_module_name__/config/style.py deleted file mode 100644 index f656d4e3f..000000000 --- a/analysis_templates/ghent_template/__cf_module_name__/config/style.py +++ /dev/null @@ -1,39 +0,0 @@ -""" -Collection of helpers for styling, e.g. -- dicitonaries of defaults for variable definition, colors, labels, etc. -- functions to quickly create variable insts in a predefined way -""" - -import order as od - -from columnflow.columnar_util import EMPTY_FLOAT - -# -# Processes -# - -default_process_colors = { - "data": "#000000", # black - "tt": "#cf9fff", # green - "dy_lep": "#377eb8", # blue -} - - -def stylize_processes(config: od.Config) -> None: - """ - Small helper that sets the process insts to analysis-appropriate defaults - For now: only colors and unstacking - Could also include some more defaults (labels, unstack, ...) - """ - - for proc in config.processes: - # set default colors - if color := default_process_colors.get(proc.name, None): - proc.color1 = color - proc.color2 = "#000000" - - config.x.default_legend_cfg = { - "ncol": 2, - "loc": "upper right", - "fontsize": 15, - } diff --git a/analysis_templates/ghent_template/__cf_module_name__/config/variables.py b/analysis_templates/ghent_template/__cf_module_name__/config/variables.py index 42f9ddb1f..2917fb459 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/config/variables.py +++ b/analysis_templates/ghent_template/__cf_module_name__/config/variables.py @@ -1,11 +1,11 @@ import order as od -from columnflow.util import maybe_import +from columnflow.util import maybe_import, call_once_on_config np = maybe_import("numpy") ak = maybe_import("awkward") -from columnflow.columnar_util import EMPTY_FLOAT, call_once_on_config +from columnflow.columnar_util import EMPTY_FLOAT @call_once_on_config() diff --git a/analysis_templates/ghent_template/__cf_module_name__/selection/default.py b/analysis_templates/ghent_template/__cf_module_name__/selection/default.py index 7d9cdb82a..60072a617 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/selection/default.py +++ b/analysis_templates/ghent_template/__cf_module_name__/selection/default.py @@ -219,11 +219,11 @@ def post_selection_init(self: Selector) -> None: @selector( uses={ pre_selection, post_selection, trigger_selection, - lepton_selection, jet_selection, lepton_gen_features + lepton_selection, jet_selection, }, produces={ pre_selection, post_selection, trigger_selection, - lepton_selection, jet_selection, lepton_gen_features + lepton_selection, jet_selection, }, exposed=True, ) diff --git a/analysis_templates/ghent_template/law.cfg b/analysis_templates/ghent_template/law.cfg index 58c984abd..25693d148 100644 --- a/analysis_templates/ghent_template/law.cfg +++ b/analysis_templates/ghent_template/law.cfg @@ -27,7 +27,7 @@ tmp_dir_perm: 777 [analysis] -default_analysis: __cf_module_name__.config.analysis___cf_short_name_lc__.analysis___cf_short_name_lc__ +default_analysis: __cf_module_name__.analysis.__cf_short_name_lc__.__cf_short_name_lc__ default_config: l18 default_dataset: tt_sl_powheg From 6ba299bdc5b81219fdd32bac99b2687f263352e7 Mon Sep 17 00:00:00 2001 From: juvanden Date: Fri, 5 Apr 2024 09:22:57 +0200 Subject: [PATCH 031/119] name changes to folders --- .../analysis/__cf_short_name_lc__.py | 8 + .../config/config___cf_short_name_lc__.py | 367 ++++++++++++++++++ .../__cf_module_name__/config/styling.py | 39 ++ 3 files changed, 414 insertions(+) create mode 100644 analysis_templates/ghent_template/__cf_module_name__/analysis/__cf_short_name_lc__.py create mode 100644 analysis_templates/ghent_template/__cf_module_name__/config/config___cf_short_name_lc__.py create mode 100644 analysis_templates/ghent_template/__cf_module_name__/config/styling.py diff --git a/analysis_templates/ghent_template/__cf_module_name__/analysis/__cf_short_name_lc__.py b/analysis_templates/ghent_template/__cf_module_name__/analysis/__cf_short_name_lc__.py new file mode 100644 index 000000000..b619736c7 --- /dev/null +++ b/analysis_templates/ghent_template/__cf_module_name__/analysis/__cf_short_name_lc__.py @@ -0,0 +1,8 @@ + +""" +Main analysis object for the __cf_short_name_lc__ analysis +""" + +from __cf_short_name_lc__.analysis.create_analysis import create_analysis + +__cf_short_name_lc__ = create_analysis("__cf_short_name_lc__", 3, tags={"is_signal_region"}) diff --git a/analysis_templates/ghent_template/__cf_module_name__/config/config___cf_short_name_lc__.py b/analysis_templates/ghent_template/__cf_module_name__/config/config___cf_short_name_lc__.py new file mode 100644 index 000000000..9f101818b --- /dev/null +++ b/analysis_templates/ghent_template/__cf_module_name__/config/config___cf_short_name_lc__.py @@ -0,0 +1,367 @@ +# coding: utf-8 + +""" +Configuration of the __cf_short_name_lc__ analysis. +""" +from __future__ import annotations + +import order as od +from scinum import Number + +from columnflow.util import DotDict, maybe_import, four_vec +from columnflow.config_util import ( + verify_config_processes, +) + +from __cf_short_name_lc__.config.styling import stylize_processes +from __cf_short_name_lc__.config.datasets import add_datasets, configure_datasets +from __cf_short_name_lc__.config.processes import add_processes +from __cf_short_name_lc__.config.categories import add_categories_selection +from __cf_short_name_lc__.config.variables import add_variables +from __cf_short_name_lc__.config.shifts import add_shifts +from __cf_short_name_lc__.selection.trigger import add_triggers + +ak = maybe_import("awkward") + + +def add_config( + analysis: od.Analysis, + campaign: od.Campaign, + config_name: str | None = None, + config_id: int | None = None, + limit_dataset_files: int | None = None, +) -> od.Config: + # validations + assert campaign.x.year in [2016, 2017, 2018] # only run 2 implemented + if campaign.x.year == 2016: + assert campaign.x.vfp in ["pre", "post"] + + # only 2018 fully implemented + if year != 2018: + raise NotImplementedError("For now, only 2018 campaign is fully implemented") + + cfg = analysis.add_config(campaign, name=config_name, id=config_id, tags=analysis.tags) + + year = campaign.x.year + year2 = year % 100 + corr_postfix = f"{campaign.x.vfp}VFP" if year == 2016 else "" + ecm = campaign.ecm + + cfg.x.year = year + cfg.x.year2 = year2 + cfg.x.corr_postfix = corr_postfix + cfg.x.ecm = ecm + + add_processes(cfg, campaign) + + add_triggers(cfg, campaign) + add_datasets(cfg, campaign) + configure_datasets(cfg, limit_dataset_files) + + # verify that the root process of all datasets is part of any of the registered processes + verify_config_processes(cfg, warn=True) + + # lumi values in inverse pb + # https://twiki.cern.ch/twiki/bin/view/CMS/LumiRecommendationsRun2?rev=2#Combination_and_correlations + if year == 2016: + cfg.x.luminosity = Number(36310, { + "lumi_13TeV_2016": 0.01j, + "lumi_13TeV_correlated": 0.006j, + }) + elif year == 2017: + cfg.x.luminosity = Number(41480, { + "lumi_13TeV_2017": 0.02j, + "lumi_13TeV_1718": 0.006j, + "lumi_13TeV_correlated": 0.009j, + }) + elif year == 2018: # 2018 + cfg.x.luminosity = Number(59830, { + "lumi_13TeV_2017": 0.015j, + "lumi_13TeV_1718": 0.002j, + "lumi_13TeV_correlated": 0.02j, + }) + + cfg.x.minbias_xs = Number(69.2, 0.046j) + + # jec configuration + # https://twiki.cern.ch/twiki/bin/view/CMS/JECDataMC?rev=201 + jerc_postfix = "APV" if year == 2016 and campaign.x.vfp == "post" else "" + cfg.x.jec = DotDict.wrap({ + "campaign": f"Summer19UL{year2}{jerc_postfix}", + "version": {2016: "V7", 2017: "V5", 2018: "V5"}[year], + "jet_type": "AK4PFchs", + "levels": ["L1FastJet", "L2Relative", "L2L3Residual", "L3Absolute"], + "levels_for_type1_met": ["L1FastJet"], + "uncertainty_sources": [ + # "AbsoluteStat", + # "AbsoluteScale", + # "AbsoluteSample", + # "AbsoluteFlavMap", + # "AbsoluteMPFBias", + # "Fragmentation", + # "SinglePionECAL", + # "SinglePionHCAL", + # "FlavorQCD", + # "TimePtEta", + # "RelativeJEREC1", + # "RelativeJEREC2", + # "RelativeJERHF", + # "RelativePtBB", + # "RelativePtEC1", + # "RelativePtEC2", + # "RelativePtHF", + # "RelativeBal", + # "RelativeSample", + # "RelativeFSR", + # "RelativeStatFSR", + # "RelativeStatEC", + # "RelativeStatHF", + # "PileUpDataMC", + # "PileUpPtRef", + # "PileUpPtBB", + # "PileUpPtEC1", + # "PileUpPtEC2", + # "PileUpPtHF", + # "PileUpMuZero", + # "PileUpEnvelope", + # "SubTotalPileUp", + # "SubTotalRelative", + # "SubTotalPt", + # "SubTotalScale", + # "SubTotalAbsolute", + # "SubTotalMC", + "Total", + # "TotalNoFlavor", + # "TotalNoTime", + # "TotalNoFlavorNoTime", + # "FlavorZJet", + # "FlavorPhotonJet", + # "FlavorPureGluon", + # "FlavorPureQuark", + # "FlavorPureCharm", + # "FlavorPureBottom", + # "TimeRunA", + # "TimeRunB", + # "TimeRunC", + # "TimeRunD", + "CorrelationGroupMPFInSitu", + "CorrelationGroupIntercalibration", + "CorrelationGroupbJES", + "CorrelationGroupFlavor", + "CorrelationGroupUncorrelated", + ], + }) + + # JER + # https://twiki.cern.ch/twiki/bin/view/CMS/JetResolution?rev=107 + cfg.x.jer = DotDict.wrap({ + "campaign": f"Summer19UL{year2}{jerc_postfix}", + "version": "JR" + {2016: "V3", 2017: "V2", 2018: "V2"}[year], + "jet_type": "AK4PFchs", + }) + + # JEC uncertainty sources propagated to btag scale factors + # (names derived from contents in BTV correctionlib file) + cfg.x.btag_sf_jec_sources = [ + "", # total + "Absolute", + "AbsoluteMPFBias", + "AbsoluteScale", + "AbsoluteStat", + f"Absolute_{year}", + "BBEC1", + f"BBEC1_{year}", + "EC2", + f"EC2_{year}", + "FlavorQCD", + "Fragmentation", + "HF", + f"HF_{year}", + "PileUpDataMC", + "PileUpPtBB", + "PileUpPtEC1", + "PileUpPtEC2", + "PileUpPtHF", + "PileUpPtRef", + "RelativeBal", + "RelativeFSR", + "RelativeJEREC1", + "RelativeJEREC2", + "RelativeJERHF", + "RelativePtBB", + "RelativePtEC1", + "RelativePtEC2", + "RelativePtHF", + "RelativeSample", + f"RelativeSample_{year}", + "RelativeStatEC", + "RelativeStatFSR", + "RelativeStatHF", + "SinglePionECAL", + "SinglePionHCAL", + "TimePtEta", + ] + + # b-tag working points + # https://twiki.cern.ch/twiki/bin/view/CMS/BtagRecommendation106XUL16preVFP?rev=6 + # https://twiki.cern.ch/twiki/bin/view/CMS/BtagRecommendation106XUL16postVFP?rev=8 + # https://twiki.cern.ch/twiki/bin/view/CMS/BtagRecommendation106XUL17?rev=15 + # https://twiki.cern.ch/twiki/bin/view/CMS/BtagRecommendation106XUL17?rev=17 + btag_key = f"2016{campaign.x.vfp}" if year == 2016 else year + cfg.x.btag_working_points = DotDict.wrap({ + "deepjet": { + "loose": {"2016pre": 0.0508, "2016post": 0.0480, 2017: 0.0532, 2018: 0.0490}[btag_key], + "medium": {"2016pre": 0.2598, "2016post": 0.2489, 2017: 0.3040, 2018: 0.2783}[btag_key], + "tight": {"2016pre": 0.6502, "2016post": 0.6377, 2017: 0.7476, 2018: 0.7100}[btag_key], + }, + "deepcsv": { + "loose": {"2016pre": 0.2027, "2016post": 0.1918, 2017: 0.1355, 2018: 0.1208}[btag_key], + "medium": {"2016pre": 0.6001, "2016post": 0.5847, 2017: 0.4506, 2018: 0.4168}[btag_key], + "tight": {"2016pre": 0.8819, "2016post": 0.8767, 2017: 0.7738, 2018: 0.7665}[btag_key], + }, + }) + cfg.x.btag_sf = ("deepJet_shape", cfg.x.btag_sf_jec_sources) + + # names of electron correction sets and working points + # (used in the electron_sf producer) + cfg.x.electron_sf_names = ("UL-Electron-ID-SF", f"{year}{corr_postfix}", "wp80iso") + cfg.x.muon_sf_names = ("NUM_TightRelIso_DEN_TightIDandIPCut", f"{year}{corr_postfix}_UL") + + # external files + json_mirror = "${MODULE_BASE}/jsonpog-integration" + year_short = str(year)[2:] # 20XX > XX + lumi_cert_site = f"https://cms-service-dqmdc.web.cern.ch/CAF/certification/Collisions{year_short}/{ecm}TeV/" + pu_reweighting_site = f"{lumi_cert_site}/PileUp/UltraLegacy" + runs = {2016: "271036-284044", 2017: "294927-306462", 2018: "314472-325175"} + cfg.x.external_files = DotDict.wrap({ + # lumi files (golden run 2 only!!) + "lumi": { + "golden": (f"{lumi_cert_site}/Legacy_{year}/Cert_{runs[year]}_{ecm}TeV_UL{year}_Collisions{year_short}_GoldenJSON.txt", "v1"), # noqa + "normtag": ("${MODULE_BASE}/Normtags/normtag_PHYSICS.json", "v1"), + }, + + # jet energy correction + "jet_jerc": (f"{json_mirror}/POG/JME/{year}{corr_postfix}_UL/jet_jerc.json.gz", "v1"), + + # electron scale factors + "electron_sf": (f"{json_mirror}/POG/EGM/{year}{corr_postfix}_UL/electron.json.gz", "v1"), + + # muon scale factors + "muon_sf": (f"{json_mirror}/POG/MUO/{year}{corr_postfix}_UL/muon_Z.json.gz", "v1"), + + # btag scale factor + "btag_sf_corr": (f"{json_mirror}/POG/BTV/{year}{corr_postfix}_UL/btagging.json.gz", "v1"), + + # fake rates + "muon_fakerate": (f"{json_mirror}/POG/BTV/{year}{corr_postfix}_UL/btagging.json.gz", "v1"), + "electron_fakerate": (f"{json_mirror}/POG/BTV/{year}{corr_postfix}_UL/btagging.json.gz", "v1"), + + # run 2 only!! + # files from https://twiki.cern.ch/twiki/bin/viewauth/CMS/PileupJSONFileforData?rev=44#Pileup_JSON_Files_For_Run_II # noqa + "pu": { + "json": (f"{pu_reweighting_site}/pileup_latest.txt", "v1"), # noqa + "mc_profile": ( + "https://raw.githubusercontent.com/cms-sw/cmssw/435f0b04c0e318c1036a6b95eb169181bbbe8344/SimGeneral/MixingModule/python/mix_2018_25ns_UltraLegacy_PoissonOOTPU_cfi.py", # noqa + "v1"), # noqa + "data_profile": { + "nominal": ( + f"{pu_reweighting_site}/PileupHistogram-goldenJSON-{ecm}tev-{year}-69200ub-99bins.root", "v1"), + # noqa + "minbias_xs_up": ( + f"{pu_reweighting_site}/PileupHistogram-goldenJSON-{ecm}tev-{year}-72400ub-99bins.root", "v1"), + # noqa + "minbias_xs_down": ( + f"{pu_reweighting_site}/PileupHistogram-goldenJSON-{ecm}tev-{year}-66000ub-99bins.root", "v1"), + # noqa + }, + }, + }) + + # process groups for conveniently looping over certain processs + # (used in wrapper_factory and during plotting) + cfg.x.process_groups = { + "test": ["tt_dl"], + "all": ["tt_dl", "dy", "data"], + "sim": ["tt_dl", "dy"], + } + + # dataset groups for conveniently looping over certain datasets + # (used in wrapper_factory and during plotting) + cfg.x.dataset_groups = { + "test": ["tt_dl"], + "all": ["tt_dl", "dy*", "data*"], + "sim": ["tt_dl", "dy*"], + } + + cfg.x.variable_groups = { + "default": ["n_jet"], + } + + # category groups for conveniently looping over certain categories + # (used during plotting) + cfg.x.category_groups = { + "default": ["incl"], + } + + # shift groups for conveniently looping over certain shifts + # (used during plotting) + cfg.x.event_weights = DotDict() + cfg.x.event_weights["normalization_weight"] = [] + add_shifts(cfg) + + cfg.x.shift_groups = { + "jer": ["nominal", "jer_up", "jer_down"], + "btag": ["nominal", "btag*"], + "all": cfg.shifts.names(), + } + + # selector step groups for conveniently looping over certain steps + # (used in cutflow tasks) + cfg.x.selector_step_groups = {} + + # custom method and sandbox for determining dataset lfns + cfg.x.get_dataset_lfns = None + cfg.x.get_dataset_lfns_sandbox = None + + # whether to validate the number of obtained LFNs in GetDatasetLFNs + # (currently set to false because the number of files per dataset is truncated to 2) + cfg.x.validate_dataset_lfns = False + + # columns to keep after certain steps + cfg.x.keep_columns = DotDict.wrap({ + "cf.MergeSelectionMasks": { + "mc_weight", "normalization_weight", "process_id", "category_ids", "cutflow.*", + }, + }) + + cfg.x.keep_columns["cf.ReduceEvents"] = ( + { + # general event information + "run", "luminosityBlock", "event", + # columns added during selection, required in general + "mc_weight", "PV.npvs", "process_id", "category_ids", "deterministic_seed", + # weight-related columns + "pu_weight*", "pdf_weight*", + "murf_envelope_weight*", "mur_weight*", "muf_weight*", + "btag_weight*", + # extra columns + } | four_vec( # Jets + {"Jet"}, + {"btagDeepFlavB", "btagDeepFlavCvB"}, + ) | four_vec( # Leptons + {"Electron", "Muon", } + ) + ) + + cfg.x.default_calibrator = "skip_jecunc" # skip jet energy correction up and down variation to save time in running + cfg.x.default_selector = "default" + cfg.x.default_producer = "default" + cfg.x.default_ml_model = None + cfg.x.default_inference_model = "example" + cfg.x.default_variables = ("n_jet",) + + add_categories_selection(cfg) + add_variables(cfg) + stylize_processes(cfg) + + return cfg diff --git a/analysis_templates/ghent_template/__cf_module_name__/config/styling.py b/analysis_templates/ghent_template/__cf_module_name__/config/styling.py new file mode 100644 index 000000000..f656d4e3f --- /dev/null +++ b/analysis_templates/ghent_template/__cf_module_name__/config/styling.py @@ -0,0 +1,39 @@ +""" +Collection of helpers for styling, e.g. +- dicitonaries of defaults for variable definition, colors, labels, etc. +- functions to quickly create variable insts in a predefined way +""" + +import order as od + +from columnflow.columnar_util import EMPTY_FLOAT + +# +# Processes +# + +default_process_colors = { + "data": "#000000", # black + "tt": "#cf9fff", # green + "dy_lep": "#377eb8", # blue +} + + +def stylize_processes(config: od.Config) -> None: + """ + Small helper that sets the process insts to analysis-appropriate defaults + For now: only colors and unstacking + Could also include some more defaults (labels, unstack, ...) + """ + + for proc in config.processes: + # set default colors + if color := default_process_colors.get(proc.name, None): + proc.color1 = color + proc.color2 = "#000000" + + config.x.default_legend_cfg = { + "ncol": 2, + "loc": "upper right", + "fontsize": 15, + } From 60b4e1e24ab21eb9613fb3e44c1331b562ffb6f9 Mon Sep 17 00:00:00 2001 From: juvanden Date: Fri, 5 Apr 2024 10:32:00 +0200 Subject: [PATCH 032/119] add patch_htcondor_workflow to patch_all() --- .../ghent_template/__cf_module_name__/columnflow_patches.py | 1 + 1 file changed, 1 insertion(+) diff --git a/analysis_templates/ghent_template/__cf_module_name__/columnflow_patches.py b/analysis_templates/ghent_template/__cf_module_name__/columnflow_patches.py index 75e3673a4..bea1a91e3 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/columnflow_patches.py +++ b/analysis_templates/ghent_template/__cf_module_name__/columnflow_patches.py @@ -49,3 +49,4 @@ def patch_htcondor_workflow(): @memoize def patch_all(): patch_bundle_repo_exclude_files() + patch_htcondor_workflow() From 195f1805e164ed5019648fa364cd9556d3951fbe Mon Sep 17 00:00:00 2001 From: maadcoen Date: Fri, 5 Apr 2024 10:44:02 +0200 Subject: [PATCH 033/119] corrected mistakes in exertal file specification --- .../config/config___cf_short_name_lc__.py | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/analysis_templates/ghent_template/__cf_module_name__/config/config___cf_short_name_lc__.py b/analysis_templates/ghent_template/__cf_module_name__/config/config___cf_short_name_lc__.py index 9f101818b..cd50f6b39 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/config/config___cf_short_name_lc__.py +++ b/analysis_templates/ghent_template/__cf_module_name__/config/config___cf_short_name_lc__.py @@ -228,16 +228,20 @@ def add_config( cfg.x.muon_sf_names = ("NUM_TightRelIso_DEN_TightIDandIPCut", f"{year}{corr_postfix}_UL") # external files - json_mirror = "${MODULE_BASE}/jsonpog-integration" + json_mirror = "modules/jsonpog-integration" year_short = str(year)[2:] # 20XX > XX - lumi_cert_site = f"https://cms-service-dqmdc.web.cern.ch/CAF/certification/Collisions{year_short}/{ecm}TeV/" + lumi_cert_site = f"https://cms-service-dqmdc.web.cern.ch/CAF/certification/Collisions{year_short}/{ecm:g}TeV" pu_reweighting_site = f"{lumi_cert_site}/PileUp/UltraLegacy" - runs = {2016: "271036-284044", 2017: "294927-306462", 2018: "314472-325175"} + goldenjsons = { + 2016: f"Cert_271036-284044_13TeV_Legacy2016_Collisions16_JSON.txt", + 2017: f"Cert_294927-306462_13TeV_UL2017_Collisions17_GoldenJSON.txt", + 2018: f"Cert_314472-325175_13TeV_Legacy2018_Collisions18_JSON.txt", + } cfg.x.external_files = DotDict.wrap({ # lumi files (golden run 2 only!!) "lumi": { - "golden": (f"{lumi_cert_site}/Legacy_{year}/Cert_{runs[year]}_{ecm}TeV_UL{year}_Collisions{year_short}_GoldenJSON.txt", "v1"), # noqa - "normtag": ("${MODULE_BASE}/Normtags/normtag_PHYSICS.json", "v1"), + "golden": (f"{lumi_cert_site}/Legacy_{year}/{goldenjsons[year]}", "v1"), + "normtag": ("modules/Normtags/normtag_PHYSICS.json", "v1"), }, # jet energy correction @@ -252,10 +256,6 @@ def add_config( # btag scale factor "btag_sf_corr": (f"{json_mirror}/POG/BTV/{year}{corr_postfix}_UL/btagging.json.gz", "v1"), - # fake rates - "muon_fakerate": (f"{json_mirror}/POG/BTV/{year}{corr_postfix}_UL/btagging.json.gz", "v1"), - "electron_fakerate": (f"{json_mirror}/POG/BTV/{year}{corr_postfix}_UL/btagging.json.gz", "v1"), - # run 2 only!! # files from https://twiki.cern.ch/twiki/bin/viewauth/CMS/PileupJSONFileforData?rev=44#Pileup_JSON_Files_For_Run_II # noqa "pu": { @@ -265,13 +265,13 @@ def add_config( "v1"), # noqa "data_profile": { "nominal": ( - f"{pu_reweighting_site}/PileupHistogram-goldenJSON-{ecm}tev-{year}-69200ub-99bins.root", "v1"), + f"{pu_reweighting_site}/PileupHistogram-goldenJSON-{ecm:g}tev-{year}-69200ub-99bins.root", "v1"), # noqa "minbias_xs_up": ( - f"{pu_reweighting_site}/PileupHistogram-goldenJSON-{ecm}tev-{year}-72400ub-99bins.root", "v1"), + f"{pu_reweighting_site}/PileupHistogram-goldenJSON-{ecm:g}tev-{year}-72400ub-99bins.root", "v1"), # noqa "minbias_xs_down": ( - f"{pu_reweighting_site}/PileupHistogram-goldenJSON-{ecm}tev-{year}-66000ub-99bins.root", "v1"), + f"{pu_reweighting_site}/PileupHistogram-goldenJSON-{ecm:g}tev-{year}-66000ub-99bins.root", "v1"), # noqa }, }, From 625e07166184fcd5841ef0102e4a6178a9b64dc2 Mon Sep 17 00:00:00 2001 From: maadcoen Date: Fri, 5 Apr 2024 10:48:19 +0200 Subject: [PATCH 034/119] removed analysis___cf_short_name_lc__.py in favour of config___cf_short_name_lc__.py --- .../config/analysis___cf_short_name_lc__.py | 414 ------------------ 1 file changed, 414 deletions(-) delete mode 100644 analysis_templates/ghent_template/__cf_module_name__/config/analysis___cf_short_name_lc__.py diff --git a/analysis_templates/ghent_template/__cf_module_name__/config/analysis___cf_short_name_lc__.py b/analysis_templates/ghent_template/__cf_module_name__/config/analysis___cf_short_name_lc__.py deleted file mode 100644 index c529a10f7..000000000 --- a/analysis_templates/ghent_template/__cf_module_name__/config/analysis___cf_short_name_lc__.py +++ /dev/null @@ -1,414 +0,0 @@ -# coding: utf-8 - -""" -Configuration of the __cf_analysis_name__ analysis. -""" - -import functools - -import law -import order as od -from scinum import Number - -from columnflow.util import DotDict, maybe_import -from columnflow.columnar_util import EMPTY_FLOAT, ColumnCollection -from columnflow.config_util import ( - get_root_processes_from_campaign, add_shift_aliases, get_shifts_from_sources, add_category, - verify_config_processes, -) - -ak = maybe_import("awkward") - - -# -# the main analysis object -# - -analysis___cf_short_name_lc__ = ana = od.Analysis( - name="analysis___cf_short_name_lc__", - id=1, -) - -# analysis-global versions -# (see cfg.x.versions below for more info) -ana.x.versions = {} - -# files of bash sandboxes that might be required by remote tasks -# (used in cf.HTCondorWorkflow) -ana.x.bash_sandboxes = ["$CF_BASE/sandboxes/cf.sh"] -default_sandbox = law.Sandbox.new(law.config.get("analysis", "default_columnar_sandbox")) -if default_sandbox.sandbox_type == "bash" and default_sandbox.name not in ana.x.bash_sandboxes: - ana.x.bash_sandboxes.append(default_sandbox.name) - -# files of cmssw sandboxes that might be required by remote tasks -# (used in cf.HTCondorWorkflow) -ana.x.cmssw_sandboxes = [ - "$CF_BASE/sandboxes/cmssw_default.sh", -] - -# config groups for conveniently looping over certain configs -# (used in wrapper_factory) -ana.x.config_groups = {} - - -# -# setup configs -# - -# an example config is setup below, based on cms NanoAOD v9 for Run2 2017, focussing on -# ttbar and single top MCs, plus single muon data -# update this config or add additional ones to accomodate the needs of your analysis - -from cmsdb.campaigns.run2_2018_nano_v9 import campaign_run2_2018_nano_v9 - -# copy the campaign -# (creates copies of all linked datasets, processes, etc. to allow for encapsulated customization) -campaign = campaign_run2_2018_nano_v9.copy() - -# get all root processes -procs = get_root_processes_from_campaign(campaign) - -# create a config by passing the campaign, so id and name will be identical -cfg = ana.add_config(campaign) - -# gather campaign data -year = campaign.x.year -ecm = campaign.ecm -year2 = year % 100 -corr_postfix = f"{campaign.x.vfp}VFP" if year == 2016 else "" - -# add processes we are interested in -process_names = [ - "data", - "tt", - "st", -] -for process_name in process_names: - # add the process - proc = cfg.add_process(procs.get(process_name)) - - # configuration of colors, labels, etc. can happen here - if proc.is_mc: - proc.color1 = (244, 182, 66) if proc.name == "tt" else (244, 93, 66) - -# add datasets we need to study -dataset_names = [ - # data - "data_mu_b", - # backgrounds - "tt_sl_powheg", - # signals - "st_tchannel_t_powheg", -] -for dataset_name in dataset_names: - # add the dataset - dataset = cfg.add_dataset(campaign.get_dataset(dataset_name)) - - # for testing purposes, limit the number of files to 2 - for info in dataset.info.values(): - info.n_files = min(info.n_files, 2) - -# verify that the root process of all datasets is part of any of the registered processes -verify_config_processes(cfg, warn=True) - -# default objects, such as calibrator, selector, producer, ml model, inference model, etc -cfg.x.default_calibrator = "example" -cfg.x.default_selector = "example" -cfg.x.default_producer = "example" -cfg.x.default_ml_model = None -cfg.x.default_inference_model = "example" -cfg.x.default_categories = ("incl",) -cfg.x.default_variables = ("n_jet", "jet1_pt") - - -# process groups for conveniently looping over certain processs -# (used in wrapper_factory and during plotting) -cfg.x.process_groups = {} - -# dataset groups for conveniently looping over certain datasets -# (used in wrapper_factory and during plotting) -cfg.x.dataset_groups = {} - -# category groups for conveniently looping over certain categories -# (used during plotting) -cfg.x.category_groups = {} - -# variable groups for conveniently looping over certain variables -# (used during plotting) -cfg.x.variable_groups = {} - -# shift groups for conveniently looping over certain shifts -# (used during plotting) -cfg.x.shift_groups = {} - -# general_settings groups for conveniently looping over different values for the general-settings parameter -# (used during plotting) -cfg.x.general_settings_groups = {} - -# process_settings groups for conveniently looping over different values for the process-settings parameter -# (used during plotting) -cfg.x.process_settings_groups = {} - -# variable_settings groups for conveniently looping over different values for the variable-settings parameter -# (used during plotting) -cfg.x.variable_settings_groups = {} - -# custom_style_config groups for conveniently looping over certain style configs -# (used during plotting) -cfg.x.custom_style_config_groups = {} - -# selector step groups for conveniently looping over certain steps -# (used in cutflow tasks) -cfg.x.selector_step_groups = { - "default": ["muon", "jet"], -} - -# calibrator groups for conveniently looping over certain calibrators -# (used during calibration) -cfg.x.calibrator_groups = {} - -# producer groups for conveniently looping over certain producers -# (used during the ProduceColumns task) -cfg.x.producer_groups = {} - -# ml_model groups for conveniently looping over certain ml_models -# (used during the machine learning tasks) -cfg.x.ml_model_groups = {} - - -# custom method and sandbox for determining dataset lfns -cfg.x.get_dataset_lfns = None -cfg.x.get_dataset_lfns_sandbox = None - -# whether to validate the number of obtained LFNs in GetDatasetLFNs -# (currently set to false because the number of files per dataset is truncated to 2) -cfg.x.validate_dataset_lfns = False - -# lumi values in inverse pb -# https://twiki.cern.ch/twiki/bin/view/CMS/LumiRecommendationsRun2?rev=2#Combination_and_correlations -cfg.x.luminosity = Number(41480, { - "lumi_13TeV_2017": 0.02j, - "lumi_13TeV_1718": 0.006j, - "lumi_13TeV_correlated": 0.009j, -}) - -# names of muon correction sets and working points -# (used in the muon producer) -cfg.x.muon_sf_names = ("NUM_TightRelIso_DEN_TightIDandIPCut", f"{year}_UL") - -# register shifts -cfg.add_shift(name="nominal", id=0) - -# tune shifts are covered by dedicated, varied datasets, so tag the shift as "disjoint_from_nominal" -# (this is currently used to decide whether ML evaluations are done on the full shifted dataset) -cfg.add_shift(name="tune_up", id=1, type="shape", tags={"disjoint_from_nominal"}) -cfg.add_shift(name="tune_down", id=2, type="shape", tags={"disjoint_from_nominal"}) - -# fake jet energy correction shift, with aliases flaged as "selection_dependent", i.e. the aliases -# affect columns that might change the output of the event selection -cfg.add_shift(name="jec_up", id=20, type="shape") -cfg.add_shift(name="jec_down", id=21, type="shape") -add_shift_aliases( - cfg, - "jec", - { - "Jet.pt": "Jet.pt_{name}", - "Jet.mass": "Jet.mass_{name}", - "MET.pt": "MET.pt_{name}", - "MET.phi": "MET.phi_{name}", - }, -) - -# event weights due to muon scale factors -cfg.add_shift(name="mu_up", id=10, type="shape") -cfg.add_shift(name="mu_down", id=11, type="shape") -add_shift_aliases(cfg, "mu", {"muon_weight": "muon_weight_{direction}"}) - -# external files -json_mirror = "modules/jsonpog-integration" -lumi_cert_site = f"https://cms-service-dqmdc.web.cern.ch/CAF/certification/Collisions{year2}/{ecm:g}TeV/" -pu_reweighting_site = f"{lumi_cert_site}/PileUp/UltraLegacy" -goldenjsons = { - 2016: f"Cert_271036-284044_13TeV_Legacy2016_Collisions16_JSON.txt", - 2017: f"Cert_294927-306462_13TeV_UL2017_Collisions17_GoldenJSON.txt", - 2018: f"Cert_314472-325175_13TeV_Legacy2018_Collisions18_JSON.txt", -} -cfg.x.external_files = DotDict.wrap({ - # lumi files (golden run 2 only!!) - "lumi": { - "golden": (f"{lumi_cert_site}/Legacy_{year}/{goldenjsons[year]}", "v1"), - "normtag": ("modules/Normtags/normtag_PHYSICS.json", "v1"), - }, - - # jet energy correction - "jet_jerc": (f"{json_mirror}/POG/JME/{year}{corr_postfix}_UL/jet_jerc.json.gz", "v1"), - - # electron scale factors - "electron_sf": (f"{json_mirror}/POG/EGM/{year}{corr_postfix}_UL/electron.json.gz", "v1"), - - # muon scale factors - "muon_sf": (f"{json_mirror}/POG/MUO/{year}{corr_postfix}_UL/muon_Z.json.gz", "v1"), - - # btag scale factor - "btag_sf_corr": (f"{json_mirror}/POG/BTV/{year}{corr_postfix}_UL/btagging.json.gz", "v1"), - - # run 2 only!! - # files from https://twiki.cern.ch/twiki/bin/viewauth/CMS/PileupJSONFileforData?rev=44#Pileup_JSON_Files_For_Run_II # noqa - "pu": { - "json": (f"{pu_reweighting_site}/pileup_latest.txt", "v1"), # noqa - "mc_profile": ( - "https://raw.githubusercontent.com/cms-sw/cmssw/435f0b04c0e318c1036a6b95eb169181bbbe8344/SimGeneral/MixingModule/python/mix_2018_25ns_UltraLegacy_PoissonOOTPU_cfi.py", # noqa - "v1"), # noqa - "data_profile": { - "nominal": ( - f"{pu_reweighting_site}/PileupHistogram-goldenJSON-{ecm:g}tev-{year}-69200ub-99bins.root", "v1"), - # noqa - "minbias_xs_up": ( - f"{pu_reweighting_site}/PileupHistogram-goldenJSON-{ecm:g}tev-{year}-72400ub-99bins.root", "v1"), - # noqa - "minbias_xs_down": ( - f"{pu_reweighting_site}/PileupHistogram-goldenJSON-{ecm:g}tev-{year}-66000ub-99bins.root", "v1"), - # noqa - }, - }, -}) - -# target file size after MergeReducedEvents in MB -cfg.x.reduced_file_size = 512.0 - -# columns to keep after certain steps -cfg.x.keep_columns = DotDict.wrap({ - "cf.ReduceEvents": { - # general event info, mandatory for reading files with coffea - ColumnCollection.MANDATORY_COFFEA, # additional columns can be added as strings, similar to object info - # object info - "Jet.pt", "Jet.eta", "Jet.phi", "Jet.mass", "Jet.btagDeepFlavB", "Jet.hadronFlavour", - "Muon.pt", "Muon.eta", "Muon.phi", "Muon.mass", "Muon.pfRelIso04_all", - "MET.pt", "MET.phi", "MET.significance", "MET.covXX", "MET.covXY", "MET.covYY", - "PV.npvs", - # all columns added during selection using a ColumnCollection flag - ColumnCollection.ALL_FROM_SELECTOR, - }, - "cf.MergeSelectionMasks": { - "cutflow.*", - }, - "cf.UniteColumns": { - "*", - }, -}) - -# event weight columns as keys in an OrderedDict, mapped to shift instances they depend on -get_shifts = functools.partial(get_shifts_from_sources, cfg) -cfg.x.event_weights = DotDict({ - "normalization_weight": [], - "muon_weight": get_shifts("mu"), -}) - -# versions per task family, either referring to strings or to callables receving the invoking -# task instance and parameters to be passed to the task family -cfg.x.versions = { - # "cf.CalibrateEvents": "prod1", - # "cf.SelectEvents": (lambda cls, inst, params: "prod1" if params.get("selector") == "default" else "dev1"), - # ... -} - -# channels -# (just one for now) -cfg.add_channel(name="mutau", id=1) - -# add categories using the "add_category" tool which adds auto-generated ids -# the "selection" entries refer to names of categorizers, e.g. in categorization/example.py -# note: it is recommended to always add an inclusive category with id=1 or name="incl" which is used -# in various places, e.g. for the inclusive cutflow plots and the "empty" selector -add_category( - cfg, - id=1, - name="incl", - selection="cat_incl", - label="inclusive", -) -add_category( - cfg, - name="2j", - selection="cat_2j", - label="2 jets", -) - -# add variables -# (the "event", "run" and "lumi" variables are required for some cutflow plotting task, -# and also correspond to the minimal set of columns that coffea's nano scheme requires) -cfg.add_variable( - name="event", - expression="event", - binning=(1, 0.0, 1.0e9), - x_title="Event number", - discrete_x=True, -) -cfg.add_variable( - name="run", - expression="run", - binning=(1, 100000.0, 500000.0), - x_title="Run number", - discrete_x=True, -) -cfg.add_variable( - name="lumi", - expression="luminosityBlock", - binning=(1, 0.0, 5000.0), - x_title="Luminosity block", - discrete_x=True, -) -cfg.add_variable( - name="n_jet", - expression="n_jet", - binning=(11, -0.5, 10.5), - x_title="Number of jets", - discrete_x=True, -) -# pt of all jets in every event -cfg.add_variable( - name="jets_pt", - expression="Jet.pt", - binning=(40, 0.0, 400.0), - unit="GeV", - x_title=r"$p_{T} of all jets$", -) -# pt of the first jet in every event -cfg.add_variable( - name="jet1_pt", # variable name, to be given to the "--variables" argument for the plotting task - expression="Jet.pt[:,0]", # content of the variable - null_value=EMPTY_FLOAT, # value to be given if content not available for event - binning=(40, 0.0, 400.0), # (bins, lower edge, upper edge) - unit="GeV", # unit of the variable, if any - x_title=r"Jet 1 $p_{T}$", # x title of histogram when plotted -) -# eta of the first jet in every event -cfg.add_variable( - name="jet1_eta", - expression="Jet.eta[:,0]", - null_value=EMPTY_FLOAT, - binning=(30, -3.0, 3.0), - x_title=r"Jet 1 $\eta$", -) -cfg.add_variable( - name="ht", - expression=lambda events: ak.sum(events.Jet.pt, axis=1), - binning=(40, 0.0, 800.0), - unit="GeV", - x_title="HT", -) -# weights -cfg.add_variable( - name="mc_weight", - expression="mc_weight", - binning=(200, -10, 10), - x_title="MC weight", -) -# cutflow variables -cfg.add_variable( - name="cf_jet1_pt", - expression="cutflow.jet1_pt", - binning=(40, 0.0, 400.0), - unit="GeV", - x_title=r"Jet 1 $p_{T}$", -) From bce6d88113167235697988859192a7ea891e418d Mon Sep 17 00:00:00 2001 From: maadcoen Date: Fri, 5 Apr 2024 11:06:12 +0200 Subject: [PATCH 035/119] remove jsonpog-integration as submodule, use cvmfs instead --- .../__cf_module_name__/config/config___cf_short_name_lc__.py | 2 +- create_analysis.sh | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/analysis_templates/ghent_template/__cf_module_name__/config/config___cf_short_name_lc__.py b/analysis_templates/ghent_template/__cf_module_name__/config/config___cf_short_name_lc__.py index cd50f6b39..fef167f8f 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/config/config___cf_short_name_lc__.py +++ b/analysis_templates/ghent_template/__cf_module_name__/config/config___cf_short_name_lc__.py @@ -228,7 +228,7 @@ def add_config( cfg.x.muon_sf_names = ("NUM_TightRelIso_DEN_TightIDandIPCut", f"{year}{corr_postfix}_UL") # external files - json_mirror = "modules/jsonpog-integration" + json_mirror = "/cvmfs/cms.cern.ch/rsync/cms-nanoAOD/jsonpog-integration" year_short = str(year)[2:] # 20XX > XX lumi_cert_site = f"https://cms-service-dqmdc.web.cern.ch/CAF/certification/Collisions{year_short}/{ecm:g}TeV" pu_reweighting_site = f"{lumi_cert_site}/PileUp/UltraLegacy" diff --git a/create_analysis.sh b/create_analysis.sh index a03886aed..bd3b69b01 100755 --- a/create_analysis.sh +++ b/create_analysis.sh @@ -274,7 +274,6 @@ create_analysis() { fi if [ "${cf_analysis_flavor}" = "ghent_template" ]; then git submodule add -b "${fetch_cmsdb_branch}" "${gh_prefix_github}CMS-LUMI-POG/Normtags.git" modules/Normtags - git submodule add -b "${fetch_cmsdb_branch}" "${gh_prefix_gitlab}cms-nanoAOD/jsonpog-integration.git" modules/jsonpog-integration git submodule add -b "${fetch_cmsdb_branch}" "${gh_prefix_gitlab}ghentanalysis/cmsdb.git" modules/cmsdb fi From 75e6b92f2ec9acd527da950954c531778388d800 Mon Sep 17 00:00:00 2001 From: Maarten De Coen <52047931+maadcoen@users.noreply.github.com> Date: Fri, 5 Apr 2024 11:26:11 +0200 Subject: [PATCH 036/119] Update create_analysis.sh use main branch when creating new analysis --- create_analysis.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/create_analysis.sh b/create_analysis.sh index bd3b69b01..dd8a8ddd9 100755 --- a/create_analysis.sh +++ b/create_analysis.sh @@ -17,7 +17,7 @@ create_analysis() { local this_file="$( ${shell_is_zsh} && echo "${(%):-%x}" || echo "${BASH_SOURCE[0]}" )" local this_dir="$( cd "$( dirname "${this_file}" )" && pwd )" local exec_dir="$( pwd )" - local fetch_cf_branch="pog_externals" + local fetch_cf_branch="main" local fetch_cmsdb_branch="master" local debug="${CF_CREATE_ANALYSIS_DEBUG:-false}" From 91fcb440622705fc3c1fc6be5f6394fffdde90f7 Mon Sep 17 00:00:00 2001 From: Maarten De Coen <52047931+maadcoen@users.noreply.github.com> Date: Fri, 5 Apr 2024 11:29:27 +0200 Subject: [PATCH 037/119] Update README.md get main branch with curl --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3c53a5a2e..93476800a 100644 --- a/README.md +++ b/README.md @@ -63,7 +63,7 @@ To create an analysis using columnflow, it is recommended to start from a predef The following command (no previous git clone required) interactively asks for a handful of names and settings, and creates a minimal, yet fully functioning project structure for you! ```shell -bash -c "$(curl -Ls https://raw.githubusercontent.com/GhentAnalysis/columnflow/pog_externals/create_analysis.sh)" +bash -c "$(curl -Ls https://raw.githubusercontent.com/GhentAnalysis/columnflow/main/create_analysis.sh)" ``` At the end of the setup, you will see further instructions and suggestions to run your first analysis tasks (example below). From e82982d5e4dcc4556552e0622b1003095831856c Mon Sep 17 00:00:00 2001 From: maadcoen Date: Fri, 5 Apr 2024 12:16:05 +0200 Subject: [PATCH 038/119] pile up scale factors are now provided centrally by the POGs, no in situ calculation anymore --- .../config/config___cf_short_name_lc__.py | 21 ++----------------- 1 file changed, 2 insertions(+), 19 deletions(-) diff --git a/analysis_templates/ghent_template/__cf_module_name__/config/config___cf_short_name_lc__.py b/analysis_templates/ghent_template/__cf_module_name__/config/config___cf_short_name_lc__.py index fef167f8f..838aa8b32 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/config/config___cf_short_name_lc__.py +++ b/analysis_templates/ghent_template/__cf_module_name__/config/config___cf_short_name_lc__.py @@ -256,25 +256,8 @@ def add_config( # btag scale factor "btag_sf_corr": (f"{json_mirror}/POG/BTV/{year}{corr_postfix}_UL/btagging.json.gz", "v1"), - # run 2 only!! - # files from https://twiki.cern.ch/twiki/bin/viewauth/CMS/PileupJSONFileforData?rev=44#Pileup_JSON_Files_For_Run_II # noqa - "pu": { - "json": (f"{pu_reweighting_site}/pileup_latest.txt", "v1"), # noqa - "mc_profile": ( - "https://raw.githubusercontent.com/cms-sw/cmssw/435f0b04c0e318c1036a6b95eb169181bbbe8344/SimGeneral/MixingModule/python/mix_2018_25ns_UltraLegacy_PoissonOOTPU_cfi.py", # noqa - "v1"), # noqa - "data_profile": { - "nominal": ( - f"{pu_reweighting_site}/PileupHistogram-goldenJSON-{ecm:g}tev-{year}-69200ub-99bins.root", "v1"), - # noqa - "minbias_xs_up": ( - f"{pu_reweighting_site}/PileupHistogram-goldenJSON-{ecm:g}tev-{year}-72400ub-99bins.root", "v1"), - # noqa - "minbias_xs_down": ( - f"{pu_reweighting_site}/PileupHistogram-goldenJSON-{ecm:g}tev-{year}-66000ub-99bins.root", "v1"), - # noqa - }, - }, + # Pile up scale factor + "pu_sf": (f"{json_mirror}/POG/LUM/{year}{corr_postfix}_UL/puWeights.json.gz", "v1") }) # process groups for conveniently looping over certain processs From 08baba53bfde9c990be7e608c16c483d66b071ea Mon Sep 17 00:00:00 2001 From: juvanden Date: Fri, 5 Apr 2024 12:24:05 +0200 Subject: [PATCH 039/119] addition object_selection function --- .../__cf_module_name__/selection/default.py | 22 +++++--------- .../__cf_module_name__/selection/objects.py | 30 ++++++++++++++++--- 2 files changed, 33 insertions(+), 19 deletions(-) diff --git a/analysis_templates/ghent_template/__cf_module_name__/selection/default.py b/analysis_templates/ghent_template/__cf_module_name__/selection/default.py index 60072a617..4b830756a 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/selection/default.py +++ b/analysis_templates/ghent_template/__cf_module_name__/selection/default.py @@ -23,7 +23,7 @@ from __cf_short_name_lc__.production.weights import event_weights_to_normalize from __cf_short_name_lc__.production.cutflow_features import cutflow_features -from __cf_short_name_lc__.selection.objects import electron_object, muon_object, jet_object +from __cf_short_name_lc__.selection.objects import object_selection from __cf_short_name_lc__.selection.stats import __cf_short_name_lc___increment_stats from __cf_short_name_lc__.selection.trigger import trigger_selection @@ -218,12 +218,12 @@ def post_selection_init(self: Selector) -> None: @selector( uses={ - pre_selection, post_selection, trigger_selection, - lepton_selection, jet_selection, + pre_selection, post_selection, + object_selection, trigger_selection, lepton_selection, jet_selection, }, produces={ - pre_selection, post_selection, trigger_selection, - lepton_selection, jet_selection, + pre_selection, post_selection, + object_selection, trigger_selection, lepton_selection, jet_selection, }, exposed=True, ) @@ -243,16 +243,8 @@ def default( results += trigger_results # apply muon object selection - events, muon_results = self[muon_object](events, stats, **kwargs) - results += muon_object_results - - # apply electron object selection - events, electron_results = self[electron_object](events, results, stats, **kwargs) - results += electron_object_results - - # apply jet object selection - events, jet_results = self[jet_object](events, results, stats, **kwargs) - results += jet_object_results + events, object_results = self[object_selection](events, stats, **kwargs) + results += object_results # apply lepton event selection events, lepton_selection_results = self[lepton_selection](events, results, stats, **kwargs) diff --git a/analysis_templates/ghent_template/__cf_module_name__/selection/objects.py b/analysis_templates/ghent_template/__cf_module_name__/selection/objects.py index 86a2c349e..e1e6517e5 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/selection/objects.py +++ b/analysis_templates/ghent_template/__cf_module_name__/selection/objects.py @@ -30,8 +30,7 @@ def masked_sorted_indices(mask: ak.Array, sort_var: ak.Array, ascending: bool = uses=four_vec( ("Muon"), ("sip3d", "dxy", "dz", "miniPFRelIso_all", "tightId") - ), - produces={"Muon.tight"}, + ) | {"event"}, triggers=None ) def muon_object( @@ -62,7 +61,7 @@ def muon_object( events = set_ak_column(events, "Muon.tight", mu_mask_tight, value_type=bool) return events, SelectionResult( - steps={}, + steps={ak.ones_like(events.event, value_type=bool)}, objects={ "Muon": { "Muon": masked_sorted_indices(mu_mask, muon.pt) @@ -79,7 +78,6 @@ def muon_object( ) | four_vec( ("Muon"), ), - produces={"Electron.tight"}, triggers=None ) def electron_object( @@ -165,3 +163,27 @@ def jet_object( }, }, ) + + +@selector( + uses=(muon_object, electron_object, jet_object), + exposed=False, +) +def object_selection( + self: Selector, + events: ak.Array, + stats: defaultdict, + **kwargs, +) -> Tuple[ak.Array, SelectionResult]: + # apply muon object selection + events, results = self[muon_object](events, stats, **kwargs) + + # apply electron object selection + events, electron_results = self[electron_object](events, results, stats, **kwargs) + results += electron_results + + # apply jet object selection + events, jet_results = self[jet_object](events, results, stats, **kwargs) + results += jet_results + + return events, results From 4a7378b2ca38178bddaf773a489aaeb994945d3c Mon Sep 17 00:00:00 2001 From: maadcoen Date: Fri, 5 Apr 2024 15:30:31 +0200 Subject: [PATCH 040/119] remove bugs in selections --- .../__cf_module_name__/selection/default.py | 47 ++++++------------- .../__cf_module_name__/selection/objects.py | 9 ++-- 2 files changed, 21 insertions(+), 35 deletions(-) diff --git a/analysis_templates/ghent_template/__cf_module_name__/selection/default.py b/analysis_templates/ghent_template/__cf_module_name__/selection/default.py index 4b830756a..ea232cbdd 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/selection/default.py +++ b/analysis_templates/ghent_template/__cf_module_name__/selection/default.py @@ -74,7 +74,7 @@ def pre_selection( @selector( uses=four_vec( - ("Electron", "Muon"), + ("Electron", "Muon"), ("charge", "pdgId", "tight"), ), triggers=None ) @@ -91,44 +91,31 @@ def lepton_selection( muon = (events.Muon[results.objects.Muon.Muon]) # create new object: leptons - leptons = ak.concatenate([muon, electron], axis=-1) - leptons = leptons[ak.argsort(leptons.pt, axis=-1, ascending=False)] + lepton = ak.concatenate([muon, electron], axis=-1) + lepton = lepton[ak.argsort(lepton.pt, axis=-1, ascending=False)] # required for pt cuts and Z-cuts on masks fill_with = { "pt": -999, "eta": -999, "phi": -999, "charge": -999, - "pdgId": -999, "mass": -999, "e_idx": -999, "mu_idx": -999, - "sip3d": -999 + "pdgId": -999, "mass": -999, "sip3d": -999, 'tight': False, } - leptons = ak.fill_none(ak.pad_none(leptons, 2, axis=-1), fill_with) + lepton = ak.fill_none(ak.pad_none(lepton, 2, axis=-1), fill_with) # construct the Z-boson candidate mask - mll = (TetraVec(leptons[:, 0]) + TetraVec(leptons[:, 1])).mass + mll = (TetraVec(lepton[:, 0]) + TetraVec(lepton[:, 1])).mass z_mask = ( - (leptons[:, 0].charge != leptons[:, 1].charge) & - (abs(leptons[:, 0].pdgId) == abs(leptons[:, 1].pdgId)) & + (lepton[:, 0].charge != lepton[:, 1].charge) & + (abs(lepton[:, 0].pdgId) == abs(lepton[:, 1].pdgId)) & (abs(mll - 91) < 15) ) lepton_mask = ( - (leptons.pt[:, 0] > 30) & - (leptons.pt[:, 1] > 20) & + (lepton.pt[:, 0] > 30) & + (lepton.pt[:, 1] > 20) & (~z_mask) & # no Z-boson peak leptons (ak.all(lepton.tight, axis=-1)) # all loose leptons in the event must be tight ) - # Electron and Muon indices corresponding to lepton selection - empty_events = ak.zeros_like(1 * events.event, dtype=np.uint16) - empty_indices = empty_events[..., None][..., :0] - e_indices = ak.where(lepton_mask, leptons.e_idx, empty_indices) - mu_indices = ak.where(lepton_mask, leptons.mu_idx, empty_indices) - e_indices_l = ak.drop_none(e_indices) - mu_indices_l = ak.drop_none(mu_indices) - - # loose indices on electron and muon - e_indices = masked_sorted_indices(e_mask_tight, electron.pt) - mu_indices = masked_sorted_indices(mu_mask_tight, muon.pt) - return events, SelectionResult( steps={ "Lepton": lepton_mask, @@ -137,7 +124,7 @@ def lepton_selection( aux={ # save the selected lepton for the duration of the selection # multiplication of a coffea particle with 1 yields the lorentz vector - "lepton": leptons, + "lepton": lepton, }, ) @@ -155,19 +142,15 @@ def jet_selection( ) -> Tuple[ak.Array, SelectionResult]: jet = (events.Jet[results.objects.Jet.Jet]) - lepton = results.aux.lepton - bjet_mask_medium = -(jet.btagDeepFlavB >= self.config_inst.x.btag_working_points.deepjet.medium) + bjet_mask_medium = (jet.btagDeepFlavB >= self.config_inst.x.btag_working_points.deepjet.medium) - jet_mask = ( - (ak.sum(bjet_mask_medium) >= 1) - ) + jet_event_mask = (ak.sum(bjet_mask_medium, axis=-1) >= 1) return events, SelectionResult( steps={ - "Jet": jet_mask, + "Jet": jet_event_mask, }, - objects={}, ) @@ -242,7 +225,7 @@ def default( events, trigger_results = self[trigger_selection](events, **kwargs) results += trigger_results - # apply muon object selection + # apply object selection events, object_results = self[object_selection](events, stats, **kwargs) results += object_results diff --git a/analysis_templates/ghent_template/__cf_module_name__/selection/objects.py b/analysis_templates/ghent_template/__cf_module_name__/selection/objects.py index e1e6517e5..694e9c738 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/selection/objects.py +++ b/analysis_templates/ghent_template/__cf_module_name__/selection/objects.py @@ -43,7 +43,7 @@ def muon_object( muon = (events.Muon) # loose object electron mask - mu_mask_loose = ( + mu_mask = ( (abs(muon.eta) < 2.4) & (muon.pt > 10.) & (muon.miniPFRelIso_all < 0.4) & @@ -54,7 +54,7 @@ def muon_object( # tight object muon mask (tight cutbased ID) mu_mask_tight = ( - (mu_mask) & + (mu_mask_loose) & (muon.tightId) ) @@ -100,7 +100,7 @@ def electron_object( (electron.sip3d < 8) & (abs(electron.dxy) < 0.05) & (abs(electron.dz) < 0.1) & - (electron.losthist < 2) & + (electron.lostHits < 2) & (electron.isPFcand) & (electron.convVeto) & (electron.tightCharge > 1) & @@ -162,6 +162,9 @@ def jet_object( "Jet": jet_indices, }, }, + aux={ + "jet_mask": jet_mask, + }, ) From 2ab94a8d477bbc9cc127b13086cc6337546b4b6c Mon Sep 17 00:00:00 2001 From: maadcoen Date: Fri, 5 Apr 2024 15:49:15 +0200 Subject: [PATCH 041/119] added n_jets to auxiliaries --- .../ghent_template/__cf_module_name__/selection/objects.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/analysis_templates/ghent_template/__cf_module_name__/selection/objects.py b/analysis_templates/ghent_template/__cf_module_name__/selection/objects.py index 694e9c738..5fb43b656 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/selection/objects.py +++ b/analysis_templates/ghent_template/__cf_module_name__/selection/objects.py @@ -154,6 +154,7 @@ def jet_object( ) jet_indices = masked_sorted_indices(jet_mask, events.Jet.pt) + n_jets = ak.sum(jet_mask, axis=-1) return events, SelectionResult( steps={}, @@ -164,6 +165,7 @@ def jet_object( }, aux={ "jet_mask": jet_mask, + "n_jets": n_jets, }, ) From a7a016690a33ca62db148dd214715fab43666123 Mon Sep 17 00:00:00 2001 From: maadcoen Date: Fri, 5 Apr 2024 15:51:57 +0200 Subject: [PATCH 042/119] removed btag scalefactors for now (we want wp rather then shape correction) --- .../__cf_module_name__/production/weights.py | 373 +++++++++--------- 1 file changed, 187 insertions(+), 186 deletions(-) diff --git a/analysis_templates/ghent_template/__cf_module_name__/production/weights.py b/analysis_templates/ghent_template/__cf_module_name__/production/weights.py index 08e5cbed2..202b443d3 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/production/weights.py +++ b/analysis_templates/ghent_template/__cf_module_name__/production/weights.py @@ -1,186 +1,187 @@ -# coding: utf-8 - -""" -Column production methods related to generic event weights. -""" - -from columnflow.util import maybe_import -from columnflow.columnar_util import set_ak_column, has_ak_column, Route -from columnflow.selection import SelectionResult -from columnflow.production import Producer, producer -from columnflow.production.cms.pileup import pu_weight -from columnflow.production.normalization import normalization_weights -from columnflow.production.cms.electron import electron_weights -from columnflow.production.cms.muon import muon_weights -from columnflow.production.cms.btag import btag_weights -from columnflow.production.cms.scale import murmuf_weights, murmuf_envelope_weights -from columnflow.production.cms.pdf import pdf_weights -from __cf_short_name_lc__.production.normalized_weights import normalized_weight_factory -from __cf_short_name_lc__.production.normalized_btag import normalized_btag_weights - -np = maybe_import("numpy") -ak = maybe_import("awkward") - - -@producer( - produces={"event_weight"}, - mc_only=True, -) -def event_weight(self: Producer, events: ak.Array, **kwargs) -> ak.Array: - """ - Producer that calculates the 'final' event weight (as done in cf.CreateHistograms) - """ - weight = ak.Array(np.ones(len(events))) - if self.dataset_inst.is_mc: - for column in self.config_inst.x.event_weights: - weight = weight * Route(column).apply(events) - for column in self.dataset_inst.x("event_weights", []): - if has_ak_column(events, column): - weight = weight * Route(column).apply(events) - else: - self.logger.warning_once( - f"missing_dataset_weight_{column}", - f"weight '{column}' for dataset {self.dataset_inst.name} not found", - ) - - events = set_ak_column(events, "event_weight", weight) - - return events - - -@event_weight.init -def event_weight_init(self: Producer) -> None: - if not getattr(self, "dataset_inst", None): - return - - self.uses |= set(self.config_inst.x.event_weights.keys()) - self.uses |= set(self.dataset_inst.x("event_weights", {}).keys()) - - -@producer( - uses={pu_weight, btag_weights - }, - # don't save btag_weights to save storage space, since we can reproduce them in ProduceColumns - produces={pu_weight}, - mc_only=True, -) -def event_weights_to_normalize(self: Producer, events: ak.Array, results: SelectionResult, **kwargs) -> ak.Array: - """ - Wrapper of several event weight producers that are typically called as part of SelectEvents - since it is required to normalize them before applying certain event selections. - """ - - # compute pu weights - - events = self[pu_weight](events, **kwargs) - - # compute btag SF weights (for renormalization tasks) - events = self[btag_weights](events, jet_mask=results.aux["jet_mask"], **kwargs) - - # skip scale/pdf weights for some datasets (missing columns) - if not self.dataset_inst.has_tag("skip_scale"): - # compute scale weights - events = self[murmuf_envelope_weights](events, **kwargs) - - # read out mur and weights - events = self[murmuf_weights](events, **kwargs) - - if not self.dataset_inst.has_tag("skip_pdf"): - # compute pdf weights - events = self[pdf_weights]( - events, - outlier_action="remove", - outlier_log_mode="warning", - **kwargs, - ) - - return events - - -@event_weights_to_normalize.init -def event_weights_to_normalize_init(self) -> None: - if not getattr(self, "dataset_inst", None): - return - - if not self.dataset_inst.has_tag("skip_scale"): - self.uses |= {murmuf_envelope_weights, murmuf_weights} - self.produces |= {murmuf_envelope_weights, murmuf_weights} - - if not self.dataset_inst.has_tag("skip_pdf"): - self.uses |= {pdf_weights} - self.produces |= {pdf_weights} - - -normalized_scale_weights = normalized_weight_factory( - producer_name="normalized_scale_weights", - weight_producers={murmuf_envelope_weights, murmuf_weights}, -) - -normalized_pdf_weights = normalized_weight_factory( - producer_name="normalized_pdf_weights", - weight_producers={pdf_weights}, -) - -normalized_pu_weights = normalized_weight_factory( - producer_name="normalized_pu_weights", - weight_producers={pu_weight}, -) - - -@producer( - uses={ - normalization_weights, electron_weights, muon_weights, btag_weights, - normalized_btag_weights, - normalized_pu_weights, - event_weight, - }, - produces={ - normalization_weights, electron_weights, muon_weights, - normalized_btag_weights, - normalized_pu_weights, - event_weight, - }, - mc_only=True, -) -def event_weights(self: Producer, events: ak.Array, **kwargs) -> ak.Array: - """ - Wrapper of several event weight producers that are typically called in ProduceColumns. - """ - # compute normalization weights - - events = self[normalization_weights](events, **kwargs) - - # compute btag SF weights - events = self[btag_weights](events, **kwargs) - # compute electron and muon SF weights - events = self[electron_weights](events, **kwargs) - events = self[muon_weights](events, **kwargs) - - # normalize event weights using stats - events = self[normalized_btag_weights](events, **kwargs) - events = self[normalized_pu_weights](events, **kwargs) - - if not self.dataset_inst.has_tag("skip_scale"): - events = self[normalized_scale_weights](events, **kwargs) - - if not self.dataset_inst.has_tag("skip_pdf"): - events = self[normalized_pdf_weights](events, **kwargs) - - # calculate the full event weight for plotting purposes - events = self[event_weight](events, **kwargs) - - return events - - -@event_weights.init -def event_weights_init(self: Producer) -> None: - if not getattr(self, "dataset_inst", None): - return - - if not self.dataset_inst.has_tag("skip_scale"): - self.uses |= {normalized_scale_weights} - self.produces |= {normalized_scale_weights} - - if not self.dataset_inst.has_tag("skip_pdf"): - self.uses |= {normalized_pdf_weights} - self.produces |= {normalized_pdf_weights} +# coding: utf-8 + +""" +Column production methods related to generic event weights. +""" + +from columnflow.util import maybe_import +from columnflow.columnar_util import set_ak_column, has_ak_column, Route +from columnflow.selection import SelectionResult +from columnflow.production import Producer, producer +from columnflow.production.cms.pileup import pu_weight +from columnflow.production.normalization import normalization_weights +from columnflow.production.cms.electron import electron_weights +from columnflow.production.cms.muon import muon_weights +from columnflow.production.cms.btag import btag_weights +from columnflow.production.cms.scale import murmuf_weights, murmuf_envelope_weights +from columnflow.production.cms.pdf import pdf_weights +from __cf_short_name_lc__.production.normalized_weights import normalized_weight_factory +from __cf_short_name_lc__.production.normalized_btag import normalized_btag_weights + +np = maybe_import("numpy") +ak = maybe_import("awkward") + + +@producer( + produces={"event_weight"}, + mc_only=True, +) +def event_weight(self: Producer, events: ak.Array, **kwargs) -> ak.Array: + """ + Producer that calculates the 'final' event weight (as done in cf.CreateHistograms) + """ + weight = ak.Array(np.ones(len(events))) + if self.dataset_inst.is_mc: + for column in self.config_inst.x.event_weights: + weight = weight * Route(column).apply(events) + for column in self.dataset_inst.x("event_weights", []): + if has_ak_column(events, column): + weight = weight * Route(column).apply(events) + else: + self.logger.warning_once( + f"missing_dataset_weight_{column}", + f"weight '{column}' for dataset {self.dataset_inst.name} not found", + ) + + events = set_ak_column(events, "event_weight", weight) + + return events + + +@event_weight.init +def event_weight_init(self: Producer) -> None: + if not getattr(self, "dataset_inst", None): + return + + self.uses |= set(self.config_inst.x.event_weights.keys()) + self.uses |= set(self.dataset_inst.x("event_weights", {}).keys()) + + +@producer( + uses={pu_weight, btag_weights + }, + # don't save btag_weights to save storage space, since we can reproduce them in ProduceColumns + produces={pu_weight}, + mc_only=True, +) +def event_weights_to_normalize(self: Producer, events: ak.Array, results: SelectionResult, **kwargs) -> ak.Array: + """ + Wrapper of several event weight producers that are typically called as part of SelectEvents + since it is required to normalize them before applying certain event selections. + """ + + # compute pu weights + + events = self[pu_weight](events, **kwargs) + + # TODO: compute btag SF weights (for renormalization tasks) + # btag_weights works for btagging shape corrections. Not for wp. + # events = self[btag_weights](events, jet_mask=results.aux["jet_mask"], **kwargs) + + # skip scale/pdf weights for some datasets (missing columns) + if not self.dataset_inst.has_tag("skip_scale"): + # compute scale weights + events = self[murmuf_envelope_weights](events, **kwargs) + + # read out mur and weights + events = self[murmuf_weights](events, **kwargs) + + if not self.dataset_inst.has_tag("skip_pdf"): + # compute pdf weights + events = self[pdf_weights]( + events, + outlier_action="remove", + outlier_log_mode="warning", + **kwargs, + ) + + return events + + +@event_weights_to_normalize.init +def event_weights_to_normalize_init(self) -> None: + if not getattr(self, "dataset_inst", None): + return + + if not self.dataset_inst.has_tag("skip_scale"): + self.uses |= {murmuf_envelope_weights, murmuf_weights} + self.produces |= {murmuf_envelope_weights, murmuf_weights} + + if not self.dataset_inst.has_tag("skip_pdf"): + self.uses |= {pdf_weights} + self.produces |= {pdf_weights} + + +normalized_scale_weights = normalized_weight_factory( + producer_name="normalized_scale_weights", + weight_producers={murmuf_envelope_weights, murmuf_weights}, +) + +normalized_pdf_weights = normalized_weight_factory( + producer_name="normalized_pdf_weights", + weight_producers={pdf_weights}, +) + +normalized_pu_weights = normalized_weight_factory( + producer_name="normalized_pu_weights", + weight_producers={pu_weight}, +) + + +@producer( + uses={ + normalization_weights, electron_weights, muon_weights, btag_weights, + normalized_btag_weights, + normalized_pu_weights, + event_weight, + }, + produces={ + normalization_weights, electron_weights, muon_weights, + normalized_btag_weights, + normalized_pu_weights, + event_weight, + }, + mc_only=True, +) +def event_weights(self: Producer, events: ak.Array, **kwargs) -> ak.Array: + """ + Wrapper of several event weight producers that are typically called in ProduceColumns. + """ + # compute normalization weights + + events = self[normalization_weights](events, **kwargs) + + # compute btag SF weights + events = self[btag_weights](events, **kwargs) + # compute electron and muon SF weights + events = self[electron_weights](events, **kwargs) + events = self[muon_weights](events, **kwargs) + + # normalize event weights using stats + events = self[normalized_btag_weights](events, **kwargs) + events = self[normalized_pu_weights](events, **kwargs) + + if not self.dataset_inst.has_tag("skip_scale"): + events = self[normalized_scale_weights](events, **kwargs) + + if not self.dataset_inst.has_tag("skip_pdf"): + events = self[normalized_pdf_weights](events, **kwargs) + + # calculate the full event weight for plotting purposes + events = self[event_weight](events, **kwargs) + + return events + + +@event_weights.init +def event_weights_init(self: Producer) -> None: + if not getattr(self, "dataset_inst", None): + return + + if not self.dataset_inst.has_tag("skip_scale"): + self.uses |= {normalized_scale_weights} + self.produces |= {normalized_scale_weights} + + if not self.dataset_inst.has_tag("skip_pdf"): + self.uses |= {normalized_pdf_weights} + self.produces |= {normalized_pdf_weights} From bc457453d7bdd531a860e095b76dee2bcac208ad Mon Sep 17 00:00:00 2001 From: maadcoen Date: Fri, 5 Apr 2024 15:52:14 +0200 Subject: [PATCH 043/119] added all used function to uses --- .../__cf_module_name__/production/weights.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/analysis_templates/ghent_template/__cf_module_name__/production/weights.py b/analysis_templates/ghent_template/__cf_module_name__/production/weights.py index 202b443d3..e435ae266 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/production/weights.py +++ b/analysis_templates/ghent_template/__cf_module_name__/production/weights.py @@ -58,8 +58,13 @@ def event_weight_init(self: Producer) -> None: @producer( - uses={pu_weight, btag_weights - }, + uses={ + pu_weight, + # btag_weights, # TODO: compute btag SF weights (this one applies shape correction) + murmuf_envelope_weights, + murmuf_weights, + pdf_weights + }, # don't save btag_weights to save storage space, since we can reproduce them in ProduceColumns produces={pu_weight}, mc_only=True, From 53a955e19ef1ff0858d028ca4b82fdb8aa376bf8 Mon Sep 17 00:00:00 2001 From: maadcoen Date: Fri, 5 Apr 2024 15:53:10 +0200 Subject: [PATCH 044/119] removed btagging stats necessary for b-tagging shape correction (we're interested in wp corrections, not yet implemented) --- .../__cf_module_name__/selection/stats.py | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/analysis_templates/ghent_template/__cf_module_name__/selection/stats.py b/analysis_templates/ghent_template/__cf_module_name__/selection/stats.py index 6ce74484c..8e2b0db92 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/selection/stats.py +++ b/analysis_templates/ghent_template/__cf_module_name__/selection/stats.py @@ -21,7 +21,10 @@ @selector( - uses={increment_stats, btag_weights, event_weights_to_normalize}, + uses={ + increment_stats, + event_weights_to_normalize + }, ) def __cf_short_name_lc___increment_stats( self: Selector, @@ -32,7 +35,6 @@ def __cf_short_name_lc___increment_stats( ) -> ak.Array: # collect important information from the results event_mask = results.event - event_mask_no_bjet = results.event # currently no b-jet selection, will change to a result without b jet selection n_jets = results.x.n_jets # weight map definition @@ -40,7 +42,6 @@ def __cf_short_name_lc___increment_stats( # "num" operations "num_events": Ellipsis, # all events "num_events_selected": event_mask, # selected events only - "num_events_selected_no_bjet": event_mask_no_bjet, } if self.dataset_inst.is_mc: @@ -48,12 +49,9 @@ def __cf_short_name_lc___increment_stats( # "sum" operations weight_map["sum_mc_weight"] = events.mc_weight # weights of all events weight_map["sum_mc_weight_selected"] = (events.mc_weight, event_mask) # weights of selected events - weight_map["sum_mc_weight_no_bjet"] = (events.mc_weight, event_mask_no_bjet) - weight_map["sum_mc_weight_selected_no_bjet"] = (events.mc_weight, event_mask_no_bjet) weight_columns = list( - set(self[event_weights_to_normalize].produced_columns) | - set(self[btag_weights].produced_columns), + set(self[event_weights_to_normalize].produced_columns) ) weight_columns = sorted([col.string_nano_column for col in weight_columns]) @@ -68,12 +66,6 @@ def __cf_short_name_lc___increment_stats( # weights for selected events weight_map[f"sum_mc_weight_{name}_selected"] = (events.mc_weight * events[name], event_mask) - if name.startswith("btag_weight"): - # weights for selected events, excluding the bjet selection - weight_map[f"sum_mc_weight_{name}_selected_no_bjet"] = ( - (events.mc_weight * events[name], event_mask_no_bjet) - ) - group_map = { "process": { "values": events.process_id, From a23ca6e07d20fff0145ce68390ea0fb24647b071 Mon Sep 17 00:00:00 2001 From: maadcoen Date: Fri, 5 Apr 2024 16:22:48 +0200 Subject: [PATCH 045/119] year used before definition --- .../config/config___cf_short_name_lc__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/analysis_templates/ghent_template/__cf_module_name__/config/config___cf_short_name_lc__.py b/analysis_templates/ghent_template/__cf_module_name__/config/config___cf_short_name_lc__.py index 838aa8b32..632cf8b6e 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/config/config___cf_short_name_lc__.py +++ b/analysis_templates/ghent_template/__cf_module_name__/config/config___cf_short_name_lc__.py @@ -32,8 +32,9 @@ def add_config( limit_dataset_files: int | None = None, ) -> od.Config: # validations - assert campaign.x.year in [2016, 2017, 2018] # only run 2 implemented - if campaign.x.year == 2016: + year = campaign.x.year + assert year in [2016, 2017, 2018] # only run 2 implemented + if year == 2016: assert campaign.x.vfp in ["pre", "post"] # only 2018 fully implemented @@ -42,7 +43,6 @@ def add_config( cfg = analysis.add_config(campaign, name=config_name, id=config_id, tags=analysis.tags) - year = campaign.x.year year2 = year % 100 corr_postfix = f"{campaign.x.vfp}VFP" if year == 2016 else "" ecm = campaign.ecm From bc24ba2c29c425fa4a9ac161953a38f1247df445 Mon Sep 17 00:00:00 2001 From: maadcoen Date: Fri, 5 Apr 2024 16:44:54 +0200 Subject: [PATCH 046/119] updated readme.md for Ghent specific context --- README.md | 61 ++++++++----------------------------------------------- 1 file changed, 8 insertions(+), 53 deletions(-) diff --git a/README.md b/README.md index 93476800a..98c6b2968 100644 --- a/README.md +++ b/README.md @@ -33,25 +33,17 @@ Backend for columnar, fully orchestrated HEP analyses with pure Python, [law](https://github.com/riga/law) and [order](https://github.com/riga/order). -Original source hosted at [GitHub](https://github.com/columnflow/columnflow). +This project is for use within the Ghent CMS group. Original source hosted at [GitHub](https://github.com/columnflow/columnflow). -## Note on current development - -This project is currently in a beta phase. -The project setup, suggested workflows, definitions of particular tasks, and the signatures of various helper classes and functions are mostly frozen but could still be subject to changes in the near future. -At this point (July 2023), four large-scale analyses based upon columnflow are being developed, and in the process, help test and verify various aspects of its core. -The first released version is expected in the fall of 2023. -However, if you would like to join early on, contribute or just give it a spin, feel free to get in touch! - -![Columnflow analytics](https://repobeats.axiom.co/api/embed/b6ebc5ba41019de55eb48e195eecb438890442c8.svg "Columnflow analytics") +![Alt](https://repobeats.axiom.co/api/embed/8cca127835f18d377e3a691220ae296ac9c80d49.svg "Columnflow Ghent analytics") @@ -59,8 +51,9 @@ However, if you would like to join early on, contribute or just give it a spin, ## Quickstart -To create an analysis using columnflow, it is recommended to start from a predefined template (located in [analysis_templates](https://github.com/columnflow/columnflow/tree/master/analysis_templates)). +To create an analysis using columnflow, it is recommended to start from a predefined template (located in [analysis_templates](https://github.com/GhentAnalysis/columnflow/tree/main/analysis_templates)). The following command (no previous git clone required) interactively asks for a handful of names and settings, and creates a minimal, yet fully functioning project structure for you! +The 'cms_minimal' flavor corresponds to the template provided by columnflow itself. 'Ghent_template' provides a more extensive example. ```shell bash -c "$(curl -Ls https://raw.githubusercontent.com/GhentAnalysis/columnflow/main/create_analysis.sh)" @@ -108,52 +101,14 @@ Setup successfull! The next steps are: For a better overview of the tasks that are triggered by the commands below, checkout the current (yet stylized) [task graph](https://github.com/columnflow/columnflow/wiki#default-task-graph). -## Projects using columnflow - -- [hh2bbtautau](https://github.com/uhh-cms/hh2bbtautau): HH → bb𝜏𝜏 analysis with CMS. -- [hh2bbww](https://github.com/uhh-cms/hh2bbww): HH → bbWW analysis with CMS. -- [topmass](https://github.com/uhh-cms/topmass): Top quark mass measurement with CMS. -- [mttbar](https://github.com/uhh-cms/mttbar): Search for heavy resonances in ttbar events with CMS. -- [analysis playground](https://github.com/uhh-cms/analysis_playground): A testing playground for HEP analyses. - - -## Contributors - - - - - - - - - - - - - - - - - - - - - - - -
Marcel Rieger
Marcel Rieger

💻 👀 📖 ⚠️
Mathis Frahm
Mathis Frahm

💻 👀
Daniel Savoiu
Daniel Savoiu

💻 👀
pkausw
pkausw

💻 👀
nprouvost
nprouvost

💻 ⚠️
Bogdan-Wiederspan
Bogdan-Wiederspan

💻 ⚠️
Tobias Kramer
Tobias Kramer

💻 👀
Matthias Schroeder
Matthias Schroeder

💻
Johannes Lange
Johannes Lange

💻
BalduinLetzer
BalduinLetzer

💻
JanekMoels
JanekMoels

🤔
haddadanas
haddadanas

💻
- - - - - - -This project follows the [all-contributors](https://github.com/all-contributors/all-contributors) specification. +## Other projects and original developers +You can find a list of other projects using columnflow on the [original github](https://github.com/columnflow/columnflow). +The main contributors to columnflow are also listed there. ## Development - Source hosted at [GitHub](https://github.com/columnflow/columnflow) -- Report issues, questions, feature requests on [GitHub Issues](https://github.com/columnflow/columnflow/issues) +- Report issues, questions, feature requests for columnflow to [GitHub Issues](https://github.com/columnflow/columnflow/issues) From e8a05753faac5ab1d9d54723cf5d26c0f9c16170 Mon Sep 17 00:00:00 2001 From: maadcoen Date: Fri, 5 Apr 2024 16:53:01 +0200 Subject: [PATCH 047/119] updated readme.md for Ghent template --- analysis_templates/ghent_template/README.md | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/analysis_templates/ghent_template/README.md b/analysis_templates/ghent_template/README.md index ac945c23e..efb92252b 100644 --- a/analysis_templates/ghent_template/README.md +++ b/analysis_templates/ghent_template/README.md @@ -1,9 +1,19 @@ # __cf_analysis_name__ Analysis +# Object Definition -### Resources +## Electrons + +Defined in [selection/objects.py:electron_object](selection/objects.py). + +# Calibration + +# Event selection + +# Resources - [columnflow](https://github.com/uhh-cms/columnflow) - [law](https://github.com/riga/law) - [order](https://github.com/riga/order) - [luigi](https://github.com/spotify/luigi) + From 193642b781fd616a2e2070720d5bc51374704e25 Mon Sep 17 00:00:00 2001 From: maadcoen Date: Fri, 5 Apr 2024 16:55:56 +0200 Subject: [PATCH 048/119] updated readme.md for Ghent template --- analysis_templates/ghent_template/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/analysis_templates/ghent_template/README.md b/analysis_templates/ghent_template/README.md index efb92252b..20a521ef7 100644 --- a/analysis_templates/ghent_template/README.md +++ b/analysis_templates/ghent_template/README.md @@ -4,7 +4,7 @@ ## Electrons -Defined in [selection/objects.py:electron_object](selection/objects.py). +Defined in [selection/objects.py:electron_object](__cf_analysis_name__/selection/objects.py). # Calibration From 111197cf97cde4b2165749e057f440c47a6e2c53 Mon Sep 17 00:00:00 2001 From: Maarten De Coen <52047931+maadcoen@users.noreply.github.com> Date: Fri, 5 Apr 2024 16:57:10 +0200 Subject: [PATCH 049/119] Update README.md --- analysis_templates/ghent_template/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/analysis_templates/ghent_template/README.md b/analysis_templates/ghent_template/README.md index 20a521ef7..017c189f5 100644 --- a/analysis_templates/ghent_template/README.md +++ b/analysis_templates/ghent_template/README.md @@ -4,7 +4,7 @@ ## Electrons -Defined in [selection/objects.py:electron_object](__cf_analysis_name__/selection/objects.py). +Defined in [selection/objects.py:electron_object](./__cf_analysis_name__/selection/objects.py). # Calibration From c6a4a106686ed99c97bca019d2263bf3b08c62a4 Mon Sep 17 00:00:00 2001 From: Maarten De Coen <52047931+maadcoen@users.noreply.github.com> Date: Fri, 5 Apr 2024 16:58:18 +0200 Subject: [PATCH 050/119] Update README.md --- analysis_templates/ghent_template/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/analysis_templates/ghent_template/README.md b/analysis_templates/ghent_template/README.md index 017c189f5..14340d57d 100644 --- a/analysis_templates/ghent_template/README.md +++ b/analysis_templates/ghent_template/README.md @@ -4,7 +4,7 @@ ## Electrons -Defined in [selection/objects.py:electron_object](./__cf_analysis_name__/selection/objects.py). +Defined in [selection/objects.py:electron_object](__cf_module_name__/selection/objects.py). # Calibration From daf6985ec374cdccdea24bf6190173fc103f0e01 Mon Sep 17 00:00:00 2001 From: maadcoen Date: Fri, 5 Apr 2024 17:06:46 +0200 Subject: [PATCH 051/119] math trial README.md --- analysis_templates/ghent_template/README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/analysis_templates/ghent_template/README.md b/analysis_templates/ghent_template/README.md index 20a521ef7..e858032d5 100644 --- a/analysis_templates/ghent_template/README.md +++ b/analysis_templates/ghent_template/README.md @@ -6,6 +6,9 @@ Defined in [selection/objects.py:electron_object](__cf_analysis_name__/selection/objects.py). +$\sqrt{3x-1}+(1+x)^2$ + + # Calibration # Event selection From 991e99867567ae8c3efd1914dc1e729cebefa825 Mon Sep 17 00:00:00 2001 From: maadcoen Date: Fri, 5 Apr 2024 17:27:51 +0200 Subject: [PATCH 052/119] electron object selection --- analysis_templates/ghent_template/README.md | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/analysis_templates/ghent_template/README.md b/analysis_templates/ghent_template/README.md index 6dab84171..e8341886f 100644 --- a/analysis_templates/ghent_template/README.md +++ b/analysis_templates/ghent_template/README.md @@ -6,8 +6,16 @@ Defined in [selection/objects.py:electron_object](__cf_module_name__/selection/objects.py). -$\sqrt{3x-1}+(1+x)^2$ - +- $\abs{eta} < 2.5$ +- $p_T > 15$ +- $miniPFRelIso_all < 0.4$ +- $sip3d < 8$ +- &dxy < 0.05$ +- $lostHits < 2$ +- isPFcand +- convVeto +- $tightCharge > 1$ +- without a tight muon closeby ($\\Delta R < 0.05$) # Calibration From f56f51806cbaeb998412a0fe00410d2a44d34a7c Mon Sep 17 00:00:00 2001 From: maadcoen Date: Fri, 5 Apr 2024 17:29:58 +0200 Subject: [PATCH 053/119] electron object selection --- analysis_templates/ghent_template/README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/analysis_templates/ghent_template/README.md b/analysis_templates/ghent_template/README.md index e8341886f..dff58e019 100644 --- a/analysis_templates/ghent_template/README.md +++ b/analysis_templates/ghent_template/README.md @@ -10,8 +10,9 @@ Defined in [selection/objects.py:electron_object](__cf_module_name__/selection/o - $p_T > 15$ - $miniPFRelIso_all < 0.4$ - $sip3d < 8$ -- &dxy < 0.05$ -- $lostHits < 2$ +- $d_{xy} < 0.05$ +- $d_z < 0.1$ +- $\texttt{lostHits} < 2$ - isPFcand - convVeto - $tightCharge > 1$ From 7c7708e032831f7bbdc54f9f2d23209f5c5da4af Mon Sep 17 00:00:00 2001 From: maadcoen Date: Fri, 5 Apr 2024 17:32:03 +0200 Subject: [PATCH 054/119] electron object selection --- analysis_templates/ghent_template/README.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/analysis_templates/ghent_template/README.md b/analysis_templates/ghent_template/README.md index dff58e019..36e17ae7c 100644 --- a/analysis_templates/ghent_template/README.md +++ b/analysis_templates/ghent_template/README.md @@ -6,16 +6,16 @@ Defined in [selection/objects.py:electron_object](__cf_module_name__/selection/objects.py). -- $\abs{eta} < 2.5$ +- $|eta| < 2.5$ - $p_T > 15$ -- $miniPFRelIso_all < 0.4$ -- $sip3d < 8$ +- $\texttt{miniPFRelIso_all} < 0.4$ +- $\texttt{sip3d} < 8$ - $d_{xy} < 0.05$ - $d_z < 0.1$ -- $\texttt{lostHits} < 2$ -- isPFcand -- convVeto -- $tightCharge > 1$ +- $lostHits} < 2$ +- is a PF candidate +- with conversion veto applied +- $\texttt{tightCharge} > 1$ - without a tight muon closeby ($\\Delta R < 0.05$) # Calibration From 78923d75bc603e7030888a728dc9c1725a0cd409 Mon Sep 17 00:00:00 2001 From: maadcoen Date: Fri, 5 Apr 2024 17:33:15 +0200 Subject: [PATCH 055/119] electron object selection --- analysis_templates/ghent_template/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/analysis_templates/ghent_template/README.md b/analysis_templates/ghent_template/README.md index 36e17ae7c..a57c2aaf6 100644 --- a/analysis_templates/ghent_template/README.md +++ b/analysis_templates/ghent_template/README.md @@ -8,11 +8,11 @@ Defined in [selection/objects.py:electron_object](__cf_module_name__/selection/o - $|eta| < 2.5$ - $p_T > 15$ -- $\texttt{miniPFRelIso_all} < 0.4$ +- $\texttt{miniPFRelIso\_all} < 0.4$ - $\texttt{sip3d} < 8$ - $d_{xy} < 0.05$ - $d_z < 0.1$ -- $lostHits} < 2$ +- $\texttt{lostHits} < 2$ - is a PF candidate - with conversion veto applied - $\texttt{tightCharge} > 1$ From a5aeb337d71383c87f471adf19c56f03515ff009 Mon Sep 17 00:00:00 2001 From: maadcoen Date: Fri, 5 Apr 2024 17:33:46 +0200 Subject: [PATCH 056/119] electron object selection --- analysis_templates/ghent_template/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/analysis_templates/ghent_template/README.md b/analysis_templates/ghent_template/README.md index a57c2aaf6..3d4304f9a 100644 --- a/analysis_templates/ghent_template/README.md +++ b/analysis_templates/ghent_template/README.md @@ -8,7 +8,7 @@ Defined in [selection/objects.py:electron_object](__cf_module_name__/selection/o - $|eta| < 2.5$ - $p_T > 15$ -- $\texttt{miniPFRelIso\_all} < 0.4$ +- $\texttt{miniPFRelIso}_\texttt{all} < 0.4$ - $\texttt{sip3d} < 8$ - $d_{xy} < 0.05$ - $d_z < 0.1$ From 12b8db807a8c3270375026ecb121ef10e52305d3 Mon Sep 17 00:00:00 2001 From: maadcoen Date: Fri, 5 Apr 2024 17:34:20 +0200 Subject: [PATCH 057/119] electron object selection --- analysis_templates/ghent_template/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/analysis_templates/ghent_template/README.md b/analysis_templates/ghent_template/README.md index 3d4304f9a..b10fc22c5 100644 --- a/analysis_templates/ghent_template/README.md +++ b/analysis_templates/ghent_template/README.md @@ -12,7 +12,7 @@ Defined in [selection/objects.py:electron_object](__cf_module_name__/selection/o - $\texttt{sip3d} < 8$ - $d_{xy} < 0.05$ - $d_z < 0.1$ -- $\texttt{lostHits} < 2$ +- at most one lost hit - is a PF candidate - with conversion veto applied - $\texttt{tightCharge} > 1$ From ccec828d4724c6231b83c38107738b79f1128106 Mon Sep 17 00:00:00 2001 From: maadcoen Date: Fri, 5 Apr 2024 17:34:58 +0200 Subject: [PATCH 058/119] electron object selection --- analysis_templates/ghent_template/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/analysis_templates/ghent_template/README.md b/analysis_templates/ghent_template/README.md index b10fc22c5..95601dd22 100644 --- a/analysis_templates/ghent_template/README.md +++ b/analysis_templates/ghent_template/README.md @@ -8,7 +8,7 @@ Defined in [selection/objects.py:electron_object](__cf_module_name__/selection/o - $|eta| < 2.5$ - $p_T > 15$ -- $\texttt{miniPFRelIso}_\texttt{all} < 0.4$ +- \texttt{miniPFRelIso_all}$ < 0.4$ - $\texttt{sip3d} < 8$ - $d_{xy} < 0.05$ - $d_z < 0.1$ From 8e2f2a579b1d2968c235ee5ea9fd7d6e5e30ba1e Mon Sep 17 00:00:00 2001 From: maadcoen Date: Fri, 5 Apr 2024 17:36:34 +0200 Subject: [PATCH 059/119] electron object selection --- analysis_templates/ghent_template/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/analysis_templates/ghent_template/README.md b/analysis_templates/ghent_template/README.md index 95601dd22..f5059e6ef 100644 --- a/analysis_templates/ghent_template/README.md +++ b/analysis_templates/ghent_template/README.md @@ -8,7 +8,7 @@ Defined in [selection/objects.py:electron_object](__cf_module_name__/selection/o - $|eta| < 2.5$ - $p_T > 15$ -- \texttt{miniPFRelIso_all}$ < 0.4$ +- $\texttt{miniPFRelIso all} < 0.4$ - $\texttt{sip3d} < 8$ - $d_{xy} < 0.05$ - $d_z < 0.1$ From faf0c0e152267c5923c4d8bcf12b1671bb751ee3 Mon Sep 17 00:00:00 2001 From: maadcoen Date: Fri, 5 Apr 2024 17:36:57 +0200 Subject: [PATCH 060/119] electron object selection --- analysis_templates/ghent_template/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/analysis_templates/ghent_template/README.md b/analysis_templates/ghent_template/README.md index f5059e6ef..0bf9784fe 100644 --- a/analysis_templates/ghent_template/README.md +++ b/analysis_templates/ghent_template/README.md @@ -6,7 +6,7 @@ Defined in [selection/objects.py:electron_object](__cf_module_name__/selection/objects.py). -- $|eta| < 2.5$ +- $|\eta| < 2.5$ - $p_T > 15$ - $\texttt{miniPFRelIso all} < 0.4$ - $\texttt{sip3d} < 8$ From 6d588faf3ff4aa36ddb23e3b78031f1331e08098 Mon Sep 17 00:00:00 2001 From: maadcoen Date: Tue, 9 Apr 2024 13:43:21 +0200 Subject: [PATCH 061/119] removed redefinition of jec_nominal (already defined in columnflow.calibration.cms.jets) --- .../__cf_module_name__/calibration/default.py | 4 +--- .../__cf_module_name__/calibration/jet.py | 11 ----------- 2 files changed, 1 insertion(+), 14 deletions(-) delete mode 100644 analysis_templates/ghent_template/__cf_module_name__/calibration/jet.py diff --git a/analysis_templates/ghent_template/__cf_module_name__/calibration/default.py b/analysis_templates/ghent_template/__cf_module_name__/calibration/default.py index 8eab79701..91071e4a7 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/calibration/default.py +++ b/analysis_templates/ghent_template/__cf_module_name__/calibration/default.py @@ -5,12 +5,10 @@ """ from columnflow.calibration import Calibrator, calibrator -from columnflow.calibration.cms.jets import jec, jer +from columnflow.calibration.cms.jets import jec, jer, jec_nominal from columnflow.production.cms.seeds import deterministic_seeds from columnflow.util import maybe_import -from __cf_short_name_lc__.calibration.jet import jec_nominal - ak = maybe_import("awkward") diff --git a/analysis_templates/ghent_template/__cf_module_name__/calibration/jet.py b/analysis_templates/ghent_template/__cf_module_name__/calibration/jet.py deleted file mode 100644 index 0164fda4f..000000000 --- a/analysis_templates/ghent_template/__cf_module_name__/calibration/jet.py +++ /dev/null @@ -1,11 +0,0 @@ -# coding: utf-8 - -""" -Custom jet energy calibration methods that disable data uncertainties (for searches). -""" - -from columnflow.calibration.cms.jets import jec - - -# custom jec calibrator that only runs nominal correction -jec_nominal = jec.derive("jec_nominal", cls_dict={"uncertainty_sources": []}) \ No newline at end of file From 33f68a9dc90e9a809c98eed1cc12cccd68c6229e Mon Sep 17 00:00:00 2001 From: maadcoen Date: Tue, 9 Apr 2024 13:53:29 +0200 Subject: [PATCH 062/119] fixed syntax mistake in decorator --- .../__cf_module_name__/production/cutflow_features.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/analysis_templates/ghent_template/__cf_module_name__/production/cutflow_features.py b/analysis_templates/ghent_template/__cf_module_name__/production/cutflow_features.py index 6f493526c..592e81839 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/production/cutflow_features.py +++ b/analysis_templates/ghent_template/__cf_module_name__/production/cutflow_features.py @@ -16,7 +16,7 @@ ak = maybe_import("awkward") -producer( +@producer( uses={ mc_weight, category_ids, # nano columns @@ -28,8 +28,6 @@ "cutflow.jet1_pt", }, ) - - def cutflow_features( self: Producer, events: ak.Array, From eda448ced72314ed40d366f1b5481fc8f023ef77 Mon Sep 17 00:00:00 2001 From: maadcoen Date: Tue, 9 Apr 2024 14:29:24 +0200 Subject: [PATCH 063/119] removed CR_WZ from regions (ttZ specific) --- .../ghent_template/__cf_module_name__/config/categories.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/analysis_templates/ghent_template/__cf_module_name__/config/categories.py b/analysis_templates/ghent_template/__cf_module_name__/config/categories.py index f51f38de9..6e3649f02 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/config/categories.py +++ b/analysis_templates/ghent_template/__cf_module_name__/config/categories.py @@ -22,7 +22,7 @@ def add_categories_selection(config: od.Config) -> None: Adds categories to a *config*, that are typically produced in `SelectEvents`. """ - config.x.regions = ("incl", "CR_WZ") + config.x.regions = ("incl") config.x.lepton_channels = ("2e", "1e1mu", "2mu") config.x.lepton_channel_labels = {"2e": "$ee$", "1e1mu": "$e\mu$", "2mu": "$\mu\mu$"} From 09bd65f357a81b744220e31cd75ced5b5963a6e1 Mon Sep 17 00:00:00 2001 From: maadcoen Date: Tue, 9 Apr 2024 15:15:51 +0200 Subject: [PATCH 064/119] mu_mask_loose > mu_mask --- .../ghent_template/__cf_module_name__/selection/objects.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/analysis_templates/ghent_template/__cf_module_name__/selection/objects.py b/analysis_templates/ghent_template/__cf_module_name__/selection/objects.py index 5fb43b656..cd4f3a049 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/selection/objects.py +++ b/analysis_templates/ghent_template/__cf_module_name__/selection/objects.py @@ -54,7 +54,7 @@ def muon_object( # tight object muon mask (tight cutbased ID) mu_mask_tight = ( - (mu_mask_loose) & + (mu_mask) & (muon.tightId) ) From 4863e1b31abb4509264891e55e844628002adaf2 Mon Sep 17 00:00:00 2001 From: Maarten De Coen <52047931+maadcoen@users.noreply.github.com> Date: Tue, 9 Apr 2024 15:21:04 +0200 Subject: [PATCH 065/119] Template readme (#2) Update template README.md and added note on uses of init decorator --- analysis_templates/ghent_template/README.md | 66 ++++++++++++++++++- .../__cf_module_name__/calibration/default.py | 5 ++ .../__cf_module_name__/selection/default.py | 2 +- .../__cf_module_name__/selection/objects.py | 2 +- 4 files changed, 71 insertions(+), 4 deletions(-) diff --git a/analysis_templates/ghent_template/README.md b/analysis_templates/ghent_template/README.md index 0bf9784fe..0adaff97f 100644 --- a/analysis_templates/ghent_template/README.md +++ b/analysis_templates/ghent_template/README.md @@ -2,9 +2,25 @@ # Object Definition +All objects collected in [selection/objects.py:object_selection](__cf_module_name__/selection/objects.py#L177). + +## Muons + +Defined in [selection/objects.py:muon_object](__cf_module_name__/selection/objects.py#L36). + +- $|\eta| < 2.4$ +- $p_T > 10$ +- $\texttt{miniPFRelIso all} < 0.4$ +- $\texttt{sip3d} < 8$ +- $d_{xy} < 0.05$ +- $d_z < 0.1$ + +Defined additionally Tight Muons:: +- $\texttt{tightId}$ + ## Electrons -Defined in [selection/objects.py:electron_object](__cf_module_name__/selection/objects.py). +Defined in [selection/objects.py:electron_object](__cf_module_name__/selection/objects.py#L83). - $|\eta| < 2.5$ - $p_T > 15$ @@ -16,12 +32,58 @@ Defined in [selection/objects.py:electron_object](__cf_module_name__/selection/o - is a PF candidate - with conversion veto applied - $\texttt{tightCharge} > 1$ -- without a tight muon closeby ($\\Delta R < 0.05$) +- without a muon closeby ($\\Delta R < 0.05$) + +## Jets + +Defined in [selection/objects.py:jet_object](__cf_module_name__/selection/objects.py#L132). + +- ak4 Jets (standard Jet collection in NanoAOD) +- $|\eta| < 2.5$ +- $p_T > 30$ +- $\texttt{jetId} \\ge 2$ +- not containing a muon or lepton ($\\Delta R < 0.4$) + # Calibration +Currently only the JEC and JER corrections are implemented. Two procedures are defined: +- Full JEC uncertainies, no JER: [calibration/default.py:default](__cf_module_name__/calibration/default.py#L21). +- Only nominal JEC, but also JER: [calibration/default.py:skip_jecunc](__cf_module_name__/calibration/skip_jecunc.py#L50). + +The applied procedure can be specified at +[config/config___cf_short_name_lc__.py:cfg.x.default_calibrator](__cf_module_name__/config/config___cf_short_name_lc__.py#L339). + + # Event selection +The aim is to select $t\overline{t}$ events. +Full default selection flow collected in [selection/default.py:default](__cf_module_name__/selection/default.py#L213). +Different selections can be defined by writing a similar function, and changing the configuration at [config/config___cf_short_name_lc__.py:cfg.x.default_selector](__cf_module_name__/config/config___cf_short_name_lc__.py#L340). + + +- triggers applied in [selection/trigger.py:default](__cf_module_name__/selection/trigger.py#L57) + - listed in [selection/trigger.py:add_triggers](__cf_module_name__/selection/trigger.py#L11) +- lepton selection applied in [selection/default.py:lepton_selection](__cf_module_name__/selection/default.py#L81). + - remove Z resonance (same flavour, opposite sign, $|m_{\ell\ell} - 91| < 15$) + - leading lepton $p_T > 30$ + - subleading lepton $p_T > 20$ + - all leptons in the event should be tight +- jet selection applied in [selection/default.py:jet_selection](__cf_module_name__/selection/default.py#L136). + - one b-tagged jet + +Note that selections are calculated as masks but not yet applied. + +# Categories / channels + +Four channels are defined in the configuration file, described in [config/categories.py](config/categories.py) and implemented in [categorization/example.py](__cf_module_name__/categorization/example.py). + +- $ee$ [selection/categories.py:catid_selection_2e](__cf_module_name__/selection/categories.py#L24) +- $e\mu$ [selection/categories.py:catid_selection_1e1mu](__cf_module_name__/selection/categories.py#L33) +- $\mu\mu$ [selection/categories.py:catid_selection_2mu](__cf_module_name__/selection/categories.py#L42) +- inclusive [selection/categories.py:catid_selection_incl](__cf_module_name__/selection/categories.py#L14) + + # Resources - [columnflow](https://github.com/uhh-cms/columnflow) diff --git a/analysis_templates/ghent_template/__cf_module_name__/calibration/default.py b/analysis_templates/ghent_template/__cf_module_name__/calibration/default.py index 91071e4a7..7854b3f78 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/calibration/default.py +++ b/analysis_templates/ghent_template/__cf_module_name__/calibration/default.py @@ -27,6 +27,10 @@ def default(self: Calibrator, events: ak.Array, **kwargs) -> ak.Array: return events +# NOTE: +# the function together with its @default.init decorator allows to customise the initialization of the +# calibration function as performed by the @calibrator decorator. Here, we extend the uses={...} and produces={...} +# sets dynamically, because what is used and produced depends on whether we are processing MC or data. @default.init def default_init(self: Calibrator) -> None: if not getattr(self, "dataset_inst", None): @@ -58,6 +62,7 @@ def skip_jecunc(self: Calibrator, events: ak.Array, **kwargs) -> ak.Array: return events +# NOTE: see default_init @skip_jecunc.init def skip_jecunc_init(self: Calibrator) -> None: if not getattr(self, "dataset_inst", None): diff --git a/analysis_templates/ghent_template/__cf_module_name__/selection/default.py b/analysis_templates/ghent_template/__cf_module_name__/selection/default.py index ea232cbdd..66ba2ec7c 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/selection/default.py +++ b/analysis_templates/ghent_template/__cf_module_name__/selection/default.py @@ -63,7 +63,7 @@ def pre_selection( if self.dataset_inst.is_mc: events = self[mc_weight](events, **kwargs) - # create process ids together with custom ttz definition (and future wz 0b, 1b, >2b definition) + # create process ids events = self[process_ids](events, **kwargs) # ensure coffea behavior events = self[attach_coffea_behavior](events, **kwargs) diff --git a/analysis_templates/ghent_template/__cf_module_name__/selection/objects.py b/analysis_templates/ghent_template/__cf_module_name__/selection/objects.py index cd4f3a049..75ad7f972 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/selection/objects.py +++ b/analysis_templates/ghent_template/__cf_module_name__/selection/objects.py @@ -104,7 +104,7 @@ def electron_object( (electron.isPFcand) & (electron.convVeto) & (electron.tightCharge > 1) & - # remove electrons that have tight muon close to it + # remove electrons that have muon close to it (ak.is_none(electron.nearest(muon, threshold=0.05), axis=-1)) ) # tight object electron mask (mvaFall17 WP90) From 45cc07ecf662f987b1a5f6b1281923ebde483a5d Mon Sep 17 00:00:00 2001 From: maadcoen Date: Tue, 9 Apr 2024 15:42:40 +0200 Subject: [PATCH 066/119] removed unncessary array stored in steps in muon selection --- .../ghent_template/__cf_module_name__/selection/objects.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/analysis_templates/ghent_template/__cf_module_name__/selection/objects.py b/analysis_templates/ghent_template/__cf_module_name__/selection/objects.py index 75ad7f972..b9d7e2e09 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/selection/objects.py +++ b/analysis_templates/ghent_template/__cf_module_name__/selection/objects.py @@ -61,7 +61,7 @@ def muon_object( events = set_ak_column(events, "Muon.tight", mu_mask_tight, value_type=bool) return events, SelectionResult( - steps={ak.ones_like(events.event, value_type=bool)}, + steps={}, objects={ "Muon": { "Muon": masked_sorted_indices(mu_mask, muon.pt) From 3f26b32bc3f68b44bcb032a3a2384feafad6bbff Mon Sep 17 00:00:00 2001 From: maadcoen Date: Tue, 9 Apr 2024 16:31:06 +0200 Subject: [PATCH 067/119] tt_dl > tt_dl_powheg for datasets --- .../config/config___cf_short_name_lc__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/analysis_templates/ghent_template/__cf_module_name__/config/config___cf_short_name_lc__.py b/analysis_templates/ghent_template/__cf_module_name__/config/config___cf_short_name_lc__.py index 632cf8b6e..c6acff188 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/config/config___cf_short_name_lc__.py +++ b/analysis_templates/ghent_template/__cf_module_name__/config/config___cf_short_name_lc__.py @@ -271,9 +271,9 @@ def add_config( # dataset groups for conveniently looping over certain datasets # (used in wrapper_factory and during plotting) cfg.x.dataset_groups = { - "test": ["tt_dl"], - "all": ["tt_dl", "dy*", "data*"], - "sim": ["tt_dl", "dy*"], + "test": ["tt_dl_powheg"], + "all": ["tt_dl_powheg", "dy*", "data*"], + "sim": ["tt_dl_powheg", "dy*"], } cfg.x.variable_groups = { From f6a823298f988ff553ac33bc66083ae38d8d91d9 Mon Sep 17 00:00:00 2001 From: maadcoen Date: Tue, 9 Apr 2024 16:53:52 +0200 Subject: [PATCH 068/119] shorted dataset listing --- .../__cf_module_name__/config/datasets.py | 32 ++++--------------- 1 file changed, 6 insertions(+), 26 deletions(-) diff --git a/analysis_templates/ghent_template/__cf_module_name__/config/datasets.py b/analysis_templates/ghent_template/__cf_module_name__/config/datasets.py index 4e7cd772c..8e5b3bda4 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/config/datasets.py +++ b/analysis_templates/ghent_template/__cf_module_name__/config/datasets.py @@ -22,38 +22,18 @@ def add_datasets(config: od.Config, campaign: od.Campaign): dataset_names = { "2018": [ # data - "data_mumu_a", - "data_mumu_b", - "data_mumu_c", - "data_mumu_d", - - "data_egamma_a", - "data_egamma_b", - "data_egamma_c", - "data_egamma_d", - - "data_muoneg_a", - "data_muoneg_b", - "data_muoneg_c", - "data_muoneg_d", - - "data_mu_a", - "data_mu_b", - "data_mu_c", - "data_mu_d", + *[f"data_mumu_{era}" for era in ['a', 'b', 'c', 'd']], + *[f"data_egamma_{era}" for era in ['a', 'b', 'c', 'd']], + *[f"data_muoneg_{era}" for era in ['a', 'b', 'c', 'd']], + *[f"data_mu_{era}" for era in ['a', 'b', 'c', 'd']], # backgrounds # ewk - "dy_lept_m50_ht-100to200_madgraph", - "dy_lept_m50_ht-200to400_madgraph", - "dy_lept_m50_ht-400to600_madgraph", - "dy_lept_m50_ht-600to800_madgraph", - "dy_lept_m50_ht-800to1200_madgraph", - "dy_lept_m50_ht-1200to2500_madgraph", + *["dy_lept_m50_ht-100to200_madgraph" for htr in ['100to200', '200to400', '400to600', + '600to800', '800to1200', '1200to2500']], # ttbar - "tt_dl_powheg", "tt_sl_powheg" ]}[f"{config.x.year}{config.x.corr_postfix}"] From ecf13c292b021cae516fdde4f5ec735643087096 Mon Sep 17 00:00:00 2001 From: Maarten De Coen <52047931+maadcoen@users.noreply.github.com> Date: Tue, 9 Apr 2024 17:01:19 +0200 Subject: [PATCH 069/119] Update README.md --- analysis_templates/ghent_template/README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/analysis_templates/ghent_template/README.md b/analysis_templates/ghent_template/README.md index 0adaff97f..d37ec1bf3 100644 --- a/analysis_templates/ghent_template/README.md +++ b/analysis_templates/ghent_template/README.md @@ -1,5 +1,18 @@ # __cf_analysis_name__ Analysis +# Datasets and processes + +All processes and datasets are defined in the [cmsdb gitlab](https://gitlab.cern.ch/ghentanalysis/cmsdb/-/blob/master/cmsdb/). +Processes are added to the analysis in the file [config/processes.py](__cf_module_name__/config/processes.py). +Data- and MC-sets are added to the analysis in the file [config/datasets.py](__cf_module_name__/config/datasets.py). + +The datasets (for ERA=a,b,c,d) are: +- data_mu_ERA corresponding to SingleMuon +- data_mumu_ERA corresponding to DoubleMuon +- data_muoneg_ERA corresponding to MuonEG +- data_egamma_ERA corresponding to EGamma + + # Object Definition All objects collected in [selection/objects.py:object_selection](__cf_module_name__/selection/objects.py#L177). From 6adcc6d6ca75f6c4690d382e0d34a575b22b00f4 Mon Sep 17 00:00:00 2001 From: Maarten De Coen <52047931+maadcoen@users.noreply.github.com> Date: Tue, 9 Apr 2024 17:09:55 +0200 Subject: [PATCH 070/119] Update README.md --- analysis_templates/ghent_template/README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/analysis_templates/ghent_template/README.md b/analysis_templates/ghent_template/README.md index d37ec1bf3..5d5466085 100644 --- a/analysis_templates/ghent_template/README.md +++ b/analysis_templates/ghent_template/README.md @@ -12,6 +12,9 @@ The datasets (for ERA=a,b,c,d) are: - data_muoneg_ERA corresponding to MuonEG - data_egamma_ERA corresponding to EGamma +The MC processes with corresponding datasets are +- ttbar with corresponding datasets tt_sl_powheg and tt_dl_powheg +- dy (Drell-Yan) with corresponding datasets dy_lept_m50_ht-RANGE_madgraph # Object Definition From 06d03876d3ad98eebe6a2b38aa66df2fdb3e6330 Mon Sep 17 00:00:00 2001 From: maadcoen Date: Tue, 9 Apr 2024 17:10:44 +0200 Subject: [PATCH 071/119] shorted dataset listing (correction) --- .../ghent_template/__cf_module_name__/config/datasets.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/analysis_templates/ghent_template/__cf_module_name__/config/datasets.py b/analysis_templates/ghent_template/__cf_module_name__/config/datasets.py index 8e5b3bda4..2e7b24627 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/config/datasets.py +++ b/analysis_templates/ghent_template/__cf_module_name__/config/datasets.py @@ -30,8 +30,8 @@ def add_datasets(config: od.Config, campaign: od.Campaign): # backgrounds # ewk - *["dy_lept_m50_ht-100to200_madgraph" for htr in ['100to200', '200to400', '400to600', - '600to800', '800to1200', '1200to2500']], + *[f"dy_lept_m50_ht-{htr}_madgraph" for htr in ['100to200', '200to400', '400to600', + '600to800', '800to1200', '1200to2500']], # ttbar "tt_dl_powheg", From 8def566abbb328a88ed349d62744165cb0587a7b Mon Sep 17 00:00:00 2001 From: Maarten De Coen <52047931+maadcoen@users.noreply.github.com> Date: Tue, 9 Apr 2024 17:20:13 +0200 Subject: [PATCH 072/119] Update README.md --- analysis_templates/ghent_template/README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/analysis_templates/ghent_template/README.md b/analysis_templates/ghent_template/README.md index 5d5466085..85fe25852 100644 --- a/analysis_templates/ghent_template/README.md +++ b/analysis_templates/ghent_template/README.md @@ -5,6 +5,7 @@ All processes and datasets are defined in the [cmsdb gitlab](https://gitlab.cern.ch/ghentanalysis/cmsdb/-/blob/master/cmsdb/). Processes are added to the analysis in the file [config/processes.py](__cf_module_name__/config/processes.py). Data- and MC-sets are added to the analysis in the file [config/datasets.py](__cf_module_name__/config/datasets.py). +Note the difference between a process and a dataset. A process can correspond to multiple datasets. The other way around is currently not possible. The datasets (for ERA=a,b,c,d) are: - data_mu_ERA corresponding to SingleMuon @@ -16,6 +17,9 @@ The MC processes with corresponding datasets are - ttbar with corresponding datasets tt_sl_powheg and tt_dl_powheg - dy (Drell-Yan) with corresponding datasets dy_lept_m50_ht-RANGE_madgraph +The analysis can be run over only selected datasets using the --datasets argument. Groupings of datasets are defined in [config/datasets.py](__cf_module_name__/config/config___cf_short_name_lc__.py). A similar scheme exists for processes. + + # Object Definition All objects collected in [selection/objects.py:object_selection](__cf_module_name__/selection/objects.py#L177). From 645360dd5aa35612adac7c287897f863d1c36b48 Mon Sep 17 00:00:00 2001 From: maadcoen Date: Wed, 10 Apr 2024 09:19:48 +0200 Subject: [PATCH 073/119] added missing background xsec --- .../__cf_module_name__/config/processes.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/analysis_templates/ghent_template/__cf_module_name__/config/processes.py b/analysis_templates/ghent_template/__cf_module_name__/config/processes.py index 452e7326e..203e767b4 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/config/processes.py +++ b/analysis_templates/ghent_template/__cf_module_name__/config/processes.py @@ -20,11 +20,12 @@ def add_processes(config: od.Config, campaign: od.Campaign): # How to add new processes: # Add custom process to encapsulate all background processes: - - bg = config.add_process( + bg_processes = ['dy'] + config.add_process( name="background", - id=9999, + id=9999, # cannot collide with ids defined in cmsdb though label="Background", - + xsecs = {13: sum([config.get_process(bg).get_xsec(13) for bg in bg_processes])} ) - bg.add_process(config.get_process("dy")) + for bg in bg_processes: + bg.add_process(config.get_process(bg)) From bd900af1702fa7b8ceee4d473bf0c2190170f082 Mon Sep 17 00:00:00 2001 From: maadcoen Date: Wed, 10 Apr 2024 09:25:14 +0200 Subject: [PATCH 074/119] added missing background xsec: bugfix --- .../ghent_template/__cf_module_name__/config/processes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/analysis_templates/ghent_template/__cf_module_name__/config/processes.py b/analysis_templates/ghent_template/__cf_module_name__/config/processes.py index 203e767b4..368e740fe 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/config/processes.py +++ b/analysis_templates/ghent_template/__cf_module_name__/config/processes.py @@ -21,11 +21,11 @@ def add_processes(config: od.Config, campaign: od.Campaign): # How to add new processes: # Add custom process to encapsulate all background processes: bg_processes = ['dy'] - config.add_process( + background = config.add_process( name="background", id=9999, # cannot collide with ids defined in cmsdb though label="Background", xsecs = {13: sum([config.get_process(bg).get_xsec(13) for bg in bg_processes])} ) for bg in bg_processes: - bg.add_process(config.get_process(bg)) + background.add_process(config.get_process(bg)) From 1aab215dade599feeda93d08bb5506a4b5909070 Mon Sep 17 00:00:00 2001 From: Maarten De Coen <52047931+maadcoen@users.noreply.github.com> Date: Wed, 10 Apr 2024 17:09:07 +0200 Subject: [PATCH 075/119] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 98c6b2968..738e51a2a 100644 --- a/README.md +++ b/README.md @@ -79,7 +79,7 @@ Setup successfull! The next steps are: Suggestions for tasks to run: - a) Run the 'calibration -> selection -> reduction' pipeline for the first file of the + a) Run the 'calibration -> selection -> reduction' pipeline for the first file (--branch 0) of the default dataset using the default calibrator and default selector (enter the command below and 'tab-tab' to see all arguments or add --help for help) > law run cf.ReduceEvents --version dev1 --branch 0 From 30b0916a442a690756b10b0d4aca6b8b09ad38e6 Mon Sep 17 00:00:00 2001 From: maadcoen Date: Thu, 11 Apr 2024 11:32:37 +0200 Subject: [PATCH 076/119] file for columnar function defined in Ghent CMS group (added TetraVec to it) --- columnflow/columnar_util_Ghent.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 columnflow/columnar_util_Ghent.py diff --git a/columnflow/columnar_util_Ghent.py b/columnflow/columnar_util_Ghent.py new file mode 100644 index 000000000..521c23fd0 --- /dev/null +++ b/columnflow/columnar_util_Ghent.py @@ -0,0 +1,28 @@ +# coding: utf-8 + +""" +Helpers and utilities for working with columnar libraries (Ghent cms group) +""" + +from __future__ import annotations + +__all__ = [ + "TetraVec" +] + +from columnflow.util import maybe_import + +ak = maybe_import("awkward") +coffea = maybe_import("coffea") + + +def TetraVec(arr: ak.Array) -> ak.Array: + """ + create a Lorentz for fector from an awkward array with pt, eta, phi, and mass fields + """ + for field in ["pt", "eta", "phi", "mass"]: + assert field in arr.fields, f"Provided array is missing {field} field" + TetraVec = ak.zip({"pt": arr.pt, "eta": arr.eta, "phi": arr.phi, "mass": arr.mass}, + with_name="PtEtaPhiMLorentzVector", + behavior=coffea.nanoevents.methods.vector.behavior) + return TetraVec \ No newline at end of file From 9eb4f57d9aec116d324022125390006916f79a2c Mon Sep 17 00:00:00 2001 From: maadcoen Date: Thu, 11 Apr 2024 11:52:02 +0200 Subject: [PATCH 077/119] wlcg_fs point to lxplus nanoaod --- analysis_templates/cms_minimal/law.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/analysis_templates/cms_minimal/law.cfg b/analysis_templates/cms_minimal/law.cfg index eab65d02f..0dd6ff0ce 100644 --- a/analysis_templates/cms_minimal/law.cfg +++ b/analysis_templates/cms_minimal/law.cfg @@ -116,7 +116,7 @@ cache_mtime_patience: -1 [wlcg_fs] # set this to your desired location -base: root://eosuser.cern.ch/eos/user/$CF_CERN_USER_FIRSTCHAR/$CF_CERN_USER/$CF_STORE_NAME +base: root://eosuser.cern.ch//eos/cms create_file_dir: True use_cache: $CF_WLCG_USE_CACHE cache_root: $CF_WLCG_CACHE_ROOT From e830221188c9896f0f61d18b07a838b3c2af7fb8 Mon Sep 17 00:00:00 2001 From: maadcoen Date: Fri, 12 Apr 2024 15:27:21 +0200 Subject: [PATCH 078/119] directories in columnflow to share ghent made code --- columnflow/calibration/cmsGhent/__init__.py | 0 columnflow/categorization/cmsGhent/__init__.py | 0 columnflow/inference/cmsGhent/__init__.py | 0 columnflow/ml/cmsGhent/__init__.py | 0 columnflow/plotting/cmsGhent/__init__.py | 0 columnflow/production/cmsGhent/__init__.py | 0 columnflow/selection/cmsGhent/__init__.py | 0 columnflow/tasks/cmsGhent/__init__.py | 0 8 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 columnflow/calibration/cmsGhent/__init__.py create mode 100644 columnflow/categorization/cmsGhent/__init__.py create mode 100644 columnflow/inference/cmsGhent/__init__.py create mode 100644 columnflow/ml/cmsGhent/__init__.py create mode 100644 columnflow/plotting/cmsGhent/__init__.py create mode 100644 columnflow/production/cmsGhent/__init__.py create mode 100644 columnflow/selection/cmsGhent/__init__.py create mode 100644 columnflow/tasks/cmsGhent/__init__.py diff --git a/columnflow/calibration/cmsGhent/__init__.py b/columnflow/calibration/cmsGhent/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/columnflow/categorization/cmsGhent/__init__.py b/columnflow/categorization/cmsGhent/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/columnflow/inference/cmsGhent/__init__.py b/columnflow/inference/cmsGhent/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/columnflow/ml/cmsGhent/__init__.py b/columnflow/ml/cmsGhent/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/columnflow/plotting/cmsGhent/__init__.py b/columnflow/plotting/cmsGhent/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/columnflow/production/cmsGhent/__init__.py b/columnflow/production/cmsGhent/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/columnflow/selection/cmsGhent/__init__.py b/columnflow/selection/cmsGhent/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/columnflow/tasks/cmsGhent/__init__.py b/columnflow/tasks/cmsGhent/__init__.py new file mode 100644 index 000000000..e69de29bb From 341b7216be7a4eb7a2d862f1f3102078ebf36cf2 Mon Sep 17 00:00:00 2001 From: Maarten De Coen <52047931+maadcoen@users.noreply.github.com> Date: Fri, 12 Apr 2024 17:01:41 +0200 Subject: [PATCH 079/119] adapted cutflow plot allowing for fractional cutflows (#4) --- columnflow/tasks/cmsGhent/cutflow.py | 107 +++++++++++++++++++++++++++ law.cfg | 3 +- 2 files changed, 109 insertions(+), 1 deletion(-) create mode 100644 columnflow/tasks/cmsGhent/cutflow.py diff --git a/columnflow/tasks/cmsGhent/cutflow.py b/columnflow/tasks/cmsGhent/cutflow.py new file mode 100644 index 000000000..bcc585c10 --- /dev/null +++ b/columnflow/tasks/cmsGhent/cutflow.py @@ -0,0 +1,107 @@ + +from collections import OrderedDict + +from ..cutflow import * +from columnflow.tasks.framework.decorators import view_output_plots + +import luigi +import law +from columnflow.util import maybe_import + +np = maybe_import("numpy") + +PlotCutflow.relative = luigi.BoolParameter( + default=False, + significant=False, + description="name of the variable to use for obtaining event counts; default: 'False'", + ) + +@law.decorator.log +@view_output_plots +def PlotCutflow_run(self): + import hist + + # prepare config objects + category_inst = self.config_inst.get_category(self.branch_data) + leaf_category_insts = category_inst.get_leaf_categories() or [category_inst] + process_insts = list(map(self.config_inst.get_process, self.processes)) + sub_process_insts = { + proc: [sub for sub, _, _ in proc.walk_processes(include_self=True)] + for proc in process_insts + } + + # histogram data per process + hists = {} + + with self.publish_step(f"plotting cutflow in {category_inst.name}"): + for dataset, inp in self.input().items(): + dataset_inst = self.config_inst.get_dataset(dataset) + h_in = inp[self.variable].load(formatter="pickle") + + # sanity checks + n_shifts = len(h_in.axes["shift"]) + if n_shifts != 1: + raise Exception(f"shift axis is supposed to only contain 1 bin, found {n_shifts}") + + # loop and extract one histogram per process + for process_inst in process_insts: + # skip when the dataset is already known to not contain any sub process + if not any(map(dataset_inst.has_process, sub_process_insts[process_inst])): + continue + + # work on a copy + h = h_in.copy() + + # axis selections + h = h[{ + "process": [ + hist.loc(p.id) + for p in sub_process_insts[process_inst] + if p.id in h.axes["process"] + ], + "category": [ + hist.loc(c.id) + for c in leaf_category_insts + if c.id in h.axes["category"] + ], + }] + + # axis reductions + h = h[{"process": sum, "category": sum, self.variable: sum}] + + # add the histogram + if process_inst in hists: + hists[process_inst] += h + else: + hists[process_inst] = h + + # there should be hists to plot + if not hists: + raise Exception("no histograms found to plot") + + total = sum(hists.values()).values() if self.relative else np.ones(len(self.selector_steps)) + + # sort hists by process order + hists = OrderedDict( + (process_inst.copy_shallow(), hists[process_inst] / total) + for process_inst in sorted(hists, key=process_insts.index) + ) + + # call the plot function + fig, _ = self.call_plot_func( + self.plot_function, + hists=hists, + config_inst=self.config_inst, + category_inst=category_inst.copy_shallow(), + **self.get_plot_parameters(), + ) + + # save the plot + for outp in self.output()["plots"]: + outp.dump(fig, formatter="mpl") + + +PlotCutflow.run = PlotCutflow_run + + + diff --git a/law.cfg b/law.cfg index a47d0ba69..c9f5c20e7 100644 --- a/law.cfg +++ b/law.cfg @@ -11,7 +11,8 @@ columnflow.tasks.union columnflow.tasks.histograms columnflow.tasks.plotting columnflow.tasks.yields -columnflow.tasks.cutflow +columnflow.tasks.cmsGhent.cutflow + [analysis] From 344e5051296630c27d9273c9e8ffc69f08343461 Mon Sep 17 00:00:00 2001 From: maadcoen Date: Wed, 17 Apr 2024 16:42:32 +0200 Subject: [PATCH 080/119] fixed bux for non relative Cutflowplot --- columnflow/tasks/cmsGhent/cutflow.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/columnflow/tasks/cmsGhent/cutflow.py b/columnflow/tasks/cmsGhent/cutflow.py index bcc585c10..c90ebf293 100644 --- a/columnflow/tasks/cmsGhent/cutflow.py +++ b/columnflow/tasks/cmsGhent/cutflow.py @@ -78,8 +78,8 @@ def PlotCutflow_run(self): # there should be hists to plot if not hists: raise Exception("no histograms found to plot") - - total = sum(hists.values()).values() if self.relative else np.ones(len(self.selector_steps)) + + total = sum(hists.values()).values() if self.relative else np.ones((len(self.selector_steps) + 1, 1)) # sort hists by process order hists = OrderedDict( From fee6afbe3198d3c3a06c23ae7f30822faf16564a Mon Sep 17 00:00:00 2001 From: Maarten De Coen <52047931+maadcoen@users.noreply.github.com> Date: Tue, 23 Apr 2024 12:07:32 +0200 Subject: [PATCH 081/119] missing order import --- .../ghent_template/__cf_module_name__/selection/trigger.py | 1 + 1 file changed, 1 insertion(+) diff --git a/analysis_templates/ghent_template/__cf_module_name__/selection/trigger.py b/analysis_templates/ghent_template/__cf_module_name__/selection/trigger.py index 7c4604ae2..38a9075ba 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/selection/trigger.py +++ b/analysis_templates/ghent_template/__cf_module_name__/selection/trigger.py @@ -1,5 +1,6 @@ # coding: utf-8 from __future__ import annotations +import order as od from columnflow.selection import Selector, SelectionResult, selector from columnflow.util import maybe_import From 6a8cfea3686df67e7a6754f2cff5a49cf944950c Mon Sep 17 00:00:00 2001 From: Maarten De Coen <52047931+maadcoen@users.noreply.github.com> Date: Tue, 23 Apr 2024 12:10:29 +0200 Subject: [PATCH 082/119] reference to non-existing "features" function (should be "default") --- .../ghent_template/__cf_module_name__/production/default.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/analysis_templates/ghent_template/__cf_module_name__/production/default.py b/analysis_templates/ghent_template/__cf_module_name__/production/default.py index 8e85d99cb..4f4d099ed 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/production/default.py +++ b/analysis_templates/ghent_template/__cf_module_name__/production/default.py @@ -54,7 +54,7 @@ def default(self: Producer, events: ak.Array, **kwargs) -> ak.Array: return events -@features.init -def features_init(self: Producer) -> None: +@default.init +def default_init(self: Producer) -> None: # add categories to config add_categories_production(self.config_inst) From a1e45726d0d935ca8c22c4f0d17306e872b6c4fd Mon Sep 17 00:00:00 2001 From: Maarten De Coen <52047931+maadcoen@users.noreply.github.com> Date: Tue, 23 Apr 2024 12:11:43 +0200 Subject: [PATCH 083/119] removed usage of undefined set_ak_column_f32, meant as short-hand for value_type=np.float32 --- .../ghent_template/__cf_module_name__/production/default.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/analysis_templates/ghent_template/__cf_module_name__/production/default.py b/analysis_templates/ghent_template/__cf_module_name__/production/default.py index 4f4d099ed..2bc269bb8 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/production/default.py +++ b/analysis_templates/ghent_template/__cf_module_name__/production/default.py @@ -44,7 +44,7 @@ def default(self: Producer, events: ak.Array, **kwargs) -> ak.Array: # (re)produce category i events = self[category_ids](events, **kwargs) - events = set_ak_column_f32(events, "ht", ak.sum(events.Jet.pt, axis=1)) + events = set_ak_column(events, "ht", ak.sum(events.Jet.pt, axis=1), value_type=np.float32) events = set_ak_column(events, "n_jet", ak.sum(events.Jet.pt > 0, axis=1)) events = set_ak_column(events, "n_bjet", ak.sum(events.Jet.btagDeepFlavB >= self.config_inst.x.btag_working_points.deepjet.medium, axis=1)) From a3e45f92ac3b28fd487a798989f323c7d2243f19 Mon Sep 17 00:00:00 2001 From: Maarten De Coen <52047931+maadcoen@users.noreply.github.com> Date: Tue, 23 Apr 2024 12:14:35 +0200 Subject: [PATCH 084/119] don't hard code ecm (take from campaign instead) --- .../ghent_template/__cf_module_name__/config/processes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/analysis_templates/ghent_template/__cf_module_name__/config/processes.py b/analysis_templates/ghent_template/__cf_module_name__/config/processes.py index 368e740fe..a737a7c66 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/config/processes.py +++ b/analysis_templates/ghent_template/__cf_module_name__/config/processes.py @@ -25,7 +25,7 @@ def add_processes(config: od.Config, campaign: od.Campaign): name="background", id=9999, # cannot collide with ids defined in cmsdb though label="Background", - xsecs = {13: sum([config.get_process(bg).get_xsec(13) for bg in bg_processes])} + xsecs = {campaign.ecm: sum([config.get_process(bg).get_xsec(campaign.ecm) for bg in bg_processes])} ) for bg in bg_processes: background.add_process(config.get_process(bg)) From fe4bb06ebfaf6f4b3d4e1ab406308da3b9b76e5a Mon Sep 17 00:00:00 2001 From: Maarten De Coen <52047931+maadcoen@users.noreply.github.com> Date: Tue, 23 Apr 2024 12:24:06 +0200 Subject: [PATCH 085/119] cutflow_features should take SelectionResult --- .../__cf_module_name__/production/cutflow_features.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/analysis_templates/ghent_template/__cf_module_name__/production/cutflow_features.py b/analysis_templates/ghent_template/__cf_module_name__/production/cutflow_features.py index 592e81839..977eb1959 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/production/cutflow_features.py +++ b/analysis_templates/ghent_template/__cf_module_name__/production/cutflow_features.py @@ -4,7 +4,7 @@ Column production methods for cutflow features. """ - +from columnflow.selection import SelectionResult from columnflow.production import Producer, producer from columnflow.production.categories import category_ids from columnflow.production.cms.mc_weight import mc_weight @@ -31,14 +31,14 @@ def cutflow_features( self: Producer, events: ak.Array, - object_masks: dict[str, dict[str, ak.Array]], + results: SelectionResult, **kwargs, ) -> ak.Array: if self.dataset_inst.is_mc: events = self[mc_weight](events, **kwargs) # apply object masks and create new collections - reduced_events = create_collections_from_masks(events, object_masks) + reduced_events = create_collections_from_masks(events, results.objects) # create category ids per event and add categories back to the events = self[category_ids](reduced_events, target_events=events, **kwargs) From 75b8ca2ac31fcb3104f53f18449bb26ce23ff23d Mon Sep 17 00:00:00 2001 From: Maarten De Coen <52047931+maadcoen@users.noreply.github.com> Date: Wed, 24 Apr 2024 11:09:37 +0200 Subject: [PATCH 086/119] bugfixes and new tasks (#5) * task to make cutflow tables * task to read all used NanoAOD datasets --- columnflow/tasks/cmsGhent/config.py | 46 +++++ columnflow/tasks/cmsGhent/cutflow.py | 257 ++++++++++++++++++++++++++- 2 files changed, 300 insertions(+), 3 deletions(-) create mode 100644 columnflow/tasks/cmsGhent/config.py diff --git a/columnflow/tasks/cmsGhent/config.py b/columnflow/tasks/cmsGhent/config.py new file mode 100644 index 000000000..d336db636 --- /dev/null +++ b/columnflow/tasks/cmsGhent/config.py @@ -0,0 +1,46 @@ +import law +import luigi +from columnflow.tasks.framework.base import ConfigTask +from columnflow.tasks.framework.mixins import DatasetsProcessesMixin + +import json + + +class ReadDataSets(DatasetsProcessesMixin, ConfigTask): + + shifts = luigi.BoolParameter( + default=False, + significant=False, + description="when True, print the shifted datasets, not the nominal", + ) + + def output(self) -> law.target.file.FileSystemFileTarget: + """ + Creates a target file for the final .json file containing the list of datasets + + """ + return self.target(("shifts" if self.shifts else "nominal") + ".json") + + def complete(self): + return self.output().exists() + + def run(self): + process_dataset_map = {p: [] for p in self.processes} + + for dt in self.datasets: + dt = self.config_inst.get_dataset(dt) + datasets = [] + process = list(dt.processes)[0] + for p in self.processes: + p_inst = self.config_inst.get_process(p) + if p_inst.has_process(process) or p_inst == process: + datasets = process_dataset_map[p] + datasets_loc: dict = dt.info.copy() + nominal = datasets_loc.pop('nominal') + if self.shifts: + for shift in datasets_loc.values(): + datasets.extend(shift.keys) + else: + datasets.extend(nominal.keys) + + self.output().dump(process_dataset_map, indent=2) diff --git a/columnflow/tasks/cmsGhent/cutflow.py b/columnflow/tasks/cmsGhent/cutflow.py index c90ebf293..f2a99c394 100644 --- a/columnflow/tasks/cmsGhent/cutflow.py +++ b/columnflow/tasks/cmsGhent/cutflow.py @@ -1,12 +1,17 @@ -from collections import OrderedDict +from collections import defaultdict +from scinum import Number + from ..cutflow import * from columnflow.tasks.framework.decorators import view_output_plots +from columnflow.tasks.framework.mixins import ( + CalibratorsMixin, SelectorStepsMixin, CategoriesMixin, DatasetsProcessesMixin +) import luigi import law -from columnflow.util import maybe_import +from columnflow.util import maybe_import, DotDict, dev_sandbox, try_int np = maybe_import("numpy") @@ -78,7 +83,7 @@ def PlotCutflow_run(self): # there should be hists to plot if not hists: raise Exception("no histograms found to plot") - + total = sum(hists.values()).values() if self.relative else np.ones((len(self.selector_steps) + 1, 1)) # sort hists by process order @@ -104,4 +109,250 @@ def PlotCutflow_run(self): PlotCutflow.run = PlotCutflow_run +class CreateCutflowTable( + DatasetsProcessesMixin, + CategoriesMixin, + SelectorStepsMixin, + CalibratorsMixin, + law.LocalWorkflow, + RemoteWorkflow, +): + sandbox = dev_sandbox(law.config.get("analysis", "default_columnar_sandbox")) + + table_format = luigi.Parameter( + default="fancy_grid", + significant=False, + description="format of the yield table; accepts all formats of the tabulate package; default: fancy_grid. " + "See https://github.com/astanin/python-tabulate/blob/master/README.md?plain=1#L147", + ) + number_format = luigi.Parameter( + default="pdg", + significant=False, + description="rounding format of each number in the yield table; accepts all formats " + "understood by scinum.Number.str(), e.g. 'pdg', 'publication', '%%.1f' or an integer " + "(number of signficant digits); default: pdg", + ) + skip_uncertainties = luigi.BoolParameter( + default=False, + significant=False, + description="when True, uncertainties are not displayed in the table; default: False", + ) + normalize_yields = luigi.ChoiceParameter( + choices=(law.NO_STR, "per_process", "per_step", "all"), + default=law.NO_STR, + significant=False, + description="string parameter to define the normalization of the yields; " + "choices: '', per_process, per_category, all; empty default", + ) + output_suffix = luigi.Parameter( + default=law.NO_STR, + description="Adds a suffix to the output name of the yields table; empty default", + ) + + selector_steps_order_sensitive = True + + # upstream requirements + reqs = Requirements( + RemoteWorkflow.reqs, + CreateCutflowHistograms=CreateCutflowHistograms, + ) + + def create_branch_map(self): + # one category per branch + if not self.categories: + raise Exception( + f"{self.__class__.__name__} task cannot build branch map when no category is " + "set", + ) + + return list(self.categories) + + def workflow_requires(self): + reqs = super().workflow_requires() + + reqs["hists"] = [ + self.reqs.CreateCutflowHistograms.req( + self, + dataset=d, + variables=("event",), + _exclude={"branches"}, + ) + for d in self.datasets + ] + return reqs + + def requires(self): + return { + d: self.reqs.CreateCutflowHistograms.req( + self, + branch=0, + dataset=d, + variables=("event",), + ) + for d in self.datasets + } + + @classmethod + def resolve_param_values(cls, params): + params = super().resolve_param_values(params) + + if "number_format" in params and try_int(params["number_format"]): + # convert 'number_format' in integer if possible + params["number_format"] = int(params["number_format"]) + + return params + + def output(self): + suffix = "" + if self.output_suffix and self.output_suffix != law.NO_STR: + suffix = f"__{self.output_suffix}" + + return { + "table": self.target(f"table__proc_{self.processes_repr}__steps_{self.branch_data}{suffix}.txt"), + "yields": self.target(f"yields__proc_{self.processes_repr}__steps_{self.branch_data}{suffix}.json"), + } + + @law.decorator.log + def run(self): + import hist + from tabulate import tabulate + + inputs = self.input() + outputs = self.output() + + category_inst = self.config_inst.get_category(self.branch_data) + leaf_category_insts = category_inst.get_leaf_categories() or [category_inst] + process_insts = list(map(self.config_inst.get_process, self.processes)) + sub_process_insts = { + proc: [sub for sub, _, _ in proc.walk_processes(include_self=True)] + for proc in process_insts + } + + # histogram data per process + hists = {} + + with self.publish_step(f"Creating cutflow table in {category_inst.name}"): + for dataset, inp in inputs.items(): + dataset_inst = self.config_inst.get_dataset(dataset) + + # load the histogram of the variable named "event" + h_in = inp["event"].load(formatter="pickle") + + # sanity checks + n_shifts = len(h_in.axes["shift"]) + if n_shifts != 1: + raise Exception(f"shift axis is supposed to only contain 1 bin, found {n_shifts}") + + # loop and extract one histogram per process + for process_inst in process_insts: + # skip when the dataset is already known to not contain any sub process + if not any(map(dataset_inst.has_process, sub_process_insts[process_inst])): + continue + + # work on a copy + h = h_in.copy() + + # axis selections + h = h[{ + "process": [ + hist.loc(p.id) + for p in sub_process_insts[process_inst] + if p.id in h.axes["process"] + ], + "category": [ + hist.loc(c.id) + for c in leaf_category_insts + if c.id in h.axes["category"] + ], + }] + + # axis reductions + h = h[{"shift": sum, "process": sum, "category": sum, "event": sum}] + + # add the histogram + if process_inst in hists: + hists[process_inst] += h + else: + hists[process_inst] = h + + # there should be hists to plot + if not hists: + raise Exception("no histograms found to plot") + + # sort hists by process order + hists = OrderedDict( + (process_inst, hists[process_inst]) + for process_inst in sorted(hists, key=process_insts.index) + ) + + yields, processes = defaultdict(list), [] + + # read out yields per step and per process + for process_inst, h in hists.items(): + processes.append(process_inst) + + for step in self.selector_steps: + h_step = h[{"step": [step]}] + h_step = h_step[{"step": sum}] + value = Number(h_step.value) + if not self.skip_uncertainties: + # set a unique uncertainty name for correct propagation below + value.set_uncertainty( + f"mcstat_{process_inst.name}_{step}", + np.sqrt(h_step.variance), + ) + yields[step].append(value) + + # obtain normalizaton factors + norm_factors = 1 + if self.normalize_yields == "all": + norm_factors = sum( + sum(step_yields) + for step_yields in yields.values() + ) + elif self.normalize_yields == "per_process": + norm_factors = [ + sum(yields[step][i] for step in yields.keys()) + for i in range(len(yields[self.selector_steps[0]])) + ] + elif self.normalize_yields == "per_step": + norm_factors = { + step: sum(step_yields) + for step, step_yields in yields.items() + } + + # initialize dicts + yields_str = defaultdict(list, {"Process": [proc.label for proc in processes]}) + raw_yields = defaultdict(dict, {}) + + # apply normalization and format + for step, step_yields in yields.items(): + for i, value in enumerate(step_yields): + # get correct norm factor per category and process + if self.normalize_yields == "per_process": + norm_factor = norm_factors[i] + elif self.normalize_yields == "per_step": + norm_factor = norm_factors[step] + else: + norm_factor = norm_factors + + raw_yield = (value / norm_factor).nominal + raw_yields[step][processes[i].name] = raw_yield + + # format yields into strings + yield_str = (value / norm_factor).str( + combine_uncs="all", + format=self.number_format, + style="latex" if "latex" in self.table_format else "plain", + ) + if "latex" in self.table_format: + yield_str = f"${yield_str}$" + yields_str[step].append(yield_str) + + # create, print and save the yield table + yield_table = tabulate(yields_str, headers="keys", tablefmt=self.table_format) + self.publish_message(yield_table) + + outputs["table"].dump(yield_table, formatter="text") + outputs["yields"].dump(raw_yields, formatter="json") From 1f149adf43a6590e3f600ffa97824c46fac921db Mon Sep 17 00:00:00 2001 From: maadcoen Date: Thu, 25 Apr 2024 17:18:41 +0200 Subject: [PATCH 087/119] added veto producer and apply it in the EventSelection task. Ignore vetoed events when checking for finite results --- columnflow/production/veto.py | 76 +++++++++++++++++++++++++++++++++++ columnflow/tasks/selection.py | 31 +++++++++++++- law.cfg | 2 +- 3 files changed, 106 insertions(+), 3 deletions(-) create mode 100644 columnflow/production/veto.py diff --git a/columnflow/production/veto.py b/columnflow/production/veto.py new file mode 100644 index 000000000..d6fbddcb8 --- /dev/null +++ b/columnflow/production/veto.py @@ -0,0 +1,76 @@ +from collections import defaultdict + +from columnflow.production import Producer, producer +from columnflow.util import maybe_import, InsertableDict +from columnflow.columnar_util import set_ak_column +from law import LocalFileTarget + +ak = maybe_import("awkward") +np = maybe_import("numpy") + + +@producer( + uses=("event", "run", "luminosityBlock"), + exposed=False, + get_veto_file=(lambda self, external_files: external_files.veto), +) +def veto_events( + self: Producer, + events: ak.Array, + file: LocalFileTarget = None, + **kwargs, +) -> ak.Array: + """ + Produces a mask vetoing certain events from being processed. Outputs a SelectionResult + with attributes *veto* (containing a mask selecting the vetoed events) and with the *event* + attribute initialized with a mask selecting non-vetoed events. If *file* is provided, it checks only + events contained within this file, or events not designated to any file. + + The events that are vetoed need to be specified from ``config_inst``, + which must contain the keyword ``veto`` in the auxiliary information. This can look + like this: + + .. code-block:: python + + # cfg is the current config instance + cfg.x.veto = config.x.veto = { + "dy_lep_m10to50_amcatnlo" : [ + { + "event": 33098036, + "luminosityBlock": 20170, + "run": 1, + ** optionally ** + "file": "/store/mc/RunIISummer20UL18NanoAODv9/DYJetsToLL_M-10to50_TuneCP5_13TeV-amcatnloFXFX-pythia8/NANOAODSIM/106X_upgrade2018_realistic_v16_L1v1-v1/50000/296CA60E-0122-2F4F-8B04-17DCF5E3E062.root" # noqa + } + ] + } + + """ + + veto = np.full_like(events.event, False, dtype=bool) + for veto_event in self.veto_list: + if file is None or "file" not in veto_event or file.path == veto_event["file"]: + veto = veto | ( + (events.event == veto_event['event']) & + (events.run == veto_event['run']) & + (events.luminosityBlock == veto_event['luminosityBlock']) + ) + + events = set_ak_column(events, "veto", veto) + + return events + + +@veto_events.setup +def veto_events_setup( + self: Producer, + reqs: dict, + inputs: dict, + reader_targets: InsertableDict, +) -> None: + """ + Loads the event veto file from the external files bundle and saves them in the + py:attr:`veto_list` attribute for simpler access in the actual callable. + """ + veto_dict = self.config_inst.aux.get("veto", {}) + self.veto_list = veto_dict.get(self.dataset_inst.name, []) diff --git a/columnflow/tasks/selection.py b/columnflow/tasks/selection.py index 482043438..479721154 100644 --- a/columnflow/tasks/selection.py +++ b/columnflow/tasks/selection.py @@ -44,6 +44,14 @@ class SelectEvents( # strategy for handling missing source columns when adding aliases on event chunks missing_column_alias_strategy = "original" + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # store the normalization weight producer for MC + self.veto_producer: Producer = Producer.get_cls("veto_events")( + inst_dict=self.get_producer_kwargs(self), + ) + def workflow_requires(self): reqs = super().workflow_requires() @@ -63,6 +71,9 @@ def workflow_requires(self): # add selector dependent requirements reqs["selector"] = self.selector_inst.run_requires() + # add veto selector dependent requirements + reqs["veto"] = self.veto_producer.run_requires() + return reqs def requires(self): @@ -78,6 +89,9 @@ def requires(self): # add selector dependent requirements reqs["selector"] = self.selector_inst.run_requires() + # add veto selector dependent requirements + reqs["veto"] = self.veto_producer.run_requires() + return reqs def output(self): @@ -132,9 +146,13 @@ def run(self): # get shift dependent aliases aliases = self.local_shift_inst.x("column_aliases", {}) + # setup the veto producer + self.veto_producer.run_setup(self.requires()["veto"], self.input()["veto"]) + # define columns that need to be read read_columns = set(map(Route, mandatory_coffea_columns)) read_columns |= self.selector_inst.used_columns + read_columns |= self.veto_producer.used_columns read_columns |= set(map(Route, aliases.values())) # define columns that will be written @@ -172,6 +190,9 @@ def run(self): # insert additional columns events = update_ak_array(events, *cols) + # add veto + events = self.veto_producer(events, file=input_file) + # add aliases events = add_ak_aliases( events, @@ -195,7 +216,8 @@ def run(self): # optional check for finite values if self.check_finite_output: - self.raise_if_not_finite(results_array) + # ignore vetoed events when checking for finite values + self.raise_if_not_finite(results_array[~events.veto]) # save results as parquet via a thread in the same pool chunk = tmp_dir.child(f"res_{lfn_index}_{pos.index}.parquet", type="f") @@ -204,11 +226,16 @@ def run(self): # remove columns if write_columns: + + # store veto in variable before filtering + veto = events.veto + events = route_filter(events) # optional check for finite values if self.check_finite_output: - self.raise_if_not_finite(events) + # ignore vetoed events when checking for finite values + self.raise_if_not_finite(events[~veto]) # save additional columns as parquet via a thread in the same pool chunk = tmp_dir.child(f"cols_{lfn_index}_{pos.index}.parquet", type="f") diff --git a/law.cfg b/law.cfg index c9f5c20e7..cb47c47af 100644 --- a/law.cfg +++ b/law.cfg @@ -21,7 +21,7 @@ default_analysis: columnflow.example_config.analysis_st.analysis_st default_config: run2_pp_2018 default_dataset: st_tchannel_t -production_modules: columnflow.production.{categories,processes,normalization} +production_modules: columnflow.production.{categories,processes,normalization,veto} calibration_modules: columnflow.calibration selection_modules: columnflow.selection.{empty} categorization_modules: columnflow.categorization From 23010b27ad5bb354e03360b71ba4f2e6346c2bae Mon Sep 17 00:00:00 2001 From: maadcoen Date: Thu, 25 Apr 2024 17:29:19 +0200 Subject: [PATCH 088/119] use veto in template: - included in event mask - exclude vetoed events from stats - add vetoes to config --- .../config/config___cf_short_name_lc__.py | 2 ++ .../__cf_module_name__/config/veto.py | 16 ++++++++++++++++ .../__cf_module_name__/selection/default.py | 15 +++++++++------ .../__cf_module_name__/selection/stats.py | 14 +++++++++----- analysis_templates/ghent_template/law.cfg | 2 +- 5 files changed, 37 insertions(+), 12 deletions(-) create mode 100644 analysis_templates/ghent_template/__cf_module_name__/config/veto.py diff --git a/analysis_templates/ghent_template/__cf_module_name__/config/config___cf_short_name_lc__.py b/analysis_templates/ghent_template/__cf_module_name__/config/config___cf_short_name_lc__.py index c6acff188..831b24263 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/config/config___cf_short_name_lc__.py +++ b/analysis_templates/ghent_template/__cf_module_name__/config/config___cf_short_name_lc__.py @@ -16,6 +16,7 @@ from __cf_short_name_lc__.config.styling import stylize_processes from __cf_short_name_lc__.config.datasets import add_datasets, configure_datasets from __cf_short_name_lc__.config.processes import add_processes +from __cf_short_name_lc__.config.veto import add_vetoes from __cf_short_name_lc__.config.categories import add_categories_selection from __cf_short_name_lc__.config.variables import add_variables from __cf_short_name_lc__.config.shifts import add_shifts @@ -56,6 +57,7 @@ def add_config( add_triggers(cfg, campaign) add_datasets(cfg, campaign) + add_vetoes(cfg) configure_datasets(cfg, limit_dataset_files) # verify that the root process of all datasets is part of any of the registered processes diff --git a/analysis_templates/ghent_template/__cf_module_name__/config/veto.py b/analysis_templates/ghent_template/__cf_module_name__/config/veto.py new file mode 100644 index 000000000..8313a6eb6 --- /dev/null +++ b/analysis_templates/ghent_template/__cf_module_name__/config/veto.py @@ -0,0 +1,16 @@ +import order as od + +from columnflow.util import call_once_on_config + +@call_once_on_config() +def add_vetoes(config: od.Config) -> None: + config.x.veto = { + 'dy_lep_m10to50_amcatnlo': [ + { + "event": 33098036, + "luminosityBlock": 20170, + "run": 1, + "file": "/store/mc/RunIISummer20UL18NanoAODv9/DYJetsToLL_M-10to50_TuneCP5_13TeV-amcatnloFXFX-pythia8/NANOAODSIM/106X_upgrade2018_realistic_v16_L1v1-v1/50000/296CA60E-0122-2F4F-8B04-17DCF5E3E062.root", # noqa + } + ] + } diff --git a/analysis_templates/ghent_template/__cf_module_name__/selection/default.py b/analysis_templates/ghent_template/__cf_module_name__/selection/default.py index 66ba2ec7c..39887f517 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/selection/default.py +++ b/analysis_templates/ghent_template/__cf_module_name__/selection/default.py @@ -10,7 +10,7 @@ import law from columnflow.util import maybe_import, four_vec -from columnflow.columnar_util import set_ak_column +from columnflow.columnar_util import set_ak_column, optional_column, has_ak_column from columnflow.production.util import attach_coffea_behavior from columnflow.selection import Selector, SelectionResult, selector @@ -44,12 +44,10 @@ def TetraVec(arr: ak.Array) -> ak.Array: @selector( uses={ - process_ids, attach_coffea_behavior, - mc_weight + process_ids, attach_coffea_behavior, mc_weight, optional_column("veto"), }, produces={ - process_ids, attach_coffea_behavior, - mc_weight + process_ids, attach_coffea_behavior, mc_weight }, exposed=False, ) @@ -69,6 +67,7 @@ def pre_selection( events = self[attach_coffea_behavior](events, **kwargs) results = SelectionResult() + results.event = ~events.veto if has_ak_column(events, "veto") else ak.full_like(events.mc_weight, True, dtype=bool) return events, results @@ -238,7 +237,11 @@ def default( results += jet_selection_results # combine event selection after all steps - results.event = results.steps.Trigger & results.steps.Lepton & results.steps.Jet + results.event = (results.event & + results.steps.Trigger & + results.steps.Lepton & + results.steps.Jet & + results.steps.Bjet) # add cutflow features, passing per-object masks events, results = self[post_selection](events, results, stats, **kwargs) diff --git a/analysis_templates/ghent_template/__cf_module_name__/selection/stats.py b/analysis_templates/ghent_template/__cf_module_name__/selection/stats.py index 8e2b0db92..dbabb7219 100644 --- a/analysis_templates/ghent_template/__cf_module_name__/selection/stats.py +++ b/analysis_templates/ghent_template/__cf_module_name__/selection/stats.py @@ -14,6 +14,7 @@ from __cf_short_name_lc__.production.weights import event_weights_to_normalize from columnflow.util import maybe_import +from columnflow.columnar_util import optional_column, has_ak_column from columnflow.ml import MLModel np = maybe_import("numpy") @@ -23,8 +24,9 @@ @selector( uses={ increment_stats, - event_weights_to_normalize - }, + event_weights_to_normalize, + optional_column("veto"), + }, ) def __cf_short_name_lc___increment_stats( self: Selector, @@ -34,6 +36,7 @@ def __cf_short_name_lc___increment_stats( **kwargs, ) -> ak.Array: # collect important information from the results + unvetoed_mask = ~events.veto if has_ak_column(events, "veto") else Ellipsis event_mask = results.event n_jets = results.x.n_jets @@ -45,9 +48,10 @@ def __cf_short_name_lc___increment_stats( } if self.dataset_inst.is_mc: - weight_map["num_negative_weights"] = (events.mc_weight < 0) + weight_map["num_negative_weights"] = (events.mc_weight < 0) & \ + (True if unvetoed_mask is Ellipsis else unvetoed_mask) # "sum" operations - weight_map["sum_mc_weight"] = events.mc_weight # weights of all events + weight_map["sum_mc_weight"] = (events.mc_weight, unvetoed_mask) # weights of all events weight_map["sum_mc_weight_selected"] = (events.mc_weight, event_mask) # weights of selected events weight_columns = list( @@ -61,7 +65,7 @@ def __cf_short_name_lc___increment_stats( # skip non-weight columns here continue - weight_map[f"sum_mc_weight_{name}"] = (events.mc_weight * events[name], Ellipsis) + weight_map[f"sum_mc_weight_{name}"] = (events.mc_weight * events[name], unvetoed_mask) # weights for selected events weight_map[f"sum_mc_weight_{name}_selected"] = (events.mc_weight * events[name], event_mask) diff --git a/analysis_templates/ghent_template/law.cfg b/analysis_templates/ghent_template/law.cfg index 25693d148..7948523ba 100644 --- a/analysis_templates/ghent_template/law.cfg +++ b/analysis_templates/ghent_template/law.cfg @@ -33,7 +33,7 @@ default_dataset: tt_sl_powheg calibration_modules: columnflow.calibration.cms.{jets,met}, __cf_module_name__.calibration.{default,jet} selection_modules: columnflow.selection.{empty}, columnflow.selection.cms.{json_filter, met_filters}, __cf_module_name__.selection.{default,categories,stats,trigger} -production_modules: columnflow.production.{categories,normalization,processes}, columnflow.production.cms.{btag,electron,mc_weight,muon,pdf,pileup,scale,seeds}, __cf_module_name__.production.{weights,features,categories} +production_modules: columnflow.production.{categories,normalization,processes,veto}, columnflow.production.cms.{btag,electron,mc_weight,muon,pdf,pileup,scale,seeds}, __cf_module_name__.production.{weights,features,categories} categorization_modules: __cf_module_name__.categorization.example ml_modules: columnflow.ml, __cf_module_name__.ml.example inference_modules: columnflow.inference, __cf_module_name__.inference.example From 63969417585efdf0cee10102bd2e38cb5520d348 Mon Sep 17 00:00:00 2001 From: maadcoen Date: Thu, 25 Apr 2024 17:56:53 +0200 Subject: [PATCH 089/119] write veto after performing selection task --- columnflow/production/veto.py | 3 ++- columnflow/tasks/selection.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/columnflow/production/veto.py b/columnflow/production/veto.py index d6fbddcb8..9f7e6a2f7 100644 --- a/columnflow/production/veto.py +++ b/columnflow/production/veto.py @@ -10,7 +10,8 @@ @producer( - uses=("event", "run", "luminosityBlock"), + uses={"event", "run", "luminosityBlock"}, + produces={"veto"}, exposed=False, get_veto_file=(lambda self, external_files: external_files.veto), ) diff --git a/columnflow/tasks/selection.py b/columnflow/tasks/selection.py index 479721154..433e12efb 100644 --- a/columnflow/tasks/selection.py +++ b/columnflow/tasks/selection.py @@ -158,6 +158,7 @@ def run(self): # define columns that will be written write_columns = set(map(Route, mandatory_coffea_columns)) write_columns |= self.selector_inst.produced_columns + write_columns |= self.veto_producer.produced_columns route_filter = RouteFilter(write_columns) # let the lfn_task prepare the nano file (basically determine a good pfn) From de550252f9f49fb4e03a31641259de69e89a5afe Mon Sep 17 00:00:00 2001 From: maadcoen Date: Fri, 26 Apr 2024 13:15:18 +0200 Subject: [PATCH 090/119] add ReadDataSets to law.cfg --- law.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/law.cfg b/law.cfg index cb47c47af..aeeb86637 100644 --- a/law.cfg +++ b/law.cfg @@ -12,6 +12,7 @@ columnflow.tasks.histograms columnflow.tasks.plotting columnflow.tasks.yields columnflow.tasks.cmsGhent.cutflow +columnflow.tasks.cmsGhent.config From 33efcfac8cc47113df9dc45d423bb078132d0079 Mon Sep 17 00:00:00 2001 From: Maarten De Coen <52047931+maadcoen@users.noreply.github.com> Date: Wed, 15 May 2024 17:46:33 +0200 Subject: [PATCH 091/119] Cutflow dev (#8) * new cutflow workflow. Shared code between MergeHistograms and MergeCutflowHistograms * allow to specify no steps and have only inclusive * add additional normalization options --- columnflow/tasks/cmsGhent/cutflow.py | 121 +++---------------- columnflow/tasks/cutflow.py | 167 +++++++++++++++++++++------ columnflow/tasks/framework/mixins.py | 91 ++++++++++++++- columnflow/tasks/histograms.py | 78 +------------ 4 files changed, 234 insertions(+), 223 deletions(-) diff --git a/columnflow/tasks/cmsGhent/cutflow.py b/columnflow/tasks/cmsGhent/cutflow.py index f2a99c394..08c0edf79 100644 --- a/columnflow/tasks/cmsGhent/cutflow.py +++ b/columnflow/tasks/cmsGhent/cutflow.py @@ -15,99 +15,6 @@ np = maybe_import("numpy") -PlotCutflow.relative = luigi.BoolParameter( - default=False, - significant=False, - description="name of the variable to use for obtaining event counts; default: 'False'", - ) - -@law.decorator.log -@view_output_plots -def PlotCutflow_run(self): - import hist - - # prepare config objects - category_inst = self.config_inst.get_category(self.branch_data) - leaf_category_insts = category_inst.get_leaf_categories() or [category_inst] - process_insts = list(map(self.config_inst.get_process, self.processes)) - sub_process_insts = { - proc: [sub for sub, _, _ in proc.walk_processes(include_self=True)] - for proc in process_insts - } - - # histogram data per process - hists = {} - - with self.publish_step(f"plotting cutflow in {category_inst.name}"): - for dataset, inp in self.input().items(): - dataset_inst = self.config_inst.get_dataset(dataset) - h_in = inp[self.variable].load(formatter="pickle") - - # sanity checks - n_shifts = len(h_in.axes["shift"]) - if n_shifts != 1: - raise Exception(f"shift axis is supposed to only contain 1 bin, found {n_shifts}") - - # loop and extract one histogram per process - for process_inst in process_insts: - # skip when the dataset is already known to not contain any sub process - if not any(map(dataset_inst.has_process, sub_process_insts[process_inst])): - continue - - # work on a copy - h = h_in.copy() - - # axis selections - h = h[{ - "process": [ - hist.loc(p.id) - for p in sub_process_insts[process_inst] - if p.id in h.axes["process"] - ], - "category": [ - hist.loc(c.id) - for c in leaf_category_insts - if c.id in h.axes["category"] - ], - }] - - # axis reductions - h = h[{"process": sum, "category": sum, self.variable: sum}] - - # add the histogram - if process_inst in hists: - hists[process_inst] += h - else: - hists[process_inst] = h - - # there should be hists to plot - if not hists: - raise Exception("no histograms found to plot") - - total = sum(hists.values()).values() if self.relative else np.ones((len(self.selector_steps) + 1, 1)) - - # sort hists by process order - hists = OrderedDict( - (process_inst.copy_shallow(), hists[process_inst] / total) - for process_inst in sorted(hists, key=process_insts.index) - ) - - # call the plot function - fig, _ = self.call_plot_func( - self.plot_function, - hists=hists, - config_inst=self.config_inst, - category_inst=category_inst.copy_shallow(), - **self.get_plot_parameters(), - ) - - # save the plot - for outp in self.output()["plots"]: - outp.dump(fig, formatter="mpl") - - -PlotCutflow.run = PlotCutflow_run - class CreateCutflowTable( DatasetsProcessesMixin, @@ -138,11 +45,11 @@ class CreateCutflowTable( description="when True, uncertainties are not displayed in the table; default: False", ) normalize_yields = luigi.ChoiceParameter( - choices=(law.NO_STR, "per_process", "per_step", "all"), + choices=(law.NO_STR, "per_process", "per_step", "per_process_100", "per_step_100", "all_100"), default=law.NO_STR, significant=False, description="string parameter to define the normalization of the yields; " - "choices: '', per_process, per_category, all; empty default", + "choices: '', per_process, per_category, all; Append 100 to express as percentage; empty default", ) output_suffix = luigi.Parameter( default=law.NO_STR, @@ -154,7 +61,7 @@ class CreateCutflowTable( # upstream requirements reqs = Requirements( RemoteWorkflow.reqs, - CreateCutflowHistograms=CreateCutflowHistograms, + MergeCutflowHistograms=MergeCutflowHistograms, ) def create_branch_map(self): @@ -171,7 +78,7 @@ def workflow_requires(self): reqs = super().workflow_requires() reqs["hists"] = [ - self.reqs.CreateCutflowHistograms.req( + self.reqs.MergeCutflowHistograms.req( self, dataset=d, variables=("event",), @@ -183,7 +90,7 @@ def workflow_requires(self): def requires(self): return { - d: self.reqs.CreateCutflowHistograms.req( + d: self.reqs.MergeCutflowHistograms.req( self, branch=0, dataset=d, @@ -236,7 +143,7 @@ def run(self): dataset_inst = self.config_inst.get_dataset(dataset) # load the histogram of the variable named "event" - h_in = inp["event"].load(formatter="pickle") + h_in = inp["hists"]["event"].load(formatter="pickle") # sanity checks n_shifts = len(h_in.axes["shift"]) @@ -304,20 +211,20 @@ def run(self): yields[step].append(value) # obtain normalizaton factors - norm_factors = 1 + norm_factors = 0.01 if '100' in self.normalize_yields else 1 if self.normalize_yields == "all": - norm_factors = sum( + norm_factors *= sum( sum(step_yields) for step_yields in yields.values() ) - elif self.normalize_yields == "per_process": + elif self.normalize_yields.startswith("per_process"): norm_factors = [ - sum(yields[step][i] for step in yields.keys()) + norm_factors * sum(yields[step][i] for step in yields.keys()) for i in range(len(yields[self.selector_steps[0]])) ] - elif self.normalize_yields == "per_step": + elif self.normalize_yields.startswith("per_step"): norm_factors = { - step: sum(step_yields) + step: norm_factors * sum(step_yields) for step, step_yields in yields.items() } @@ -329,9 +236,9 @@ def run(self): for step, step_yields in yields.items(): for i, value in enumerate(step_yields): # get correct norm factor per category and process - if self.normalize_yields == "per_process": + if self.normalize_yields.startswith("per_process"): norm_factor = norm_factors[i] - elif self.normalize_yields == "per_step": + elif self.normalize_yields.startswith("per_step"): norm_factor = norm_factors[step] else: norm_factor = norm_factors diff --git a/columnflow/tasks/cutflow.py b/columnflow/tasks/cutflow.py index a75f0721d..261cf7488 100644 --- a/columnflow/tasks/cutflow.py +++ b/columnflow/tasks/cutflow.py @@ -15,15 +15,20 @@ Requirements, AnalysisTask, DatasetTask, ShiftTask, wrapper_factory, ) from columnflow.tasks.framework.mixins import ( - CalibratorsMixin, SelectorStepsMixin, VariablesMixin, CategoriesMixin, ChunkedIOMixin, + CalibratorsMixin, SelectorStepsMixin, VariablesMixin, CategoriesMixin, ChunkedIOMixin, MergeHistogramMixin ) from columnflow.tasks.framework.plotting import ( PlotBase, PlotBase1D, PlotBase2D, ProcessPlotSettingMixin, VariablePlotSettingMixin, ) from columnflow.tasks.framework.decorators import view_output_plots from columnflow.tasks.framework.remote import RemoteWorkflow -from columnflow.tasks.selection import MergeSelectionMasks -from columnflow.util import DotDict, dev_sandbox +from columnflow.tasks.external import GetDatasetLFNs +from columnflow.tasks.selection import SelectEvents +from columnflow.tasks.calibration import CalibrateEvents +from columnflow.production import Producer +from columnflow.util import DotDict, dev_sandbox, maybe_import + +np = maybe_import("numpy") class CreateCutflowHistograms( @@ -46,32 +51,64 @@ class CreateCutflowHistograms( # upstream requirements reqs = Requirements( RemoteWorkflow.reqs, - MergeSelectionMasks=MergeSelectionMasks, + GetDatasetLFNs=GetDatasetLFNs, + CalibrateEvents=CalibrateEvents, + SelectEvents=SelectEvents, ) + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # store the normalization weight producer for MC + self.norm_weight_producer = None + if self.dataset_inst.is_mc: + self.norm_weight_producer = Producer.get_cls("normalization_weights")( + inst_dict=self.get_producer_kwargs(self), + ) + # strategy for handling missing source columns when adding aliases on event chunks missing_column_alias_strategy = "original" - def create_branch_map(self): - # dummy branch map - return [None] - def workflow_requires(self): reqs = super().workflow_requires() + reqs["lfns"] = self.reqs.GetDatasetLFNs.req(self) + if not self.pilot: + reqs["calibrations"] = [ + self.reqs.CalibrateEvents.req(self, calibrator=calibrator_inst.cls_name) + for calibrator_inst in self.calibrator_insts + if calibrator_inst.produced_columns + ] + reqs["selection"] = self.reqs.SelectEvents.req(self) + else: + # pass-through pilot workflow requirements of upstream task + t = self.reqs.SelectEvents.req(self) + reqs = law.util.merge_dicts(reqs, t.workflow_requires(), inplace=True) - reqs["selection"] = self.reqs.MergeSelectionMasks.req(self, tree_index=0, _exclude={"branches"}) + if self.dataset_inst.is_mc: + reqs["normalization"] = self.norm_weight_producer.run_requires() return reqs def requires(self): - return { - "selection": self.reqs.MergeSelectionMasks.req(self, tree_index=0, branch=0), + reqs = { + "lfns": self.reqs.GetDatasetLFNs.req(self), + "calibrations": [ + self.reqs.CalibrateEvents.req(self, calibrator=calibrator_inst.cls_name) + for calibrator_inst in self.calibrator_insts + if calibrator_inst.produced_columns + ], + "selection": self.reqs.SelectEvents.req(self), } + if self.dataset_inst.is_mc: + reqs["normalization"] = self.norm_weight_producer.run_requires() + + return reqs + + # TODO: CreateHistograms has a @MergeReducedEventsUser.maybe_dummy here def output(self): return { - var: self.target(f"cutflow_hist__{var}.pickle") - for var in self.variables + "hists": self.target(f"histograms__vars_{self.variables_repr}__{self.branch}.pickle"), } @law.decorator.log @@ -81,10 +118,11 @@ def run(self): import hist import numpy as np import awkward as ak - from columnflow.columnar_util import Route, add_ak_aliases + from columnflow.columnar_util import Route, add_ak_aliases, mandatory_coffea_columns, update_ak_array # prepare inputs and outputs inputs = self.input() + lfn_task = self.requires()["lfns"] # create a temp dir for saving intermediate files tmp_dir = law.LocalDirectoryTarget(is_tmp=True) @@ -93,10 +131,17 @@ def run(self): # get shift dependent aliases aliases = self.local_shift_inst.x("column_aliases", {}) + # setup the normalization weights producer + if self.dataset_inst.is_mc: + self.norm_weight_producer.run_setup( + self.requires()["normalization"], + self.input()["normalization"], + ) + # define columns that need to be read read_columns = {"category_ids", "process_id"} | set(aliases.values()) if self.dataset_inst.is_mc: - read_columns |= {"normalization_weight"} + read_columns |= self.norm_weight_producer.used_columns read_columns = {Route(c) for c in read_columns} # define steps @@ -120,7 +165,9 @@ def run(self): expressions[variable_inst.name] = expr # prepare columns to load - load_columns = {("events" + route) for route in read_columns} | {Route("steps.*")} + load_columns = read_columns | set(mandatory_coffea_columns) + load_nano_columns = {("events" + route) for route in read_columns} | set(mandatory_coffea_columns) + load_sel_columns = {Route("steps.*")} # prepare histograms histograms = {} @@ -147,16 +194,35 @@ def prepare_hists(steps): # enable weights and store it histograms[var_key] = h.Weight() - for arr, pos in self.iter_chunked_io( - inputs["selection"]["masks"].path, - source_type="awkward_parquet", - read_columns=load_columns, + # let the lfn_task prepare the nano file (basically determine a good pfn) + [(lfn_index, input_file)] = lfn_task.iter_nano_files(self) + + # open the input file with uproot + with self.publish_step("load and open ..."): + nano_file = input_file.load(formatter="uproot") + + input_paths = [nano_file] + input_paths.append(inputs["selection"]["results"].path) + input_paths.extend([inp["columns"].path for inp in inputs["calibrations"]]) + if self.selector_inst.produced_columns: + input_paths.append(inputs["selection"]["columns"].path) + + for (events, sel, *diffs), pos in self.iter_chunked_io( + input_paths, + source_type=["coffea_root"] + (len(input_paths) - 1) * ["awkward_parquet"], + read_columns=[load_nano_columns, load_sel_columns] + (len(input_paths) - 2) * [load_columns], ): - events = arr.events + + # add the calibrated diffs and potentially new columns + events = update_ak_array(events, *diffs) + + # add normalization weight + if self.dataset_inst.is_mc: + events = self.norm_weight_producer(events) # overwrite steps if not defined yet - if not steps: - steps = arr.steps.fields + if steps is None: + steps = sel.steps.fields # prepare histograms and exprepssions once if not histograms: @@ -203,12 +269,12 @@ def get_point(mask=Ellipsis): # fill all other steps mask = True for step in steps: - if step not in arr.steps.fields: + if step not in sel.steps.fields: raise ValueError( f"step '{step}' is not defined by selector {self.selector}", ) # incrementally update the mask and fill the point - mask = mask & arr.steps[step] + mask = mask & sel.steps[step] fill_kwargs = get_point(mask) arrays = ak.flatten(ak.cartesian(fill_kwargs)) histograms[var_key].fill( @@ -217,8 +283,7 @@ def get_point(mask=Ellipsis): ) # dump the histograms - for var_key in histograms.keys(): - self.output()[var_key].dump(histograms[var_key], formatter="pickle") + self.output()["hists"].dump(histograms, formatter="pickle") CreateCutflowHistogramsWrapper = wrapper_factory( @@ -228,6 +293,30 @@ def get_point(mask=Ellipsis): ) +class MergeCutflowHistograms( + MergeHistogramMixin, + SelectorStepsMixin, + CalibratorsMixin, + DatasetTask, + RemoteWorkflow, +): + sandbox = dev_sandbox(law.config.get("analysis", "default_columnar_sandbox")) + + # upstream requirements + reqs = Requirements( + RemoteWorkflow.reqs, + CreateHistograms=CreateCutflowHistograms, + ) + + selector_steps_order_sensitive = True + +MergeCutflowHistogramsWrapper = wrapper_factory( + base_cls=AnalysisTask, + require_cls=MergeCutflowHistograms, + enable=["configs", "skip_configs", "datasets", "skip_datasets", "shifts", "skip_shifts"], +) + + class PlotCutflowBase( SelectorStepsMixin, CategoriesMixin, @@ -246,7 +335,7 @@ class PlotCutflowBase( # upstream requirements reqs = Requirements( RemoteWorkflow.reqs, - CreateCutflowHistograms=CreateCutflowHistograms, + MergeCutflowHistograms=MergeCutflowHistograms, ) def store_parts(self): @@ -273,6 +362,12 @@ class PlotCutflow( f"default: '{CreateCutflowHistograms.default_variables[0]}'", ) + relative = luigi.BoolParameter( + default=False, + significant=False, + description="plot cutflow as fraction of total at each step", + ) + # upstream requirements reqs = Requirements( PlotCutflowBase.reqs, @@ -293,7 +388,7 @@ def workflow_requires(self): reqs = super().workflow_requires() reqs["hists"] = [ - self.reqs.CreateCutflowHistograms.req( + self.reqs.MergeCutflowHistograms.req( self, dataset=d, variables=(self.variable,), @@ -305,7 +400,7 @@ def workflow_requires(self): def requires(self): return { - d: self.reqs.CreateCutflowHistograms.req( + d: self.reqs.MergeCutflowHistograms.req( self, branch=0, dataset=d, @@ -340,7 +435,7 @@ def run(self): with self.publish_step(f"plotting cutflow in {category_inst.name}"): for dataset, inp in self.input().items(): dataset_inst = self.config_inst.get_dataset(dataset) - h_in = inp[self.variable].load(formatter="pickle") + h_in = inp["hists"][self.variable].load(formatter="pickle") # sanity checks n_shifts = len(h_in.axes["shift"]) @@ -383,9 +478,11 @@ def run(self): if not hists: raise Exception("no histograms found to plot") + total = sum(hists.values()).values() if self.relative else np.ones((len(self.selector_steps) + 1, 1)) + # sort hists by process order hists = OrderedDict( - (process_inst.copy_shallow(), hists[process_inst]) + (process_inst.copy_shallow(), hists[process_inst] / total) for process_inst in sorted(hists, key=process_insts.index) ) @@ -463,14 +560,14 @@ def create_branch_map(self): def workflow_requires(self): reqs = super().workflow_requires() reqs["hists"] = [ - self.reqs.CreateCutflowHistograms.req(self, dataset=d, _exclude={"branches"}) + self.reqs.MergeCutflowHistograms.req(self, dataset=d, _exclude={"branches"}) for d in self.datasets ] return reqs def requires(self): return { - d: self.reqs.CreateCutflowHistograms.req(self, dataset=d, branch=0) + d: self.reqs.MergeCutflowHistograms.req(self, dataset=d, branch=0) for d in self.datasets } @@ -507,7 +604,7 @@ def run(self): with self.publish_step(f"plotting {self.branch_data.variable} in {category_inst.name}"): for dataset, inp in self.input().items(): dataset_inst = self.config_inst.get_dataset(dataset) - h_in = inp[self.branch_data.variable].load(formatter="pickle") + h_in = inp["hists"][self.branch_data.variable].load(formatter="pickle") # sanity checks n_shifts = len(h_in.axes["shift"]) diff --git a/columnflow/tasks/framework/mixins.py b/columnflow/tasks/framework/mixins.py index 0861cc0d3..dcfa0b732 100644 --- a/columnflow/tasks/framework/mixins.py +++ b/columnflow/tasks/framework/mixins.py @@ -587,9 +587,9 @@ class SelectorStepsMixin(SelectorMixin): """ selector_steps = law.CSVParameter( - default=(), - description="a subset of steps of the selector to apply; uses all steps when empty; " - "empty default", + default=None, + description="a subset of steps of the selector to apply; uses all steps when None; " + "None default", brace_expand=True, parse_empty=True, ) @@ -631,6 +631,9 @@ def resolve_param_values(cls, params: dict[str, Any]) -> dict[str, Any]: if not cls.selector_steps_order_sensitive and "selector_steps" in params: params["selector_steps"] = tuple(sorted(params["selector_steps"])) + if "selector_steps" in params and params["selector_steps"] == (None,): + params["selector_steps"] = None + return params @classmethod @@ -667,8 +670,8 @@ def store_parts(self) -> law.util.InsertableDict: steps = self.selector_steps if not self.selector_steps_order_sensitive: steps = sorted(steps) - if steps: - parts["selector"] += "__steps_" + "_".join(steps) + if steps is not None: + parts["selector"] += ("__steps_" + "_".join(steps) if steps else "__inclusive") return parts @@ -2226,3 +2229,81 @@ def iter_chunked_io(self, *args, **kwargs): # eager, overly cautious gc del handler gc.collect() + + +class MergeHistogramMixin( + VariablesMixin, + law.LocalWorkflow, + ): + only_missing = luigi.BoolParameter( + default=False, + description="when True, identify missing variables first and only require histograms of " + "missing ones; default: False", + ) + remove_previous = luigi.BoolParameter( + default=False, + significant=False, + description="when True, remove particlar input histograms after merging; default: False", + ) + + def create_branch_map(self): + # create a dummy branch map so that this task could be submitted as a job + return {0: None} + + def workflow_requires(self): + reqs = super().workflow_requires() + + reqs["hists"] = self.as_branch().requires() + + return reqs + + def requires(self): + # optional dynamic behavior: determine not yet created variables and require only those + prefer_cli = {"variables"} + variables = self.variables + if self.only_missing: + prefer_cli.clear() + missing = self.output().count(existing=False, keys=True)[1] + variables = tuple(sorted(missing, key=variables.index)) + + if not variables: + return [] + + return self.reqs.CreateHistograms.req( + self, + branch=-1, + variables=tuple(variables), + _exclude={"branches"}, + _prefer_cli=prefer_cli, + ) + + def output(self): + return {"hists": law.SiblingFileCollection({ + variable_name: self.target(f"hist__{variable_name}.pickle") + for variable_name in self.variables + })} + + @law.decorator.log + def run(self): + # preare inputs and outputs + inputs = self.input()["collection"] + outputs = self.output() + + # load input histograms + hists = [ + inp["hists"].load(formatter="pickle") + for inp in self.iter_progress(inputs.targets.values(), len(inputs), reach=(0, 50)) + ] + + # create a separate file per output variable + variable_names = list(hists[0].keys()) + for variable_name in self.iter_progress(variable_names, len(variable_names), reach=(50, 100)): + self.publish_message(f"merging histograms for '{variable_name}'") + + variable_hists = [h[variable_name] for h in hists] + merged = sum(variable_hists[1:], variable_hists[0].copy()) + outputs["hists"][variable_name].dump(merged, formatter="pickle") + + # optionally remove inputs + if self.remove_previous: + inputs.remove() diff --git a/columnflow/tasks/histograms.py b/columnflow/tasks/histograms.py index 322500249..da93b8db0 100644 --- a/columnflow/tasks/histograms.py +++ b/columnflow/tasks/histograms.py @@ -12,7 +12,7 @@ from columnflow.tasks.framework.base import Requirements, AnalysisTask, DatasetTask, wrapper_factory from columnflow.tasks.framework.mixins import ( CalibratorsMixin, SelectorStepsMixin, ProducersMixin, MLModelsMixin, VariablesMixin, - ShiftSourcesMixin, WeightProducerMixin, ChunkedIOMixin, + ShiftSourcesMixin, WeightProducerMixin, ChunkedIOMixin, MergeHistogramMixin ) from columnflow.tasks.framework.remote import RemoteWorkflow from columnflow.tasks.reduction import MergeReducedEventsUser, MergeReducedEvents @@ -268,27 +268,15 @@ def expr(events, *args, **kwargs): class MergeHistograms( - VariablesMixin, + MergeHistogramMixin, WeightProducerMixin, MLModelsMixin, ProducersMixin, SelectorStepsMixin, CalibratorsMixin, DatasetTask, - law.LocalWorkflow, RemoteWorkflow, ): - only_missing = luigi.BoolParameter( - default=False, - description="when True, identify missing variables first and only require histograms of " - "missing ones; default: False", - ) - remove_previous = luigi.BoolParameter( - default=False, - significant=False, - description="when True, remove particlar input histograms after merging; default: False", - ) - sandbox = dev_sandbox(law.config.get("analysis", "default_columnar_sandbox")) # upstream requirements @@ -297,68 +285,6 @@ class MergeHistograms( CreateHistograms=CreateHistograms, ) - def create_branch_map(self): - # create a dummy branch map so that this task could be submitted as a job - return {0: None} - - def workflow_requires(self): - reqs = super().workflow_requires() - - reqs["hists"] = self.as_branch().requires() - - return reqs - - def requires(self): - # optional dynamic behavior: determine not yet created variables and require only those - prefer_cli = {"variables"} - variables = self.variables - if self.only_missing: - prefer_cli.clear() - missing = self.output().count(existing=False, keys=True)[1] - variables = tuple(sorted(missing, key=variables.index)) - - if not variables: - return [] - - return self.reqs.CreateHistograms.req( - self, - branch=-1, - variables=tuple(variables), - _exclude={"branches"}, - _prefer_cli=prefer_cli, - ) - - def output(self): - return {"hists": law.SiblingFileCollection({ - variable_name: self.target(f"hist__{variable_name}.pickle") - for variable_name in self.variables - })} - - @law.decorator.log - def run(self): - # preare inputs and outputs - inputs = self.input()["collection"] - outputs = self.output() - - # load input histograms - hists = [ - inp["hists"].load(formatter="pickle") - for inp in self.iter_progress(inputs.targets.values(), len(inputs), reach=(0, 50)) - ] - - # create a separate file per output variable - variable_names = list(hists[0].keys()) - for variable_name in self.iter_progress(variable_names, len(variable_names), reach=(50, 100)): - self.publish_message(f"merging histograms for '{variable_name}'") - - variable_hists = [h[variable_name] for h in hists] - merged = sum(variable_hists[1:], variable_hists[0].copy()) - outputs["hists"][variable_name].dump(merged, formatter="pickle") - - # optionally remove inputs - if self.remove_previous: - inputs.remove() - MergeHistogramsWrapper = wrapper_factory( base_cls=AnalysisTask, From 3eb4506e9c134f7e20b854875c0dbcf813a441f1 Mon Sep 17 00:00:00 2001 From: Jan van der Linden Date: Fri, 17 May 2024 10:43:22 +0200 Subject: [PATCH 092/119] centering discrete x values --- columnflow/plotting/plot_util.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/columnflow/plotting/plot_util.py b/columnflow/plotting/plot_util.py index afa238bea..c55012d6d 100644 --- a/columnflow/plotting/plot_util.py +++ b/columnflow/plotting/plot_util.py @@ -202,8 +202,9 @@ def prepare_style_config( # disable minor ticks based on variable_inst if variable_inst.discrete_x: - # TODO: find sth better than plain bin edges or possibly memory intense range(*xlim) - style_config["ax_cfg"]["xticks"] = variable_inst.bin_edges + # TODO: options for very large ranges, or non-uniform discrete x + tx = range(int(xlim[0]), int(xlim[1]+1)) + style_config["ax_cfg"]["xticks"] = tx style_config["ax_cfg"]["minorxticks"] = [] if variable_inst.discrete_y: style_config["ax_cfg"]["minoryticks"] = [] From 75b0ca7a45c304e0d629a2a1d545ecda45662e1c Mon Sep 17 00:00:00 2001 From: maadcoen Date: Fri, 17 May 2024 17:17:53 +0200 Subject: [PATCH 093/119] implementation top lepton MVA --- columnflow/calibration/cmsGhent/lepton_mva.py | 148 ++++++++++++ .../production/cmsGhent/gen_features.py | 217 ++++++++++++++++++ .../selection/cmsGhent/lepton_mva_cuts.py | 88 +++++++ sandboxes/venv_lepton_mva.sh | 18 ++ sandboxes/venv_lepton_mva.txt | 5 + sandboxes/venv_lepton_mva_dev.sh | 18 ++ 6 files changed, 494 insertions(+) create mode 100644 columnflow/calibration/cmsGhent/lepton_mva.py create mode 100644 columnflow/production/cmsGhent/gen_features.py create mode 100644 columnflow/selection/cmsGhent/lepton_mva_cuts.py create mode 100644 sandboxes/venv_lepton_mva.sh create mode 100644 sandboxes/venv_lepton_mva.txt create mode 100644 sandboxes/venv_lepton_mva_dev.sh diff --git a/columnflow/calibration/cmsGhent/lepton_mva.py b/columnflow/calibration/cmsGhent/lepton_mva.py new file mode 100644 index 000000000..90e38647f --- /dev/null +++ b/columnflow/calibration/cmsGhent/lepton_mva.py @@ -0,0 +1,148 @@ +""" +Code to add lepton MVA to NanoAOD +""" + +from collections import OrderedDict + +from columnflow.calibration import Calibrator, calibrator +from columnflow.production import Producer, producer +from columnflow.util import maybe_import +from columnflow.columnar_util import set_ak_column, InsertableDict +from columnflow.columnar_util_Ghent import TetraVec +from columnflow.tasks.external import BundleExternalFiles + +np = maybe_import("numpy") +ak = maybe_import("awkward") +coffea = maybe_import("coffea") +maybe_import("coffea.nanoevents.methods.nanoaod") + + +@producer( + uses={ + f"{lep}.{p}" + for lep in ["Muon", "Electron"] + for p in ["pt", "eta", "miniPFRelIso_all", "miniPFRelIso_chg", "jetRelIso", "dxy", "dz", "jetIdx", + "jetNDauCharged", "jetPtRelv2", "pfRelIso03_all", "sip3d"] + } | {"Jet.btagDeepFlavB", "Electron.mvaFall17V2noIso", "Muon.segmentComp"}, + produces={ + f"{lep}.{p}" + for lep in ["Muon", "Electron"] + for p in ["abseta", "miniPFRelIso_neutral", "jetPtRatio", "jetBTagDeepFlavor", "log_absdxy", "log_absdz"] + }, +) +def lepton_mva_inputs_producer(self: Calibrator, events: ak.Array, **kwargs) -> ak.Array: + """ + collects all inputs to the TOP lepton MVA (v1) and makes the necessary transformations + """ + for lepton_name in ["Muon", "Electron"]: + lepton = events[lepton_name] + matched_jet = lepton.jetIdx + is_matched = matched_jet != -1 + + # replace jetRelIso by the equivalent used in the MVA + # if no matched jet, jetRelIso == pfRelIso04_all in NanoAOD, but MVA assumes then zero + events = set_ak_column(events, f"{lepton_name}.jetPtRatio", 1. / (lepton.jetRelIso + 1.)) + + # matched deepJet score of closest jet if any (zero otherwise) + btag_values = ak.pad_none(events.Jet.btagDeepFlavB, target=1)[matched_jet] + events = set_ak_column(events, f"{lepton_name}.jetBTagDeepFlavor", ak.where(is_matched, btag_values, 0.)) + + # impact parameters in log + for impact in ["dxy", "dz"]: + events = set_ak_column(events, f"{lepton_name}.log_abs" + impact, np.log(np.abs(lepton[impact]))) + + # Relative mini-isolation with neutral PF objects + events = set_ak_column(events, f"{lepton_name}.miniPFRelIso_neutral", lepton.miniPFRelIso_all - lepton.miniPFRelIso_chg) + + # absolute eta + events = set_ak_column(events, f"{lepton_name}.abseta", np.abs(lepton.eta)) + + return events + + +_shared_mva_inputs = [ + "pt", + "eta", + "jetNDauCharged", + "miniPFRelIso_chg", + "miniPFRelIso_neutral", + "jetPtRelv2", + "jetPtRatio", + "pfRelIso03_all", + "jetBTagDeepFlavor", + "sip3d", + "log_absdxy", + "log_absdz", +] + +lepton_mva_inputs = { + "Electron": [*_shared_mva_inputs, "mvaFall17V2noIso"], # add "lost hits" for version 2 + "Muon": [*_shared_mva_inputs, "segmentComp"], + "Lepton": _shared_mva_inputs +} + + +@calibrator( + uses={lepton_mva_inputs_producer}, + produces={"Electron.mvaTOP", "Muon.mvaTOP"}, + sandbox="bash::$SINGLETOP_BASE/sandboxes/venv_lepton_mva.sh", +) +def lepton_mva_producer(self: Calibrator, events: ak.Array, **kwargs) -> ak.Array: + """ + Produces the TOP lepton MVA (v1) scores. + Requires an external file in the config under ``lepton_mva.weights``: + + .. code-block:: python + + cfg.x.external_files = DotDict.wrap({ + "lepton_mva": + "weights": { + "Muon": f"YOURDIRECTORY/mu_TOPUL18_XGB.weights.bin", + "Electron": f"YOURDIRECTORY/weights/el_TOPUL18_XGB.weights.bin", + }, + }) + """ + events = self[lepton_mva_inputs_producer](events) + for lepton in ["Muon", "Electron"]: + features = [events[lepton][p] for p in lepton_mva_inputs[lepton]] + # set None values (e.g. when there is no matched jet) to zero + features = ak.fill_none(features, 0.) + # flatten into a numpy array of shape (ninstances, nfeatures) + counts = ak.num(features[0]) + features = np.transpose(np.array(ak.flatten(features, axis=2))) + # make c-contiguous (rows are stored as contiguous blocks of memory.) + features = np.ascontiguousarray(features) + # call xgboost predictor + scores = self.mva[lepton].inplace_predict(features) + # unflatten into an awkward array + scores = ak.unflatten(scores, counts) + # set the scores as an additional field for muons + events = set_ak_column(events, f"{lepton}.mvaTOP", scores) + + return events + + +@lepton_mva_producer.requires +def lepton_mva_producer_requires(self: Calibrator, reqs: dict) -> None: + if "external_files" in reqs: + return + reqs["external_files"] = BundleExternalFiles.req(self.task) + + +@lepton_mva_producer.setup +def lepton_mva_producer_setup( + self: Calibrator, + reqs: dict, + inputs: dict, + reader_targets: InsertableDict, +) -> None: + bundle = reqs["external_files"] + + # create the xgboost predictor + import xgboost + + self.mva = {} + + for lepton in ["Electron", "Muon"]: + self.mva[lepton] = xgboost.Booster() + self.mva[lepton].load_model(bundle.files.lepton_mva["weights"][lepton].path) diff --git a/columnflow/production/cmsGhent/gen_features.py b/columnflow/production/cmsGhent/gen_features.py new file mode 100644 index 000000000..7b2a4a296 --- /dev/null +++ b/columnflow/production/cmsGhent/gen_features.py @@ -0,0 +1,217 @@ +from collections import defaultdict +from typing import Tuple + +import law + +from columnflow.util import maybe_import, four_vec +from columnflow.columnar_util import set_ak_column +from columnflow.production import Producer, producer +from columnflow.columnar_util_Ghent import TetraVec + +np = maybe_import("numpy") +ak = maybe_import("awkward") +coffea = maybe_import("coffea") + + +def _geometric_matching(particles1: ak.Array, particles2: ak.Array) -> (ak.Array, ak.Array): + """ + Returns two awkward arrays. + First contains that for each particle in **particles** the closest particle in the same event in **particles2**. + Second tells you whether the found closest particle is contained within a cone of 0.2. + """ + particles1, particles2 = ak.unzip(ak.cartesian([particles1, particles2], axis=1, nested=True)) + dr = particles1.delta_r(particles2) + drmin_idx = ak.argmin(dr, axis=-1, keepdims=True) + drmin = ak.flatten(dr[drmin_idx], axis=2) + closest_match = ak.flatten(particles2[drmin_idx], axis=2) + return closest_match, ak.fill_none(drmin < 0.2, False) + + +# map of the status flag name to the corresponding bit in statusFlags +_statusmap = ({ + "isPrompt": 0, + "isDecayedLeptonHadron": 1, + "isTauDecayProduct": 2, + "isPromptTauDecayProduct": 3, + "isDirectTauDecayProduct": 4, + "isDirectPromptTauDecayProduct": 5, + "isDirectHadronDecayProduct": 6, + "isHardProcess": 7, + "fromHardProcess": 8, + "isHardProcessTauDecayProduct": 9, + "isDirectHardProcessTauDecayProduct": 10, + "fromHardProcessBeforeFSR": 11, + "isFirstCopy": 12, + "isLastCopy": 13, + "isLastCopyBeforeFSR": 14, + }) + +# status flags that should be present for a prompt genparticle +_prompt_status = ["isPrompt", "isDirectPromptTauDecayProduct", "isHardProcess", + "fromHardProcess", "fromHardProcessBeforeFSR"] + + +@producer( + uses=four_vec( + ("Electron", "Muon"), + ("pdgId", "genPartIdx")) | + four_vec( + ("GenPart"), + ("pdgId", "status", "statusFlags") + ), + produces=four_vec( + {"Electron", "Muon"}, + {"isPrompt", "matchPdgId", "isChargeFlip"} + ), + mc_only=True, + exposed=False, +) +def lepton_gen_features( + self: Producer, + events: ak.Array, + **kwargs, +) -> ak.Array: + + genpart = events.GenPart + + for name, abs_pdgId in (("Electron", 11), ("Muon", 13)): + + lepton = events[name] + + # first check if already has a matched gen particle (include charge matching) + is_nanoAOD_matched = (lepton.genPartIdx >= 0) + is_nanoAOD_charge_matched = is_nanoAOD_matched & (lepton.pdgId == genpart.pdgId[lepton.genPartIdx]) + matched_genpart = genpart[lepton.genPartIdx] + + # if this fails apply geometric matching to stable leptons and photons + + # select stable gen particles + stable_genpart = genpart[genpart.status == 1] + + # first look for closest mathing generator lepton within cone of 0.2 + gen_abs_pdgId = abs(stable_genpart.pdgId) + geom_match_lepton, lepton_within_cone = _geometric_matching(lepton, stable_genpart[gen_abs_pdgId == abs_pdgId]) + + # if not within cone of 0.2, allow for a photon match + geom_match_photon, photon_within_cone = _geometric_matching(lepton, stable_genpart[gen_abs_pdgId == 22]) + + # finally apply hierarchy to determine matched gen particle + match = ak.Array(ak.zeros_like(geom_match_photon)) + match = ak.where(photon_within_cone, geom_match_photon, match) + match = ak.where(lepton_within_cone, geom_match_lepton, match) + match = ak.where(is_nanoAOD_charge_matched, matched_genpart, match) + + # check for matched gen particle if it fulfills all status flags for being prompt + match_isPrompt = False + for status in _prompt_status: + match_isPrompt = match_isPrompt | (match.statusFlags & (1 << _statusmap[status]) != 0) + + valid_match = is_nanoAOD_matched | lepton_within_cone | photon_within_cone + match_pdgId = (match.pdgId == lepton.pdgId) & valid_match + is_chargeflip = (match.pdgId == -lepton.pdgId) & valid_match + + events = set_ak_column(events, f"{name}.isPrompt", ak.fill_none(match_isPrompt, False, axis=-1)) + events = set_ak_column(events, f"{name}.matchPdgId", ak.fill_none(match_pdgId, False, axis=-1)) + events = set_ak_column(events, f"{name}.isChargeFlip", ak.fill_none(is_chargeflip, False, axis=-1)) + + return events + + +@producer( + uses=four_vec( + ("Electron", "Muon"), + ("pdgId", "genPartIdx")) | + four_vec( + ("GenPart"), + ("pdgId", "status", "statusFlags") + ), + produces=four_vec( + {"Electron", "Muon"}, + {"isPromptJules", "matchPdgIdJules", "isChargeFlipJules"} + ), + mc_only=True, + exposed=False, +) +def lepton_gen_features_jules( + self: Producer, + events: ak.Array, + **kwargs, +) -> ak.Array: + + electron = (events.Electron) + muon = (events.Muon) + genpart = (events.GenPart) + + statusmap = ({ + "isPrompt": 0, + "isDecayedLeptonHadron": 1, + "isTauDecayProduct": 2, + "isPromptTauDecayProduct": 3, + "isDirectTauDecayProduct": 4, + "isDirectPromptTauDecayProduct": 5, + "isDirectHadronDecayProduct": 6, + "isHardProcess": 7, + "fromHardProcess": 8, + "isHardProcessTauDecayProduct": 9, + "isDirectHardProcessTauDecayProduct": 10, + "fromHardProcessBeforeFSR": 11, + "isFirstCopy": 12, + "isLastCopy": 13, + "isLastCopyBeforeFSR": 14, + }) + + def has_statusFlag(gen, statusFlag): + return (gen.statusFlags & (1 << statusFlag) != 0) + + for name, lepton, in (("Electron", electron), ("Muon", muon)): + # leptons in [x,y,:] are identical, genparts i, [x,y,:] are all genparts (needed to remove genparts that do not have the same pdgId) + _lepton, _genpart = ak.unzip(ak.cartesian([lepton, genpart], axis=1, nested=True)) + + # mask to match lepton pdgId with gen particle before looking at nearest generator particle + pdgId_mask = (abs(_lepton.pdgId) == abs(_genpart.pdgId)) + + status_mask = ak.where(abs(_genpart.pdgId) == 15, (_genpart.status == 2) & + (has_statusFlag(_genpart, 13)), _genpart.status == 1) + + # reduced gen particle list of possible matching candidates + _genpart_allowphoton = _genpart[(pdgId_mask | (abs(_genpart.pdgId) == 22)) & status_mask] + _genpart = _genpart[pdgId_mask & status_mask] + + dr = ak.min(lepton.delta_r(_genpart), axis=-1) + dr_allowphoton = ak.min(lepton.delta_r(_genpart_allowphoton), axis=-1) + + # take closest gen particle as match (with pdgId and status mask aplied on the gen particle) + custom_match = ak.flatten(_genpart[ak.argmin(lepton.delta_r(_genpart), axis=-1, keepdims=True)], axis=2) + custom_allowphoton = ak.flatten(_genpart_allowphoton[ak.argmin( + lepton.delta_r(_genpart_allowphoton), axis=-1, keepdims=True)], axis=2) + + # if delta r > 0.2, check for match with gen photons + cond = ak.fill_none((dr > 0.2), True) + custom_match = ak.where(cond, custom_allowphoton, custom_match) + + # First check if lepton ahs a designated gen particle, if not use custom match + cond = ak.fill_none((lepton.genPartIdx >= 0) & ( + genpart[lepton.genPartIdx].pdgId == lepton.pdgId), True) + gen_match = ak.where(cond, genpart[lepton.genPartIdx], custom_match) + cond = ak.fill_none((lepton.genPartIdx >= 0) & ( + genpart[lepton.genPartIdx].pdgId == lepton.pdgId), True) + valid_match = ak.where(cond, True, dr_allowphoton < 0.2) + + # if delta r still > 0.2, there is no valid custom match! + match_isPrompt = ( + (has_statusFlag(gen_match, statusmap["isPrompt"])) | + (has_statusFlag(gen_match, statusmap["isDirectPromptTauDecayProduct"])) | + (has_statusFlag(gen_match, statusmap["isHardProcess"])) | + (has_statusFlag(gen_match, statusmap["fromHardProcess"])) | + (has_statusFlag(gen_match, statusmap["fromHardProcessBeforeFSR"])) + ) & (valid_match) + + matchPdgId = (gen_match.pdgId == lepton.pdgId) & valid_match + + is_chargeflip = (gen_match.pdgId == -lepton.pdgId) & valid_match + + events = set_ak_column(events, f"{name}.isPromptJules", ak.fill_none(match_isPrompt, 0, axis=-1)) + events = set_ak_column(events, f"{name}.matchPdgIdJules", ak.fill_none(matchPdgId, 0, axis=-1)) + events = set_ak_column(events, f"{name}.isChargeFlipJules", ak.fill_none(is_chargeflip, 0, axis=-1)) + + return events diff --git a/columnflow/selection/cmsGhent/lepton_mva_cuts.py b/columnflow/selection/cmsGhent/lepton_mva_cuts.py new file mode 100644 index 000000000..072ccb220 --- /dev/null +++ b/columnflow/selection/cmsGhent/lepton_mva_cuts.py @@ -0,0 +1,88 @@ +# coding: utf-8 + +""" +Selection modules for object selection of Muon, Electron, and Jet. +""" + +from collections import defaultdict +from typing import Tuple, Literal, Dict + +import law + +from columnflow.util import maybe_import, four_vec +from columnflow.columnar_util import set_ak_column, optional_column +from columnflow.production.util import attach_coffea_behavior +from columnflow.selection import Selector, SelectionResult, selector +from columnflow.selection.util import masked_sorted_indices + +ak = maybe_import("awkward") + + +@selector( + uses=( + four_vec({"Electron", "Muon"}, {"dxy", "dz", "sip3d", "miniPFRelIso_all"}) | + {"Electron.lostHits", "Electron.deltaEtaSC", "Muon.mediumId"} | + optional_column("Electron.mvaTOP", "Muon.mvaTOP") + ), +) +def lepton_mva_object( + self: Selector, + events: ak.Array, + working_point: 'Dict[Listeral["Muon", "Electron"], str] | str'="veto", + **kwargs, +) -> Tuple[ak.Array, SelectionResult]: + """ + The following cuts are the cuts that are required to be able to use the lepton MVA. Leptons that are + passing these cuts are referred to as "veto" leptons. + No additional cuts should be applied for the available scale factors to apply, except on p_T and eta. + + :param events: Array containing events in the NanoAOD format + :param working_point: name of the working_point or dict mapping leptons to working points to apply to the muons + and electrons outputted in the SelectionResult + :return: Tuple containing the events array and a :py:class:`~columnflow.selection.SelectionResult` + with selected Muon and Electron objects passing **working_point**. The event array has extra Muon and Electron + boolean fields for the veto definition, as well as the TOP mva working points if the mvaTOP field is present in the + event.Muon and event.Electron fields + + """ + if isinstance(working_point, str): + working_point = {l: working_point for l in ["Muon", "Electron"]} + if set(working_point.values()) != {"veto"}: + assert working_point in self.config_inst.x.top_mva_wps + assert "mvaTOP" in events.Electron.fields + assert "mvaTOP" in events.Muon.fields + + # conditions differing for muons and leptons + ele, mu = events.Electron, events.Muon + ele_absetaSC = abs(ele.eta + ele.deltaEtaSC) + masks = { + "Electron": (abs(ele.eta) < 2.5) & (ele.lostHits < 2) & ((ele_absetaSC > 1.5560) | (ele_absetaSC < 1.4442)), + "Muon": (abs(events.Muon.eta) < 2.4) & events.Muon.mediumId, + } + + # conditions shared for muons and leptons + for lepton_name in masks: + lepton = events[lepton_name] + veto_mask = masks[lepton_name] & ( + (lepton.pt > 10) & + (lepton.miniPFRelIso_all < 0.4) & + (lepton.sip3d < 8) & + (lepton.dz < 0.1) & + (lepton.dxy < 0.05) + ) + events = set_ak_column(events, f"{lepton_name}.veto", veto_mask) + if "mvaTOP" in lepton.fields: + wps = self.config_inst.x.top_mva_wps + for wp in wps: + events = set_ak_column(events, f"{lepton_name}.{wp}", + events[lepton_name]["veto"] & + (lepton.mvaTOP > wps[wp]) + ) + return events, SelectionResult( + steps={}, + objects={ + lep: + {lep: masked_sorted_indices(events[lep][working_point[lep]], events[lep].pt)} + for lep in ["Muon", "Electrion"] + }, + ) diff --git a/sandboxes/venv_lepton_mva.sh b/sandboxes/venv_lepton_mva.sh new file mode 100644 index 000000000..aa9e74ba9 --- /dev/null +++ b/sandboxes/venv_lepton_mva.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +# Script that sets up a virtual env in $CF_VENV_PATH. +# For more info on functionality and parameters, see the generic setup script _setup_venv.sh. + +action() { + local shell_is_zsh=$( [ -z "${ZSH_VERSION}" ] && echo "false" || echo "true" ) + local this_file="$( ${shell_is_zsh} && echo "${(%):-%x}" || echo "${BASH_SOURCE[0]}" )" + local this_dir="$( cd "$( dirname "${this_file}" )" && pwd )" + + # set variables and source the generic venv setup + export CF_SANDBOX_FILE="${CF_SANDBOX_FILE:-${this_file}}" + export CF_VENV_NAME="$( basename "${this_file%.sh}" )" + export CF_VENV_REQUIREMENTS="${this_dir}/venv_lepton_mva.txt" + + source "${CF_BASE}/sandboxes/_setup_venv.sh" "$@" +} +action "$@" diff --git a/sandboxes/venv_lepton_mva.txt b/sandboxes/venv_lepton_mva.txt new file mode 100644 index 000000000..ac791087d --- /dev/null +++ b/sandboxes/venv_lepton_mva.txt @@ -0,0 +1,5 @@ +# version 1 + +-r ../modules/columnflow/sandboxes/columnar.txt + +xgboost==2.0.2 diff --git a/sandboxes/venv_lepton_mva_dev.sh b/sandboxes/venv_lepton_mva_dev.sh new file mode 100644 index 000000000..51f121991 --- /dev/null +++ b/sandboxes/venv_lepton_mva_dev.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +# Script that sets up a virtual env in $CF_VENV_PATH. +# For more info on functionality and parameters, see the generic setup script _setup_venv.sh. + +action() { + local shell_is_zsh=$( [ -z "${ZSH_VERSION}" ] && echo "false" || echo "true" ) + local this_file="$( ${shell_is_zsh} && echo "${(%):-%x}" || echo "${BASH_SOURCE[0]}" )" + local this_dir="$( cd "$( dirname "${this_file}" )" && pwd )" + + # set variables and source the generic venv setup + export CF_SANDBOX_FILE="${CF_SANDBOX_FILE:-${this_file}}" + export CF_VENV_NAME="$( basename "${this_file%.sh}" )" + export CF_VENV_REQUIREMENTS="${this_dir}/venv_lepton_mva.txt,${CF_BASE}/sandboxes/dev.txt" + + source "${CF_BASE}/sandboxes/_setup_venv.sh" "$@" +} +action "$@" From 53f870b1d8295ec9a466f3ef479764393799eb5b Mon Sep 17 00:00:00 2001 From: Maarten De Coen <52047931+maadcoen@users.noreply.github.com> Date: Fri, 17 May 2024 17:26:15 +0200 Subject: [PATCH 094/119] Cutflow dev (#10) allow to remove initial steps in cutflow plots --- columnflow/tasks/cutflow.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/columnflow/tasks/cutflow.py b/columnflow/tasks/cutflow.py index 261cf7488..141490b84 100644 --- a/columnflow/tasks/cutflow.py +++ b/columnflow/tasks/cutflow.py @@ -368,6 +368,12 @@ class PlotCutflow( description="plot cutflow as fraction of total at each step", ) + skip_initial = luigi.BoolParameter( + default=False, + significant=False, + description="do not plot the event selection before applying any steps", + ) + # upstream requirements reqs = Requirements( PlotCutflowBase.reqs, @@ -468,6 +474,9 @@ def run(self): # axis reductions h = h[{"process": sum, "category": sum, self.variable: sum}] + if self.skip_initial: + h = h[{"step": self.selector_steps}] + # add the histogram if process_inst in hists: hists[process_inst] += h From a3b3e7bfa659f03dba1b280cc5f3015e57fa94e3 Mon Sep 17 00:00:00 2001 From: maadcoen Date: Fri, 17 May 2024 17:56:50 +0200 Subject: [PATCH 095/119] add environment requirement to lepton_mva_producer description --- columnflow/calibration/cmsGhent/lepton_mva.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/columnflow/calibration/cmsGhent/lepton_mva.py b/columnflow/calibration/cmsGhent/lepton_mva.py index 90e38647f..05e41924d 100644 --- a/columnflow/calibration/cmsGhent/lepton_mva.py +++ b/columnflow/calibration/cmsGhent/lepton_mva.py @@ -101,6 +101,14 @@ def lepton_mva_producer(self: Calibrator, events: ak.Array, **kwargs) -> ak.Arra "Electron": f"YOURDIRECTORY/weights/el_TOPUL18_XGB.weights.bin", }, }) + + Requires adding the environment venv_lepton_mva which included xgboost to the analysis or config. E.g. + + analysis_inst.x.bash_sandboxes = [ + "$CF_BASE/sandboxes/cf.sh", + "$CF_BASE/sandboxes/venv_lepton_mva.sh", + ] + """ events = self[lepton_mva_inputs_producer](events) for lepton in ["Muon", "Electron"]: From 4c943f6485197e4e563e937209a5540bb12e622a Mon Sep 17 00:00:00 2001 From: Maarten De Coen <52047931+maadcoen@users.noreply.github.com> Date: Tue, 21 May 2024 10:45:06 +0200 Subject: [PATCH 096/119] Update gen_features.py remove implementation from Jules --- .../production/cmsGhent/gen_features.py | 100 ------------------ 1 file changed, 100 deletions(-) diff --git a/columnflow/production/cmsGhent/gen_features.py b/columnflow/production/cmsGhent/gen_features.py index 7b2a4a296..b89b660f4 100644 --- a/columnflow/production/cmsGhent/gen_features.py +++ b/columnflow/production/cmsGhent/gen_features.py @@ -115,103 +115,3 @@ def lepton_gen_features( events = set_ak_column(events, f"{name}.isChargeFlip", ak.fill_none(is_chargeflip, False, axis=-1)) return events - - -@producer( - uses=four_vec( - ("Electron", "Muon"), - ("pdgId", "genPartIdx")) | - four_vec( - ("GenPart"), - ("pdgId", "status", "statusFlags") - ), - produces=four_vec( - {"Electron", "Muon"}, - {"isPromptJules", "matchPdgIdJules", "isChargeFlipJules"} - ), - mc_only=True, - exposed=False, -) -def lepton_gen_features_jules( - self: Producer, - events: ak.Array, - **kwargs, -) -> ak.Array: - - electron = (events.Electron) - muon = (events.Muon) - genpart = (events.GenPart) - - statusmap = ({ - "isPrompt": 0, - "isDecayedLeptonHadron": 1, - "isTauDecayProduct": 2, - "isPromptTauDecayProduct": 3, - "isDirectTauDecayProduct": 4, - "isDirectPromptTauDecayProduct": 5, - "isDirectHadronDecayProduct": 6, - "isHardProcess": 7, - "fromHardProcess": 8, - "isHardProcessTauDecayProduct": 9, - "isDirectHardProcessTauDecayProduct": 10, - "fromHardProcessBeforeFSR": 11, - "isFirstCopy": 12, - "isLastCopy": 13, - "isLastCopyBeforeFSR": 14, - }) - - def has_statusFlag(gen, statusFlag): - return (gen.statusFlags & (1 << statusFlag) != 0) - - for name, lepton, in (("Electron", electron), ("Muon", muon)): - # leptons in [x,y,:] are identical, genparts i, [x,y,:] are all genparts (needed to remove genparts that do not have the same pdgId) - _lepton, _genpart = ak.unzip(ak.cartesian([lepton, genpart], axis=1, nested=True)) - - # mask to match lepton pdgId with gen particle before looking at nearest generator particle - pdgId_mask = (abs(_lepton.pdgId) == abs(_genpart.pdgId)) - - status_mask = ak.where(abs(_genpart.pdgId) == 15, (_genpart.status == 2) & - (has_statusFlag(_genpart, 13)), _genpart.status == 1) - - # reduced gen particle list of possible matching candidates - _genpart_allowphoton = _genpart[(pdgId_mask | (abs(_genpart.pdgId) == 22)) & status_mask] - _genpart = _genpart[pdgId_mask & status_mask] - - dr = ak.min(lepton.delta_r(_genpart), axis=-1) - dr_allowphoton = ak.min(lepton.delta_r(_genpart_allowphoton), axis=-1) - - # take closest gen particle as match (with pdgId and status mask aplied on the gen particle) - custom_match = ak.flatten(_genpart[ak.argmin(lepton.delta_r(_genpart), axis=-1, keepdims=True)], axis=2) - custom_allowphoton = ak.flatten(_genpart_allowphoton[ak.argmin( - lepton.delta_r(_genpart_allowphoton), axis=-1, keepdims=True)], axis=2) - - # if delta r > 0.2, check for match with gen photons - cond = ak.fill_none((dr > 0.2), True) - custom_match = ak.where(cond, custom_allowphoton, custom_match) - - # First check if lepton ahs a designated gen particle, if not use custom match - cond = ak.fill_none((lepton.genPartIdx >= 0) & ( - genpart[lepton.genPartIdx].pdgId == lepton.pdgId), True) - gen_match = ak.where(cond, genpart[lepton.genPartIdx], custom_match) - cond = ak.fill_none((lepton.genPartIdx >= 0) & ( - genpart[lepton.genPartIdx].pdgId == lepton.pdgId), True) - valid_match = ak.where(cond, True, dr_allowphoton < 0.2) - - # if delta r still > 0.2, there is no valid custom match! - match_isPrompt = ( - (has_statusFlag(gen_match, statusmap["isPrompt"])) | - (has_statusFlag(gen_match, statusmap["isDirectPromptTauDecayProduct"])) | - (has_statusFlag(gen_match, statusmap["isHardProcess"])) | - (has_statusFlag(gen_match, statusmap["fromHardProcess"])) | - (has_statusFlag(gen_match, statusmap["fromHardProcessBeforeFSR"])) - ) & (valid_match) - - matchPdgId = (gen_match.pdgId == lepton.pdgId) & valid_match - - is_chargeflip = (gen_match.pdgId == -lepton.pdgId) & valid_match - - events = set_ak_column(events, f"{name}.isPromptJules", ak.fill_none(match_isPrompt, 0, axis=-1)) - events = set_ak_column(events, f"{name}.matchPdgIdJules", ak.fill_none(matchPdgId, 0, axis=-1)) - events = set_ak_column(events, f"{name}.isChargeFlipJules", ak.fill_none(is_chargeflip, 0, axis=-1)) - - return events From 921ef00951355db14824fa3a060cee87ade34307 Mon Sep 17 00:00:00 2001 From: maadcoen Date: Tue, 21 May 2024 12:07:54 +0200 Subject: [PATCH 097/119] check whether has fields as attributes, not in fields (because might be implemented as method) --- columnflow/columnar_util_Ghent.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/columnflow/columnar_util_Ghent.py b/columnflow/columnar_util_Ghent.py index 521c23fd0..e5c1bef45 100644 --- a/columnflow/columnar_util_Ghent.py +++ b/columnflow/columnar_util_Ghent.py @@ -21,8 +21,8 @@ def TetraVec(arr: ak.Array) -> ak.Array: create a Lorentz for fector from an awkward array with pt, eta, phi, and mass fields """ for field in ["pt", "eta", "phi", "mass"]: - assert field in arr.fields, f"Provided array is missing {field} field" + assert hasattr(arr, field), f"Provided array is missing {field} field" TetraVec = ak.zip({"pt": arr.pt, "eta": arr.eta, "phi": arr.phi, "mass": arr.mass}, with_name="PtEtaPhiMLorentzVector", behavior=coffea.nanoevents.methods.vector.behavior) - return TetraVec \ No newline at end of file + return TetraVec From c923119cc8062850f79f06cbf0e0fc0ab698a3ab Mon Sep 17 00:00:00 2001 From: maadcoen Date: Tue, 21 May 2024 15:27:23 +0200 Subject: [PATCH 098/119] don't sort in case selector steps are None --- columnflow/tasks/framework/mixins.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/columnflow/tasks/framework/mixins.py b/columnflow/tasks/framework/mixins.py index dcfa0b732..ec53fd096 100644 --- a/columnflow/tasks/framework/mixins.py +++ b/columnflow/tasks/framework/mixins.py @@ -668,7 +668,7 @@ def store_parts(self) -> law.util.InsertableDict: parts = super().store_parts() steps = self.selector_steps - if not self.selector_steps_order_sensitive: + if not self.selector_steps_order_sensitive and steps is not None: steps = sorted(steps) if steps is not None: parts["selector"] += ("__steps_" + "_".join(steps) if steps else "__inclusive") From 50bf63c2ae46e3283d44994d4e20f4eab9a3d3fd Mon Sep 17 00:00:00 2001 From: maadcoen Date: Tue, 21 May 2024 15:27:43 +0200 Subject: [PATCH 099/119] point to correct location of inherited file --- sandboxes/venv_lepton_mva.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sandboxes/venv_lepton_mva.txt b/sandboxes/venv_lepton_mva.txt index ac791087d..44f9cfb16 100644 --- a/sandboxes/venv_lepton_mva.txt +++ b/sandboxes/venv_lepton_mva.txt @@ -1,5 +1,5 @@ # version 1 --r ../modules/columnflow/sandboxes/columnar.txt +-r columnar.txt xgboost==2.0.2 From 4891c5f3a16232cbb47864d43cb45864457281a9 Mon Sep 17 00:00:00 2001 From: maadcoen Date: Tue, 21 May 2024 15:41:00 +0200 Subject: [PATCH 100/119] typo removed --- columnflow/selection/cmsGhent/lepton_mva_cuts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/columnflow/selection/cmsGhent/lepton_mva_cuts.py b/columnflow/selection/cmsGhent/lepton_mva_cuts.py index 072ccb220..f85ca42a7 100644 --- a/columnflow/selection/cmsGhent/lepton_mva_cuts.py +++ b/columnflow/selection/cmsGhent/lepton_mva_cuts.py @@ -83,6 +83,6 @@ def lepton_mva_object( objects={ lep: {lep: masked_sorted_indices(events[lep][working_point[lep]], events[lep].pt)} - for lep in ["Muon", "Electrion"] + for lep in ["Muon", "Electron"] }, ) From c40925e3d177a2b4d750a3dd448c59ce4045f6b0 Mon Sep 17 00:00:00 2001 From: Mathis Frahm Date: Thu, 2 May 2024 10:56:21 +0200 Subject: [PATCH 101/119] add producers parameter to ProduceColumnsWrapper --- columnflow/tasks/production.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/columnflow/tasks/production.py b/columnflow/tasks/production.py index 1bc430bee..a53e055bb 100644 --- a/columnflow/tasks/production.py +++ b/columnflow/tasks/production.py @@ -3,12 +3,13 @@ """ Tasks related to producing new columns. """ +import itertools import law from columnflow.tasks.framework.base import Requirements, AnalysisTask, wrapper_factory from columnflow.tasks.framework.mixins import ( - CalibratorsMixin, SelectorStepsMixin, ProducerMixin, ChunkedIOMixin, + CalibratorsMixin, SelectorStepsMixin, ProducerMixin, ChunkedIOMixin, ProducersMixin, ) from columnflow.tasks.framework.remote import RemoteWorkflow from columnflow.tasks.reduction import MergeReducedEventsUser, MergeReducedEvents @@ -165,8 +166,25 @@ def run(self): ) -ProduceColumnsWrapper = wrapper_factory( +ProduceColumnsWrapperBase = wrapper_factory( base_cls=AnalysisTask, require_cls=ProduceColumns, enable=["configs", "skip_configs", "datasets", "skip_datasets", "shifts", "skip_shifts"], ) +ProduceColumnsWrapperBase.exclude_index = True + + +class ProduceColumnsWrapper( + ProduceColumnsWrapperBase, + ProducersMixin, + +): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # add the producers parameter + self.wrapper_fields.extend(["producer"]) + + combined_parameters = itertools.product(self.wrapper_parameters, self.producers) + combined_parameters = [params_tuple + (producer,) for params_tuple, producer in combined_parameters] + self.wrapper_parameters = combined_parameters From 9490820b6a613436c6c973b631c6b1c63b51075d Mon Sep 17 00:00:00 2001 From: Mathis Frahm Date: Mon, 13 May 2024 17:44:22 +0200 Subject: [PATCH 102/119] customize muon_weights output column names --- columnflow/production/cms/muon.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/columnflow/production/cms/muon.py b/columnflow/production/cms/muon.py index 79436a5b6..3b96dcf54 100644 --- a/columnflow/production/cms/muon.py +++ b/columnflow/production/cms/muon.py @@ -18,15 +18,14 @@ uses={ "Muon.pt", "Muon.eta", }, - produces={ - "muon_weight", "muon_weight_up", "muon_weight_down", - }, + # produces in the init # only run on mc mc_only=True, # function to determine the correction file get_muon_file=(lambda self, external_files: external_files.muon_sf), # function to determine the muon weight config get_muon_config=(lambda self: self.config_inst.x.muon_sf_names), + weight_name=(lambda self: "muon_weight"), ) def muon_weights( self: Producer, @@ -83,7 +82,6 @@ def muon_weights( "ValType": syst, # syst key in 2017 } inputs = [variable_map_syst[inp.name] for inp in self.muon_sf_corrector.inputs] - sf_flat = self.muon_sf_corrector(*inputs) # add the correct layout to it @@ -93,7 +91,7 @@ def muon_weights( weight = ak.prod(sf, axis=1, mask_identity=False) # store it - events = set_ak_column(events, f"muon_weight{postfix}", weight, value_type=np.float32) + events = set_ak_column(events, f"{self.weight_name()}{postfix}", weight, value_type=np.float32) return events @@ -128,3 +126,9 @@ def muon_weights_setup( # check versions if self.muon_sf_corrector.version not in (1,): raise Exception(f"unsuppprted muon sf corrector version {self.muon_sf_corrector.version}") + + +@muon_weights.init +def muon_weights_init(self: Producer, **kwargs) -> None: + weight_name = self.weight_name() + self.produces |= {weight_name, f"{weight_name}_up", f"{weight_name}_down"} From 6b3d76f3d88673636305d2522ceb218daeaacdae Mon Sep 17 00:00:00 2001 From: Mathis Frahm Date: Tue, 14 May 2024 10:13:22 +0200 Subject: [PATCH 103/119] allow customizing supported versions --- columnflow/production/cms/muon.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/columnflow/production/cms/muon.py b/columnflow/production/cms/muon.py index 3b96dcf54..69e3d85b1 100644 --- a/columnflow/production/cms/muon.py +++ b/columnflow/production/cms/muon.py @@ -26,6 +26,7 @@ # function to determine the muon weight config get_muon_config=(lambda self: self.config_inst.x.muon_sf_names), weight_name=(lambda self: "muon_weight"), + supported_versions=(1, 2), ) def muon_weights( self: Producer, @@ -124,7 +125,7 @@ def muon_weights_setup( self.muon_sf_corrector = correction_set[corrector_name] # check versions - if self.muon_sf_corrector.version not in (1,): + if self.supported_versions and self.muon_sf_corrector.version not in self.supported_versions: raise Exception(f"unsuppprted muon sf corrector version {self.muon_sf_corrector.version}") From b3c7cdd789227d0d20c996c5a8627ab72474d156 Mon Sep 17 00:00:00 2001 From: Mathis Frahm Date: Fri, 17 May 2024 08:30:08 +0200 Subject: [PATCH 104/119] change muon weight_name attribute to simple string --- columnflow/production/cms/muon.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/columnflow/production/cms/muon.py b/columnflow/production/cms/muon.py index 69e3d85b1..c702b4339 100644 --- a/columnflow/production/cms/muon.py +++ b/columnflow/production/cms/muon.py @@ -25,7 +25,7 @@ get_muon_file=(lambda self, external_files: external_files.muon_sf), # function to determine the muon weight config get_muon_config=(lambda self: self.config_inst.x.muon_sf_names), - weight_name=(lambda self: "muon_weight"), + weight_name="muon_weight", supported_versions=(1, 2), ) def muon_weights( @@ -92,7 +92,7 @@ def muon_weights( weight = ak.prod(sf, axis=1, mask_identity=False) # store it - events = set_ak_column(events, f"{self.weight_name()}{postfix}", weight, value_type=np.float32) + events = set_ak_column(events, f"{self.weight_name}{postfix}", weight, value_type=np.float32) return events @@ -131,5 +131,5 @@ def muon_weights_setup( @muon_weights.init def muon_weights_init(self: Producer, **kwargs) -> None: - weight_name = self.weight_name() + weight_name = self.weight_name self.produces |= {weight_name, f"{weight_name}_up", f"{weight_name}_down"} From 32cf8bfc4e07d00d32f338a63a8901b29e342796 Mon Sep 17 00:00:00 2001 From: Mathis Frahm Date: Fri, 19 Apr 2024 12:01:28 +0200 Subject: [PATCH 105/119] add parsing of tuples for Settings and MultiSettings --- columnflow/tasks/framework/parameters.py | 11 ++++++++++- tests/test_task_parameters.py | 13 +++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/columnflow/tasks/framework/parameters.py b/columnflow/tasks/framework/parameters.py index 405b786a8..bb3d8833d 100644 --- a/columnflow/tasks/framework/parameters.py +++ b/columnflow/tasks/framework/parameters.py @@ -31,13 +31,22 @@ class SettingsParameter(law.CSVParameter): def parse_setting(cls, setting: str) -> tuple[str, float | bool | str]: pair = setting.split("=", 1) key, value = pair if len(pair) == 2 else (pair[0], "True") + if ";" in value: + # split by semicolon and parse each value + value = tuple(cls.parse_value(v) for v in value.split(";")) + else: + value = cls.parse_value(value) + return (key, value) + + @classmethod + def parse_value(cls, value): if try_float(value): value = float(value) elif value.lower() == "true": value = True elif value.lower() == "false": value = False - return (key, value) + return value @classmethod def serialize_setting(cls, name: str, value: str) -> str: diff --git a/tests/test_task_parameters.py b/tests/test_task_parameters.py index b4f444cbf..b5f17f05b 100644 --- a/tests/test_task_parameters.py +++ b/tests/test_task_parameters.py @@ -23,6 +23,11 @@ def test_settings_parameter(self): p.parse("param1=10,param2,param3=text,param4=false"), {"param1": 10.0, "param2": True, "param3": "text", "param4": False}, ) + self.assertEqual( + # parsing of semicolon separated values + p.parse("param1=1;2;3;4,param2=a;b;true;false"), + {"param1": (1, 2, 3, 4), "param2": ("a", "b", True, False)}, + ) self.assertEqual( # if a parameter is set multiple times, prioritize last one p.parse("A=1,B,A=2"), @@ -46,6 +51,14 @@ def test_multi_settings_parameter(self): p.parse("obj1,k1=10,k2,k3=text:obj2,k4=false"), {"obj1": {"k1": 10.0, "k2": True, "k3": "text"}, "obj2": {"k4": False}}, ) + self.assertEqual( + # parsing of semicolon separated values + p.parse("obj1,k1=1;2;3;4,k2=a;b;true;false:obj2,k3=5;6;x;y"), + { + "obj1": {"k1": (1, 2, 3, 4), "k2": ("a", "b", True, False)}, + "obj2": {"k3": (5, 6, "x", "y")}, + }, + ) self.assertEqual( # providing the same key twice results in once combined dict p.parse("tt,A=2:st,A=2:tt,B=True"), From 6ca46f60b7a4e3cebc1a8431f419a0028aba7ef4 Mon Sep 17 00:00:00 2001 From: Mathis Frahm Date: Fri, 19 Apr 2024 12:13:29 +0200 Subject: [PATCH 106/119] allow parsing complex numbers --- columnflow/tasks/framework/parameters.py | 4 +++- columnflow/util.py | 13 ++++++++++++- tests/test_task_parameters.py | 8 ++++---- tests/test_util.py | 9 ++++++++- 4 files changed, 27 insertions(+), 7 deletions(-) diff --git a/columnflow/tasks/framework/parameters.py b/columnflow/tasks/framework/parameters.py index bb3d8833d..3c4501e4e 100644 --- a/columnflow/tasks/framework/parameters.py +++ b/columnflow/tasks/framework/parameters.py @@ -8,7 +8,7 @@ import law -from columnflow.util import try_float, DotDict +from columnflow.util import try_float, try_complex, DotDict class SettingsParameter(law.CSVParameter): @@ -42,6 +42,8 @@ def parse_setting(cls, setting: str) -> tuple[str, float | bool | str]: def parse_value(cls, value): if try_float(value): value = float(value) + elif try_complex(value): + value = complex(value) elif value.lower() == "true": value = True elif value.lower() == "false": diff --git a/columnflow/util.py b/columnflow/util.py index 843e22cb9..48b6badec 100644 --- a/columnflow/util.py +++ b/columnflow/util.py @@ -10,7 +10,7 @@ "UNSET", "maybe_import", "import_plt", "import_ROOT", "import_file", "create_random_name", "expand_path", "real_path", "ensure_dir", "wget", "call_thread", "call_proc", "ensure_proxy", "dev_sandbox", - "safe_div", "try_float", "try_int", "is_pattern", "is_regex", "pattern_matcher", + "safe_div", "try_float", "try_complex", "try_int", "is_pattern", "is_regex", "pattern_matcher", "dict_add_strict", "get_source_code", "DotDict", "MockModule", "FunctionArgs", "ClassPropertyDescriptor", "classproperty", "DerivableMeta", "Derivable", @@ -413,6 +413,17 @@ def try_float(f: Any) -> bool: return False +def try_complex(f: Any) -> bool: + """ + Tests whether a value *f* can be converted to a complex number. + """ + try: + complex(f) + return True + except (ValueError, TypeError): + return False + + def try_int(i: Any) -> bool: """ Tests whether a value *i* can be converted to an integer. diff --git a/tests/test_task_parameters.py b/tests/test_task_parameters.py index b5f17f05b..6bca67460 100644 --- a/tests/test_task_parameters.py +++ b/tests/test_task_parameters.py @@ -25,8 +25,8 @@ def test_settings_parameter(self): ) self.assertEqual( # parsing of semicolon separated values - p.parse("param1=1;2;3;4,param2=a;b;true;false"), - {"param1": (1, 2, 3, 4), "param2": ("a", "b", True, False)}, + p.parse("param1=1;2;3j;4j,param2=a;b;true;false"), + {"param1": (1, 2, 3j, 4j), "param2": ("a", "b", True, False)}, ) self.assertEqual( # if a parameter is set multiple times, prioritize last one @@ -53,9 +53,9 @@ def test_multi_settings_parameter(self): ) self.assertEqual( # parsing of semicolon separated values - p.parse("obj1,k1=1;2;3;4,k2=a;b;true;false:obj2,k3=5;6;x;y"), + p.parse("obj1,k1=1;2;3j;4j,k2=a;b;true;false:obj2,k3=5;6;x;y"), { - "obj1": {"k1": (1, 2, 3, 4), "k2": ("a", "b", True, False)}, + "obj1": {"k1": (1, 2, 3j, 4j), "k2": ("a", "b", True, False)}, "obj2": {"k3": (5, 6, "x", "y")}, }, ) diff --git a/tests/test_util.py b/tests/test_util.py index 82373f9e1..30769c453 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -7,7 +7,7 @@ from columnflow.util import ( create_random_name, maybe_import, MockModule, DotDict, Derivable, - safe_div, try_float, try_int, is_regex, is_pattern, pattern_matcher, + safe_div, try_float, try_int, try_complex, is_regex, is_pattern, pattern_matcher, ) @@ -44,6 +44,13 @@ def test_try_int_try_float(self): self.assertFalse(try_number(1j)) self.assertFalse(try_number([1, 2])) + def test_try_complex(self): + self.assertTrue(try_complex("1.2+2.5j")) + self.assertFalse(try_complex("some_string")) + self.assertFalse(try_complex([1, 2])) + # real numbers are also complex number + self.assertTrue(try_complex("5.0")) + def test_is_regex(self): self.assertTrue(is_regex(r"^foo\d+.*$")) self.assertFalse(is_regex(r"^no$atEnd")) From f6590844d56f7978cb0a3b52ba37a0b8100db56c Mon Sep 17 00:00:00 2001 From: Mathis Frahm Date: Fri, 19 Apr 2024 15:18:43 +0200 Subject: [PATCH 107/119] fix serializing of SettingsParameter --- columnflow/tasks/framework/parameters.py | 7 ++++--- tests/test_task_parameters.py | 9 +++++++-- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/columnflow/tasks/framework/parameters.py b/columnflow/tasks/framework/parameters.py index 3c4501e4e..6b9844c76 100644 --- a/columnflow/tasks/framework/parameters.py +++ b/columnflow/tasks/framework/parameters.py @@ -9,6 +9,7 @@ import law from columnflow.util import try_float, try_complex, DotDict +from columnflow.types import Iterable class SettingsParameter(law.CSVParameter): @@ -32,7 +33,7 @@ def parse_setting(cls, setting: str) -> tuple[str, float | bool | str]: pair = setting.split("=", 1) key, value = pair if len(pair) == 2 else (pair[0], "True") if ";" in value: - # split by semicolon and parse each value + # split by ";" and parse each value value = tuple(cls.parse_value(v) for v in value.split(";")) else: value = cls.parse_value(value) @@ -51,7 +52,8 @@ def parse_value(cls, value): return value @classmethod - def serialize_setting(cls, name: str, value: str) -> str: + def serialize_setting(cls, name: str, value: str | Iterable[str]) -> str: + value = ";".join(str(v) for v in law.util.make_tuple(value)) return f"{name}={value}" def __init__(self, **kwargs): @@ -114,7 +116,6 @@ def parse(self, inp): ) # next, merge dicts outputs = law.util.merge_dicts(*outputs, deep=True) - return outputs def serialize(self, value): diff --git a/tests/test_task_parameters.py b/tests/test_task_parameters.py index 6bca67460..51f0f5deb 100644 --- a/tests/test_task_parameters.py +++ b/tests/test_task_parameters.py @@ -24,7 +24,7 @@ def test_settings_parameter(self): {"param1": 10.0, "param2": True, "param3": "text", "param4": False}, ) self.assertEqual( - # parsing of semicolon separated values + # parsing of lists of values, separated via ";" p.parse("param1=1;2;3j;4j,param2=a;b;true;false"), {"param1": (1, 2, 3j, 4j), "param2": ("a", "b", True, False)}, ) @@ -39,6 +39,11 @@ def test_settings_parameter(self): p.serialize({"param1": 2, "param2": False}), "param1=2,param2=False", ) + print(p.serialize({"param1": [1, 2j, "A", True, False]})) + self.assertEqual( + p.serialize({"param1": [1, 2j, "A", True, False]}), + "param1=1;2j;A;True;False", + ) def test_multi_settings_parameter(self): p = MultiSettingsParameter() @@ -52,7 +57,7 @@ def test_multi_settings_parameter(self): {"obj1": {"k1": 10.0, "k2": True, "k3": "text"}, "obj2": {"k4": False}}, ) self.assertEqual( - # parsing of semicolon separated values + # parsing of lists of values, separated via ";" p.parse("obj1,k1=1;2;3j;4j,k2=a;b;true;false:obj2,k3=5;6;x;y"), { "obj1": {"k1": (1, 2, 3j, 4j), "k2": ("a", "b", True, False)}, From a5c364014a3b8d5f5d89dbd8366698050eb43726 Mon Sep 17 00:00:00 2001 From: Mathis Frahm Date: Fri, 19 Apr 2024 15:29:41 +0200 Subject: [PATCH 108/119] add slicing of histograms --- columnflow/plotting/plot_util.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/columnflow/plotting/plot_util.py b/columnflow/plotting/plot_util.py index c55012d6d..d7d8932fb 100644 --- a/columnflow/plotting/plot_util.py +++ b/columnflow/plotting/plot_util.py @@ -14,7 +14,7 @@ import order as od -from columnflow.util import maybe_import, try_int +from columnflow.util import maybe_import, try_int, try_complex math = maybe_import("math") hist = maybe_import("hist") @@ -122,6 +122,17 @@ def apply_variable_settings( h = h[{var_inst.name: hist.rebin(rebin_factor)}] hists[proc_inst] = h + slices = getattr(var_inst, "slice", None) or var_inst.x("slice", None) + if ( + slices and isinstance(slices, Iterable) and len(slices) >= 2 and + try_complex(slices[0]) and try_complex(slices[1]) + ): + slice_0 = int(slices[0]) if try_int(slices[0]) else complex(slices[0]) + slice_1 = int(slices[1]) if try_int(slices[1]) else complex(slices[1]) + for proc_inst, h in list(hists.items()): + h = h[{var_inst.name: slice(slice_0, slice_1)}] + hists[proc_inst] = h + return hists From 0324c2adae60cca8595935ddacabf8050e021cc1 Mon Sep 17 00:00:00 2001 From: Mathis Frahm Date: Fri, 17 May 2024 08:58:39 +0200 Subject: [PATCH 109/119] add class attribute for SettingsParameter delimiters --- columnflow/tasks/framework/parameters.py | 6 ++++-- tests/test_task_parameters.py | 6 ++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/columnflow/tasks/framework/parameters.py b/columnflow/tasks/framework/parameters.py index 6b9844c76..2fedcd21e 100644 --- a/columnflow/tasks/framework/parameters.py +++ b/columnflow/tasks/framework/parameters.py @@ -27,14 +27,16 @@ class SettingsParameter(law.CSVParameter): p.serialize({"param1": 2, "param2": False}) => "param1=2,param2=False" """ + settings_delimiter = "=" + tuple_delimiter = ";" @classmethod def parse_setting(cls, setting: str) -> tuple[str, float | bool | str]: - pair = setting.split("=", 1) + pair = setting.split(cls.settings_delimiter, 1) key, value = pair if len(pair) == 2 else (pair[0], "True") if ";" in value: # split by ";" and parse each value - value = tuple(cls.parse_value(v) for v in value.split(";")) + value = tuple(cls.parse_value(v) for v in value.split(cls.tuple_delimiter)) else: value = cls.parse_value(value) return (key, value) diff --git a/tests/test_task_parameters.py b/tests/test_task_parameters.py index 51f0f5deb..e06040800 100644 --- a/tests/test_task_parameters.py +++ b/tests/test_task_parameters.py @@ -14,6 +14,11 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) def test_settings_parameter(self): + # check that the default delimiters have not been changed + self.assertEqual(SettingsParameter.settings_delimiter, "=") + self.assertEqual(SettingsParameter.tuple_delimiter, ";") + + # initialize a SettingParameter p = SettingsParameter() # parsing @@ -46,6 +51,7 @@ def test_settings_parameter(self): ) def test_multi_settings_parameter(self): + # initialize a MultiSettingsParameter p = MultiSettingsParameter() # parsing From 18077ac94a3fcdb7f1371d949f9f6b927e01884d Mon Sep 17 00:00:00 2001 From: maadcoen Date: Thu, 23 May 2024 14:31:22 +0200 Subject: [PATCH 110/119] fix in selector_steps default handling --- columnflow/tasks/cutflow.py | 2 +- columnflow/tasks/framework/mixins.py | 15 +++++++-------- columnflow/tasks/reduction.py | 4 ++-- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/columnflow/tasks/cutflow.py b/columnflow/tasks/cutflow.py index 141490b84..5c16dbee7 100644 --- a/columnflow/tasks/cutflow.py +++ b/columnflow/tasks/cutflow.py @@ -221,7 +221,7 @@ def prepare_hists(steps): events = self.norm_weight_producer(events) # overwrite steps if not defined yet - if steps is None: + if steps == self.selector_steps_default: steps = sel.steps.fields # prepare histograms and exprepssions once diff --git a/columnflow/tasks/framework/mixins.py b/columnflow/tasks/framework/mixins.py index ec53fd096..b3604dee8 100644 --- a/columnflow/tasks/framework/mixins.py +++ b/columnflow/tasks/framework/mixins.py @@ -586,8 +586,10 @@ class SelectorStepsMixin(SelectorMixin): parameter for this task. """ + selector_steps_default = ('_DEFAULT',) + selector_steps = law.CSVParameter( - default=None, + default=selector_steps_default, description="a subset of steps of the selector to apply; uses all steps when None; " "None default", brace_expand=True, @@ -628,12 +630,9 @@ def resolve_param_values(cls, params: dict[str, Any]) -> dict[str, Any]: ) # sort selector steps when the order does not matter - if not cls.selector_steps_order_sensitive and "selector_steps" in params: + if "selector_steps" in params and not cls.selector_steps_order_sensitive: params["selector_steps"] = tuple(sorted(params["selector_steps"])) - if "selector_steps" in params and params["selector_steps"] == (None,): - params["selector_steps"] = None - return params @classmethod @@ -668,9 +667,9 @@ def store_parts(self) -> law.util.InsertableDict: parts = super().store_parts() steps = self.selector_steps - if not self.selector_steps_order_sensitive and steps is not None: - steps = sorted(steps) - if steps is not None: + if steps != self.selector_steps_default: + if not self.selector_steps_order_sensitive: + steps = sorted(steps) parts["selector"] += ("__steps_" + "_".join(steps) if steps else "__inclusive") return parts diff --git a/columnflow/tasks/reduction.py b/columnflow/tasks/reduction.py index 87cb61559..813f79936 100644 --- a/columnflow/tasks/reduction.py +++ b/columnflow/tasks/reduction.py @@ -122,7 +122,7 @@ def run(self): # define columns to read for the differently structured selection masks read_sel_columns = set() # open either selector steps of the full event selection mask - read_sel_columns.add(Route("steps.*" if self.selector_steps else "event")) + read_sel_columns.add(Route("steps.*" if self.selector_steps and self.selector_steps != self.selector_steps_default else "event")) # add object masks, depending on the columns to write # (as object masks are dynamic and deeply nested, preload the meta info to access fields) sel_results = inputs["selection"]["results"].load(formatter="dask_awkward") @@ -183,7 +183,7 @@ def run(self): ) # build the event mask - if self.selector_steps: + if self.selector_steps and self.selector_steps != self.selector_steps_default: # check if all steps are present missing_steps = set(self.selector_steps) - set(sel.steps.fields) if missing_steps: From 5684b8e7281d155b3b5d4c53455196575b008342 Mon Sep 17 00:00:00 2001 From: maadcoen Date: Thu, 23 May 2024 15:26:01 +0200 Subject: [PATCH 111/119] added recursive concatentation in case of large amount of arrays. Awkward seems to breakdown when number of concatenated arrays exceeds 2 ** 7 --- columnflow/columnar_util_Ghent.py | 11 ++++++++++- columnflow/production/categories.py | 3 ++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/columnflow/columnar_util_Ghent.py b/columnflow/columnar_util_Ghent.py index e5c1bef45..54ccbfe2b 100644 --- a/columnflow/columnar_util_Ghent.py +++ b/columnflow/columnar_util_Ghent.py @@ -7,7 +7,7 @@ from __future__ import annotations __all__ = [ - "TetraVec" + "TetraVec", "safe_concatenate", ] from columnflow.util import maybe_import @@ -26,3 +26,12 @@ def TetraVec(arr: ak.Array) -> ak.Array: with_name="PtEtaPhiMLorentzVector", behavior=coffea.nanoevents.methods.vector.behavior) return TetraVec + + +def safe_concatenate(arrays, *args, **kwargs): + n = len(arrays) + if n > 2 ** 7: + c1 = safe_concatenate(arrays[:n // 2], *args, **kwargs) + c2 = safe_concatenate(arrays[n // 2:], *args, **kwargs) + return ak.concatenate([c1, c2], *args, **kwargs) + return ak.concatenate(arrays, *args, **kwargs) diff --git a/columnflow/production/categories.py b/columnflow/production/categories.py index 415b5dbd7..afd71a448 100644 --- a/columnflow/production/categories.py +++ b/columnflow/production/categories.py @@ -14,6 +14,7 @@ from columnflow.production import Producer, producer from columnflow.util import maybe_import from columnflow.columnar_util import set_ak_column +from columnflow.columnar_util_Ghent import safe_concatenate np = maybe_import("numpy") ak = maybe_import("awkward") @@ -48,7 +49,7 @@ def category_ids( category_ids.append(ak.singletons(ak.nan_to_none(ids))) # combine - category_ids = ak.concatenate(category_ids, axis=1) + category_ids = safe_concatenate(category_ids, axis=1) # save, optionally on a target events array if target_events is None: From acf3e31323a7ad5b41d45b9632e4d6af842a8c2d Mon Sep 17 00:00:00 2001 From: maadcoen Date: Tue, 28 May 2024 12:01:25 +0200 Subject: [PATCH 112/119] deal with case where no leptons are present --- columnflow/calibration/cmsGhent/lepton_mva.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/columnflow/calibration/cmsGhent/lepton_mva.py b/columnflow/calibration/cmsGhent/lepton_mva.py index 05e41924d..98c325afe 100644 --- a/columnflow/calibration/cmsGhent/lepton_mva.py +++ b/columnflow/calibration/cmsGhent/lepton_mva.py @@ -120,11 +120,15 @@ def lepton_mva_producer(self: Calibrator, events: ak.Array, **kwargs) -> ak.Arra features = np.transpose(np.array(ak.flatten(features, axis=2))) # make c-contiguous (rows are stored as contiguous blocks of memory.) features = np.ascontiguousarray(features) - # call xgboost predictor - scores = self.mva[lepton].inplace_predict(features) - # unflatten into an awkward array - scores = ak.unflatten(scores, counts) - # set the scores as an additional field for muons + + if np.any(features): + # call xgboost predictor + scores = self.mva[lepton].inplace_predict(features) + # unflatten into an awkward array + scores = ak.unflatten(scores, counts) + # set the scores as an additional field for muons + else: + scores = ak.zeros_like(events[lepton][lepton_mva_inputs[lepton][0]], dtype=np.float32) events = set_ak_column(events, f"{lepton}.mvaTOP", scores) return events From daa7f4f5d4e2bf0f8d019781d82585fc67bcd3ef Mon Sep 17 00:00:00 2001 From: David Kavtaradze Date: Thu, 30 May 2024 15:42:37 +0200 Subject: [PATCH 113/119] update to custom_lfn function --- columnflow/tasks/external.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/columnflow/tasks/external.py b/columnflow/tasks/external.py index a6a875400..6d2997f21 100644 --- a/columnflow/tasks/external.py +++ b/columnflow/tasks/external.py @@ -10,7 +10,7 @@ import time import shutil import subprocess - +import glob import luigi import law import order as od @@ -103,7 +103,8 @@ def run(self): lfns = [] for key in sorted(self.dataset_info_inst.keys): self.logger.info(f"get lfns for dataset key {key} {msg}") - lfns.extend(get_dataset_lfns(self.dataset_inst, self.global_shift_inst, key)) + # lfns.extend(get_dataset_lfns(self.dataset_inst, self.global_shift_inst, key)) + lfns.extend(get_dataset_lfns(self,key)) if self.validate and len(lfns) != self.dataset_info_inst.n_files: raise ValueError( @@ -117,6 +118,21 @@ def run(self): tmp.dump(lfns, indent=4, formatter="json") self.transfer(tmp) + def custom_get_dataset_lfns( + self, + # dataset_inst: od.Dataset, + # shift_inst: od.Shift, + dataset_key: str, + ) -> list[str]: + """ + Function to get the LFN information for custom datasets + The path of custom files have to be given in law.cfg file as [custom_pnfs_fs] + """ + base = law.config.get_expanded('custom_pnfs_fs', "base") + out = glob.glob(f"{base}{dataset_key}/*/*/*.root") + return out + + def get_dataset_lfns_dasgoclient( self, dataset_inst: od.Dataset, From a897f63fab14024dc88cb51a8a509cca42b42157 Mon Sep 17 00:00:00 2001 From: David Kavtaradze Date: Thu, 30 May 2024 18:24:22 +0200 Subject: [PATCH 114/119] correcting arguments --- columnflow/tasks/external.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/columnflow/tasks/external.py b/columnflow/tasks/external.py index 6d2997f21..bf12dff8d 100644 --- a/columnflow/tasks/external.py +++ b/columnflow/tasks/external.py @@ -103,8 +103,10 @@ def run(self): lfns = [] for key in sorted(self.dataset_info_inst.keys): self.logger.info(f"get lfns for dataset key {key} {msg}") - # lfns.extend(get_dataset_lfns(self.dataset_inst, self.global_shift_inst, key)) - lfns.extend(get_dataset_lfns(self,key)) + if msg=='via dasgoclient': + lfns.extend(get_dataset_lfns(self.dataset_inst, self.global_shift_inst, key)) + else: + lfns.extend(get_dataset_lfns(self,key)) if self.validate and len(lfns) != self.dataset_info_inst.n_files: raise ValueError( @@ -120,8 +122,6 @@ def run(self): def custom_get_dataset_lfns( self, - # dataset_inst: od.Dataset, - # shift_inst: od.Shift, dataset_key: str, ) -> list[str]: """ From 424e152a3e98f65aa58ec9076db72b077103a3d6 Mon Sep 17 00:00:00 2001 From: Jan van der Linden Date: Tue, 4 Jun 2024 17:53:18 +0200 Subject: [PATCH 115/119] add custom x tick labels --- columnflow/plotting/plot_util.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/columnflow/plotting/plot_util.py b/columnflow/plotting/plot_util.py index d7d8932fb..68c43ea2c 100644 --- a/columnflow/plotting/plot_util.py +++ b/columnflow/plotting/plot_util.py @@ -217,6 +217,12 @@ def prepare_style_config( tx = range(int(xlim[0]), int(xlim[1]+1)) style_config["ax_cfg"]["xticks"] = tx style_config["ax_cfg"]["minorxticks"] = [] + + # add custom bin labels if specified and same amount of x ticks + if x_labels := variable_inst.x_labels: + if len(x_labels) == len(tx): + style_config["ax_cfg"]["xticklabels"] = x_labels + if variable_inst.discrete_y: style_config["ax_cfg"]["minoryticks"] = [] From 8493bd382c43f5d941c161b665818ccccc7d613c Mon Sep 17 00:00:00 2001 From: Jan van der Linden Date: Wed, 14 Aug 2024 11:31:27 +0200 Subject: [PATCH 116/119] classes for easy access to CMS color schemes + custom ones --- columnflow/plotting/cmsGhent/colors.py | 90 ++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 columnflow/plotting/cmsGhent/colors.py diff --git a/columnflow/plotting/cmsGhent/colors.py b/columnflow/plotting/cmsGhent/colors.py new file mode 100644 index 000000000..8f8640674 --- /dev/null +++ b/columnflow/plotting/cmsGhent/colors.py @@ -0,0 +1,90 @@ +class DefaultColors: + def __init__(self): + self.grey = "#94a4a2" + self.grey2 = "#717581" + self.grey3 = "#5D5F66" + self.grey4 = "#44464A" + self.grey5 = "#3D3E3F" + + def pastel(self): + return PastelColors() + + def rainbow(self): + return RainbowColors() + + def cat_six(self): + return CATColorsSix() + + def cat_ten(self): + return CATColorsTen() + + +class CATColorsSix(DefaultColors): + def __init__(self): + super().__init__() + self.red = "#e42536" + self.orange = "#f89c20" + self.blue = "#5790fc" + self.purple = "#7a21dd" + self.violet = "#964a8b" + self.grey = "#9c9ca1" + +class CATColorsTen(DefaultColors): + def __init__(self): + super().__init__() + self.blue = "#3f90da" + self.blue_light = "#92dadd" + self.orange = "#ffa90e" + self.orange_dark = "#e76300" + self.red = "#bd1f01" + self.purple = "#832db6" + self.brown = "#a96b59" + self.ochre = "#b9ac70" + +class RainbowColors(DefaultColors): + def __init__(self): + super().__init__() + self.purple = "#d23be7" + self.blue = "#4355db" + self.blue_light = "#34bbe6" + self.green = "#49da9a" + self.lime = "#a3e048" + self.yellow = "#f7d038" + self.orange = "#eb7532" + self.red = "#e6261f" + self.colors = [ + self.purple, + self.blue, + self.blue_light, + self.green, + self.lime, + self.yellow, + self.orange, + self.red + ] + +class PastelColors(DefaultColors): + def __init__(self): + super().__init__() + self.yellow = "#ffa600" + self.orange = "#ff843c" + self.orange_dark = "#ff6562" + self.red = "#fd5385" + self.violet_light = "#da52a2" + self.violet = "#ab59b5" + self.purple = "#6f5fba" + self.blue = "#1761b0" + self.colors = [ + self.yellow, + self.orange, + self.orange_dark, + self.red, + self.violet_light, + self.violet, + self.purple, + self.blue + ] + + + + From 89bef618439a298234ca91899e9b686dfd9c6052 Mon Sep 17 00:00:00 2001 From: Jan van der Linden Date: Wed, 14 Aug 2024 11:43:53 +0200 Subject: [PATCH 117/119] indexing possible --- columnflow/plotting/cmsGhent/colors.py | 84 ++++++++++++++++---------- 1 file changed, 52 insertions(+), 32 deletions(-) diff --git a/columnflow/plotting/cmsGhent/colors.py b/columnflow/plotting/cmsGhent/colors.py index 8f8640674..8018174ca 100644 --- a/columnflow/plotting/cmsGhent/colors.py +++ b/columnflow/plotting/cmsGhent/colors.py @@ -11,69 +11,92 @@ def pastel(self): def rainbow(self): return RainbowColors() - + def cat_six(self): return CATColorsSix() def cat_ten(self): return CATColorsTen() - + + def __getitem__(self, i): + return self.colors[i] class CATColorsSix(DefaultColors): def __init__(self): super().__init__() - self.red = "#e42536" + self.red = "#e42536" self.orange = "#f89c20" - self.blue = "#5790fc" + self.blue = "#5790fc" self.purple = "#7a21dd" self.violet = "#964a8b" - self.grey = "#9c9ca1" + self.grey = "#9c9ca1" + self.colors = [ + self.red, + self.orange, + self.blue, + self.purple, + self.violet, + self.grey, + ] + class CATColorsTen(DefaultColors): def __init__(self): super().__init__() - self.blue = "#3f90da" - self.blue_light = "#92dadd" - self.orange = "#ffa90e" + self.blue = "#3f90da" + self.blue_light = "#92dadd" + self.orange = "#ffa90e" self.orange_dark = "#e76300" - self.red = "#bd1f01" - self.purple = "#832db6" - self.brown = "#a96b59" - self.ochre = "#b9ac70" + self.red = "#bd1f01" + self.purple = "#832db6" + self.brown = "#a96b59" + self.ochre = "#b9ac70" + self.colors = [ + self.blue, + self.blue_light, + self.orange, + self.orange_dark, + self.red, + self.purple, + self.brown, + self.ochre, + ] + class RainbowColors(DefaultColors): def __init__(self): super().__init__() - self.purple = "#d23be7" - self.blue = "#4355db" + self.purple = "#d23be7" + self.blue = "#4355db" self.blue_light = "#34bbe6" - self.green = "#49da9a" - self.lime = "#a3e048" - self.yellow = "#f7d038" - self.orange = "#eb7532" - self.red = "#e6261f" + self.green = "#49da9a" + self.lime = "#a3e048" + self.yellow = "#f7d038" + self.orange = "#eb7532" + self.red = "#e6261f" self.colors = [ - self.purple, + self.purple, self.blue, self.blue_light, self.green, self.lime, self.yellow, self.orange, - self.red + self.red, ] + class PastelColors(DefaultColors): def __init__(self): super().__init__() - self.yellow = "#ffa600" - self.orange = "#ff843c" - self.orange_dark = "#ff6562" - self.red = "#fd5385" + self.yellow = "#ffa600" + self.orange = "#ff843c" + self.orange_dark = "#ff6562" + self.red = "#fd5385" self.violet_light = "#da52a2" - self.violet = "#ab59b5" - self.purple = "#6f5fba" - self.blue = "#1761b0" + self.violet = "#ab59b5" + self.purple = "#6f5fba" + self.blue = "#1761b0" self.colors = [ self.yellow, self.orange, @@ -82,9 +105,6 @@ def __init__(self): self.violet_light, self.violet, self.purple, - self.blue + self.blue, ] - - - From 5119f96fc83e2cb7cbc8f76a1eb7347aa5edc767 Mon Sep 17 00:00:00 2001 From: Jan van der Linden Date: Wed, 14 Aug 2024 11:46:53 +0200 Subject: [PATCH 118/119] grey for 10 --- columnflow/plotting/cmsGhent/colors.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/columnflow/plotting/cmsGhent/colors.py b/columnflow/plotting/cmsGhent/colors.py index 8018174ca..40b940d57 100644 --- a/columnflow/plotting/cmsGhent/colors.py +++ b/columnflow/plotting/cmsGhent/colors.py @@ -1,6 +1,6 @@ class DefaultColors: def __init__(self): - self.grey = "#94a4a2" + self.grey = "#94a4a2" self.grey2 = "#717581" self.grey3 = "#5D5F66" self.grey4 = "#44464A" @@ -17,10 +17,11 @@ def cat_six(self): def cat_ten(self): return CATColorsTen() - + def __getitem__(self, i): return self.colors[i] + class CATColorsSix(DefaultColors): def __init__(self): super().__init__() @@ -51,6 +52,8 @@ def __init__(self): self.purple = "#832db6" self.brown = "#a96b59" self.ochre = "#b9ac70" + self.grey = "#94a4a2" + self.grey2 = "#717581" self.colors = [ self.blue, self.blue_light, @@ -60,6 +63,8 @@ def __init__(self): self.purple, self.brown, self.ochre, + self.grey, + self.grey2 ] @@ -107,4 +112,3 @@ def __init__(self): self.purple, self.blue, ] - From 2a12157470fc0b206008991f6a8c35a5a7579eef Mon Sep 17 00:00:00 2001 From: Jan van der Linden Date: Wed, 14 Aug 2024 11:50:32 +0200 Subject: [PATCH 119/119] linto --- columnflow/plotting/cmsGhent/colors.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/columnflow/plotting/cmsGhent/colors.py b/columnflow/plotting/cmsGhent/colors.py index 40b940d57..5f486bb5a 100644 --- a/columnflow/plotting/cmsGhent/colors.py +++ b/columnflow/plotting/cmsGhent/colors.py @@ -52,7 +52,7 @@ def __init__(self): self.purple = "#832db6" self.brown = "#a96b59" self.ochre = "#b9ac70" - self.grey = "#94a4a2" + self.grey = "#94a4a2" self.grey2 = "#717581" self.colors = [ self.blue, @@ -64,7 +64,7 @@ def __init__(self): self.brown, self.ochre, self.grey, - self.grey2 + self.grey2, ]