From 3db9017bf2af8649dab12ba7491701bb30116b37 Mon Sep 17 00:00:00 2001 From: Anuj Shukla Date: Thu, 4 Dec 2025 08:54:01 -0500 Subject: [PATCH 1/7] fix: Prevent segfaults from thread ID array overflow --- projects/rocprofiler-systems/CMakeLists.txt | 31 ++++ .../rocprofiler-systems/cmake/Packages.cmake | 10 ++ .../docs/reference/development-guide.rst | 37 +++++ .../source/lib/common/defines.h.in | 4 + .../lib/rocprof-sys/library/sampling.cpp | 139 +++++++++++------- 5 files changed, 167 insertions(+), 54 deletions(-) diff --git a/projects/rocprofiler-systems/CMakeLists.txt b/projects/rocprofiler-systems/CMakeLists.txt index f4a9d30eb9f..6ac3968a3b5 100644 --- a/projects/rocprofiler-systems/CMakeLists.txt +++ b/projects/rocprofiler-systems/CMakeLists.txt @@ -305,6 +305,37 @@ elseif(NOT ROCPROFSYS_MAX_THREADS EQUAL _MAX_THREADS) ) endif() +# --------------------------------------------------------------------------------# +# Maximum Storage Threads Configuration +# --------------------------------------------------------------------------------# +# Default to ROCPROFSYS_MAX_THREADS if not explicitly set +if(NOT DEFINED ROCPROFSYS_MAX_STORAGE_THREADS) + set(ROCPROFSYS_MAX_STORAGE_THREADS + "${ROCPROFSYS_MAX_THREADS}" + CACHE STRING + "Maximum number of storage thread slots" + ) +endif() + +# Auto-adjust if set below minimum safe threshold +if(ROCPROFSYS_MAX_STORAGE_THREADS LESS ROCPROFSYS_MAX_THREADS) + rocprofiler_systems_message( + WARNING + "ROCPROFSYS_MAX_STORAGE_THREADS (${ROCPROFSYS_MAX_STORAGE_THREADS}) < ROCPROFSYS_MAX_THREADS (${ROCPROFSYS_MAX_THREADS}). Auto-adjusting to prevent runtime errors." + ) + set(ROCPROFSYS_MAX_STORAGE_THREADS + "${ROCPROFSYS_MAX_THREADS}" + CACHE STRING + "Maximum number of storage thread slots" + FORCE + ) +endif() + +rocprofiler_systems_add_feature( + ROCPROFSYS_MAX_STORAGE_THREADS + "Maximum storage thread slots (defaults to ROCPROFSYS_MAX_THREADS)" +) + set(ROCPROFSYS_MAX_UNWIND_DEPTH "64" CACHE STRING diff --git a/projects/rocprofiler-systems/cmake/Packages.cmake b/projects/rocprofiler-systems/cmake/Packages.cmake index 18f9f6b412c..1206832c32a 100644 --- a/projects/rocprofiler-systems/cmake/Packages.cmake +++ b/projects/rocprofiler-systems/cmake/Packages.cmake @@ -850,6 +850,15 @@ set(TIMEMORY_MAX_THREADS "Max statically-allocated threads" FORCE ) + +# Add option for maximum storage threads +set(TIMEMORY_MAX_STORAGE_THREADS + "${ROCPROFSYS_MAX_STORAGE_THREADS}" + CACHE STRING + "Maximum threads for storage array allocation in timemory operations" + FORCE +) + set(TIMEMORY_SETTINGS_PREFIX "ROCPROFSYS_" CACHE STRING @@ -1174,6 +1183,7 @@ endif() target_compile_definitions( rocprofiler-systems-compile-definitions INTERFACE ROCPROFSYS_MAX_THREADS=${ROCPROFSYS_MAX_THREADS} + ROCPROFSYS_MAX_STORAGE_THREADS=${ROCPROFSYS_MAX_STORAGE_THREADS} ) foreach(_LIB ${ROCPROFSYS_EXTENSION_LIBRARIES}) diff --git a/projects/rocprofiler-systems/docs/reference/development-guide.rst b/projects/rocprofiler-systems/docs/reference/development-guide.rst index f1470ef4146..8d0c7d1bce5 100644 --- a/projects/rocprofiler-systems/docs/reference/development-guide.rst +++ b/projects/rocprofiler-systems/docs/reference/development-guide.rst @@ -336,6 +336,43 @@ a segmentation fault occurs. To fix this issue, a new model is being adopted which has all the benefits of this model but permits dynamic expansion. +Configuring thread limits +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +ROCm Systems Profiler provides two CMake configuration options to control thread-related memory allocation: + +* ``ROCPROFSYS_MAX_THREADS``: Maximum number of threads supported by the application (default: calculated based on CPU cores, minimum 128) +* ``ROCPROFSYS_MAX_STORAGE_THREADS``: Maximum number of storage slots for thread-local data (default: matches ``ROCPROFSYS_MAX_THREADS``) + +These settings control two distinct aspects: + +* ``ROCPROFSYS_MAX_THREADS`` controls the thread ID manager capacity - the maximum number of threads the application can create +* ``ROCPROFSYS_MAX_STORAGE_THREADS`` controls the storage array size - how many slots are allocated for thread-local data storage + +By default, ``ROCPROFSYS_MAX_STORAGE_THREADS`` is set equal to ``ROCPROFSYS_MAX_THREADS`` to prevent segmentation faults. +However, for memory optimization, you can set ``ROCPROFSYS_MAX_STORAGE_THREADS`` to a higher value only when needed. + +**Example: Building with custom thread limits** + +.. code-block:: shell + + # Build with support for 8192 threads + cmake -B build \ + -DROCPROFSYS_MAX_THREADS=8192 \ + -DROCPROFSYS_MAX_STORAGE_THREADS=8192 \ + .. + cmake --build build + +**Important considerations** +.. warning:: + + Setting ``ROCPROFSYS_MAX_STORAGE_THREADS`` below ``ROCPROFSYS_MAX_THREADS`` will cause CMake to auto-adjust ROCPROFSYS_MAX_STORAGE_THREADS to upward to prevent runtime crashes. Setting it above ``ROCPROFSYS_MAX_THREADS`` wastes memory and will generate a warning. + +.. note:: + + The conditional expansion logic ensures that storage arrays only expand beyond 2048 slots when explicitly configured, + helping to minimize memory overhead for typical applications while supporting high-thread-count workloads when needed. + Sampling model ======================================== diff --git a/projects/rocprofiler-systems/source/lib/common/defines.h.in b/projects/rocprofiler-systems/source/lib/common/defines.h.in index 1d9c5bec095..62450965d67 100644 --- a/projects/rocprofiler-systems/source/lib/common/defines.h.in +++ b/projects/rocprofiler-systems/source/lib/common/defines.h.in @@ -93,6 +93,10 @@ # define ROCPROFSYS_MAX_THREADS @ROCPROFSYS_MAX_THREADS@ #endif +#if !defined(ROCPROFSYS_MAX_STORAGE_THREADS) +# define ROCPROFSYS_MAX_STORAGE_THREADS @ROCPROFSYS_MAX_STORAGE_THREADS@ +#endif + #if !defined(ROCPROFSYS_MAX_UNWIND_DEPTH) # define ROCPROFSYS_MAX_UNWIND_DEPTH @ROCPROFSYS_MAX_UNWIND_DEPTH@ #endif diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/sampling.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/sampling.cpp index 8b98777b7e7..ae2a0842cdc 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/sampling.cpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/sampling.cpp @@ -796,77 +796,108 @@ configure(bool _setup, int64_t _tid) if(_signal_types->count(get_sampling_realtime_signal()) > 0) { - _sampler->configure(timer{ get_sampling_realtime_signal(), CLOCK_REALTIME, - SIGEV_THREAD_ID, get_sampling_realtime_freq(), - get_sampling_realtime_delay(), _tid, - threading::get_sys_tid() }); + try + { + _sampler->configure(timer{ get_sampling_realtime_signal(), CLOCK_REALTIME, + SIGEV_THREAD_ID, get_sampling_realtime_freq(), + get_sampling_realtime_delay(), _tid, + threading::get_sys_tid() }); + } catch(const std::exception& e) + { + ROCPROFSYS_VERBOSE( + 1, "Failed to configure realtime sampling timer for thread %ld: %s\n", + _tid, e.what()); + _signal_types->erase(get_sampling_realtime_signal()); + } } if(_signal_types->count(get_sampling_cputime_signal()) > 0) { - _sampler->configure( - timer{ get_sampling_cputime_signal(), CLOCK_THREAD_CPUTIME_ID, - SIGEV_THREAD_ID, get_sampling_cputime_freq(), - get_sampling_cputime_delay(), _tid, threading::get_sys_tid() }); + try + { + _sampler->configure(timer{ + get_sampling_cputime_signal(), CLOCK_THREAD_CPUTIME_ID, + SIGEV_THREAD_ID, get_sampling_cputime_freq(), + get_sampling_cputime_delay(), _tid, threading::get_sys_tid() }); + } catch(const std::exception& e) + { + ROCPROFSYS_VERBOSE( + 1, "Failed to configure cputime sampling timer for thread %ld: %s\n", + _tid, e.what()); + _signal_types->erase(get_sampling_cputime_signal()); + } } if(_signal_types->count(get_sampling_overflow_signal()) > 0) { - if(_signal_types->size() == 1) - trait::runtime_enabled::set(false); + try + { + if(_signal_types->size() == 1) + trait::runtime_enabled::set(false); - _perf_sampler = std::make_unique(); + _perf_sampler = std::make_unique(); - struct perf_event_attr _pe; - memset(&_pe, 0, sizeof(_pe)); + struct perf_event_attr _pe; + memset(&_pe, 0, sizeof(_pe)); - auto _freq = get_sampling_overflow_freq(); - auto _overflow_event = - get_setting_value("ROCPROFSYS_SAMPLING_OVERFLOW_EVENT") - .value_or("perf::PERF_COUNT_HW_CACHE_REFERENCES"); + auto _freq = get_sampling_overflow_freq(); + auto _overflow_event = + get_setting_value("ROCPROFSYS_SAMPLING_OVERFLOW_EVENT") + .value_or("perf::PERF_COUNT_HW_CACHE_REFERENCES"); - perf::config_overflow_sampling(_pe, _overflow_event, _freq); + perf::config_overflow_sampling(_pe, _overflow_event, _freq); - _pe.sample_type = PERF_SAMPLE_TIME | PERF_SAMPLE_IP | PERF_SAMPLE_CALLCHAIN; + _pe.sample_type = + PERF_SAMPLE_TIME | PERF_SAMPLE_IP | PERF_SAMPLE_CALLCHAIN; - _pe.wakeup_events = 10; - _pe.exclude_idle = 1; - _pe.exclude_kernel = 1; - _pe.exclude_hv = 1; - _pe.exclude_callchain_kernel = 1; - _pe.disabled = 1; - _pe.inherit = 0; + _pe.wakeup_events = 10; + _pe.exclude_idle = 1; + _pe.exclude_kernel = 1; + _pe.exclude_hv = 1; + _pe.exclude_callchain_kernel = 1; + _pe.disabled = 1; + _pe.inherit = 0; - if(_pe.type == PERF_TYPE_SOFTWARE) + if(_pe.type == PERF_TYPE_SOFTWARE) + { + _pe.use_clockid = 1; + _pe.clockid = CLOCK_REALTIME; + } + + auto _perf_open_error = + _perf_sampler->open(_pe, _info->index_data->system_value); + + ROCPROFSYS_REQUIRE(!_perf_open_error) + << "perf backend for overflow failed to activate: " + << *_perf_open_error; + + _perf_sampler->set_ready_signal(get_sampling_overflow_signal()); + _sampler->configure( + overflow{ get_sampling_overflow_signal(), + [](int _sig, pid_t, long, int64_t _idx) { + perf::get_instance(_idx)->set_ready_signal(_sig); + return true; + }, + [](int, pid_t, long, int64_t _idx) { + return perf::get_instance(_idx)->start(); + }, + [](int, pid_t, long, int64_t _idx) { + if(!perf::get_instance(_idx) || + !perf::get_instance(_idx)->is_open()) + return true; + auto _stopped = perf::get_instance(_idx)->stop(); + if(_stopped) perf::get_instance(_idx)->close(); + return _stopped; + }, + _tid, threading::get_sys_tid() }); + } catch(const std::exception& exc) { - _pe.use_clockid = 1; - _pe.clockid = CLOCK_REALTIME; + ROCPROFSYS_VERBOSE(1, + "Overflow sampling setup failed for thread %ld: %s\n", + _tid, exc.what()); + _signal_types->erase(get_sampling_overflow_signal()); + if(_perf_sampler) _perf_sampler.reset(); } - - auto _perf_open_error = - _perf_sampler->open(_pe, _info->index_data->system_value); - - ROCPROFSYS_REQUIRE(!_perf_open_error) - << "perf backend for overflow failed to activate: " << *_perf_open_error; - - _perf_sampler->set_ready_signal(get_sampling_overflow_signal()); - _sampler->configure(overflow{ - get_sampling_overflow_signal(), - [](int _sig, pid_t, long, int64_t _idx) { - perf::get_instance(_idx)->set_ready_signal(_sig); - return true; - }, - [](int, pid_t, long, int64_t _idx) { - return perf::get_instance(_idx)->start(); - }, - [](int, pid_t, long, int64_t _idx) { - if(!perf::get_instance(_idx) || !perf::get_instance(_idx)->is_open()) - return true; - auto _stopped = perf::get_instance(_idx)->stop(); - if(_stopped) perf::get_instance(_idx)->close(); - return _stopped; - }, - _tid, threading::get_sys_tid() }); } if(get_use_tmp_files()) From c8e24623f3be0d3a08814b42cea6a74cd4064b43 Mon Sep 17 00:00:00 2001 From: Anuj Shukla Date: Mon, 8 Dec 2025 09:53:39 -0500 Subject: [PATCH 2/7] Refactor thread limit validation and improve documentation --- projects/rocprofiler-systems/CMakeLists.txt | 36 ++---------- .../docs/reference/development-guide.rst | 45 +++++++------- .../source/lib/core/state.cpp | 8 +-- .../source/lib/core/utility.hpp | 4 ++ .../components/pthread_create_gotcha.cpp | 22 ++----- .../lib/rocprof-sys/library/runtime.cpp | 9 +-- .../lib/rocprof-sys/library/thread_info.cpp | 58 ++++++++++++++++++- .../lib/rocprof-sys/library/thread_info.hpp | 11 ++++ .../rocprof-sys-thread-limit-tests.cmake | 15 ++--- 9 files changed, 119 insertions(+), 89 deletions(-) diff --git a/projects/rocprofiler-systems/CMakeLists.txt b/projects/rocprofiler-systems/CMakeLists.txt index 6ac3968a3b5..3f7c871d9ab 100644 --- a/projects/rocprofiler-systems/CMakeLists.txt +++ b/projects/rocprofiler-systems/CMakeLists.txt @@ -236,10 +236,9 @@ rocprofiler_systems_add_option(ROCPROFSYS_BUILD_CODECOV "Build for code coverage rocprofiler_systems_add_option(ROCPROFSYS_INSTALL_PERFETTO_TOOLS "Install perfetto tools (i.e. traced, perfetto, etc.)" OFF ) -rocprofiler_systems_add_option(ROCPROFSYS_BUILD_SQLITE3 - "Enable building sqlite3 library internally" ON +rocprofiler_systems_add_option(ROCPROFILER_BUILD_SQLITE3 + "Enable building sqlite3 library internally" OFF ) - rocprofiler_systems_add_option(ROCPROFSYS_BUILD_GTEST "Enable building googletest library internally" ON ) @@ -305,37 +304,14 @@ elseif(NOT ROCPROFSYS_MAX_THREADS EQUAL _MAX_THREADS) ) endif() -# --------------------------------------------------------------------------------# -# Maximum Storage Threads Configuration -# --------------------------------------------------------------------------------# -# Default to ROCPROFSYS_MAX_THREADS if not explicitly set -if(NOT DEFINED ROCPROFSYS_MAX_STORAGE_THREADS) - set(ROCPROFSYS_MAX_STORAGE_THREADS - "${ROCPROFSYS_MAX_THREADS}" - CACHE STRING - "Maximum number of storage thread slots" - ) -endif() - -# Auto-adjust if set below minimum safe threshold -if(ROCPROFSYS_MAX_STORAGE_THREADS LESS ROCPROFSYS_MAX_THREADS) +if(ROCPROFSYS_MAX_THREADS LESS 128) rocprofiler_systems_message( - WARNING - "ROCPROFSYS_MAX_STORAGE_THREADS (${ROCPROFSYS_MAX_STORAGE_THREADS}) < ROCPROFSYS_MAX_THREADS (${ROCPROFSYS_MAX_THREADS}). Auto-adjusting to prevent runtime errors." - ) - set(ROCPROFSYS_MAX_STORAGE_THREADS - "${ROCPROFSYS_MAX_THREADS}" - CACHE STRING - "Maximum number of storage thread slots" - FORCE + AUTHOR_WARNING + "ROCPROFSYS_MAX_THREADS (=${ROCPROFSYS_MAX_THREADS}) is less than 128 which may be too low for some applications. Setting it to at least 128." ) + set(ROCPROFSYS_MAX_THREADS 128) endif() -rocprofiler_systems_add_feature( - ROCPROFSYS_MAX_STORAGE_THREADS - "Maximum storage thread slots (defaults to ROCPROFSYS_MAX_THREADS)" -) - set(ROCPROFSYS_MAX_UNWIND_DEPTH "64" CACHE STRING diff --git a/projects/rocprofiler-systems/docs/reference/development-guide.rst b/projects/rocprofiler-systems/docs/reference/development-guide.rst index 8d0c7d1bce5..6307d3935bb 100644 --- a/projects/rocprofiler-systems/docs/reference/development-guide.rst +++ b/projects/rocprofiler-systems/docs/reference/development-guide.rst @@ -331,48 +331,45 @@ Currently, most thread data is effectively stored in a static for release builds. During finalization, ROCm Systems Profiler iterates through the thread-data and transforms that data into something that can be passed along to Perfetto and/or Timemory. -The downside of the current model is that if the user exceeds ``ROCPROFSYS_MAX_THREADS``, -a segmentation fault occurs. To fix this issue, -a new model is being adopted which has all the benefits of this model -but permits dynamic expansion. +In the current model, if the user exceeds ``ROCPROFSYS_MAX_THREADS`` at runtime, +thread creation fails gracefully with a warning message, and the excess threads operate with thread-local +fallback and profiling will be skipped and not persisted to output files. To support truly dynamic thread limits without +compile-time constraints, a new model is being adopted which has all the benefits of static allocation +but permits dynamic expansion beyond ``ROCPROFSYS_MAX_THREADS``. Currently, the thread limit +can be increased at compile-time using the ``ROCPROFSYS_MAX_THREADS`` CMake configuration option. Configuring thread limits ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -ROCm Systems Profiler provides two CMake configuration options to control thread-related memory allocation: +ROCm Systems Profiler uses a single CMake configuration option to control thread-related memory allocation: -* ``ROCPROFSYS_MAX_THREADS``: Maximum number of threads supported by the application (default: calculated based on CPU cores, minimum 128) -* ``ROCPROFSYS_MAX_STORAGE_THREADS``: Maximum number of storage slots for thread-local data (default: matches ``ROCPROFSYS_MAX_THREADS``) +* ``ROCPROFSYS_MAX_THREADS``: Maximum number of threads supported (default: ``max(128, 16 × CPU_cores)``, must be power of 2) -These settings control two distinct aspects: +This setting controls: -* ``ROCPROFSYS_MAX_THREADS`` controls the thread ID manager capacity - the maximum number of threads the application can create -* ``ROCPROFSYS_MAX_STORAGE_THREADS`` controls the storage array size - how many slots are allocated for thread-local data storage +* Thread ID manager capacity (maximum thread IDs that can be tracked) +* Storage array sizes for thread-local data across the codebase +* Timemory's internal thread storage (``TIMEMORY_MAX_THREADS``) -By default, ``ROCPROFSYS_MAX_STORAGE_THREADS`` is set equal to ``ROCPROFSYS_MAX_THREADS`` to prevent segmentation faults. -However, for memory optimization, you can set ``ROCPROFSYS_MAX_STORAGE_THREADS`` to a higher value only when needed. +**Build-time validation:** -**Example: Building with custom thread limits** +CMake enforces that ``ROCPROFSYS_MAX_THREADS`` must be a power of 2: + +.. code-block:: cmake + + # Valid: 128, 256, 512, 1024, 2048, 4096, 8192, 16384, ... + # Invalid: 100, 3000, 5000, 10000, ... (FATAL_ERROR) + # Values < 128 will be automatically set to 128 with a warning +**Example: Building with custom thread limit** .. code-block:: shell # Build with support for 8192 threads cmake -B build \ -DROCPROFSYS_MAX_THREADS=8192 \ - -DROCPROFSYS_MAX_STORAGE_THREADS=8192 \ .. cmake --build build -**Important considerations** -.. warning:: - - Setting ``ROCPROFSYS_MAX_STORAGE_THREADS`` below ``ROCPROFSYS_MAX_THREADS`` will cause CMake to auto-adjust ROCPROFSYS_MAX_STORAGE_THREADS to upward to prevent runtime crashes. Setting it above ``ROCPROFSYS_MAX_THREADS`` wastes memory and will generate a warning. - -.. note:: - - The conditional expansion logic ensures that storage arrays only expand beyond 2048 slots when explicitly configured, - helping to minimize memory overhead for typical applications while supporting high-thread-count workloads when needed. - Sampling model ======================================== diff --git a/projects/rocprofiler-systems/source/lib/core/state.cpp b/projects/rocprofiler-systems/source/lib/core/state.cpp index 007090b9366..63fc5e3654f 100644 --- a/projects/rocprofiler-systems/source/lib/core/state.cpp +++ b/projects/rocprofiler-systems/source/lib/core/state.cpp @@ -51,15 +51,15 @@ get_thread_state_value() auto& get_thread_state_history(int64_t _idx = utility::get_thread_index()) { - static auto _v = utility::get_filled_array( - []() { return utility::get_reserved_vector(32); }); - - if(_idx >= ROCPROFSYS_MAX_THREADS) + if(_idx < 0 || _idx >= ROCPROFSYS_MAX_THREADS) { static thread_local auto _tl_v = utility::get_reserved_vector(32); return _tl_v; } + static auto _v = utility::get_filled_array( + []() { return utility::get_reserved_vector(32); }); + return _v.at(_idx); } } // namespace diff --git a/projects/rocprofiler-systems/source/lib/core/utility.hpp b/projects/rocprofiler-systems/source/lib/core/utility.hpp index 4fd05a1be6e..344a2f7345a 100644 --- a/projects/rocprofiler-systems/source/lib/core/utility.hpp +++ b/projects/rocprofiler-systems/source/lib/core/utility.hpp @@ -24,6 +24,7 @@ #include "concepts.hpp" +#include #include #include #include @@ -47,6 +48,9 @@ namespace utility inline auto get_thread_index() { + // Internal/offset threads should not consume TID counter slots + if(tim::threading::offset_this_id()) return static_cast(-1); + static std::atomic _c{ 0 }; static thread_local int64_t _v = _c++; return _v; diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/pthread_create_gotcha.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/pthread_create_gotcha.cpp index 06f20f445f9..a2b8a155f81 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/pthread_create_gotcha.cpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/pthread_create_gotcha.cpp @@ -62,9 +62,6 @@ namespace component { using bundle_t = tim::lightweight_tuple; using category_region_t = tim::lightweight_tuple>; -// The maximum limit for the number of threads is set at 4096. declared and stored in the -// set_storage struct's `types.hpp` file. -constexpr size_t allowed_max_threads = 4096; namespace { @@ -174,6 +171,9 @@ pthread_create_gotcha::wrapper::operator()() const return m_routine(m_arg); } + // Set offset flag BEFORE any function that might call utility::get_thread_index() + if(m_config.offset) tim::threading::offset_this_id(true); + push_thread_state(ThreadState::Internal); int64_t _tid = -1; @@ -184,20 +184,8 @@ pthread_create_gotcha::wrapper::operator()() const auto _coverage = (get_mode() == Mode::Coverage); const auto& _parent_info = thread_info::get(m_config.parent_tid, InternalTID); const auto& _info = thread_info::init(m_config.offset); - auto _sequent_value = _info->index_data ? _info->index_data->sequent_value : -1; - if(static_cast(_sequent_value) >= allowed_max_threads) - { - static std::once_flag thread_limit_warning_flag; - std::call_once(thread_limit_warning_flag, []() { - ROCPROFSYS_WARNING_F( - 1, - "[rocprof-sys][WARNING] Maximum allowed thread limit (%zu) " - "reached. Further thread creation and profiling will be " - "disabled to prevent resource exhaustion.\n", - allowed_max_threads); - }); - return m_routine(m_arg); - } + // If thread_info::init returns empty optional, the thread limit was exceeded + if(!_info) return m_routine(m_arg); auto _dtor = [&]() { set_thread_state(ThreadState::Internal); if(_is_sampling) diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/runtime.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/runtime.cpp index c80b4172ce3..c7c8f9d8334 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/runtime.cpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/runtime.cpp @@ -62,15 +62,16 @@ auto root_process_id = auto& get_sampling_on_child_threads_history(int64_t _idx = utility::get_thread_index()) { - static auto _v = utility::get_filled_array( - []() { return utility::get_reserved_vector(64); }); - - if(_idx >= ROCPROFSYS_MAX_THREADS) + // Check bounds FIRST to prevent out_of_range access + if(_idx < 0 || _idx >= ROCPROFSYS_MAX_THREADS) { static thread_local auto _tl_v = utility::get_reserved_vector(128); return _tl_v; } + static auto _v = utility::get_filled_array( + []() { return utility::get_reserved_vector(64); }); + return _v.at(_idx); } diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/thread_info.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/thread_info.cpp index 37846e1d393..30354497f80 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/thread_info.cpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/thread_info.cpp @@ -61,12 +61,24 @@ get_index_data() auto& get_info_data(int64_t _tid) { + // Bounds check to prevent out_of_range when _tid >= MAX_THREADS + if(_tid < 0 || _tid >= ROCPROFSYS_MAX_THREADS) + { + static thread_local auto _tl_v = std::optional{}; + return _tl_v; + } return get_info_data()->at(_tid); } auto& get_index_data(int64_t _tid) { + // Bounds check to prevent out_of_range when _tid >= MAX_THREADS + if(_tid < 0 || _tid >= ROCPROFSYS_MAX_THREADS) + { + static thread_local auto _tl_v = std::optional{}; + return _tl_v; + } return get_index_data()->at(_tid); } @@ -175,7 +187,6 @@ thread_info::init(bool _offset) { static thread_local bool _once = false; auto& _info_data = get_info_data(); - auto _tid = utility::get_thread_index(); if(!_info_data) { @@ -183,6 +194,51 @@ thread_info::init(bool _offset) return (_dummy.reset(), _dummy); // always reset for safety } + // Internal threads (_offset=true) use thread-local storage to not consume user TID + // slots Check this before calling get_thread_index() to avoid incrementing the global + // counter + if(_offset) + { + static thread_local auto _tl_internal = std::optional{}; + if(!_once && (_once = true)) + { + threading::offset_this_id(_offset); + _tl_internal = thread_info{}; + _tl_internal->is_offset = true; + // For internal threads, use a pseudo index_data without consuming TID pool + _tl_internal->index_data = thread_index_data(true); + _tl_internal->lifetime.first = tim::get_clock_real_now(); + _tl_internal->causal_count = &offset_causal_count; + set_thread_state(ThreadState::Disabled); + + ROCPROFSYS_BASIC_VERBOSE_F( + 2, + "Internal thread %li on PID %i (rank: %i) using thread-local storage " + "(not consuming user TID slot)\n", + _tl_internal->index_data->system_value, process::get_id(), dmp::rank()); + } + return _tl_internal; + } + + // Only allocate TID for non-offset (user) threads + auto _tid = utility::get_thread_index(); + + // Prevent access beyond MAX_THREADS + if(_tid < 0 || _tid >= ROCPROFSYS_MAX_THREADS) + { + static std::once_flag thread_limit_warning_flag; + std::call_once(thread_limit_warning_flag, []() { + ROCPROFSYS_WARNING_F( + 1, + "[rocprof-sys][WARNING] Maximum allowed thread limit (%d) " + "reached. Further thread creation and profiling will be " + "disabled to prevent resource exhaustion.\n", + ROCPROFSYS_MAX_THREADS); + }); + static thread_local auto _tl_dummy = std::optional{}; + return (_tl_dummy.reset(), _tl_dummy); + } + if(!_once && (_once = true)) { grow_data(_tid); diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/thread_info.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/thread_info.hpp index dee22fe68a8..f5c506f8a76 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/thread_info.hpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/thread_info.hpp @@ -77,6 +77,17 @@ struct thread_index_data native_tid_t pthread_value = ::pthread_self(); stl_tid_t stl_value = std::this_thread::get_id(); + thread_index_data() = default; + + // Special constructor for internal threads that don't consume TID counter + explicit thread_index_data(bool _is_internal) + : internal_value(_is_internal ? -1 : utility::get_thread_index()) + , system_value(tim::threading::get_sys_tid()) + , sequent_value(_is_internal ? -1 : tim::threading::get_id()) + , pthread_value(::pthread_self()) + , stl_value(std::this_thread::get_id()) + {} + std::string as_string() const; }; diff --git a/projects/rocprofiler-systems/tests/rocprof-sys-thread-limit-tests.cmake b/projects/rocprofiler-systems/tests/rocprof-sys-thread-limit-tests.cmake index 3e4ab1fc713..8a8444a386a 100644 --- a/projects/rocprofiler-systems/tests/rocprof-sys-thread-limit-tests.cmake +++ b/projects/rocprofiler-systems/tests/rocprof-sys-thread-limit-tests.cmake @@ -41,21 +41,18 @@ set(_thread_limit_environment "ROCPROFSYS_TIMEMORY_COMPONENTS=wall_clock,peak_rss,page_rss" ) -# Maximum allowed threads -set(ALLOWED_MAX_THREADS 4096) +math(EXPR THREAD_VAL_1 "${ROCPROFSYS_MAX_THREADS} - 1") +math(EXPR THREAD_VAL_2 "${ROCPROFSYS_MAX_THREADS} + 24") -math(EXPR THREAD_VAL_1 "${ROCPROFSYS_MAX_THREADS} + 24") -math(EXPR THREAD_VAL_2 "${ALLOWED_MAX_THREADS} + 1") - -set(THREAD_VALUES ${THREAD_VAL_1} ${THREAD_VAL_2}) +set(THREAD_VALUES ${THREAD_VAL_1} ${THREAD_VAL_2} ${ROCPROFSYS_MAX_THREADS}) # Loop over thread values foreach(THREADS IN LISTS THREAD_VALUES) set(THREAD_PASS_VALUE ${THREADS}) math(EXPR THREAD_FAIL_VALUE "${THREADS} + 1") - if(${THREADS} GREATER_EQUAL ${ALLOWED_MAX_THREADS}) - math(EXPR THREAD_PASS_VALUE "${ALLOWED_MAX_THREADS} - 1") - math(EXPR THREAD_FAIL_VALUE "${THREADS}") + if(${THREADS} GREATER_EQUAL ${ROCPROFSYS_MAX_THREADS}) + math(EXPR THREAD_PASS_VALUE "${ROCPROFSYS_MAX_THREADS} - 1") + math(EXPR THREAD_FAIL_VALUE "${ROCPROFSYS_MAX_THREADS} + 1") endif() set(_thread_limit_pass_regex "\\|${THREAD_PASS_VALUE}>>>") From 3fda3bdba2e2fa8915effe347f3cd2c19390dfc8 Mon Sep 17 00:00:00 2001 From: Anuj Shukla Date: Mon, 8 Dec 2025 10:44:29 -0500 Subject: [PATCH 3/7] Edited prevous changes --- projects/rocprofiler-systems/CMakeLists.txt | 5 +++-- projects/rocprofiler-systems/cmake/Packages.cmake | 10 ---------- .../docs/reference/development-guide.rst | 8 ++++---- .../rocprofiler-systems/source/lib/common/defines.h.in | 4 ---- 4 files changed, 7 insertions(+), 20 deletions(-) diff --git a/projects/rocprofiler-systems/CMakeLists.txt b/projects/rocprofiler-systems/CMakeLists.txt index 3f7c871d9ab..ce22cbf2639 100644 --- a/projects/rocprofiler-systems/CMakeLists.txt +++ b/projects/rocprofiler-systems/CMakeLists.txt @@ -236,9 +236,10 @@ rocprofiler_systems_add_option(ROCPROFSYS_BUILD_CODECOV "Build for code coverage rocprofiler_systems_add_option(ROCPROFSYS_INSTALL_PERFETTO_TOOLS "Install perfetto tools (i.e. traced, perfetto, etc.)" OFF ) -rocprofiler_systems_add_option(ROCPROFILER_BUILD_SQLITE3 - "Enable building sqlite3 library internally" OFF +rocprofiler_systems_add_option(ROCPROFSYS_BUILD_SQLITE3 + "Enable building sqlite3 library internally" ON ) + rocprofiler_systems_add_option(ROCPROFSYS_BUILD_GTEST "Enable building googletest library internally" ON ) diff --git a/projects/rocprofiler-systems/cmake/Packages.cmake b/projects/rocprofiler-systems/cmake/Packages.cmake index 1206832c32a..18f9f6b412c 100644 --- a/projects/rocprofiler-systems/cmake/Packages.cmake +++ b/projects/rocprofiler-systems/cmake/Packages.cmake @@ -850,15 +850,6 @@ set(TIMEMORY_MAX_THREADS "Max statically-allocated threads" FORCE ) - -# Add option for maximum storage threads -set(TIMEMORY_MAX_STORAGE_THREADS - "${ROCPROFSYS_MAX_STORAGE_THREADS}" - CACHE STRING - "Maximum threads for storage array allocation in timemory operations" - FORCE -) - set(TIMEMORY_SETTINGS_PREFIX "ROCPROFSYS_" CACHE STRING @@ -1183,7 +1174,6 @@ endif() target_compile_definitions( rocprofiler-systems-compile-definitions INTERFACE ROCPROFSYS_MAX_THREADS=${ROCPROFSYS_MAX_THREADS} - ROCPROFSYS_MAX_STORAGE_THREADS=${ROCPROFSYS_MAX_STORAGE_THREADS} ) foreach(_LIB ${ROCPROFSYS_EXTENSION_LIBRARIES}) diff --git a/projects/rocprofiler-systems/docs/reference/development-guide.rst b/projects/rocprofiler-systems/docs/reference/development-guide.rst index 6307d3935bb..bdc76569ae8 100644 --- a/projects/rocprofiler-systems/docs/reference/development-guide.rst +++ b/projects/rocprofiler-systems/docs/reference/development-guide.rst @@ -333,10 +333,10 @@ ROCm Systems Profiler iterates through the thread-data and transforms that data into something that can be passed along to Perfetto and/or Timemory. In the current model, if the user exceeds ``ROCPROFSYS_MAX_THREADS`` at runtime, thread creation fails gracefully with a warning message, and the excess threads operate with thread-local -fallback and profiling will be skipped and not persisted to output files. To support truly dynamic thread limits without -compile-time constraints, a new model is being adopted which has all the benefits of static allocation -but permits dynamic expansion beyond ``ROCPROFSYS_MAX_THREADS``. Currently, the thread limit -can be increased at compile-time using the ``ROCPROFSYS_MAX_THREADS`` CMake configuration option. +fallback and profiling will be skipped and not persisted to output files for threads beyond ROCPROFSYS_MAX_THREADS. +To support truly dynamic thread limits without compile-time constraints, a new model is being adopted which +has all the benefits of static allocation but permits dynamic expansion beyond ``ROCPROFSYS_MAX_THREADS``. +Currently, the thread limit can be increased at compile-time using the ``ROCPROFSYS_MAX_THREADS`` CMake configuration option. Configuring thread limits ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/projects/rocprofiler-systems/source/lib/common/defines.h.in b/projects/rocprofiler-systems/source/lib/common/defines.h.in index 62450965d67..1d9c5bec095 100644 --- a/projects/rocprofiler-systems/source/lib/common/defines.h.in +++ b/projects/rocprofiler-systems/source/lib/common/defines.h.in @@ -93,10 +93,6 @@ # define ROCPROFSYS_MAX_THREADS @ROCPROFSYS_MAX_THREADS@ #endif -#if !defined(ROCPROFSYS_MAX_STORAGE_THREADS) -# define ROCPROFSYS_MAX_STORAGE_THREADS @ROCPROFSYS_MAX_STORAGE_THREADS@ -#endif - #if !defined(ROCPROFSYS_MAX_UNWIND_DEPTH) # define ROCPROFSYS_MAX_UNWIND_DEPTH @ROCPROFSYS_MAX_UNWIND_DEPTH@ #endif From c14a68d803063192a3959a3aa4be2f30b27822b4 Mon Sep 17 00:00:00 2001 From: Anuj Shukla Date: Fri, 12 Dec 2025 07:03:49 -0500 Subject: [PATCH 4/7] Keeping only chnages related to issue, Reverting optimization changes --- projects/rocprofiler-systems/CMakeLists.txt | 2 - .../source/lib/core/state.cpp | 8 +- .../source/lib/core/utility.hpp | 4 - .../components/pthread_create_gotcha.cpp | 19 ++- .../lib/rocprof-sys/library/runtime.cpp | 9 +- .../lib/rocprof-sys/library/sampling.cpp | 139 +++++++----------- .../lib/rocprof-sys/library/thread_info.cpp | 58 +------- .../lib/rocprof-sys/library/thread_info.hpp | 11 -- .../rocprof-sys-thread-limit-tests.cmake | 6 +- 9 files changed, 80 insertions(+), 176 deletions(-) diff --git a/projects/rocprofiler-systems/CMakeLists.txt b/projects/rocprofiler-systems/CMakeLists.txt index ce22cbf2639..a26de294b3c 100644 --- a/projects/rocprofiler-systems/CMakeLists.txt +++ b/projects/rocprofiler-systems/CMakeLists.txt @@ -304,7 +304,6 @@ elseif(NOT ROCPROFSYS_MAX_THREADS EQUAL _MAX_THREADS) "ROCPROFSYS_MAX_THREADS (=${ROCPROFSYS_MAX_THREADS}) must be a power of 2. We were unable to verify it so we are emitting this warning instead. Estimate resulted in: ${_MAX_THREADS}" ) endif() - if(ROCPROFSYS_MAX_THREADS LESS 128) rocprofiler_systems_message( AUTHOR_WARNING @@ -312,7 +311,6 @@ if(ROCPROFSYS_MAX_THREADS LESS 128) ) set(ROCPROFSYS_MAX_THREADS 128) endif() - set(ROCPROFSYS_MAX_UNWIND_DEPTH "64" CACHE STRING diff --git a/projects/rocprofiler-systems/source/lib/core/state.cpp b/projects/rocprofiler-systems/source/lib/core/state.cpp index 63fc5e3654f..007090b9366 100644 --- a/projects/rocprofiler-systems/source/lib/core/state.cpp +++ b/projects/rocprofiler-systems/source/lib/core/state.cpp @@ -51,15 +51,15 @@ get_thread_state_value() auto& get_thread_state_history(int64_t _idx = utility::get_thread_index()) { - if(_idx < 0 || _idx >= ROCPROFSYS_MAX_THREADS) + static auto _v = utility::get_filled_array( + []() { return utility::get_reserved_vector(32); }); + + if(_idx >= ROCPROFSYS_MAX_THREADS) { static thread_local auto _tl_v = utility::get_reserved_vector(32); return _tl_v; } - static auto _v = utility::get_filled_array( - []() { return utility::get_reserved_vector(32); }); - return _v.at(_idx); } } // namespace diff --git a/projects/rocprofiler-systems/source/lib/core/utility.hpp b/projects/rocprofiler-systems/source/lib/core/utility.hpp index 344a2f7345a..4fd05a1be6e 100644 --- a/projects/rocprofiler-systems/source/lib/core/utility.hpp +++ b/projects/rocprofiler-systems/source/lib/core/utility.hpp @@ -24,7 +24,6 @@ #include "concepts.hpp" -#include #include #include #include @@ -48,9 +47,6 @@ namespace utility inline auto get_thread_index() { - // Internal/offset threads should not consume TID counter slots - if(tim::threading::offset_this_id()) return static_cast(-1); - static std::atomic _c{ 0 }; static thread_local int64_t _v = _c++; return _v; diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/pthread_create_gotcha.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/pthread_create_gotcha.cpp index a2b8a155f81..ac9a14684e0 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/pthread_create_gotcha.cpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/pthread_create_gotcha.cpp @@ -171,9 +171,6 @@ pthread_create_gotcha::wrapper::operator()() const return m_routine(m_arg); } - // Set offset flag BEFORE any function that might call utility::get_thread_index() - if(m_config.offset) tim::threading::offset_this_id(true); - push_thread_state(ThreadState::Internal); int64_t _tid = -1; @@ -184,8 +181,20 @@ pthread_create_gotcha::wrapper::operator()() const auto _coverage = (get_mode() == Mode::Coverage); const auto& _parent_info = thread_info::get(m_config.parent_tid, InternalTID); const auto& _info = thread_info::init(m_config.offset); - // If thread_info::init returns empty optional, the thread limit was exceeded - if(!_info) return m_routine(m_arg); + auto _sequent_value = _info->index_data ? _info->index_data->sequent_value : -1; + if(static_cast(_sequent_value) >= ROCPROFSYS_MAX_THREADS) + { + static std::once_flag thread_limit_warning_flag; + std::call_once(thread_limit_warning_flag, []() { + ROCPROFSYS_WARNING_F( + 1, + "[rocprof-sys][WARNING] Maximum allowed thread limit (%zu) " + "reached. Further thread creation and profiling will be " + "disabled to prevent resource exhaustion.\n", + ROCPROFSYS_MAX_THREADS); + }); + return m_routine(m_arg); + } auto _dtor = [&]() { set_thread_state(ThreadState::Internal); if(_is_sampling) diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/runtime.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/runtime.cpp index c7c8f9d8334..c80b4172ce3 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/runtime.cpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/runtime.cpp @@ -62,16 +62,15 @@ auto root_process_id = auto& get_sampling_on_child_threads_history(int64_t _idx = utility::get_thread_index()) { - // Check bounds FIRST to prevent out_of_range access - if(_idx < 0 || _idx >= ROCPROFSYS_MAX_THREADS) + static auto _v = utility::get_filled_array( + []() { return utility::get_reserved_vector(64); }); + + if(_idx >= ROCPROFSYS_MAX_THREADS) { static thread_local auto _tl_v = utility::get_reserved_vector(128); return _tl_v; } - static auto _v = utility::get_filled_array( - []() { return utility::get_reserved_vector(64); }); - return _v.at(_idx); } diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/sampling.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/sampling.cpp index ae2a0842cdc..8b98777b7e7 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/sampling.cpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/sampling.cpp @@ -796,108 +796,77 @@ configure(bool _setup, int64_t _tid) if(_signal_types->count(get_sampling_realtime_signal()) > 0) { - try - { - _sampler->configure(timer{ get_sampling_realtime_signal(), CLOCK_REALTIME, - SIGEV_THREAD_ID, get_sampling_realtime_freq(), - get_sampling_realtime_delay(), _tid, - threading::get_sys_tid() }); - } catch(const std::exception& e) - { - ROCPROFSYS_VERBOSE( - 1, "Failed to configure realtime sampling timer for thread %ld: %s\n", - _tid, e.what()); - _signal_types->erase(get_sampling_realtime_signal()); - } + _sampler->configure(timer{ get_sampling_realtime_signal(), CLOCK_REALTIME, + SIGEV_THREAD_ID, get_sampling_realtime_freq(), + get_sampling_realtime_delay(), _tid, + threading::get_sys_tid() }); } if(_signal_types->count(get_sampling_cputime_signal()) > 0) { - try - { - _sampler->configure(timer{ - get_sampling_cputime_signal(), CLOCK_THREAD_CPUTIME_ID, - SIGEV_THREAD_ID, get_sampling_cputime_freq(), - get_sampling_cputime_delay(), _tid, threading::get_sys_tid() }); - } catch(const std::exception& e) - { - ROCPROFSYS_VERBOSE( - 1, "Failed to configure cputime sampling timer for thread %ld: %s\n", - _tid, e.what()); - _signal_types->erase(get_sampling_cputime_signal()); - } + _sampler->configure( + timer{ get_sampling_cputime_signal(), CLOCK_THREAD_CPUTIME_ID, + SIGEV_THREAD_ID, get_sampling_cputime_freq(), + get_sampling_cputime_delay(), _tid, threading::get_sys_tid() }); } if(_signal_types->count(get_sampling_overflow_signal()) > 0) { - try - { - if(_signal_types->size() == 1) - trait::runtime_enabled::set(false); + if(_signal_types->size() == 1) + trait::runtime_enabled::set(false); - _perf_sampler = std::make_unique(); + _perf_sampler = std::make_unique(); - struct perf_event_attr _pe; - memset(&_pe, 0, sizeof(_pe)); + struct perf_event_attr _pe; + memset(&_pe, 0, sizeof(_pe)); - auto _freq = get_sampling_overflow_freq(); - auto _overflow_event = - get_setting_value("ROCPROFSYS_SAMPLING_OVERFLOW_EVENT") - .value_or("perf::PERF_COUNT_HW_CACHE_REFERENCES"); + auto _freq = get_sampling_overflow_freq(); + auto _overflow_event = + get_setting_value("ROCPROFSYS_SAMPLING_OVERFLOW_EVENT") + .value_or("perf::PERF_COUNT_HW_CACHE_REFERENCES"); - perf::config_overflow_sampling(_pe, _overflow_event, _freq); + perf::config_overflow_sampling(_pe, _overflow_event, _freq); - _pe.sample_type = - PERF_SAMPLE_TIME | PERF_SAMPLE_IP | PERF_SAMPLE_CALLCHAIN; + _pe.sample_type = PERF_SAMPLE_TIME | PERF_SAMPLE_IP | PERF_SAMPLE_CALLCHAIN; - _pe.wakeup_events = 10; - _pe.exclude_idle = 1; - _pe.exclude_kernel = 1; - _pe.exclude_hv = 1; - _pe.exclude_callchain_kernel = 1; - _pe.disabled = 1; - _pe.inherit = 0; + _pe.wakeup_events = 10; + _pe.exclude_idle = 1; + _pe.exclude_kernel = 1; + _pe.exclude_hv = 1; + _pe.exclude_callchain_kernel = 1; + _pe.disabled = 1; + _pe.inherit = 0; - if(_pe.type == PERF_TYPE_SOFTWARE) - { - _pe.use_clockid = 1; - _pe.clockid = CLOCK_REALTIME; - } - - auto _perf_open_error = - _perf_sampler->open(_pe, _info->index_data->system_value); - - ROCPROFSYS_REQUIRE(!_perf_open_error) - << "perf backend for overflow failed to activate: " - << *_perf_open_error; - - _perf_sampler->set_ready_signal(get_sampling_overflow_signal()); - _sampler->configure( - overflow{ get_sampling_overflow_signal(), - [](int _sig, pid_t, long, int64_t _idx) { - perf::get_instance(_idx)->set_ready_signal(_sig); - return true; - }, - [](int, pid_t, long, int64_t _idx) { - return perf::get_instance(_idx)->start(); - }, - [](int, pid_t, long, int64_t _idx) { - if(!perf::get_instance(_idx) || - !perf::get_instance(_idx)->is_open()) - return true; - auto _stopped = perf::get_instance(_idx)->stop(); - if(_stopped) perf::get_instance(_idx)->close(); - return _stopped; - }, - _tid, threading::get_sys_tid() }); - } catch(const std::exception& exc) + if(_pe.type == PERF_TYPE_SOFTWARE) { - ROCPROFSYS_VERBOSE(1, - "Overflow sampling setup failed for thread %ld: %s\n", - _tid, exc.what()); - _signal_types->erase(get_sampling_overflow_signal()); - if(_perf_sampler) _perf_sampler.reset(); + _pe.use_clockid = 1; + _pe.clockid = CLOCK_REALTIME; } + + auto _perf_open_error = + _perf_sampler->open(_pe, _info->index_data->system_value); + + ROCPROFSYS_REQUIRE(!_perf_open_error) + << "perf backend for overflow failed to activate: " << *_perf_open_error; + + _perf_sampler->set_ready_signal(get_sampling_overflow_signal()); + _sampler->configure(overflow{ + get_sampling_overflow_signal(), + [](int _sig, pid_t, long, int64_t _idx) { + perf::get_instance(_idx)->set_ready_signal(_sig); + return true; + }, + [](int, pid_t, long, int64_t _idx) { + return perf::get_instance(_idx)->start(); + }, + [](int, pid_t, long, int64_t _idx) { + if(!perf::get_instance(_idx) || !perf::get_instance(_idx)->is_open()) + return true; + auto _stopped = perf::get_instance(_idx)->stop(); + if(_stopped) perf::get_instance(_idx)->close(); + return _stopped; + }, + _tid, threading::get_sys_tid() }); } if(get_use_tmp_files()) diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/thread_info.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/thread_info.cpp index 30354497f80..37846e1d393 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/thread_info.cpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/thread_info.cpp @@ -61,24 +61,12 @@ get_index_data() auto& get_info_data(int64_t _tid) { - // Bounds check to prevent out_of_range when _tid >= MAX_THREADS - if(_tid < 0 || _tid >= ROCPROFSYS_MAX_THREADS) - { - static thread_local auto _tl_v = std::optional{}; - return _tl_v; - } return get_info_data()->at(_tid); } auto& get_index_data(int64_t _tid) { - // Bounds check to prevent out_of_range when _tid >= MAX_THREADS - if(_tid < 0 || _tid >= ROCPROFSYS_MAX_THREADS) - { - static thread_local auto _tl_v = std::optional{}; - return _tl_v; - } return get_index_data()->at(_tid); } @@ -187,6 +175,7 @@ thread_info::init(bool _offset) { static thread_local bool _once = false; auto& _info_data = get_info_data(); + auto _tid = utility::get_thread_index(); if(!_info_data) { @@ -194,51 +183,6 @@ thread_info::init(bool _offset) return (_dummy.reset(), _dummy); // always reset for safety } - // Internal threads (_offset=true) use thread-local storage to not consume user TID - // slots Check this before calling get_thread_index() to avoid incrementing the global - // counter - if(_offset) - { - static thread_local auto _tl_internal = std::optional{}; - if(!_once && (_once = true)) - { - threading::offset_this_id(_offset); - _tl_internal = thread_info{}; - _tl_internal->is_offset = true; - // For internal threads, use a pseudo index_data without consuming TID pool - _tl_internal->index_data = thread_index_data(true); - _tl_internal->lifetime.first = tim::get_clock_real_now(); - _tl_internal->causal_count = &offset_causal_count; - set_thread_state(ThreadState::Disabled); - - ROCPROFSYS_BASIC_VERBOSE_F( - 2, - "Internal thread %li on PID %i (rank: %i) using thread-local storage " - "(not consuming user TID slot)\n", - _tl_internal->index_data->system_value, process::get_id(), dmp::rank()); - } - return _tl_internal; - } - - // Only allocate TID for non-offset (user) threads - auto _tid = utility::get_thread_index(); - - // Prevent access beyond MAX_THREADS - if(_tid < 0 || _tid >= ROCPROFSYS_MAX_THREADS) - { - static std::once_flag thread_limit_warning_flag; - std::call_once(thread_limit_warning_flag, []() { - ROCPROFSYS_WARNING_F( - 1, - "[rocprof-sys][WARNING] Maximum allowed thread limit (%d) " - "reached. Further thread creation and profiling will be " - "disabled to prevent resource exhaustion.\n", - ROCPROFSYS_MAX_THREADS); - }); - static thread_local auto _tl_dummy = std::optional{}; - return (_tl_dummy.reset(), _tl_dummy); - } - if(!_once && (_once = true)) { grow_data(_tid); diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/thread_info.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/thread_info.hpp index f5c506f8a76..dee22fe68a8 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/thread_info.hpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/thread_info.hpp @@ -77,17 +77,6 @@ struct thread_index_data native_tid_t pthread_value = ::pthread_self(); stl_tid_t stl_value = std::this_thread::get_id(); - thread_index_data() = default; - - // Special constructor for internal threads that don't consume TID counter - explicit thread_index_data(bool _is_internal) - : internal_value(_is_internal ? -1 : utility::get_thread_index()) - , system_value(tim::threading::get_sys_tid()) - , sequent_value(_is_internal ? -1 : tim::threading::get_id()) - , pthread_value(::pthread_self()) - , stl_value(std::this_thread::get_id()) - {} - std::string as_string() const; }; diff --git a/projects/rocprofiler-systems/tests/rocprof-sys-thread-limit-tests.cmake b/projects/rocprofiler-systems/tests/rocprof-sys-thread-limit-tests.cmake index 8a8444a386a..4c263be644b 100644 --- a/projects/rocprofiler-systems/tests/rocprof-sys-thread-limit-tests.cmake +++ b/projects/rocprofiler-systems/tests/rocprof-sys-thread-limit-tests.cmake @@ -70,9 +70,9 @@ foreach(THREADS IN LISTS THREAD_VALUES) REWRITE_ARGS -e -v 2 -i 1024 --label return args RUNTIME_ARGS -e -v 1 -i 1024 --label return args RUN_ARGS 35 2 ${THREADS} - SAMPLING_TIMEOUT 180 - REWRITE_TIMEOUT 180 - RUNTIME_TIMEOUT 360 + SAMPLING_TIMEOUT 480 + REWRITE_TIMEOUT 480 + RUNTIME_TIMEOUT 480 RUNTIME_PASS_REGEX "${_thread_limit_pass_regex}" SAMPLING_PASS_REGEX "${_thread_limit_pass_regex}" REWRITE_RUN_PASS_REGEX "${_thread_limit_pass_regex}" From ecebe82f6fb836e833267c797e04eb41900fd405 Mon Sep 17 00:00:00 2001 From: Anuj Shukla Date: Fri, 12 Dec 2025 10:20:01 -0500 Subject: [PATCH 5/7] correcting formatting --- .../rocprof-sys/library/components/pthread_create_gotcha.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/pthread_create_gotcha.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/pthread_create_gotcha.cpp index ac9a14684e0..58e93881471 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/pthread_create_gotcha.cpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/pthread_create_gotcha.cpp @@ -188,7 +188,7 @@ pthread_create_gotcha::wrapper::operator()() const std::call_once(thread_limit_warning_flag, []() { ROCPROFSYS_WARNING_F( 1, - "[rocprof-sys][WARNING] Maximum allowed thread limit (%zu) " + "[rocprof-sys][WARNING] Maximum allowed thread limit (%d) " "reached. Further thread creation and profiling will be " "disabled to prevent resource exhaustion.\n", ROCPROFSYS_MAX_THREADS); From 94d5c2c30c3a51355b3d5e07af6ebe99538b7ff9 Mon Sep 17 00:00:00 2001 From: Anuj Shukla Date: Wed, 24 Dec 2025 13:35:03 -0500 Subject: [PATCH 6/7] Fix thread limit configuration and update documentation --- projects/rocprofiler-systems/CMakeLists.txt | 8 ++++++-- projects/rocprofiler-systems/CMakePresets.json | 2 +- .../docs/reference/development-guide.rst | 8 ++++---- .../library/components/pthread_create_gotcha.cpp | 4 ++-- 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/projects/rocprofiler-systems/CMakeLists.txt b/projects/rocprofiler-systems/CMakeLists.txt index a26de294b3c..f6e451fb641 100644 --- a/projects/rocprofiler-systems/CMakeLists.txt +++ b/projects/rocprofiler-systems/CMakeLists.txt @@ -275,9 +275,13 @@ else() math(EXPR ROCPROFSYS_THREAD_COUNT "16 * ${ROCPROFSYS_PROCESSOR_COUNT}") compute_pow2_ceil(ROCPROFSYS_THREAD_COUNT "16 * ${ROCPROFSYS_PROCESSOR_COUNT}") - # set the default to 2048 if it could not be calculated + # Fatal error if pow2 calculation failed (e.g., Python3 not found) if(ROCPROFSYS_THREAD_COUNT LESS 2) - set(ROCPROFSYS_THREAD_COUNT 2048) + rocprofiler_systems_message( + FATAL_ERROR + "Failed to compute power of 2 ceiling for ROCPROFSYS_THREAD_COUNT. " + "Ensure dependency is available. Processor count: ${ROCPROFSYS_PROCESSOR_COUNT}" + ) endif() endif() diff --git a/projects/rocprofiler-systems/CMakePresets.json b/projects/rocprofiler-systems/CMakePresets.json index 8150973ae93..7bb4579e195 100644 --- a/projects/rocprofiler-systems/CMakePresets.json +++ b/projects/rocprofiler-systems/CMakePresets.json @@ -15,7 +15,7 @@ "ROCPROFSYS_BUILD_LIBIBERTY": "ON", "ROCPROFSYS_BUILD_TBB": "ON", "ROCPROFSYS_BUILD_TESTING": "ON", - "ROCPROFSYS_MAX_THREADS": "64", + "ROCPROFSYS_MAX_THREADS": "128", "ROCPROFSYS_STRIP_LIBRARIES": "OFF", "ROCPROFSYS_USE_PYTHON": "ON", "ROCPROFSYS_USE_ROCM": "ON" diff --git a/projects/rocprofiler-systems/docs/reference/development-guide.rst b/projects/rocprofiler-systems/docs/reference/development-guide.rst index bdc76569ae8..56a2ceb00cb 100644 --- a/projects/rocprofiler-systems/docs/reference/development-guide.rst +++ b/projects/rocprofiler-systems/docs/reference/development-guide.rst @@ -327,13 +327,13 @@ Thread-data class Currently, most thread data is effectively stored in a static ``std::array, ROCPROFSYS_MAX_THREADS>`` instance. -``ROCPROFSYS_MAX_THREADS`` is a value defined a compile-time and set to ``2048`` +``ROCPROFSYS_MAX_THREADS`` is a value defined at compile-time (default: ``max(128, pow2_ceil(16 * nproc))``) for release builds. During finalization, ROCm Systems Profiler iterates through the thread-data and transforms that data into something that can be passed along to Perfetto and/or Timemory. In the current model, if the user exceeds ``ROCPROFSYS_MAX_THREADS`` at runtime, -thread creation fails gracefully with a warning message, and the excess threads operate with thread-local -fallback and profiling will be skipped and not persisted to output files for threads beyond ROCPROFSYS_MAX_THREADS. +thread creation fails gracefully with a warning message, excess threads operate with thread-local fallback, +and profiling is skipped and not persisted to output files for threads beyond ``ROCPROFSYS_MAX_THREADS``. To support truly dynamic thread limits without compile-time constraints, a new model is being adopted which has all the benefits of static allocation but permits dynamic expansion beyond ``ROCPROFSYS_MAX_THREADS``. Currently, the thread limit can be increased at compile-time using the ``ROCPROFSYS_MAX_THREADS`` CMake configuration option. @@ -343,7 +343,7 @@ Configuring thread limits ROCm Systems Profiler uses a single CMake configuration option to control thread-related memory allocation: -* ``ROCPROFSYS_MAX_THREADS``: Maximum number of threads supported (default: ``max(128, 16 × CPU_cores)``, must be power of 2) +* ``ROCPROFSYS_MAX_THREADS``: Maximum number of threads supported (default: ``max(128, pow2_ceil(16 * nproc))``; must be a power of 2, values < ``128`` are raised to ``128``) This setting controls: diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/pthread_create_gotcha.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/pthread_create_gotcha.cpp index 58e93881471..b6d34084a3e 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/pthread_create_gotcha.cpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/pthread_create_gotcha.cpp @@ -188,10 +188,10 @@ pthread_create_gotcha::wrapper::operator()() const std::call_once(thread_limit_warning_flag, []() { ROCPROFSYS_WARNING_F( 1, - "[rocprof-sys][WARNING] Maximum allowed thread limit (%d) " + "[rocprof-sys][WARNING] Maximum allowed thread limit (%zu) " "reached. Further thread creation and profiling will be " "disabled to prevent resource exhaustion.\n", - ROCPROFSYS_MAX_THREADS); + static_cast(ROCPROFSYS_MAX_THREADS)); }); return m_routine(m_arg); } From bc4bfcaee1749edf608318ab98a7348686787504 Mon Sep 17 00:00:00 2001 From: Anuj Shukla Date: Tue, 6 Jan 2026 15:36:09 -0500 Subject: [PATCH 7/7] Add changelog entry for PR #2172 (thread limit improvements) --- projects/rocprofiler-systems/CHANGELOG.md | 18 ++++++++++++++++++ projects/rocprofiler-systems/CMakeLists.txt | 8 +------- projects/rocprofiler-systems/CMakePresets.json | 2 +- .../docs/reference/development-guide.rst | 8 +++----- 4 files changed, 23 insertions(+), 13 deletions(-) diff --git a/projects/rocprofiler-systems/CHANGELOG.md b/projects/rocprofiler-systems/CHANGELOG.md index 7cdbb61f5b0..5697d258acb 100644 --- a/projects/rocprofiler-systems/CHANGELOG.md +++ b/projects/rocprofiler-systems/CHANGELOG.md @@ -4,6 +4,24 @@ Full documentation for ROCm Systems Profiler is available at [https://rocm.docs.amd.com/projects/rocprofiler-systems/en/latest/](https://rocm.docs.amd.com/projects/rocprofiler-systems/en/latest/). +## ROCm Systems Profiler 1.4.0 for ROCm x.y.z (unreleased) + +### Added + + +### Changed + + +### Removed + + +### Deprecated + + +### Resolved issues + +- Improved thread limit configuration and graceful handling when exceeding `ROCPROFSYS_MAX_THREADS` to prevent segfaults. + ## ROCm Systems Profiler 1.3.0 for ROCm 7.2.0 ### Added diff --git a/projects/rocprofiler-systems/CMakeLists.txt b/projects/rocprofiler-systems/CMakeLists.txt index f6e451fb641..ba806effc85 100644 --- a/projects/rocprofiler-systems/CMakeLists.txt +++ b/projects/rocprofiler-systems/CMakeLists.txt @@ -308,13 +308,7 @@ elseif(NOT ROCPROFSYS_MAX_THREADS EQUAL _MAX_THREADS) "ROCPROFSYS_MAX_THREADS (=${ROCPROFSYS_MAX_THREADS}) must be a power of 2. We were unable to verify it so we are emitting this warning instead. Estimate resulted in: ${_MAX_THREADS}" ) endif() -if(ROCPROFSYS_MAX_THREADS LESS 128) - rocprofiler_systems_message( - AUTHOR_WARNING - "ROCPROFSYS_MAX_THREADS (=${ROCPROFSYS_MAX_THREADS}) is less than 128 which may be too low for some applications. Setting it to at least 128." - ) - set(ROCPROFSYS_MAX_THREADS 128) -endif() + set(ROCPROFSYS_MAX_UNWIND_DEPTH "64" CACHE STRING diff --git a/projects/rocprofiler-systems/CMakePresets.json b/projects/rocprofiler-systems/CMakePresets.json index 7bb4579e195..8150973ae93 100644 --- a/projects/rocprofiler-systems/CMakePresets.json +++ b/projects/rocprofiler-systems/CMakePresets.json @@ -15,7 +15,7 @@ "ROCPROFSYS_BUILD_LIBIBERTY": "ON", "ROCPROFSYS_BUILD_TBB": "ON", "ROCPROFSYS_BUILD_TESTING": "ON", - "ROCPROFSYS_MAX_THREADS": "128", + "ROCPROFSYS_MAX_THREADS": "64", "ROCPROFSYS_STRIP_LIBRARIES": "OFF", "ROCPROFSYS_USE_PYTHON": "ON", "ROCPROFSYS_USE_ROCM": "ON" diff --git a/projects/rocprofiler-systems/docs/reference/development-guide.rst b/projects/rocprofiler-systems/docs/reference/development-guide.rst index 56a2ceb00cb..3b3ebc84ff2 100644 --- a/projects/rocprofiler-systems/docs/reference/development-guide.rst +++ b/projects/rocprofiler-systems/docs/reference/development-guide.rst @@ -327,8 +327,7 @@ Thread-data class Currently, most thread data is effectively stored in a static ``std::array, ROCPROFSYS_MAX_THREADS>`` instance. -``ROCPROFSYS_MAX_THREADS`` is a value defined at compile-time (default: ``max(128, pow2_ceil(16 * nproc))``) -for release builds. During finalization, +``ROCPROFSYS_MAX_THREADS`` is a value defined at compile-time for release builds. During finalization, ROCm Systems Profiler iterates through the thread-data and transforms that data into something that can be passed along to Perfetto and/or Timemory. In the current model, if the user exceeds ``ROCPROFSYS_MAX_THREADS`` at runtime, @@ -343,7 +342,7 @@ Configuring thread limits ROCm Systems Profiler uses a single CMake configuration option to control thread-related memory allocation: -* ``ROCPROFSYS_MAX_THREADS``: Maximum number of threads supported (default: ``max(128, pow2_ceil(16 * nproc))``; must be a power of 2, values < ``128`` are raised to ``128``) +* ``ROCPROFSYS_MAX_THREADS``: Maximum number of threads supported (default if not explicitly set: ``128`` if nproc < 8, otherwise ``pow2_ceil(16 * nproc)``; must be a power of 2) This setting controls: @@ -357,9 +356,8 @@ CMake enforces that ``ROCPROFSYS_MAX_THREADS`` must be a power of 2: .. code-block:: cmake - # Valid: 128, 256, 512, 1024, 2048, 4096, 8192, 16384, ... + # Valid: 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, ... any power of 2 # Invalid: 100, 3000, 5000, 10000, ... (FATAL_ERROR) - # Values < 128 will be automatically set to 128 with a warning **Example: Building with custom thread limit** .. code-block:: shell