diff --git a/projects/rocprofiler-systems/CHANGELOG.md b/projects/rocprofiler-systems/CHANGELOG.md index 9938a6b49ed..a8396c66ffb 100644 --- a/projects/rocprofiler-systems/CHANGELOG.md +++ b/projects/rocprofiler-systems/CHANGELOG.md @@ -17,6 +17,11 @@ Full documentation for ROCm Systems Profiler is available at [https://rocm.docs. - By default, tracing uses deferred trace generation (cached mode) for improved performance and minimal runtime overhead. - `--trace` / `-T` CLI flag enables tracing with cached mode by default. - `--trace-legacy` / `-L` CLI flag enables legacy direct mode for tracing. +- Changed thread storage allocation from a hard-coded 4096-element array to a compile-time computed size derived from the ROCPROFSYS_MAX_THREADS configuration flag. + +### Resolved issues + +- Fixed application termination with segfault when thread creation surpasses ROCPROFSYS_MAX_THREADS configuration. ### Removed diff --git a/projects/rocprofiler-systems/CMakeLists.txt b/projects/rocprofiler-systems/CMakeLists.txt index f4a9d30eb9f..ba806effc85 100644 --- a/projects/rocprofiler-systems/CMakeLists.txt +++ b/projects/rocprofiler-systems/CMakeLists.txt @@ -275,9 +275,13 @@ else() math(EXPR ROCPROFSYS_THREAD_COUNT "16 * ${ROCPROFSYS_PROCESSOR_COUNT}") compute_pow2_ceil(ROCPROFSYS_THREAD_COUNT "16 * ${ROCPROFSYS_PROCESSOR_COUNT}") - # set the default to 2048 if it could not be calculated + # Fatal error if pow2 calculation failed (e.g., Python3 not found) if(ROCPROFSYS_THREAD_COUNT LESS 2) - set(ROCPROFSYS_THREAD_COUNT 2048) + rocprofiler_systems_message( + FATAL_ERROR + "Failed to compute power of 2 ceiling for ROCPROFSYS_THREAD_COUNT. " + "Ensure dependency is available. Processor count: ${ROCPROFSYS_PROCESSOR_COUNT}" + ) endif() endif() diff --git a/projects/rocprofiler-systems/docs/reference/development-guide.rst b/projects/rocprofiler-systems/docs/reference/development-guide.rst index f1470ef4146..3b3ebc84ff2 100644 --- a/projects/rocprofiler-systems/docs/reference/development-guide.rst +++ b/projects/rocprofiler-systems/docs/reference/development-guide.rst @@ -327,14 +327,46 @@ Thread-data class Currently, most thread data is effectively stored in a static ``std::array, ROCPROFSYS_MAX_THREADS>`` instance. -``ROCPROFSYS_MAX_THREADS`` is a value defined a compile-time and set to ``2048`` -for release builds. During finalization, +``ROCPROFSYS_MAX_THREADS`` is a value defined at compile-time for release builds. During finalization, ROCm Systems Profiler iterates through the thread-data and transforms that data into something that can be passed along to Perfetto and/or Timemory. -The downside of the current model is that if the user exceeds ``ROCPROFSYS_MAX_THREADS``, -a segmentation fault occurs. To fix this issue, -a new model is being adopted which has all the benefits of this model -but permits dynamic expansion. +In the current model, if the user exceeds ``ROCPROFSYS_MAX_THREADS`` at runtime, +thread creation fails gracefully with a warning message, excess threads operate with thread-local fallback, +and profiling is skipped and not persisted to output files for threads beyond ``ROCPROFSYS_MAX_THREADS``. +To support truly dynamic thread limits without compile-time constraints, a new model is being adopted which +has all the benefits of static allocation but permits dynamic expansion beyond ``ROCPROFSYS_MAX_THREADS``. +Currently, the thread limit can be increased at compile-time using the ``ROCPROFSYS_MAX_THREADS`` CMake configuration option. + +Configuring thread limits +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +ROCm Systems Profiler uses a single CMake configuration option to control thread-related memory allocation: + +* ``ROCPROFSYS_MAX_THREADS``: Maximum number of threads supported (default if not explicitly set: ``128`` if nproc < 8, otherwise ``pow2_ceil(16 * nproc)``; must be a power of 2) + +This setting controls: + +* Thread ID manager capacity (maximum thread IDs that can be tracked) +* Storage array sizes for thread-local data across the codebase +* Timemory's internal thread storage (``TIMEMORY_MAX_THREADS``) + +**Build-time validation:** + +CMake enforces that ``ROCPROFSYS_MAX_THREADS`` must be a power of 2: + +.. code-block:: cmake + + # Valid: 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, ... any power of 2 + # Invalid: 100, 3000, 5000, 10000, ... (FATAL_ERROR) +**Example: Building with custom thread limit** + +.. code-block:: shell + + # Build with support for 8192 threads + cmake -B build \ + -DROCPROFSYS_MAX_THREADS=8192 \ + .. + cmake --build build Sampling model ======================================== diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/pthread_create_gotcha.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/pthread_create_gotcha.cpp index 783f862593f..9fd283a13f6 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/pthread_create_gotcha.cpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/pthread_create_gotcha.cpp @@ -64,9 +64,6 @@ namespace component { using bundle_t = tim::lightweight_tuple; using category_region_t = tim::lightweight_tuple>; -// The maximum limit for the number of threads is set at 4096. declared and stored in the -// set_storage struct's `types.hpp` file. -constexpr size_t allowed_max_threads = 4096; namespace { @@ -187,7 +184,7 @@ pthread_create_gotcha::wrapper::operator()() const const auto& _parent_info = thread_info::get(m_config.parent_tid, InternalTID); const auto& _info = thread_info::init(m_config.offset); auto _sequent_value = _info->index_data ? _info->index_data->sequent_value : -1; - if(static_cast(_sequent_value) >= allowed_max_threads) + if(static_cast(_sequent_value) >= ROCPROFSYS_MAX_THREADS) { static std::once_flag thread_limit_warning_flag; std::call_once(thread_limit_warning_flag, []() { @@ -196,7 +193,7 @@ pthread_create_gotcha::wrapper::operator()() const "[rocprof-sys][WARNING] Maximum allowed thread limit (%zu) " "reached. Further thread creation and profiling will be " "disabled to prevent resource exhaustion.\n", - allowed_max_threads); + static_cast(ROCPROFSYS_MAX_THREADS)); }); return m_routine(m_arg); } diff --git a/projects/rocprofiler-systems/tests/rocprof-sys-thread-limit-tests.cmake b/projects/rocprofiler-systems/tests/rocprof-sys-thread-limit-tests.cmake index b81f9f86f6f..41e0b5b774a 100644 --- a/projects/rocprofiler-systems/tests/rocprof-sys-thread-limit-tests.cmake +++ b/projects/rocprofiler-systems/tests/rocprof-sys-thread-limit-tests.cmake @@ -40,21 +40,18 @@ set(_thread_limit_environment "ROCPROFSYS_TIMEMORY_COMPONENTS=wall_clock,peak_rss,page_rss" ) -# Maximum allowed threads -set(ALLOWED_MAX_THREADS 4096) +math(EXPR THREAD_VAL_1 "${ROCPROFSYS_MAX_THREADS} - 1") +math(EXPR THREAD_VAL_2 "${ROCPROFSYS_MAX_THREADS} + 24") -math(EXPR THREAD_VAL_1 "${ROCPROFSYS_MAX_THREADS} + 24") -math(EXPR THREAD_VAL_2 "${ALLOWED_MAX_THREADS} + 1") - -set(THREAD_VALUES ${THREAD_VAL_1} ${THREAD_VAL_2}) +set(THREAD_VALUES ${THREAD_VAL_1} ${THREAD_VAL_2} ${ROCPROFSYS_MAX_THREADS}) # Loop over thread values foreach(THREADS IN LISTS THREAD_VALUES) set(THREAD_PASS_VALUE ${THREADS}) math(EXPR THREAD_FAIL_VALUE "${THREADS} + 1") - if(${THREADS} GREATER_EQUAL ${ALLOWED_MAX_THREADS}) - math(EXPR THREAD_PASS_VALUE "${ALLOWED_MAX_THREADS} - 1") - math(EXPR THREAD_FAIL_VALUE "${THREADS}") + if(${THREADS} GREATER_EQUAL ${ROCPROFSYS_MAX_THREADS}) + math(EXPR THREAD_PASS_VALUE "${ROCPROFSYS_MAX_THREADS} - 1") + math(EXPR THREAD_FAIL_VALUE "${ROCPROFSYS_MAX_THREADS} + 1") endif() set(_thread_limit_pass_regex "\\|${THREAD_PASS_VALUE}>>>") @@ -72,9 +69,9 @@ foreach(THREADS IN LISTS THREAD_VALUES) REWRITE_ARGS -e -v 2 -i 1024 --label return args RUNTIME_ARGS -e -v 1 -i 1024 --label return args RUN_ARGS 35 2 ${THREADS} - SAMPLING_TIMEOUT 180 - REWRITE_TIMEOUT 180 - RUNTIME_TIMEOUT 360 + SAMPLING_TIMEOUT 480 + REWRITE_TIMEOUT 480 + RUNTIME_TIMEOUT 480 RUNTIME_PASS_REGEX "${_thread_limit_pass_regex}" SAMPLING_PASS_REGEX "${_thread_limit_pass_regex}" REWRITE_RUN_PASS_REGEX "${_thread_limit_pass_regex}"