From 3613597f67b19d3719161a344eb3fd8d89200695 Mon Sep 17 00:00:00 2001 From: ks734 Date: Tue, 5 May 2026 10:35:19 +0000 Subject: [PATCH 01/10] RDKEMW-16955: Log firebolt state of the app in OOMCrash plugin --- bundle/lib/source/DobbySpecConfig.cpp | 9 +- bundle/runtime-schemas/defs-plugins.json | 3 - rdkPlugins/OOMCrash/source/OOMCrashPlugin.cpp | 214 ++++++++++++++---- rdkPlugins/OOMCrash/source/OOMCrashPlugin.h | 1 + 4 files changed, 182 insertions(+), 45 deletions(-) diff --git a/bundle/lib/source/DobbySpecConfig.cpp b/bundle/lib/source/DobbySpecConfig.cpp index 5d85f526b..8b35bcd42 100644 --- a/bundle/lib/source/DobbySpecConfig.cpp +++ b/bundle/lib/source/DobbySpecConfig.cpp @@ -635,7 +635,7 @@ bool DobbySpecConfig::parseSpec(ctemplate::TemplateDictionary* dictionary, // step 6 - enable the RDK plugins section dictionary->ShowSection(ENABLE_RDK_PLUGINS); - // step 6.5 - add any default plugins in the settings file + // step 6.1 - add any default plugins in the settings file Json::Value rdkPluginData = mRdkPluginsData; for (const auto& pluginName : mDefaultPlugins) { @@ -643,6 +643,13 @@ bool DobbySpecConfig::parseSpec(ctemplate::TemplateDictionary* dictionary, mRdkPluginsJson[pluginName]["required"] = false; } + + // step 6.2 - always enable the OOMCrash plugin (unless already configured) + if (!mRdkPluginsJson.isMember("oomcrash")) + { + mRdkPluginsJson["oomcrash"]["data"] = Json::Value(Json::objectValue); + mRdkPluginsJson["oomcrash"]["required"] = false; + } // step 7 - process RDK plugins json into dictionary if (!processRdkPlugins(mSpec["rdkPlugins"], mDictionary)) { diff --git a/bundle/runtime-schemas/defs-plugins.json b/bundle/runtime-schemas/defs-plugins.json index ac029e737..9c4aa1c40 100644 --- a/bundle/runtime-schemas/defs-plugins.json +++ b/bundle/runtime-schemas/defs-plugins.json @@ -651,9 +651,6 @@ }, "data": { "type": "object", - "required": [ - "path" - ], "properties": { "path": { "type": "string" diff --git a/rdkPlugins/OOMCrash/source/OOMCrashPlugin.cpp b/rdkPlugins/OOMCrash/source/OOMCrashPlugin.cpp index 7eaa84f9a..030699094 100644 --- a/rdkPlugins/OOMCrash/source/OOMCrashPlugin.cpp +++ b/rdkPlugins/OOMCrash/source/OOMCrashPlugin.cpp @@ -18,6 +18,11 @@ */ #include "OOMCrashPlugin.h" + +#include + +#define FIREBOLT_STATE "fireboltState" +#define FIREBOLT_STATE_PREV "fireboltState_prev" /** * Need to do this at the start of every plugin to make sure the correct * C methods are visible to allow PluginLauncher to find the plugin @@ -70,11 +75,12 @@ bool OOMCrash::postInstallation() return false; } - const std::string path = mContainerConfig->rdk_plugins->oomcrash->data->path; + const char *pathPtr = mContainerConfig->rdk_plugins->oomcrash->data->path; + const std::string path = pathPtr ? pathPtr : ""; if (path.empty()) { - AI_LOG_ERROR("OOMCrash path is empty"); - return false; + AI_LOG_INFO("OOMCrash path not configured, skipping mount setup for container '%s'", mUtils->getContainerId().c_str()); + return true; } if (!mUtils->mkdirRecursive((mRootfsPath + path).c_str(), 0755) && errno != EEXIST) @@ -112,23 +118,17 @@ bool OOMCrash::postHalt() return false; } - bool oomDetected = false; - if (mUtils->exitStatus != 0) - oomDetected = checkForOOM(); + bool oomDetected = checkForOOM(); + + const char *pathPtr = mContainerConfig->rdk_plugins->oomcrash->data->path; + const std::string path = pathPtr ? pathPtr : ""; - if (oomDetected) + if (oomDetected && !path.empty()) createFileForOOM(); // Remove the crashFile if container exits normally or if no OOM detected - if (mUtils->exitStatus == 0 || !oomDetected) + if (!path.empty() && (mUtils->exitStatus == 0 || !oomDetected)) { - std::string path = mContainerConfig->rdk_plugins->oomcrash->data->path; - if (path.empty()) - { - AI_LOG_ERROR("OOMCrash path is empty"); - return false; - } - std::string crashFile = path + "/oom_crashed_" + mUtils->getContainerId() + ".txt"; if (remove(crashFile.c_str()) != 0) { @@ -173,68 +173,200 @@ std::vector OOMCrash::getDependencies() const } /** - * @brief Read cgroup file. + * @brief Read the oom_kill counter from the cgroup memory.oom_control file. + * + * The memory.oom_control file contains multiple key-value lines, e.g.: * - * @param[out] val gives the number of times that the cgroup limit was exceeded. + * Kernel >= 4.13: + * oom_kill_disable 0 + * under_oom 0 + * oom_kill 1 * - * @return true on successfully reading from the file. + * Kernel < 4.13: + * oom_kill_disable 0 + * under_oom 0 + * + * On older kernels the 'oom_kill' counter does not exist, so we fall back + * to the 'under_oom' flag which is 1 while the cgroup is in OOM state. + * + * @param[out] val Set to the value of the 'oom_kill' field (or 'under_oom' + * on older kernels) on success. + * + * @return true on successfully reading and parsing the field. */ bool OOMCrash::readCgroup(unsigned long *val) { - std::string path = "/sys/fs/cgroup/memory/" + mUtils->getContainerId() + "/memory.failcnt"; + std::string path = "/sys/fs/cgroup/memory/" + mUtils->getContainerId() + "/memory.oom_control"; FILE *fp = fopen(path.c_str(), "r"); if (!fp) { - if (errno != ENOENT) - AI_LOG_ERROR("failed to open '%s' (%d - %s)", path.c_str(), errno, strerror(errno)); - + AI_LOG_ERROR("failed to open '%s' (%d - %s)", path.c_str(), errno, strerror(errno)); return false; } char* line = nullptr; size_t len = 0; ssize_t rd; + bool foundOomKill = false; + unsigned long underOom = 0; + bool foundUnderOom = false; - if ((rd = getline(&line, &len, fp)) < 0) + while ((rd = getline(&line, &len, fp)) > 0) { - if (line) - free(line); - fclose(fp); - AI_LOG_ERROR("failed to read cgroup file line (%d - %s)", errno, strerror(errno)); - return false; + unsigned long v; + // sscanf won't match "oom_kill_disable" because the space in the + // format requires whitespace where "_disable" has an underscore. + if (sscanf(line, "oom_kill %lu", &v) == 1) + { + *val = v; + foundOomKill = true; + break; + } + if (sscanf(line, "under_oom %lu", &v) == 1) + { + underOom = v; + foundUnderOom = true; + } } - *val = strtoul(line, nullptr, 0); - + if (line) + free(line); fclose(fp); - free(line); - return true; + // Prefer oom_kill (kernel >= 4.13); fall back to under_oom for older kernels + if (foundOomKill) + return true; + + if (foundUnderOom) + { + AI_LOG_INFO("'oom_kill' field not present (kernel < 4.13), using 'under_oom' fallback"); + *val = underOom; + return true; + } + + AI_LOG_ERROR("neither 'oom_kill' nor 'under_oom' found in '%s'", path.c_str()); + return false; +} + +/** + * @brief Check if memory (or memory+swap) max usage reached the configured + * limit, indicating the container hit its memory ceiling. + * + * This is used as a fallback OOM indicator on older kernels (< 4.13) where + * the oom_kill counter does not exist and under_oom is transient. + * memory.max_usage_in_bytes is the high-water mark and persists until the + * cgroup is destroyed. + * + * @return true if max usage >= limit for memory or memory+swap. + */ +bool OOMCrash::isMemoryAtLimit() +{ + std::string basePath = "/sys/fs/cgroup/memory/" + mUtils->getContainerId(); + + const char *pairs[][2] = { + { "/memory.max_usage_in_bytes", "/memory.limit_in_bytes" }, + { "/memory.memsw.max_usage_in_bytes", "/memory.memsw.limit_in_bytes" }, + }; + + for (const auto &pair : pairs) + { + unsigned long maxUsage = 0, limit = 0; + std::string maxPath = basePath + pair[0]; + std::string limPath = basePath + pair[1]; + + FILE *fpMax = fopen(maxPath.c_str(), "r"); + FILE *fpLim = fopen(limPath.c_str(), "r"); + + bool ok = (fpMax && fpLim && + fscanf(fpMax, "%lu", &maxUsage) == 1 && + fscanf(fpLim, "%lu", &limit) == 1); + + if (fpMax) fclose(fpMax); + if (fpLim) fclose(fpLim); + + if (ok && limit > 0 && maxUsage >= limit) + { + AI_LOG_INFO("%s=%lu reached %s=%lu", pair[0]+1, maxUsage, pair[1]+1, limit); + return true; + } + } + + return false; } /** - * @brief Check for Out of Memory by reading cgroup file. + * @brief Check for Out of Memory by reading cgroup files. + * + * Detection priority: + * 1. oom_kill > 0 (kernel >= 4.13, definitive) + * 2. under_oom > 0 (kernel < 4.13, transient flag) + * 3. max_usage_in_bytes >= limit (all kernels, persistent high-water mark) * * @return true if OOM detected. */ bool OOMCrash::checkForOOM() { - unsigned long failCnt; - bool status; - if (readCgroup(&failCnt) && (failCnt > 0)) + unsigned long oomKill = 0; + bool cgroupRead = readCgroup(&oomKill); + + // Priority 1 & 2: oom_kill or under_oom confirmed OOM + if (cgroupRead && oomKill > 0) { - AI_LOG_WARN("memory allocation failure detected in %s container, likely OOM (failcnt = %lu)", mUtils->getContainerId().c_str(), failCnt); - status = true; + AI_LOG_INFO("oom_control reports OOM (value=%lu) for container '%s'", + oomKill, mUtils->getContainerId().c_str()); + } + // Priority 3: on kernel < 4.13 under_oom may have cleared — check max_usage + else if (isMemoryAtLimit()) + { + AI_LOG_WARN("oom_control did not confirm OOM but max memory usage reached limit " + "for container '%s'", mUtils->getContainerId().c_str()); } else { - AI_LOG_WARN("No OOM failure detected in %s container", mUtils->getContainerId().c_str()); - status = false; + AI_LOG_INFO("No OOM kill detected in container '%s'", mUtils->getContainerId().c_str()); + return false; } - return status; + + // OOM kill confirmed - retrieve firebolt state from annotations. + // AppService often transitions the app to "background" after the OOM kill + // but before postHalt runs. Since the container exited abnormally, prefer + // the previous fireboltState value (which was the state at the time of the + // actual OOM kill) over the current value which may have been overwritten + // by a post-crash transition. + std::map annotations = mUtils->getAnnotations(); + std::string fireboltState; + + auto prevIt = annotations.find(FIREBOLT_STATE_PREV); + if (prevIt != annotations.end()) + { + fireboltState = prevIt->second; + AI_LOG_INFO("Using previous fireboltState '%s' (current may have been " + "set after OOM kill)", fireboltState.c_str()); + } + else + { + auto it = annotations.find(FIREBOLT_STATE); + if (it != annotations.end()) + { + fireboltState = it->second; + } + } + + if (!fireboltState.empty()) + { + AI_LOG_WARN("OOM kill detected: container '%s' fireboltState '%s'", + mUtils->getContainerId().c_str(), fireboltState.c_str()); + } + else + { + AI_LOG_WARN("OOM kill detected: container '%s' (firebolt state unknown)", + mUtils->getContainerId().c_str()); + } + + return true; } /** diff --git a/rdkPlugins/OOMCrash/source/OOMCrashPlugin.h b/rdkPlugins/OOMCrash/source/OOMCrashPlugin.h index b43a16327..fd3ed42e1 100644 --- a/rdkPlugins/OOMCrash/source/OOMCrashPlugin.h +++ b/rdkPlugins/OOMCrash/source/OOMCrashPlugin.h @@ -57,6 +57,7 @@ class OOMCrash : public RdkPluginBase private: bool readCgroup(unsigned long *val); + bool isMemoryAtLimit(); bool checkForOOM(); void createFileForOOM(); From 3e48ece78670305a0ea96c4c599c36f71d22f485 Mon Sep 17 00:00:00 2001 From: ks734_comcast Date: Fri, 19 Jun 2026 18:27:43 +0530 Subject: [PATCH 02/10] RDKEMW-16955: Log firebolt state of the app in OOMCrash plugin --- rdkPlugins/OOMCrash/source/OOMCrashPlugin.cpp | 47 +++++++++++++------ 1 file changed, 32 insertions(+), 15 deletions(-) diff --git a/rdkPlugins/OOMCrash/source/OOMCrashPlugin.cpp b/rdkPlugins/OOMCrash/source/OOMCrashPlugin.cpp index 035e628b1..ff2ec5438 100644 --- a/rdkPlugins/OOMCrash/source/OOMCrashPlugin.cpp +++ b/rdkPlugins/OOMCrash/source/OOMCrashPlugin.cpp @@ -118,7 +118,9 @@ bool OOMCrash::postHalt() return false; } - bool oomDetected = checkForOOM(); + bool oomDetected = false; + if (mUtils->exitStatus != 0) + oomDetected = checkForOOM(); const char *pathPtr = mContainerConfig->rdk_plugins->oomcrash->data->path; const std::string path = pathPtr ? pathPtr : ""; @@ -343,9 +345,13 @@ bool OOMCrash::isMemoryAtLimit() * @brief Check for Out of Memory by reading cgroup files. * * Detection priority: - * 1. oom_kill > 0 (kernel >= 4.13, definitive) - * 2. under_oom > 0 (kernel < 4.13, transient flag) - * 3. max_usage_in_bytes >= limit (all kernels, persistent high-water mark) + * 1. oom_kill > 0 — from memory.oom_control (v1) or memory.events (v2). + * Authoritative: if readCgroup() succeeds and returns 0, + * the kernel confirms no OOM kill occurred and detection + * stops here (isMemoryAtLimit() is NOT consulted). + * 2. isMemoryAtLimit() — only reached when readCgroup() itself failed (cgroup + * file unreadable). Compares max usage against the + * configured limit as a last-resort heuristic. * * @return true if OOM detected. */ @@ -355,17 +361,23 @@ bool OOMCrash::checkForOOM() unsigned long oomKill = 0; bool cgroupRead = readCgroup(&oomKill); - // Priority 1 & 2: oom_kill or under_oom confirmed OOM if (cgroupRead && oomKill > 0) { + // cgroup counter is authoritative — OOM kill confirmed AI_LOG_INFO("oom_control reports OOM (value=%lu) for container '%s'", oomKill, mUtils->getContainerId().c_str()); } - // Priority 3: on kernel < 4.13 under_oom may have cleared — check max_usage + else if (cgroupRead) + { + // cgroup read succeeded and counter is 0 — kernel says no OOM kill + AI_LOG_INFO("No OOM kill detected in container '%s'", mUtils->getContainerId().c_str()); + return false; + } else if (isMemoryAtLimit()) { - AI_LOG_WARN("oom_control did not confirm OOM but max memory usage reached limit " - "for container '%s'", mUtils->getContainerId().c_str()); + // cgroup file was unreadable — fall back to high-water-mark heuristic + AI_LOG_WARN("cgroup unreadable; max memory usage reached limit for container '%s'", + mUtils->getContainerId().c_str()); } else { @@ -374,11 +386,12 @@ bool OOMCrash::checkForOOM() } // OOM kill confirmed - retrieve firebolt state from annotations. - // AppService often transitions the app to "background" after the OOM kill - // but before postHalt runs. Since the container exited abnormally, prefer - // the previous fireboltState value (which was the state at the time of the - // actual OOM kill) over the current value which may have been overwritten - // by a post-crash transition. + // Both the previous (fireboltState_prev) and current (fireboltState) values + // are read. AppService often transitions the app (e.g. to "background") + // after the OOM kill but before postHalt runs, so the current value may + // reflect a post-crash transition rather than the state at the time of the + // kill. fireboltState_prev is therefore preferred for the final log; both + // are reported so the transition can be observed in the logs. std::map annotations = mUtils->getAnnotations(); std::string fireboltState; @@ -386,8 +399,12 @@ bool OOMCrash::checkForOOM() if (prevIt != annotations.end()) { fireboltState = prevIt->second; - AI_LOG_INFO("Using previous fireboltState '%s' (current may have been " - "set after OOM kill)", fireboltState.c_str()); + std::string currentState; + auto curIt = annotations.find(FIREBOLT_STATE); + if (curIt != annotations.end()) + currentState = curIt->second; + AI_LOG_INFO("Using previous fireboltState '%s' (current '%s' may have been " + "set after OOM kill)", fireboltState.c_str(), currentState.c_str()); } else { From 52ae6f8cdcff551057af92056cffd8a869b55907 Mon Sep 17 00:00:00 2001 From: ks734_comcast Date: Mon, 22 Jun 2026 14:33:05 +0530 Subject: [PATCH 03/10] RDKEMW-16955: Log firebolt state of the app in OOMCrash plugin --- rdkPlugins/OOMCrash/source/OOMCrashPlugin.cpp | 85 +++++++++++-------- 1 file changed, 50 insertions(+), 35 deletions(-) diff --git a/rdkPlugins/OOMCrash/source/OOMCrashPlugin.cpp b/rdkPlugins/OOMCrash/source/OOMCrashPlugin.cpp index ff2ec5438..5031625e3 100644 --- a/rdkPlugins/OOMCrash/source/OOMCrashPlugin.cpp +++ b/rdkPlugins/OOMCrash/source/OOMCrashPlugin.cpp @@ -175,32 +175,30 @@ std::vector OOMCrash::getDependencies() const } /** - * @brief Read cgroup file to detect OOM. + * @brief Read the cgroup OOM kill counter for the container. * - * On cgroups v1 reads memory.failcnt from - * /sys/fs/cgroup/memory//memory.failcnt. + * cgroups v1: parses memory.oom_control from + * /sys/fs/cgroup/memory//memory.oom_control * - * On cgroups v2 reads the oom_kill field from - * /sys/fs/cgroup//memory.events. + * Kernel >= 4.13 exposes an 'oom_kill' field — a monotonic count of + * processes killed by the OOM killer. This is the preferred value. * - * The memory.oom_control file contains multiple key-value lines, e.g.: + * Kernel < 4.13 does not have 'oom_kill'; 'under_oom' is 1 while the + * cgroup is actively under OOM pressure (transient — may clear before + * postHalt runs; isMemoryAtLimit() is then used as a final fallback). * - * Kernel >= 4.13: - * oom_kill_disable 0 - * under_oom 0 - * oom_kill 1 + * Note: memory.failcnt counts allocation failures, NOT OOM kills. + * With swap available the kernel may swap rather than kill, and the + * counter resets to 0 on cgroup recreation, making it unreliable. * - * Kernel < 4.13: - * oom_kill_disable 0 - * under_oom 0 + * cgroups v2: reads the 'oom_kill' field from + * /sys/fs/cgroup//memory.events — a monotonic count of OOM kills. + * Falls back to the system.slice scope path if the direct path is absent. * - * On older kernels the 'oom_kill' counter does not exist, so we fall back - * to the 'under_oom' flag which is 1 while the cgroup is in OOM state. + * @param[out] val Set to the oom_kill counter (v1 kernel >= 4.13 or v2), + * or the under_oom flag (v1 kernel < 4.13) on success. * - * @param[out] val Set to the value of the 'oom_kill' field (or 'under_oom' - * on older kernels) on success. - * - * @return true on successfully reading and parsing the field. + * @return true on successfully reading and parsing the value. */ bool OOMCrash::readCgroup(unsigned long *val) @@ -221,7 +219,7 @@ bool OOMCrash::readCgroup(unsigned long *val) else { // On cgroups v1, use the legacy per-controller path - path = "/sys/fs/cgroup/memory/" + containerId + "/memory.failcnt"; + path = "/sys/fs/cgroup/memory/" + containerId + "/memory.oom_control"; } FILE *fp = fopen(path.c_str(), "r"); @@ -246,9 +244,6 @@ bool OOMCrash::readCgroup(unsigned long *val) char* line = nullptr; size_t len = 0; ssize_t rd; - bool foundOomKill = false; - unsigned long underOom = 0; - bool foundUnderOom = false; if (isCgroupV2) { @@ -276,22 +271,42 @@ bool OOMCrash::readCgroup(unsigned long *val) } else { - // v1: single numeric value in memory.failcnt - if ((rd = getline(&line, &len, fp)) < 0) + // v1: parse key-value memory.oom_control. + // Prefer 'oom_kill' (kernel >= 4.13, monotonic); fall back to + // 'under_oom' (kernel < 4.13, transient — 1 while OOM is active). + unsigned long oomKill = 0, underOom = 0; + bool foundOomKill = false, foundUnderOom = false; + + while ((rd = getline(&line, &len, fp)) > 0) { - if (line) - free(line); - fclose(fp); - AI_LOG_ERROR("failed to read cgroup file line (%d - %s)", errno, strerror(errno)); - return false; + unsigned long v; + if (sscanf(line, "oom_kill %lu", &v) == 1) + { + oomKill = v; + foundOomKill = true; + } + else if (sscanf(line, "under_oom %lu", &v) == 1) + { + underOom = v; + foundUnderOom = true; + } } - - *val = strtoul(line, nullptr, 0); - - fclose(fp); free(line); + fclose(fp); - return true; + if (foundOomKill) + { + *val = oomKill; + return true; + } + if (foundUnderOom) + { + AI_LOG_INFO("'oom_kill' not present (kernel < 4.13), using 'under_oom' value"); + *val = underOom; + return true; + } + AI_LOG_ERROR("neither 'oom_kill' nor 'under_oom' found in '%s'", path.c_str()); + return false; } } From 248ca33b2d64c79d6ed05d64f9e2e6884c19fb4e Mon Sep 17 00:00:00 2001 From: ks734_comcast Date: Tue, 23 Jun 2026 12:47:15 +0530 Subject: [PATCH 04/10] RDKEMW-16955: Debug-Log firebolt state of the app in OOMCrash plugin --- rdkPlugins/OOMCrash/source/OOMCrashPlugin.cpp | 83 ++++++++++++++++--- .../test_runner/bundle_generation.py | 2 + 2 files changed, 75 insertions(+), 10 deletions(-) diff --git a/rdkPlugins/OOMCrash/source/OOMCrashPlugin.cpp b/rdkPlugins/OOMCrash/source/OOMCrashPlugin.cpp index 5031625e3..250e6b087 100644 --- a/rdkPlugins/OOMCrash/source/OOMCrashPlugin.cpp +++ b/rdkPlugins/OOMCrash/source/OOMCrashPlugin.cpp @@ -19,7 +19,10 @@ #include "OOMCrashPlugin.h" +#include +#include #include +#include #define FIREBOLT_STATE "fireboltState" #define FIREBOLT_STATE_PREV "fireboltState_prev" @@ -234,7 +237,11 @@ bool OOMCrash::readCgroup(unsigned long *val) if (!fp) { - if (errno != ENOENT) + if (errno == ENOENT) + AI_LOG_WARN("cgroup file '%s' not found — cgroup may have been " + "destroyed before postHalt (race condition)", + path.c_str()); + else AI_LOG_ERROR("failed to open '%s' (%d - %s)", path.c_str(), errno, strerror(errno)); return false; @@ -299,12 +306,20 @@ bool OOMCrash::readCgroup(unsigned long *val) *val = oomKill; return true; } - if (foundUnderOom) + if (foundUnderOom && underOom > 0) { - AI_LOG_INFO("'oom_kill' not present (kernel < 4.13), using 'under_oom' value"); + AI_LOG_INFO("'oom_kill' not present (kernel < 4.13), 'under_oom' is active"); *val = underOom; return true; } + if (foundUnderOom) + { + // under_oom is 0 — OOM pressure may have cleared before postHalt. + // Return false so checkForOOM() falls through to isMemoryAtLimit(). + AI_LOG_INFO("'oom_kill' not present (kernel < 4.13), 'under_oom' is 0 " + "(transient flag may have cleared); deferring to memory limit check"); + return false; + } AI_LOG_ERROR("neither 'oom_kill' nor 'under_oom' found in '%s'", path.c_str()); return false; } @@ -361,12 +376,17 @@ bool OOMCrash::isMemoryAtLimit() * * Detection priority: * 1. oom_kill > 0 — from memory.oom_control (v1) or memory.events (v2). - * Authoritative: if readCgroup() succeeds and returns 0, - * the kernel confirms no OOM kill occurred and detection - * stops here (isMemoryAtLimit() is NOT consulted). - * 2. isMemoryAtLimit() — only reached when readCgroup() itself failed (cgroup - * file unreadable). Compares max usage against the - * configured limit as a last-resort heuristic. + * Authoritative: kernel OOM kill confirmed. + * 2. oom_kill == 0 but isMemoryAtLimit() — "soft OOM". When swap is + * available (memsw.limit > mem.limit), the kernel swaps + * rather than invoking the OOM killer. DobbyInit detects + * allocation failures (failcnt) and kills the app with + * SIGTERM. oom_kill stays 0 but max_usage hitting the + * limit confirms memory pressure caused the death. + * 3. oom_kill == 0 and memory not at limit — no OOM, normal termination. + * 4. readCgroup() failed + isMemoryAtLimit() — cgroup files unreadable + * (under_oom=0 on kernel < 4.13, or ENOENT race). + * Falls back to high-water-mark heuristic. * * @return true if OOM detected. */ @@ -376,15 +396,52 @@ bool OOMCrash::checkForOOM() unsigned long oomKill = 0; bool cgroupRead = readCgroup(&oomKill); + // Helper: append a log line to /tmp/oomcrash_.log using direct file I/O. + // sd_journal_send() from the forked child is unreliable under memory pressure + // (and stderr routes through the same journald socket on systemd services). + // Direct file writes use page cache and are far more resilient. + const std::string logFile = "/tmp/oomcrash_" + mUtils->getContainerId() + ".log"; + auto fileLog = [&logFile](const char *fmt, ...) + { + char buf[256]; + va_list ap; + va_start(ap, fmt); + int n = vsnprintf(buf, sizeof(buf), fmt, ap); + va_end(ap); + if (n > 0) + { + int fd = open(logFile.c_str(), O_WRONLY | O_CREAT | O_APPEND, 0644); + if (fd >= 0) + { + (void)write(fd, buf, std::min(n, (int)sizeof(buf) - 1)); + close(fd); + } + } + }; + if (cgroupRead && oomKill > 0) { // cgroup counter is authoritative — OOM kill confirmed AI_LOG_INFO("oom_control reports OOM (value=%lu) for container '%s'", oomKill, mUtils->getContainerId().c_str()); + fileLog("OOMCrash: oom_control reports OOM (value=%lu) for container '%s'\n", + oomKill, mUtils->getContainerId().c_str()); + } + else if (cgroupRead && isMemoryAtLimit()) + { + // oom_kill is 0 (kernel OOM killer didn't fire) but memory usage hit + // the limit. This happens when swap is available: the kernel swaps + // rather than killing, failcnt increments, and DobbyInit terminates + // the app with SIGTERM. Treat as a "soft OOM". + AI_LOG_WARN("oom_kill=0 but max memory usage reached limit for container '%s' " + "(soft OOM — killed by init due to memory pressure, not by kernel OOM killer)", + mUtils->getContainerId().c_str()); + fileLog("OOMCrash: soft OOM for container '%s' (oom_kill=0, memory at limit)\n", + mUtils->getContainerId().c_str()); } else if (cgroupRead) { - // cgroup read succeeded and counter is 0 — kernel says no OOM kill + // cgroup read succeeded, counter is 0, and memory didn't hit limit AI_LOG_INFO("No OOM kill detected in container '%s'", mUtils->getContainerId().c_str()); return false; } @@ -393,6 +450,8 @@ bool OOMCrash::checkForOOM() // cgroup file was unreadable — fall back to high-water-mark heuristic AI_LOG_WARN("cgroup unreadable; max memory usage reached limit for container '%s'", mUtils->getContainerId().c_str()); + fileLog("OOMCrash: cgroup unreadable, memory at limit for container '%s'\n", + mUtils->getContainerId().c_str()); } else { @@ -434,11 +493,15 @@ bool OOMCrash::checkForOOM() { AI_LOG_WARN("OOM kill detected: container '%s' fireboltState '%s'", mUtils->getContainerId().c_str(), fireboltState.c_str()); + fileLog("OOMCrash: container '%s' fireboltState '%s'\n", + mUtils->getContainerId().c_str(), fireboltState.c_str()); } else { AI_LOG_WARN("OOM kill detected: container '%s' (firebolt state unknown)", mUtils->getContainerId().c_str()); + fileLog("OOMCrash: container '%s' (firebolt state unknown)\n", + mUtils->getContainerId().c_str()); } return true; diff --git a/tests/L2_testing/test_runner/bundle_generation.py b/tests/L2_testing/test_runner/bundle_generation.py index b0a480db7..6205db849 100755 --- a/tests/L2_testing/test_runner/bundle_generation.py +++ b/tests/L2_testing/test_runner/bundle_generation.py @@ -85,8 +85,10 @@ def _normalise_config(config): mount["options"] = [opt for opt in mount["options"] if not str(opt).startswith("size=")] # Networking plugin can be auto-disabled depending on environment + # OOMCrash plugin is auto-injected by DobbySpecConfig when not present if isinstance(cfg.get("rdkPlugins"), dict): cfg["rdkPlugins"].pop("networking", None) + cfg["rdkPlugins"].pop("oomcrash", None) return cfg From 64d370bcfe43d808bd48bbca8d86cf22709be17d Mon Sep 17 00:00:00 2001 From: ks734_comcast Date: Wed, 24 Jun 2026 17:59:20 +0530 Subject: [PATCH 05/10] RDKEMW-16955: Debug-Log firebolt state of the app in OOMCrash plugin --- rdkPlugins/OOMCrash/source/OOMCrashPlugin.cpp | 54 +++++++------------ 1 file changed, 19 insertions(+), 35 deletions(-) diff --git a/rdkPlugins/OOMCrash/source/OOMCrashPlugin.cpp b/rdkPlugins/OOMCrash/source/OOMCrashPlugin.cpp index 250e6b087..96a997441 100644 --- a/rdkPlugins/OOMCrash/source/OOMCrashPlugin.cpp +++ b/rdkPlugins/OOMCrash/source/OOMCrashPlugin.cpp @@ -19,10 +19,8 @@ #include "OOMCrashPlugin.h" -#include -#include #include -#include +#include #define FIREBOLT_STATE "fireboltState" #define FIREBOLT_STATE_PREV "fireboltState_prev" @@ -396,36 +394,19 @@ bool OOMCrash::checkForOOM() unsigned long oomKill = 0; bool cgroupRead = readCgroup(&oomKill); - // Helper: append a log line to /tmp/oomcrash_.log using direct file I/O. - // sd_journal_send() from the forked child is unreliable under memory pressure - // (and stderr routes through the same journald socket on systemd services). - // Direct file writes use page cache and are far more resilient. - const std::string logFile = "/tmp/oomcrash_" + mUtils->getContainerId() + ".log"; - auto fileLog = [&logFile](const char *fmt, ...) - { - char buf[256]; - va_list ap; - va_start(ap, fmt); - int n = vsnprintf(buf, sizeof(buf), fmt, ap); - va_end(ap); - if (n > 0) - { - int fd = open(logFile.c_str(), O_WRONLY | O_CREAT | O_APPEND, 0644); - if (fd >= 0) - { - (void)write(fd, buf, std::min(n, (int)sizeof(buf) - 1)); - close(fd); - } - } - }; + // Open syslog connection immediately (LOG_NDELAY) so the socket credentials + // are captured while this child PID still exists. Unlike sd_journal_send() + // (used by AI_LOG), syslog() embeds the ident and PID in the message text + // itself, so journald can process the message even after the child exits. + openlog("DobbyDaemon", LOG_PID | LOG_NDELAY, LOG_DAEMON); if (cgroupRead && oomKill > 0) { // cgroup counter is authoritative — OOM kill confirmed AI_LOG_INFO("oom_control reports OOM (value=%lu) for container '%s'", oomKill, mUtils->getContainerId().c_str()); - fileLog("OOMCrash: oom_control reports OOM (value=%lu) for container '%s'\n", - oomKill, mUtils->getContainerId().c_str()); + syslog(LOG_WARNING, "OOMCrash: oom_control reports OOM (value=%lu) for container '%s'", + oomKill, mUtils->getContainerId().c_str()); } else if (cgroupRead && isMemoryAtLimit()) { @@ -436,13 +417,14 @@ bool OOMCrash::checkForOOM() AI_LOG_WARN("oom_kill=0 but max memory usage reached limit for container '%s' " "(soft OOM — killed by init due to memory pressure, not by kernel OOM killer)", mUtils->getContainerId().c_str()); - fileLog("OOMCrash: soft OOM for container '%s' (oom_kill=0, memory at limit)\n", - mUtils->getContainerId().c_str()); + syslog(LOG_WARNING, "OOMCrash: soft OOM for container '%s' (oom_kill=0, memory at limit)", + mUtils->getContainerId().c_str()); } else if (cgroupRead) { // cgroup read succeeded, counter is 0, and memory didn't hit limit AI_LOG_INFO("No OOM kill detected in container '%s'", mUtils->getContainerId().c_str()); + closelog(); return false; } else if (isMemoryAtLimit()) @@ -450,12 +432,13 @@ bool OOMCrash::checkForOOM() // cgroup file was unreadable — fall back to high-water-mark heuristic AI_LOG_WARN("cgroup unreadable; max memory usage reached limit for container '%s'", mUtils->getContainerId().c_str()); - fileLog("OOMCrash: cgroup unreadable, memory at limit for container '%s'\n", - mUtils->getContainerId().c_str()); + syslog(LOG_WARNING, "OOMCrash: cgroup unreadable, memory at limit for container '%s'", + mUtils->getContainerId().c_str()); } else { AI_LOG_INFO("No OOM kill detected in container '%s'", mUtils->getContainerId().c_str()); + closelog(); return false; } @@ -493,17 +476,18 @@ bool OOMCrash::checkForOOM() { AI_LOG_WARN("OOM kill detected: container '%s' fireboltState '%s'", mUtils->getContainerId().c_str(), fireboltState.c_str()); - fileLog("OOMCrash: container '%s' fireboltState '%s'\n", - mUtils->getContainerId().c_str(), fireboltState.c_str()); + syslog(LOG_WARNING, "OOMCrash: container '%s' fireboltState '%s'", + mUtils->getContainerId().c_str(), fireboltState.c_str()); } else { AI_LOG_WARN("OOM kill detected: container '%s' (firebolt state unknown)", mUtils->getContainerId().c_str()); - fileLog("OOMCrash: container '%s' (firebolt state unknown)\n", - mUtils->getContainerId().c_str()); + syslog(LOG_WARNING, "OOMCrash: container '%s' (firebolt state unknown)", + mUtils->getContainerId().c_str()); } + closelog(); return true; } From 9572d444e7058365873e2e0567338a5f6a22f5ed Mon Sep 17 00:00:00 2001 From: ks734_comcast Date: Thu, 25 Jun 2026 18:39:23 +0530 Subject: [PATCH 06/10] RDKEMW-16955: Debug-Log firebolt state of the app in OOMCrash plugin --- rdkPlugins/OOMCrash/source/OOMCrashPlugin.cpp | 38 ++++++++----------- 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/rdkPlugins/OOMCrash/source/OOMCrashPlugin.cpp b/rdkPlugins/OOMCrash/source/OOMCrashPlugin.cpp index 96a997441..e30a99546 100644 --- a/rdkPlugins/OOMCrash/source/OOMCrashPlugin.cpp +++ b/rdkPlugins/OOMCrash/source/OOMCrashPlugin.cpp @@ -20,7 +20,7 @@ #include "OOMCrashPlugin.h" #include -#include +#include #define FIREBOLT_STATE "fireboltState" #define FIREBOLT_STATE_PREV "fireboltState_prev" @@ -148,6 +148,16 @@ bool OOMCrash::postHalt() } AI_LOG_INFO("OOMCrash postHalt hook is running for container with hostname %s", mUtils->getContainerId().c_str()); + + // TEMPORARY DIAGNOSTIC: keep this forked child process alive briefly so + // journald has time to resolve /proc// and process the AI_LOG + // messages above before the child calls _exit(0). If the AI_LOG lines + // now appear reliably in the journal (whereas without this sleep they + // were intermittently dropped), it proves the log loss is a journald + // PID-resolution race against the short-lived child, not a logging bug. + // REMOVE before merging. + usleep(200000); // 200 ms + return true; } @@ -167,9 +177,12 @@ std::vector OOMCrash::getDependencies() const std::vector dependencies; const rt_defs_plugins_oom_crash* pluginConfig = mContainerConfig->rdk_plugins->oomcrash; - for (size_t i = 0; i < pluginConfig->depends_on_len; i++) + if (pluginConfig && pluginConfig->depends_on) { - dependencies.push_back(pluginConfig->depends_on[i]); + for (size_t i = 0; i < pluginConfig->depends_on_len; i++) + { + dependencies.push_back(pluginConfig->depends_on[i]); + } } return dependencies; @@ -394,19 +407,11 @@ bool OOMCrash::checkForOOM() unsigned long oomKill = 0; bool cgroupRead = readCgroup(&oomKill); - // Open syslog connection immediately (LOG_NDELAY) so the socket credentials - // are captured while this child PID still exists. Unlike sd_journal_send() - // (used by AI_LOG), syslog() embeds the ident and PID in the message text - // itself, so journald can process the message even after the child exits. - openlog("DobbyDaemon", LOG_PID | LOG_NDELAY, LOG_DAEMON); - if (cgroupRead && oomKill > 0) { // cgroup counter is authoritative — OOM kill confirmed AI_LOG_INFO("oom_control reports OOM (value=%lu) for container '%s'", oomKill, mUtils->getContainerId().c_str()); - syslog(LOG_WARNING, "OOMCrash: oom_control reports OOM (value=%lu) for container '%s'", - oomKill, mUtils->getContainerId().c_str()); } else if (cgroupRead && isMemoryAtLimit()) { @@ -417,14 +422,11 @@ bool OOMCrash::checkForOOM() AI_LOG_WARN("oom_kill=0 but max memory usage reached limit for container '%s' " "(soft OOM — killed by init due to memory pressure, not by kernel OOM killer)", mUtils->getContainerId().c_str()); - syslog(LOG_WARNING, "OOMCrash: soft OOM for container '%s' (oom_kill=0, memory at limit)", - mUtils->getContainerId().c_str()); } else if (cgroupRead) { // cgroup read succeeded, counter is 0, and memory didn't hit limit AI_LOG_INFO("No OOM kill detected in container '%s'", mUtils->getContainerId().c_str()); - closelog(); return false; } else if (isMemoryAtLimit()) @@ -432,13 +434,10 @@ bool OOMCrash::checkForOOM() // cgroup file was unreadable — fall back to high-water-mark heuristic AI_LOG_WARN("cgroup unreadable; max memory usage reached limit for container '%s'", mUtils->getContainerId().c_str()); - syslog(LOG_WARNING, "OOMCrash: cgroup unreadable, memory at limit for container '%s'", - mUtils->getContainerId().c_str()); } else { AI_LOG_INFO("No OOM kill detected in container '%s'", mUtils->getContainerId().c_str()); - closelog(); return false; } @@ -476,18 +475,13 @@ bool OOMCrash::checkForOOM() { AI_LOG_WARN("OOM kill detected: container '%s' fireboltState '%s'", mUtils->getContainerId().c_str(), fireboltState.c_str()); - syslog(LOG_WARNING, "OOMCrash: container '%s' fireboltState '%s'", - mUtils->getContainerId().c_str(), fireboltState.c_str()); } else { AI_LOG_WARN("OOM kill detected: container '%s' (firebolt state unknown)", mUtils->getContainerId().c_str()); - syslog(LOG_WARNING, "OOMCrash: container '%s' (firebolt state unknown)", - mUtils->getContainerId().c_str()); } - closelog(); return true; } From 29a31047235544181468998dc17795150f454203 Mon Sep 17 00:00:00 2001 From: ks734_comcast Date: Fri, 26 Jun 2026 10:49:30 +0530 Subject: [PATCH 07/10] RDKEMW-16955: Fix intermittent loss of plugin hook logs in journald --- .../lib/source/DobbyRdkPluginManager.cpp | 85 +++++++++++++++++++ rdkPlugins/OOMCrash/source/OOMCrashPlugin.cpp | 11 --- 2 files changed, 85 insertions(+), 11 deletions(-) diff --git a/pluginLauncher/lib/source/DobbyRdkPluginManager.cpp b/pluginLauncher/lib/source/DobbyRdkPluginManager.cpp index c9bbd727c..2910e4478 100644 --- a/pluginLauncher/lib/source/DobbyRdkPluginManager.cpp +++ b/pluginLauncher/lib/source/DobbyRdkPluginManager.cpp @@ -42,6 +42,84 @@ #include #include +#ifdef USE_SYSTEMD +#define SD_JOURNAL_SUPPRESS_LOCATION +#include +#include + +// ----------------------------------------------------------------------------- +/** + * @brief Redirect this process's AI_LOG output to a journald *stream* fd + * instead of per-message datagrams. + * + * Plugin hooks run inside a short-lived worker process forked by + * executeHookTimeout(). If that worker logs via sd_journal_send() (datagrams) + * and then _exit()s, journald may not have resolved /proc/ for message + * attribution before the pid disappears, so the messages get dropped (a + * journald PID-resolution race). A journald stream fd avoids this: journald + * captures the sender's identity via SO_PEERCRED at connect time - while we + * are still alive - so every line written is attributed correctly even after + * we exit, and the kernel preserves the buffered stream data until journald + * reads EOF. + * + * @return The stream fd on success (left open for the lifetime of the + * process), or -1 on failure (logging is left unchanged). + */ +static int redirectLoggingToJournaldStream() +{ + int journalFd = sd_journal_stream_fd("DobbyDaemon", LOG_INFO, 1 /* level_prefix */); + if (journalFd < 0) + { + // Leave the inherited logger in place - no worse than before + return -1; + } + + AICommon::initLogging( + [journalFd](int level, const char *file, const char *func, int line, const char *message) + { + int priority; + const char *levelStr; + switch (level) + { + case AI_DEBUG_LEVEL_FATAL: priority = LOG_CRIT; levelStr = "FTL: "; break; + case AI_DEBUG_LEVEL_ERROR: priority = LOG_ERR; levelStr = "ERR: "; break; + case AI_DEBUG_LEVEL_WARNING: priority = LOG_WARNING; levelStr = "WRN: "; break; + case AI_DEBUG_LEVEL_MILESTONE: priority = LOG_NOTICE; levelStr = "MIL: "; break; + case AI_DEBUG_LEVEL_INFO: priority = LOG_INFO; levelStr = "NFO: "; break; + case AI_DEBUG_LEVEL_DEBUG: priority = LOG_DEBUG; levelStr = "DBG: "; break; + default: priority = LOG_INFO; levelStr = ": "; break; + } + + // With level_prefix enabled, journald reads a leading "" on each + // line to set the message priority, then stores the remainder as + // the message text. Mirror the usual Dobby log format for the text. + char buf[512]; + int len; + if (!file || !func || (line <= 0)) + { + len = snprintf(buf, sizeof(buf), "<%d>%s%s\n", + priority, levelStr, message); + } + else + { + len = snprintf(buf, sizeof(buf), "<%d>%s< M:%.64s F:%.64s L:%d > %s\n", + priority, levelStr, file, func, line, message); + } + if (len < 0) + return; + if (len > static_cast(sizeof(buf))) + len = sizeof(buf); + + // write() copies into journald's socket buffer synchronously, so the + // data is safe even if we _exit() immediately afterwards. + ssize_t written = TEMP_FAILURE_RETRY(write(journalFd, buf, len)); + (void)written; + }); + + return journalFd; +} +#endif // USE_SYSTEMD + // ----------------------------------------------------------------------------- /** * @brief Create instance of DobbyRdkPlugin Manager and load all plugins that @@ -715,6 +793,13 @@ bool DobbyRdkPluginManager::executeHookTimeout(const std::string &pluginName, if (setsid() < 0) _exit(EXIT_FAILURE); +#ifdef USE_SYSTEMD + // This worker is short-lived: it runs the hook and then _exit()s below. + // Route its logs through a journald stream fd so they aren't dropped by + // the journald PID-resolution race (see redirectLoggingToJournaldStream). + redirectLoggingToJournaldStream(); +#endif + char result = static_cast(executeHook(pluginName, hook)); // Set result in shared memory *static_cast(sharedMemory) = result; diff --git a/rdkPlugins/OOMCrash/source/OOMCrashPlugin.cpp b/rdkPlugins/OOMCrash/source/OOMCrashPlugin.cpp index e30a99546..772df63d8 100644 --- a/rdkPlugins/OOMCrash/source/OOMCrashPlugin.cpp +++ b/rdkPlugins/OOMCrash/source/OOMCrashPlugin.cpp @@ -20,7 +20,6 @@ #include "OOMCrashPlugin.h" #include -#include #define FIREBOLT_STATE "fireboltState" #define FIREBOLT_STATE_PREV "fireboltState_prev" @@ -148,16 +147,6 @@ bool OOMCrash::postHalt() } AI_LOG_INFO("OOMCrash postHalt hook is running for container with hostname %s", mUtils->getContainerId().c_str()); - - // TEMPORARY DIAGNOSTIC: keep this forked child process alive briefly so - // journald has time to resolve /proc// and process the AI_LOG - // messages above before the child calls _exit(0). If the AI_LOG lines - // now appear reliably in the journal (whereas without this sleep they - // were intermittently dropped), it proves the log loss is a journald - // PID-resolution race against the short-lived child, not a logging bug. - // REMOVE before merging. - usleep(200000); // 200 ms - return true; } From d5af721b90e27ecc62c0befd43f6e9bc9cab0cc4 Mon Sep 17 00:00:00 2001 From: ks734_comcast Date: Fri, 26 Jun 2026 12:02:29 +0530 Subject: [PATCH 08/10] RDKEMW-16955: Fix intermittent loss of plugin hook logs in journald --- .../lib/source/DobbyRdkPluginManager.cpp | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/pluginLauncher/lib/source/DobbyRdkPluginManager.cpp b/pluginLauncher/lib/source/DobbyRdkPluginManager.cpp index 2910e4478..aaf867fdb 100644 --- a/pluginLauncher/lib/source/DobbyRdkPluginManager.cpp +++ b/pluginLauncher/lib/source/DobbyRdkPluginManager.cpp @@ -92,19 +92,15 @@ static int redirectLoggingToJournaldStream() // With level_prefix enabled, journald reads a leading "" on each // line to set the message priority, then stores the remainder as - // the message text. Mirror the usual Dobby log format for the text. + // the message text. Match the daemon's journald format exactly: the + // text is just ": " (file/func/line were carried in + // separate CODE_* fields, not the message text). + (void)file; + (void)func; + (void)line; char buf[512]; - int len; - if (!file || !func || (line <= 0)) - { - len = snprintf(buf, sizeof(buf), "<%d>%s%s\n", + int len = snprintf(buf, sizeof(buf), "<%d>%s%s\n", priority, levelStr, message); - } - else - { - len = snprintf(buf, sizeof(buf), "<%d>%s< M:%.64s F:%.64s L:%d > %s\n", - priority, levelStr, file, func, line, message); - } if (len < 0) return; if (len > static_cast(sizeof(buf))) From f80cc9921f0ffb5f9a6382f2dadef27abf04bc21 Mon Sep 17 00:00:00 2001 From: ks734_comcast Date: Fri, 26 Jun 2026 14:21:04 +0530 Subject: [PATCH 09/10] Revert - Fix intermittent loss of plugin hook logs in journald --- .../lib/source/DobbyRdkPluginManager.cpp | 81 ------------------- 1 file changed, 81 deletions(-) diff --git a/pluginLauncher/lib/source/DobbyRdkPluginManager.cpp b/pluginLauncher/lib/source/DobbyRdkPluginManager.cpp index aaf867fdb..c9bbd727c 100644 --- a/pluginLauncher/lib/source/DobbyRdkPluginManager.cpp +++ b/pluginLauncher/lib/source/DobbyRdkPluginManager.cpp @@ -42,80 +42,6 @@ #include #include -#ifdef USE_SYSTEMD -#define SD_JOURNAL_SUPPRESS_LOCATION -#include -#include - -// ----------------------------------------------------------------------------- -/** - * @brief Redirect this process's AI_LOG output to a journald *stream* fd - * instead of per-message datagrams. - * - * Plugin hooks run inside a short-lived worker process forked by - * executeHookTimeout(). If that worker logs via sd_journal_send() (datagrams) - * and then _exit()s, journald may not have resolved /proc/ for message - * attribution before the pid disappears, so the messages get dropped (a - * journald PID-resolution race). A journald stream fd avoids this: journald - * captures the sender's identity via SO_PEERCRED at connect time - while we - * are still alive - so every line written is attributed correctly even after - * we exit, and the kernel preserves the buffered stream data until journald - * reads EOF. - * - * @return The stream fd on success (left open for the lifetime of the - * process), or -1 on failure (logging is left unchanged). - */ -static int redirectLoggingToJournaldStream() -{ - int journalFd = sd_journal_stream_fd("DobbyDaemon", LOG_INFO, 1 /* level_prefix */); - if (journalFd < 0) - { - // Leave the inherited logger in place - no worse than before - return -1; - } - - AICommon::initLogging( - [journalFd](int level, const char *file, const char *func, int line, const char *message) - { - int priority; - const char *levelStr; - switch (level) - { - case AI_DEBUG_LEVEL_FATAL: priority = LOG_CRIT; levelStr = "FTL: "; break; - case AI_DEBUG_LEVEL_ERROR: priority = LOG_ERR; levelStr = "ERR: "; break; - case AI_DEBUG_LEVEL_WARNING: priority = LOG_WARNING; levelStr = "WRN: "; break; - case AI_DEBUG_LEVEL_MILESTONE: priority = LOG_NOTICE; levelStr = "MIL: "; break; - case AI_DEBUG_LEVEL_INFO: priority = LOG_INFO; levelStr = "NFO: "; break; - case AI_DEBUG_LEVEL_DEBUG: priority = LOG_DEBUG; levelStr = "DBG: "; break; - default: priority = LOG_INFO; levelStr = ": "; break; - } - - // With level_prefix enabled, journald reads a leading "" on each - // line to set the message priority, then stores the remainder as - // the message text. Match the daemon's journald format exactly: the - // text is just ": " (file/func/line were carried in - // separate CODE_* fields, not the message text). - (void)file; - (void)func; - (void)line; - char buf[512]; - int len = snprintf(buf, sizeof(buf), "<%d>%s%s\n", - priority, levelStr, message); - if (len < 0) - return; - if (len > static_cast(sizeof(buf))) - len = sizeof(buf); - - // write() copies into journald's socket buffer synchronously, so the - // data is safe even if we _exit() immediately afterwards. - ssize_t written = TEMP_FAILURE_RETRY(write(journalFd, buf, len)); - (void)written; - }); - - return journalFd; -} -#endif // USE_SYSTEMD - // ----------------------------------------------------------------------------- /** * @brief Create instance of DobbyRdkPlugin Manager and load all plugins that @@ -789,13 +715,6 @@ bool DobbyRdkPluginManager::executeHookTimeout(const std::string &pluginName, if (setsid() < 0) _exit(EXIT_FAILURE); -#ifdef USE_SYSTEMD - // This worker is short-lived: it runs the hook and then _exit()s below. - // Route its logs through a journald stream fd so they aren't dropped by - // the journald PID-resolution race (see redirectLoggingToJournaldStream). - redirectLoggingToJournaldStream(); -#endif - char result = static_cast(executeHook(pluginName, hook)); // Set result in shared memory *static_cast(sharedMemory) = result; From 7707a2c01489696b66d02155e6b1df0824265948 Mon Sep 17 00:00:00 2001 From: ks734_comcast Date: Fri, 26 Jun 2026 15:01:57 +0530 Subject: [PATCH 10/10] RDKEMW-16955: Fix copilot comments --- rdkPlugins/OOMCrash/source/OOMCrashPlugin.cpp | 100 +++++++++++++----- 1 file changed, 74 insertions(+), 26 deletions(-) diff --git a/rdkPlugins/OOMCrash/source/OOMCrashPlugin.cpp b/rdkPlugins/OOMCrash/source/OOMCrashPlugin.cpp index 772df63d8..a6be71cc0 100644 --- a/rdkPlugins/OOMCrash/source/OOMCrashPlugin.cpp +++ b/rdkPlugins/OOMCrash/source/OOMCrashPlugin.cpp @@ -330,38 +330,83 @@ bool OOMCrash::readCgroup(unsigned long *val) * limit, indicating the container hit its memory ceiling. * * This is used as a fallback OOM indicator on older kernels (< 4.13) where - * the oom_kill counter does not exist and under_oom is transient. - * memory.max_usage_in_bytes is the high-water mark and persists until the - * cgroup is destroyed. + * the oom_kill counter does not exist and under_oom is transient. The + * high-water mark persists until the cgroup is destroyed. + * + * cgroups v1: compares memory.max_usage_in_bytes against memory.limit_in_bytes + * (and the memsw equivalents) under /sys/fs/cgroup/memory/. + * + * cgroups v2: compares memory.peak against memory.max (and the swap + * equivalents) under /sys/fs/cgroup/, falling back to the + * system.slice scope path - mirroring readCgroup(). A limit file may + * hold the literal "max" (unlimited); such pairs are skipped. * * @return true if max usage >= limit for memory or memory+swap. */ bool OOMCrash::isMemoryAtLimit() { - std::string basePath = "/sys/fs/cgroup/memory/" + mUtils->getContainerId(); + const std::string containerId = mUtils->getContainerId(); - const char *pairs[][2] = { - { "/memory.max_usage_in_bytes", "/memory.limit_in_bytes" }, - { "/memory.memsw.max_usage_in_bytes", "/memory.memsw.limit_in_bytes" }, - }; + // Detect cgroups version by checking for the v2 unified hierarchy + struct stat st; + const bool isCgroupV2 = (stat("/sys/fs/cgroup/cgroup.controllers", &st) == 0); - for (const auto &pair : pairs) + // Reads a single value from a cgroup file. Returns false on open/parse + // failure or when the file holds the literal "max" (v2 unlimited). + auto readValue = [](const std::string &filePath, unsigned long *out) -> bool { - unsigned long maxUsage = 0, limit = 0; - std::string maxPath = basePath + pair[0]; - std::string limPath = basePath + pair[1]; + FILE *fp = fopen(filePath.c_str(), "r"); + if (!fp) + return false; + + char token[64] = {0}; + bool ok = (fscanf(fp, "%63s", token) == 1); + fclose(fp); + + if (!ok || strcmp(token, "max") == 0) + return false; + + char *end = nullptr; + unsigned long v = strtoul(token, &end, 10); + if (end == token || *end != '\0') + return false; + + *out = v; + return true; + }; - FILE *fpMax = fopen(maxPath.c_str(), "r"); - FILE *fpLim = fopen(limPath.c_str(), "r"); + std::string basePath; + const char *pairs[2][2]; + + if (isCgroupV2) + { + // Resolve the container's cgroup dir in the unified hierarchy, matching + // readCgroup()'s direct + system.slice scope fallback. + basePath = "/sys/fs/cgroup/" + containerId; + if (stat((basePath + "/memory.max").c_str(), &st) != 0) + { + basePath = "/sys/fs/cgroup/system.slice/dobby-" + containerId + ".scope"; + } - bool ok = (fpMax && fpLim && - fscanf(fpMax, "%lu", &maxUsage) == 1 && - fscanf(fpLim, "%lu", &limit) == 1); + // memory.peak (kernel >= 5.19) vs memory.max + // memory.swap.peak (kernel >= 6.5) vs memory.swap.max + pairs[0][0] = "/memory.peak"; pairs[0][1] = "/memory.max"; + pairs[1][0] = "/memory.swap.peak"; pairs[1][1] = "/memory.swap.max"; + } + else + { + basePath = "/sys/fs/cgroup/memory/" + containerId; - if (fpMax) fclose(fpMax); - if (fpLim) fclose(fpLim); + pairs[0][0] = "/memory.max_usage_in_bytes"; pairs[0][1] = "/memory.limit_in_bytes"; + pairs[1][0] = "/memory.memsw.max_usage_in_bytes"; pairs[1][1] = "/memory.memsw.limit_in_bytes"; + } - if (ok && limit > 0 && maxUsage >= limit) + for (const auto &pair : pairs) + { + unsigned long maxUsage = 0, limit = 0; + if (readValue(basePath + pair[0], &maxUsage) && + readValue(basePath + pair[1], &limit) && + limit > 0 && maxUsage >= limit) { AI_LOG_INFO("%s=%lu reached %s=%lu", pair[0]+1, maxUsage, pair[1]+1, limit); return true; @@ -393,14 +438,17 @@ bool OOMCrash::isMemoryAtLimit() bool OOMCrash::checkForOOM() { - unsigned long oomKill = 0; - bool cgroupRead = readCgroup(&oomKill); + unsigned long oomIndicator = 0; + bool cgroupRead = readCgroup(&oomIndicator); - if (cgroupRead && oomKill > 0) + if (cgroupRead && oomIndicator > 0) { - // cgroup counter is authoritative — OOM kill confirmed - AI_LOG_INFO("oom_control reports OOM (value=%lu) for container '%s'", - oomKill, mUtils->getContainerId().c_str()); + // cgroup OOM indicator is authoritative — OOM confirmed. The exact + // source depends on the hierarchy/kernel (memory.oom_control oom_kill + // or under_oom on v1, memory.events oom_kill on v2), so keep the + // wording generic. + AI_LOG_INFO("cgroup OOM indicator set (value=%lu) for container '%s'", + oomIndicator, mUtils->getContainerId().c_str()); } else if (cgroupRead && isMemoryAtLimit()) {