NVIDIA · svcnvidia-nemo-ci · Apr 22, 2026 · Apr 22, 2026 · Apr 22, 2026 · Apr 22, 2026
@@ -0,0 +1,14 @@
+{
+  "hooks": {
+    "UserPromptSubmit": [
+      {
+        "hooks": [
+          {
+            "type": "command",
+            "command": "printf '{\"hookSpecificOutput\":{\"hookEventName\":\"UserPromptSubmit\",\"additionalContext\":\"MANDATORY WORKFLOW — never skip or reorder: (1) Read the artifact first (commit, file, error, PR). (2) Identify and invoke the relevant skill via the Skill tool BEFORE forming any answer or plan — even when the answer seems obvious. (3) Only then answer using the skill context. Skipping step 2 is not allowed.\"}}'"
+          }
+        ]
+      }
+    ]
+  }
+}
@@ -61,6 +61,10 @@ inputs:
     description: "Platform to run tests on (e.g. dgx_h100, dgx_gb200)"
     required: false
     default: "dgx_h100"
+  cadence:
+    description: "Trigger cadence for cadence filter (pr|nightly|mergegroup). Empty disables filter."
+    required: false
+    default: ""
 runs:
   using: "composite"
   steps:
@@ -136,6 +140,9 @@ runs:
         if [ "${{ inputs.lightweight }}" == "true" ]; then
           ARGS+=(--enable-lightweight-mode)
         fi
+        if [ -n "${{ inputs.cadence }}" ]; then
+          ARGS+=(--cadence ${{ inputs.cadence }})
+        fi
 
         export PYTHONPATH=$(pwd)
         export NEMORUN_HOME=$(pwd)

@@ -1,4 +1,4 @@
 enabled: true
 auto_sync_draft: false
 auto_sync_ready: true
-trustees_override: ["AAnoosheh", "ArEsKay3", "Autumn1998", "BestJuly", "BoxiangW", "CarlosGomes98", "ChenhanYu", "Connor-XY", "FDecaYed", "HaochenYuan", "ISEEKYAN", "JRD971000", "Mellonta", "Phlip79", "QiZhangNV", "RPrenger", "ShriyaRishab", "Victarry", "WanZzzzzz", "Wohox", "YangFei1990", "ZhiyuLi-Nvidia", "ahmadki", "aklife97", "ananthsub", "aroshanghias-nvd", "asolergi-nv", "buptzyb", "chtruong814", "cjld", "cspades", "cuichenx", "deepakn94", "dimapihtar", "dingqingy-nv", "duncanriach", "erhoo82", "ericharper", "fanshiqing", "faradawn", "fitsumreda", "frsun-nvda", "gautham-kollu", "gdengk", "guihong-nv", "guyueh1", "hexinw-nvidia", "huvunvidia", "hxbai", "ilml", "jalbericiola", "janEbert", "jaredcasper", "jenchen13", "jiemingz", "jingqiny-99", "jkamalu", "jon-barker", "jstjohn", "kajalj22", "kanz-nv", "keshavb96", "kevalmorabia97", "ko3n1g", "ksivaman", "kunlunl", "kvareddy", "kwyss-nvidia", "layalir", "lhb8125", "lmcafee-nvidia", "maanug-nv", "mathemakitten", "matthieule", "mchrzanowski", "mehraakash", "minitu", "mkhona-nvidia", "nanz-nv", "parthmannan", "prajwal1210", "pthombre", "rhewett-nv", "rogerwaleffe", "sajadn", "sanandaraj5597", "sancha", "santhnm2", "sbak5", "shanmugamr1992", "sharathts", "sheliang-nv", "shengf-nv", "shifangx", "shjwudp", "sidsingh-nvidia", "skyw", "sraman-rgb", "sudhakarsingh27", "tdene", "theothermike", "thomasdhc", "tomlifu", "trintamaki", "tylerpoon", "wdykas", "wplf", "wujingyue", "xiaoyao0115", "xuwchen", "yanring", "yaox12", "yaoyu-33", "yashaswikarnati", "yeyu-nvidia", "yobibyte", "youngeunkwon0405", "yueshen2016", "yuzhongw-nvidia", "zhongbozhu"]
+trustees_override: ["AAnoosheh", "ArEsKay3", "Autumn1998", "BestJuly", "BoxiangW", "CarlosGomes98", "ChenhanYu", "Connor-XY", "FDecaYed", "HaochenYuan", "ISEEKYAN", "JRD971000", "Mellonta", "Phlip79", "QiZhangNV", "RPrenger", "ShriyaRishab", "Victarry", "WanZzzzzz", "Wohox", "YangFei1990", "ZhiyuLi-Nvidia", "ahmadki", "aklife97", "ananthsub", "aroshanghias-nvd", "asolergi-nv", "balasaajay", "buptzyb", "chtruong814", "cjld", "cspades", "cuichenx", "deepakn94", "dimapihtar", "dingqingy-nv", "duncanriach", "erhoo82", "ericharper", "fanshiqing", "faradawn", "fitsumreda", "frsun-nvda", "gautham-kollu", "gdengk", "guihong-nv", "guyueh1", "hexinw-nvidia", "huvunvidia", "hxbai", "ilml", "jalbericiola", "janEbert", "jaredcasper", "jenchen13", "jiemingz", "jingqiny-99", "jkamalu", "jon-barker", "jstjohn", "kajalj22", "kanz-nv", "kevalmorabia97", "ko3n1g", "ksivaman", "kunlunl", "kvareddy", "kwyss-nvidia", "layalir", "lhb8125", "lmcafee-nvidia", "maanug-nv", "mathemakitten", "matthieule", "mchrzanowski", "mehraakash", "minitu", "mkhona-nvidia", "nanz-nv", "ntajbakhsh", "parthmannan", "prajwal1210", "pthombre", "rhewett-nv", "rogerwaleffe", "sajadn", "sanandaraj5597", "sancha", "santhnm2", "sbak5", "shanmugamr1992", "sharathts", "sheliang-nv", "shengf-nv", "shifangx", "shjwudp", "sidsingh-nvidia", "skyw", "sraman-rgb", "sudhakarsingh27", "tdene", "theothermike", "thomasdhc", "tomlifu", "trintamaki", "tylerpoon", "wdykas", "wplf", "wujingyue", "xiaoyao0115", "xuantengh", "xuwchen", "yanring", "yaox12", "yaoyu-33", "yashaswikarnati", "yeyu-nvidia", "yobibyte", "youngeunkwon0405", "yueshen2016", "yuzhongw-nvidia", "zhongbozhu"]
diff --git a/.github/oncall_schedule.json b/.github/oncall_schedule.json
@@ -1,12 +1,4 @@
 [
-    {
-        "user": "asolergi-nv",
-        "date": "2026-04-22"
-    },
-    {
-        "user": "maanug-nv",
-        "date": "2026-04-29"
-    },
     {
         "user": "dimapihtar",
         "date": "2026-05-06"
@@ -46,5 +38,13 @@
     {
         "user": "wujingyue",
         "date": "2026-07-08"
+    },
+    {
+        "user": "Connor-XY",
+        "date": "2026-07-15"
+    },
+    {
+        "user": "Phlip79",
+        "date": "2026-07-22"
     }
 ]
@@ -3,6 +3,15 @@
 
 :warning: For major changes (either in lines of code or in its impact), please make sure to first share a design doc with the team. If you're unsure what's the best way to do so, contact the @mcore-oncall.
 
+## Issue tracking
+
+For PRs from open-source community contributors:
+
+- **New features**: a linked issue is **required**. Please open a [feature request](https://github.com/NVIDIA/Megatron-LM/issues/new?template=feature_request.md) and reference it here before submitting the PR.
+- **Small updates (bug fixes, minor improvements)**: a linked issue is **recommended** and will accelerate the PR review process.
+
+Linked issue: <!-- e.g. Fixes #1234 / Related to #1234 -->
+
 ## Contribution process
 
 ### Pre-checks

@@ -65,6 +65,7 @@ jobs:
           import json
           import requests
           import re
+          import time
 
           # GitHub API configuration
           GITHUB_TOKEN = os.environ["GITHUB_TOKEN"]
@@ -88,21 +89,38 @@ jobs:
               "X-GitHub-Api-Version": "2022-11-28",
           }
 
-          def make_request(endpoint, method="GET", data=None):
-              """Make a request to the GitHub API with error handling."""
+          def make_request(endpoint, method="GET", data=None, max_retries=5):
+              """Make a request to the GitHub API with retry on transient errors."""
               url = f"{API_BASE}/{endpoint}"
-              try:
-                  if method == "GET":
-                      response = requests.get(url, headers=headers)
-                  else:
-                      response = requests.post(url, headers=headers, json=data)
-                  response.raise_for_status()
-                  return response.json()
-              except requests.exceptions.RequestException as e:
-                  print(f"Error making request to {endpoint}: {str(e)}")
-                  if hasattr(e.response, 'text'):
-                      print(f"Response: {e.response.text}")
-                  return None
+              for attempt in range(max_retries):
+                  try:
+                      if method == "GET":
+                          response = requests.get(url, headers=headers, timeout=30)
+                      else:
+                          response = requests.post(url, headers=headers, json=data, timeout=30)
+                      if response.status_code == 429:
+                          retry_after = int(response.headers.get("Retry-After", 2 ** attempt))
+                          print(f"Rate limited on {endpoint}, retrying in {retry_after}s (attempt {attempt + 1}/{max_retries})")
+                          time.sleep(retry_after)
+                          continue
+                      if response.status_code >= 500:
+                          delay = 2 ** attempt
+                          print(f"Server error {response.status_code} on {endpoint}, retrying in {delay}s (attempt {attempt + 1}/{max_retries})")
+                          time.sleep(delay)
+                          continue
+                      response.raise_for_status()
+                      return response.json()
+                  except (requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e:
+                      delay = 2 ** attempt
+                      print(f"Transient error on {endpoint}: {e}, retrying in {delay}s (attempt {attempt + 1}/{max_retries})")
+                      time.sleep(delay)
+                  except requests.exceptions.RequestException as e:
+                      print(f"Error making request to {endpoint}: {str(e)}")
+                      if hasattr(e, 'response') and e.response is not None:
+                          print(f"Response: {e.response.text}")
+                      return None
+              print(f"Max retries ({max_retries}) exceeded for {endpoint}")
+              return None
 
           def is_internal_contributor(pr_info):
               """Return True if the PR author is a member of NVIDIA or NVIDIA-NeMo org (is_org_member)."""
@@ -166,8 +184,16 @@ jobs:
 
           # Get current running and queued workflows
           print("Fetching workflow runs...")
-          queued_workflow_runs = make_request("actions/runs?status=queued").get("workflow_runs", [])
-          in_progress_workflow_runs = make_request("actions/runs?status=in_progress").get("workflow_runs", [])
+          queued_resp = make_request("actions/runs?status=queued")
+          if queued_resp is None:
+              print("Failed to fetch queued workflow runs after retries, exiting")
+              exit(1)
+          queued_workflow_runs = queued_resp.get("workflow_runs", [])
+          in_progress_resp = make_request("actions/runs?status=in_progress")
+          if in_progress_resp is None:
+              print("Failed to fetch in-progress workflow runs after retries, exiting")
+              exit(1)
+          in_progress_workflow_runs = in_progress_resp.get("workflow_runs", [])
 
           # For external contributors, enforce a single global concurrency limit across ALL branches.
           # For internal contributors, enforce per-branch limits as before.
@@ -199,7 +225,11 @@ jobs:
 
           # Get waiting CI workflows for test environment
           print("Fetching deployments...")
-          pending_workflows = make_request("actions/runs?status=waiting").get("workflow_runs", [])
+          waiting_resp = make_request("actions/runs?status=waiting")
+          if waiting_resp is None:
+              print("Failed to fetch waiting workflow runs after retries, exiting")
+              exit(1)
+          pending_workflows = waiting_resp.get("workflow_runs", [])
           print("Pending workflows:", len(pending_workflows))
           pending_workflows = [run for run in pending_workflows
                               if run["name"] == "CICD Megatron-LM" and matches_queue(run, "${{ matrix.branch }}", CONTRIBUTOR_TYPE)]
@@ -220,7 +250,11 @@ jobs:
               print(f"Approving workflow {workflow_name} with Run Id: {workflow_id}")
 
               deployment_url = f"actions/runs/{workflow_id}/pending_deployments"
-              deployment = make_request(deployment_url)[0]
+              deployments = make_request(deployment_url)
+              if not deployments:
+                  print(f"Failed to fetch pending deployments for run {workflow_id}")
+                  exit(1)
+              deployment = deployments[0]
               environment_id = deployment["environment"]["id"]
 
               # Approve the deployment