NVIDIA · RmSchaffert · May 14, 2026
diff --git a/.github/workflows/blossom-ci.yml b/.github/workflows/blossom-ci.yml
@@ -54,7 +54,6 @@ jobs:
           repository: ${{ fromJson(needs.Authorization.outputs.args).repo }}
           ref: ${{ fromJson(needs.Authorization.outputs.args).ref }}
           lfs: 'true'
-          fetch-depth: 0
       - name: Run blossom action
         uses: NVIDIA/blossom-action@main
         env:

diff --git a/build_config/accvlab_build_config/helpers/__init__.py b/build_config/accvlab_build_config/helpers/__init__.py
@@ -17,9 +17,11 @@
 """
 
 from .build_utils import (
+    CudaArchitectureSelection,
     load_config,
     detect_cuda_info,
     get_compile_flags,
+    select_cuda_architectures_for_nvcc,
     run_external_build,
     get_abs_setup_dir,
 )
@@ -29,9 +31,11 @@
 )
 
 __all__ = [
+    'CudaArchitectureSelection',
     'load_config',
     'detect_cuda_info',
     'get_compile_flags',
+    'select_cuda_architectures_for_nvcc',
     'run_external_build',
     'get_abs_setup_dir',
     'build_cmake_args',

diff --git a/build_config/accvlab_build_config/helpers/build_utils.py b/build_config/accvlab_build_config/helpers/build_utils.py
@@ -18,10 +18,150 @@
 """
 
 import os
+import re
 from pathlib import Path
+import shutil
 import subprocess
 import sys
-from typing import Optional
+from typing import List, NamedTuple, Optional
+
+
+class CudaArchitectureSelection(NamedTuple):
+    """CUDA architecture selection compatible with the available ``nvcc``.
+
+    Attributes:
+        architectures: CUDA architectures to build as cubin targets.
+        ptx_architectures: At most one architecture to build as a PTX target
+            because a detected GPU architecture had to be capped.
+    """
+
+    architectures: List[str]
+    ptx_architectures: List[str]
+
+
+def _find_nvcc() -> Optional[str]:
+    """
+    Locate the CUDA compiler used to determine supported target architectures.
+    """
+    candidate = os.environ.get("CUDACXX")
+    if candidate:
+        return candidate
+
+    for env_var in ("CUDA_HOME", "CUDA_PATH"):
+        cuda_root = os.environ.get(env_var)
+        if cuda_root:
+            candidate = os.path.join(cuda_root, "bin", "nvcc")
+            if os.path.exists(candidate):
+                return candidate
+
+    return shutil.which("nvcc")
+
+
+def _detect_nvcc_supported_architectures() -> List[str]:
+    """
+    Ask nvcc which virtual GPU architectures it supports.
+    Returns values like ['70', '75', '80', '90'].
+    """
+    nvcc = _find_nvcc()
+    if not nvcc:
+        return []
+
+    try:
+        result = subprocess.run(
+            [nvcc, "--list-gpu-arch"],
+            check=True,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            text=True,
+            timeout=10,
+        )
+    except Exception:
+        return []
+
+    archs: List[str] = []
+    for match in re.finditer(r"compute_([0-9]+)", result.stdout):
+        arch = match.group(1)
+        if arch not in archs:
+            archs.append(arch)
+
+    return sorted(archs, key=int)
+
+
+def _split_cuda_architectures(value: str) -> List[str]:
+    return [arch.strip() for arch in re.split(r"[,;]", value) if arch.strip()]
+
+
+def _forward_compatible_ptx_architecture(
+    supported_architectures: List[str], max_architecture: int
+) -> Optional[str]:
+    forward_compatible_archs: List[str] = []
+    fallback_archs: List[str] = []
+    for arch in supported_architectures:
+        try:
+            arch_int = int(arch)
+        except ValueError:
+            continue
+
+        if arch_int > max_architecture:
+            continue
+
+        fallback_archs.append(arch)
+        if arch_int % 10 == 0:
+            forward_compatible_archs.append(arch)
+
+    if forward_compatible_archs:
+        return max(forward_compatible_archs, key=int)
+    if fallback_archs:
+        return max(fallback_archs, key=int)
+    return None
+
+
+def select_cuda_architectures_for_nvcc(
+    cuda_architectures: List[str],
+) -> CudaArchitectureSelection:
+    """Select CUDA cubin and PTX targets supported by the installed ``nvcc``.
+
+    Numeric architectures above ``nvcc``'s maximum supported architecture are
+    capped to that maximum. When capping occurs, one PTX target is added using
+    the newest forward-compatible base architecture supported by ``nvcc`` at or
+    below the capped architecture. For example, if the highest supported
+    architecture is ``96``, the PTX target is ``90``.
+
+    Args:
+        cuda_architectures: CUDA architecture numbers to select from, for
+            example ``["80", "90", "103"]``.
+
+    Returns:
+        CudaArchitectureSelection: The capped cubin architectures and, when
+        capping occurred, the single architecture to emit as a PTX target. If
+        ``nvcc`` cannot be found or queried, the input architectures are returned
+        unchanged and no PTX targets are added.
+    """
+    supported_archs = _detect_nvcc_supported_architectures()
+    if not cuda_architectures or not supported_archs:
+        return CudaArchitectureSelection(cuda_architectures, [])
+
+    max_supported = max(int(arch) for arch in supported_archs)
+    capped_archs: List[str] = []
+    any_arch_capped = False
+    for arch in cuda_architectures:
+        try:
+            arch_int = int(arch)
+            capped_arch = str(min(arch_int, max_supported))
+            any_arch_capped = any_arch_capped or arch_int > max_supported
+        except ValueError:
+            capped_arch = arch
+
+        if capped_arch not in capped_archs:
+            capped_archs.append(capped_arch)
+
+    ptx_archs: List[str] = []
+    if any_arch_capped:
+        ptx_arch = _forward_compatible_ptx_architecture(supported_archs, max_supported)
+        if ptx_arch is not None:
+            ptx_archs.append(ptx_arch)
+
+    return CudaArchitectureSelection(capped_archs, ptx_archs)
 
 
 def missing_torch_error() -> RuntimeError:
@@ -106,8 +246,8 @@ def load_config(default_config: Optional[dict] = None) -> dict:
                 config[key] = env_val.lower() in ('1', 'true', 'yes', 'on')
             elif isinstance(config[key], int):
                 config[key] = int(env_val)
-            elif key == 'CUSTOM_CUDA_ARCHS' and env_val:
-                config[key] = env_val.split(',')
+            elif key == 'CUSTOM_CUDA_ARCHS':
+                config[key] = _split_cuda_architectures(env_val) if env_val else None
             else:
                 config[key] = env_val
 
@@ -152,7 +292,12 @@ def detect_cuda_info():
 
 
 def get_compile_flags(config, cuda_info, include_dirs=None):
-    """Construct compilation flags
+    """Construct compilation flags.
+
+    If ``CUSTOM_CUDA_ARCHS`` is unset, detected CUDA architectures are capped to
+    the maximum supported by ``nvcc``. If any architecture is capped, the newest
+    forward-compatible base architecture supported by ``nvcc`` is also emitted
+    as a PTX target.
 
     Args:
         config (dict): Build configuration
@@ -202,17 +347,24 @@ def get_compile_flags(config, cuda_info, include_dirs=None):
 
     # CUDA flags (only if CUDA is available)
     if cuda_info['cuda_available']:
-        cuda_archs = (
-            config['CUSTOM_CUDA_ARCHS']
-            if config['CUSTOM_CUDA_ARCHS'] is not None
-            else cuda_info['gpu_architectures']
-        )
+        ptx_archs: List[str] = []
+        if config['CUSTOM_CUDA_ARCHS'] is not None:
+            cuda_archs = config['CUSTOM_CUDA_ARCHS']
+        else:
+            arch_selection = select_cuda_architectures_for_nvcc(cuda_info['gpu_architectures'])
+            cuda_archs = arch_selection.architectures
+            ptx_archs = arch_selection.ptx_architectures
+
         if not cuda_archs:
-            cuda_archs = ['70', '75', '80', '86']  # Default modern architectures
+            arch_selection = select_cuda_architectures_for_nvcc(['70', '75', '80', '86'])
+            cuda_archs = arch_selection.architectures
+            ptx_archs = arch_selection.ptx_architectures
 
         # Generate architecture flags
         for arch in cuda_archs:
             flags['nvcc'].extend([f'-gencode=arch=compute_{arch},code=sm_{arch}'])
+        for arch in ptx_archs:
+            flags['nvcc'].extend([f'-gencode=arch=compute_{arch},code=compute_{arch}'])
 
         # CUDA compilation flags
         flags['nvcc'].extend(

diff --git a/build_config/accvlab_build_config/helpers/cmake_args.py b/build_config/accvlab_build_config/helpers/cmake_args.py
@@ -3,7 +3,7 @@
 from pathlib import Path
 from typing import List, Optional
 
-from .build_utils import missing_torch_error, require_torch_cuda_support
+from .build_utils import detect_cuda_info, select_cuda_architectures_for_nvcc
 
 # Marker file at the ACCV-Lab monorepo root (see `.nav` in the repository).
 _NAV_MARKER = ".nav"
@@ -57,35 +57,16 @@ def _normalize_cpp_standard(value: str) -> str:
     return v
 
 
-def _detect_cuda_architectures() -> List[str]:
-    """
-    Try to detect CUDA architectures from PyTorch if available.
-
-    Returns a list like ['70', '75', '80']. Returns an empty list if PyTorch is
-    CUDA-enabled but no CUDA devices are available.
-
-    Raises:
-        RuntimeError: If PyTorch is not installed or is installed without CUDA
-            support. ACCV-Lab CUDA extension builds require a CUDA-enabled
-            PyTorch wheel, so this is treated as a build configuration error
-            rather than as "CUDA not detected".
-    """
-    try:
-        import torch  # type: ignore
-    except ImportError as exc:
-        raise missing_torch_error() from exc
-
-    require_torch_cuda_support(torch)
+def _format_cmake_cuda_architectures(archs: List[str], ptx_archs: List[str]) -> List[str]:
+    if not ptx_archs:
+        return archs
 
-    if not torch.cuda.is_available():
-        return []
-    arches: List[str] = []
-    for i in range(torch.cuda.device_count()):
-        major, minor = torch.cuda.get_device_capability(i)
-        arch = f"{major}{minor}"
-        if arch not in arches:
-            arches.append(arch)
-    return arches
+    cmake_archs: List[str] = []
+    for arch in archs:
+        cmake_archs.append(f"{arch}-real")
+    for arch in ptx_archs:
+        cmake_archs.append(f"{arch}-virtual")
+    return cmake_archs
 
 
 def get_project_root() -> Path:
@@ -112,6 +93,11 @@ def _build_cmake_args_from_env() -> List[str]:
     """
     Build a list of -D CMake arguments from environment variables to harmonize
     build configuration across setuptools, external CMake, and scikit-build flows.
+
+    If ``CUSTOM_CUDA_ARCHS`` is unset, detected CUDA architectures are capped to
+    the maximum supported by ``nvcc``. If capping occurs, CMake builds cubins for
+    the capped architectures and adds one PTX target for the newest supported
+    forward-compatible base architecture.
     """
     args: List[str] = []
     # Always export compile_commands.json for tooling/validation
@@ -139,9 +125,15 @@ def _build_cmake_args_from_env() -> List[str]:
         args.append(f'-DCMAKE_CUDA_ARCHITECTURES={norm_archs}')
     else:
         # Attempt auto-detection via torch; if empty, let CMake defaults apply
-        detected = _detect_cuda_architectures()
+        cuda_info = detect_cuda_info()
+        detected = cuda_info['gpu_architectures'] if cuda_info['cuda_available'] else []
         if detected:
-            args.append(f'-DCMAKE_CUDA_ARCHITECTURES={";".join(detected)}')
+            selection = select_cuda_architectures_for_nvcc(detected)
+            cmake_archs = _format_cmake_cuda_architectures(
+                selection.architectures,
+                selection.ptx_architectures,
+            )
+            args.append(f'-DCMAKE_CUDA_ARCHITECTURES={";".join(cmake_archs)}')
 
     # VERBOSE_BUILD -> CMAKE_VERBOSE_MAKEFILE
     if _parse_bool_env(os.environ.get("VERBOSE_BUILD", "")):
@@ -186,7 +178,7 @@ def _build_cmake_args_package_scm_version(repo_root: Path) -> List[str]:
     Pass numeric version from setuptools-scm to CMake as a repo-aligned package
     version define (and harmless for CMake projects that ignore the variable).
     """
-    from setuptools_scm import get_version
+    from setuptools_scm import get_version  # type: ignore
 
     v = get_version(
         root=str(repo_root),
@@ -203,6 +195,10 @@ def _build_cmake_args_package_scm_version(repo_root: Path) -> List[str]:
 def build_cmake_args() -> List[str]:
     """
     Full CMake -D list: environment-based flags plus repo-aligned SCM version define.
+
+    Auto-detected CUDA architectures are capped to ``nvcc`` support when
+    ``CUSTOM_CUDA_ARCHS`` is unset. If capping occurs, one PTX target is emitted
+    for the newest supported forward-compatible base architecture.
     """
     root = get_project_root()
     return _build_cmake_args_from_env() + _build_cmake_args_package_scm_version(root)

diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-FROM nvidia/cuda:12.4.1-devel-ubuntu22.04
+FROM nvidia/cuda:12.8.2-devel-ubuntu22.04
 
 ENV DEBIAN_FRONTEND=noninteractive
 
@@ -86,7 +86,7 @@ RUN pip install numpy==1.23.5 \
     scipy==1.15.3 \
     opencv-python-headless==4.5.5.64
 
-RUN pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu124
+RUN pip install torch==2.11.0 torchvision==0.26.0 torchaudio==2.11.0 --index-url https://download.pytorch.org/whl/cu128
 
 RUN pip install black
 
@@ -101,13 +101,13 @@ RUN pip install sphinx \
 RUN pip install ninja \
     scikit-build
 
-RUN pip install pycuda==2025.1.1 \
+RUN pip install pycuda==2025.1.2 \
     pybind11==3.0.0 \
     cvcuda-cu12==0.15.0
 
 RUN pip install pandas==1.5.3 \
     IPython \
-    nvidia-dali-cuda120==1.51.2 \
+    nvidia-dali-cuda120==1.53.0 \
     nvtx \
     psutil \
     numba==0.59 \
@@ -123,7 +123,7 @@ RUN apt-get install -y libjpeg-dev zlib1g-dev
 
 RUN pip install --upgrade pip setuptools==80.9.0 wheel setuptools-scm>=8
 
-RUN pip install cupy==13.6.0
+RUN pip install cupy-cuda12x==13.6.0
 
 WORKDIR /workspace