diff --git a/.github/workflows/Dockerfile.mlir b/.github/workflows/Dockerfile.mlir
new file mode 100644
index 0000000..9980418
--- /dev/null
+++ b/.github/workflows/Dockerfile.mlir
@@ -0,0 +1,30 @@
+FROM quay.io/pypa/manylinux_2_28_x86_64:2025.05.16-1
+
+RUN rm -rf /opt/_internal/pipx/venvs/cmake \
+    && dnf install -y \
+        ca-certificates \
+        cmake \
+        curl \
+        dnf-plugins-core \
+        libxml2-devel \
+        libzstd-devel \
+        ninja-build \
+        pkgconf-pkg-config \
+        zlib-devel \
+    && dnf config-manager --add-repo \
+        https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo \
+    && dnf install -y cuda-toolkit-13-2 \
+    && dnf clean all \
+    && rm -rf /var/cache/dnf
+
+RUN /opt/python/cp312-cp312/bin/python -m pip install --no-cache-dir lit \
+    && ln -s /opt/python/cp312-cp312/bin/lit /usr/local/bin/lit
+
+ENV CUDA_PATH=/usr/local/cuda-13.2
+ENV CUDA_HOME=/usr/local/cuda-13.2
+ENV CUDAToolkit_ROOT=/usr/local/cuda-13.2
+ENV CMAKE_CUDA_COMPILER=/usr/local/cuda-13.2/bin/nvcc
+ENV PATH=/usr/local/cuda-13.2/bin:${PATH}
+ENV LD_LIBRARY_PATH=/usr/local/cuda-13.2/lib64:/usr/local/cuda-13.2/lib64/stubs
+
+RUN ln -sf libcuda.so /usr/local/cuda-13.2/lib64/stubs/libcuda.so.1
diff --git a/.github/workflows/build-llvm.yml b/.github/workflows/build-llvm.yml
index 144bcb4..d82c25e 100644
--- a/.github/workflows/build-llvm.yml
+++ b/.github/workflows/build-llvm.yml
@@ -24,15 +24,18 @@ jobs:
             const content = fs.readFileSync('mlir/llvm_commit.txt', 'utf8');
             const commit = content.split(/\r?\n/).find(l => !/^\s*#/.test(l) && l.trim() !== '') || '';
             if (!commit) throw new Error('No commit found in mlir/llvm_commit.txt');
-            return commit;
+            core.setOutput('commit', commit.trim());
+            return commit.trim();
 
       - name: Checkout llvm-project at commit
         uses: actions/checkout@v4
         with:
           repository: llvm/llvm-project
-          ref: ${{ steps.read_commit.outputs.result }}
+          # treat the value as a tag name and fetch tags so shallow fetch can find it
+          ref: refs/tags/${{ steps.read_commit.outputs.commit }}
           path: llvm-project
-          fetch-depth: 1  # shallow clone, only the specified ref
+          fetch-depth: 1
+          fetch-tags: true
 
       - name: Install build dependencies
         run: |
@@ -49,7 +52,10 @@ jobs:
             -DLLVM_ENABLE_PROJECTS="mlir" \
             -DLLVM_TARGETS_TO_BUILD="NVPTX" \
             -DCMAKE_BUILD_TYPE=MinSizeRel \
-            -DCMAKE_INSTALL_PREFIX=install
+            -DLLVM_BUILD_TOOLS=OFF \
+            -DLLVM_BUILD_TESTS=ON \
+            -DLLVM_INSTALL_UTILS=ON \
+            -DCMAKE_INSTALL_PREFIX=${{ github.workspace }}/build-static/install
 
       - name: Configure (dynamic)
         if: matrix.link == 'dynamic'
@@ -58,11 +64,14 @@ jobs:
             -DLLVM_ENABLE_PROJECTS="mlir" \
             -DLLVM_TARGETS_TO_BUILD="NVPTX" \
             -DCMAKE_BUILD_TYPE=MinSizeRel \
-            -DMLIR_BUILD_LLVM_DYLIB=ON \
-            -DMLIR_LINK_LLVM_DYLIB=ON \
+            -DLLVM_BUILD_TOOLS=OFF \
+            -DMLIR_BUILD_MLIR_DYLIB=ON \
+            -DMLIR_LINK_MLIR_DYLIB=ON \
             -DLLVM_BUILD_LLVM_DYLIB=ON \
             -DLLVM_LINK_LLVM_DYLIB=ON \
-            -DCMAKE_INSTALL_PREFIX=install
+            -DLLVM_BUILD_TESTS=ON \
+            -DLLVM_INSTALL_UTILS=ON \
+            -DCMAKE_INSTALL_PREFIX=${{ github.workspace }}/build-dynamic/install
 
       - name: Build and install
         run: |
@@ -71,7 +80,7 @@ jobs:
       - name: Create archive of install
         run: |
           mkdir -p artifacts
-          COMMIT=${{ steps.read_commit.outputs.result }}
+          COMMIT=${{ steps.read_commit.outputs.commit }}
           tar -C build-${{ matrix.link }} -czf artifacts/llvm-mlir-install-${{ matrix.link }}-${COMMIT}.tar.gz install
           echo "Created artifacts/llvm-mlir-install-${{ matrix.link }}-${COMMIT}.tar.gz"
 
@@ -110,30 +119,13 @@ jobs:
             const content = fs.readFileSync('mlir/llvm_commit.txt', 'utf8');
             const commit = content.split(/\r?\n/).find(l => !/^\s*#/.test(l) && l.trim() !== '') || '';
             if (!commit) throw new Error('No commit found in mlir/llvm_commit.txt');
+            core.setOutput('commit', commit);
             return commit;
-
-      - name: Create GitHub Release
-        id: create_release
-        uses: actions/create-release@v1
-        with:
-          tag_name: llvm-mlir-${{ steps.read_commit_publish.outputs.result }}
-          release_name: LLVM-MLIR ${{ steps.read_commit_publish.outputs.result }}
-          body: "Automated build artifacts for LLVM+MLIR"
-          draft: false
-          prerelease: false
-
-      - name: Upload static asset to release
-        uses: actions/upload-release-asset@v1
-        with:
-          upload_url: ${{ steps.create_release.outputs.upload_url }}
-          asset_path: artifacts/llvm-mlir-install-static-${{ steps.read_commit_publish.outputs.result }}.tar.gz
-          asset_name: llvm-mlir-install-static-${{ steps.read_commit_publish.outputs.result }}.tar.gz
-          asset_content_type: application/gzip
-
-      - name: Upload dynamic asset to release
-        uses: actions/upload-release-asset@v1
+      - name: Release
+        uses: softprops/action-gh-release@v3
         with:
-          upload_url: ${{ steps.create_release.outputs.upload_url }}
-          asset_path: artifacts/llvm-mlir-install-dynamic-${{ steps.read_commit_publish.outputs.result }}.tar.gz
-          asset_name: llvm-mlir-install-dynamic-${{ steps.read_commit_publish.outputs.result }}.tar.gz
-          asset_content_type: application/gzip
+          tag_name: llvm-mlir-${{ steps.read_commit_publish.outputs.commit }}
+          name: LLVM-MLIR prebuilt ${{ steps.read_commit_publish.outputs.commit }}
+          files: |
+            artifacts/llvm-mlir-install-static-${{ steps.read_commit_publish.outputs.commit }}.tar.gz
+            artifacts/llvm-mlir-install-dynamic-${{ steps.read_commit_publish.outputs.commit }}.tar.gz
diff --git a/.github/workflows/ccpp.yml b/.github/workflows/ccpp.yml
index 7852876..e1393d2 100644
--- a/.github/workflows/ccpp.yml
+++ b/.github/workflows/ccpp.yml
@@ -110,4 +110,130 @@ jobs:
     - name: Alpha158 test
       working-directory: ./
       run: |
-        python ./tests/test_alpha158.py --inputs ./input.npz --ref ./alpha158.npz --action run_avx2
\ No newline at end of file
+        python ./tests/test_alpha158.py --inputs ./input.npz --ref ./alpha158.npz --action run_avx2
+  cuda-mlir:
+    runs-on: ubuntu-24.04
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        submodules: recursive
+    - uses: actions/setup-python@v5
+      with:
+        python-version: '3.12'
+        cache: 'pip'
+    - name: Install build dependencies
+      run: |
+        set -eux
+        sudo apt-get update
+        sudo apt-get install -y --no-install-recommends \
+          build-essential \
+          ca-certificates \
+          cmake \
+          curl \
+          git \
+          libxml2-dev \
+          libzstd-dev \
+          ninja-build \
+          pkg-config \
+          zlib1g-dev
+    - name: Cache CUDA 13.2 toolkit
+      id: cache-cuda
+      uses: actions/cache@v4
+      with:
+        path: .cache/cuda-13.2
+        key: ${{ runner.os }}-${{ runner.arch }}-cuda-mlir-minimal-13.2-v1
+    - name: Install CUDA 13.2
+      if: steps.cache-cuda.outputs.cache-hit != 'true'
+      run: |
+        set -eux
+        curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-ubuntu2404.pin \
+          -o /tmp/cuda-ubuntu2404.pin
+        sudo install -m 0644 /tmp/cuda-ubuntu2404.pin /etc/apt/preferences.d/cuda-repository-pin-600
+        curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-ubuntu2404-keyring.gpg \
+          -o /tmp/cuda-archive-keyring.gpg
+        sudo install -m 0644 /tmp/cuda-archive-keyring.gpg /usr/share/keyrings/cuda-archive-keyring.gpg
+        echo "deb [signed-by=/usr/share/keyrings/cuda-archive-keyring.gpg] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/ /" \
+          | sudo tee /etc/apt/sources.list.d/cuda-ubuntu2404-x86_64.list
+        sudo apt-get update
+        sudo apt-get install -y --no-install-recommends \
+          cuda-nvcc-13-2 \
+          cuda-cudart-dev-13-2 \
+          cuda-driver-dev-13-2 \
+          libnvvm-13-2
+        mkdir -p .cache
+        sudo tar -C /usr/local -cf "$RUNNER_TEMP/cuda-13.2.tar" cuda-13.2
+        tar -C .cache -xf "$RUNNER_TEMP/cuda-13.2.tar"
+    - name: Prepare CUDA 13.2
+      run: |
+        set -eux
+        if [ ! -d /usr/local/cuda-13.2 ]; then
+          sudo ln -s "$GITHUB_WORKSPACE/.cache/cuda-13.2" /usr/local/cuda-13.2
+        fi
+        sudo ln -sf libcuda.so /usr/local/cuda-13.2/lib64/stubs/libcuda.so.1
+        test -x /usr/local/cuda-13.2/bin/nvcc
+        test -x /usr/local/cuda-13.2/bin/ptxas
+        test -f /usr/local/cuda-13.2/include/cuda.h
+        test -f /usr/local/cuda-13.2/include/cuda_runtime.h
+        test -f /usr/local/cuda-13.2/lib64/stubs/libcuda.so
+        test -f /usr/local/cuda-13.2/nvvm/libdevice/libdevice.10.bc
+        /usr/local/cuda-13.2/bin/nvcc --version
+    - name: Install Python dependencies
+      run: |
+        python -m pip install --upgrade pip
+        python -m pip install numpy==1.26.4 lit cupy-cuda13x
+        lit --version
+    - name: Read LLVM tag
+      id: llvm_tag
+      run: |
+        tag="$(sed -e 's/#.*//' -e '/^[[:space:]]*$/d' mlir/llvm_commit.txt | head -n1 | tr -d '[:space:]')"
+        test -n "$tag"
+        echo "tag=$tag" >> "$GITHUB_OUTPUT"
+        echo "LLVM tag: $tag"
+    - name: Download prebuilt LLVM/MLIR
+      env:
+        LLVM_TAG: ${{ steps.llvm_tag.outputs.tag }}
+      run: |
+        set -eux
+        mkdir -p "$RUNNER_TEMP/llvm-mlir"
+        curl -fL --retry 3 \
+          "https://github.com/Menooker/KunQuant/releases/download/llvm-mlir-${LLVM_TAG}/llvm-mlir-install-static-${LLVM_TAG}.tar.gz" \
+          -o "$RUNNER_TEMP/llvm-mlir.tar.gz"
+        tar -xzf "$RUNNER_TEMP/llvm-mlir.tar.gz" -C "$RUNNER_TEMP/llvm-mlir" --strip-components=1
+        test -f "$RUNNER_TEMP/llvm-mlir/lib/cmake/mlir/MLIRConfig.cmake"
+        test -f "$RUNNER_TEMP/llvm-mlir/lib/cmake/llvm/LLVMConfig.cmake"
+        echo "LLVM_PREFIX=$RUNNER_TEMP/llvm-mlir" >> "$GITHUB_ENV"
+    - name: Configure MLIR backend
+      env:
+        CUDA_PATH: /usr/local/cuda-13.2
+        CUDA_HOME: /usr/local/cuda-13.2
+      run: |
+        cmake -S . -B build/mlir-ci -G Ninja \
+          -DCMAKE_BUILD_TYPE=Release \
+          -DKUN_BUILD_CPU_RUNNER=OFF \
+          -DKUN_BUILD_MLIR=ON \
+          -DLLVM_DIR="$LLVM_PREFIX/lib/cmake/llvm" \
+          -DMLIR_DIR="$LLVM_PREFIX/lib/cmake/mlir" \
+          -DCUDAToolkit_ROOT=/usr/local/cuda-13.2 \
+          -DCMAKE_CUDA_COMPILER=/usr/local/cuda-13.2/bin/nvcc \
+          -DPython_EXECUTABLE="$(python -c 'import sys; print(sys.executable)')" \
+          -DPYTHON_EXECUTABLE="$(python -c 'import sys; print(sys.executable)')" \
+          -DLLVM_EXTERNAL_LIT="$(command -v lit)"
+    - name: Run KunQuant MLIR tests
+      env:
+        CUDA_PATH: /usr/local/cuda-13.2
+        CUDA_HOME: /usr/local/cuda-13.2
+      run: cmake --build build/mlir-ci --target check-kun-mlir --parallel 4
+    - name: Check KunQuant-MLIR imports
+      env:
+        CUDA_PATH: /usr/local/cuda-13.2
+        CUDA_HOME: /usr/local/cuda-13.2
+      run: |
+        export LD_LIBRARY_PATH="$LLVM_PREFIX/lib:/usr/local/cuda-13.2/lib64/stubs:${LD_LIBRARY_PATH:-}"
+        python - <<'PY'
+        import KunQuantMLIR.KunMLIR as direct
+        from KunQuant.jit import KunMLIR as compat
+        import KunQuant.jit.KunMLIR as submodule
+        assert direct.__file__ == compat.__file__ == submodule.__file__
+        assert "/KunQuantMLIR/" in direct.__file__
+        print(direct.__file__)
+        PY
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
index a60b5b2..d1332ca 100644
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -4,16 +4,34 @@ name: Create and publish a Docker image
 # Configures this workflow to run every time a change is pushed to the branch called `release`.
 on:
   workflow_dispatch:
+    inputs:
+      image:
+        description: Image to build
+        required: true
+        default: both
+        type: choice
+        options:
+        - core
+        - mlir
+        - both
 
 # Defines two custom environment variables for the workflow. These are used for the Container registry domain, and a name for the Docker image that this workflow builds.
 env:
   REGISTRY: ghcr.io
-  IMAGE_NAME: ${{ github.repository }}
 
 # There is a single job in this workflow. It's configured to run on the latest available version of Ubuntu.
 jobs:
   build-and-push-image:
     runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        include:
+          - target: core
+            image: ghcr.io/menooker/kunquant
+            dockerfile: Dockerfile
+          - target: mlir
+            image: ghcr.io/menooker/kunquant-mlir
+            dockerfile: Dockerfile.mlir
     # Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job.
     permissions:
       contents: read
@@ -23,9 +41,11 @@ jobs:
       #
     steps:
       - name: Checkout repository
+        if: ${{ inputs.image == matrix.target || inputs.image == 'both' }}
         uses: actions/checkout@v4
       # Uses the `docker/login-action` action to log in to the Container registry registry using the account and password that will publish the packages. Once published, the packages are scoped to the account defined here.
       - name: Log in to the Container registry
+        if: ${{ inputs.image == matrix.target || inputs.image == 'both' }}
         uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
         with:
           registry: ${{ env.REGISTRY }}
@@ -33,27 +53,31 @@ jobs:
           password: ${{ secrets.GITHUB_TOKEN }}
       # This step uses [docker/metadata-action](https://github.com/docker/metadata-action#about) to extract tags and labels that will be applied to the specified image. The `id` "meta" allows the output of this step to be referenced in a subsequent step. The `images` value provides the base name for the tags and labels.
       - name: Extract metadata (tags, labels) for Docker
+        if: ${{ inputs.image == matrix.target || inputs.image == 'both' }}
         id: meta
         uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
         with:
-          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
+          images: ${{ matrix.image }}
       # This step uses the `docker/build-push-action` action to build the image, based on your repository's `Dockerfile`. If the build succeeds, it pushes the image to GitHub Packages.
       # It uses the `context` parameter to define the build's context as the set of files located in the specified path. For more information, see [Usage](https://github.com/docker/build-push-action#usage) in the README of the `docker/build-push-action` repository.
       # It uses the `tags` and `labels` parameters to tag and label the image with the output from the "meta" step.
       - name: Build and push Docker image
+        if: ${{ inputs.image == matrix.target || inputs.image == 'both' }}
         id: push
         uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
         with:
           context: .github/workflows
+          file: .github/workflows/${{ matrix.dockerfile }}
           push: true
           tags: ${{ steps.meta.outputs.tags }}
           labels: ${{ steps.meta.outputs.labels }}
       
       # This step generates an artifact attestation for the image, which is an unforgeable statement about where and how it was built. It increases supply chain security for people who consume the image. For more information, see [Using artifact attestations to establish provenance for builds](/actions/security-guides/using-artifact-attestations-to-establish-provenance-for-builds).
       - name: Generate artifact attestation
+        if: ${{ inputs.image == matrix.target || inputs.image == 'both' }}
         uses: actions/attest-build-provenance@v2
         with:
-          subject-name: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME}}
+          subject-name: ${{ matrix.image }}
           subject-digest: ${{ steps.push.outputs.digest }}
           push-to-registry: true
       
diff --git a/.github/workflows/publish-kunquant-mlir.yml b/.github/workflows/publish-kunquant-mlir.yml
new file mode 100644
index 0000000..e8c365f
--- /dev/null
+++ b/.github/workflows/publish-kunquant-mlir.yml
@@ -0,0 +1,125 @@
+name: Publish KunQuant-MLIR to PyPI and TestPyPI
+
+on:
+  workflow_dispatch:
+    inputs:
+      target:
+        description: Publish target
+        required: true
+        default: testpypi
+        type: choice
+        options:
+        - testpypi
+        - pypi
+        - both
+
+permissions:
+  contents: read
+
+jobs:
+  build:
+    name: Build KunQuant-MLIR wheel
+    runs-on: ubuntu-24.04
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        submodules: recursive
+    - uses: actions/setup-python@v5
+      with:
+        python-version: '3.12'
+        cache: pip
+    - name: Install build dependencies
+      run: |
+        set -eux
+        python -m pip install --upgrade pip
+        python -m pip install cibuildwheel
+    - name: Read LLVM tag
+      id: llvm_tag
+      run: |
+        tag="$(sed -e 's/#.*//' -e '/^[[:space:]]*$/d' mlir/llvm_commit.txt | head -n1 | tr -d '[:space:]')"
+        test -n "$tag"
+        echo "tag=$tag" >> "$GITHUB_OUTPUT"
+        echo "LLVM tag: $tag"
+    - name: Download prebuilt LLVM/MLIR
+      env:
+        LLVM_TAG: ${{ steps.llvm_tag.outputs.tag }}
+      run: |
+        set -eux
+        mkdir -p "$RUNNER_TEMP/llvm-mlir"
+        curl -fL --retry 3 \
+          "https://github.com/Menooker/KunQuant/releases/download/llvm-mlir-${LLVM_TAG}/llvm-mlir-install-static-${LLVM_TAG}.tar.gz" \
+          -o "$RUNNER_TEMP/llvm-mlir.tar.gz"
+        tar -xzf "$RUNNER_TEMP/llvm-mlir.tar.gz" -C "$RUNNER_TEMP/llvm-mlir" --strip-components=1
+        test -f "$RUNNER_TEMP/llvm-mlir/lib/cmake/mlir/MLIRConfig.cmake"
+        test -f "$RUNNER_TEMP/llvm-mlir/lib/cmake/llvm/LLVMConfig.cmake"
+        echo "LLVM_PREFIX=$RUNNER_TEMP/llvm-mlir" >> "$GITHUB_ENV"
+    - name: Build wheel
+      run: |
+        set -eux
+        export CIBW_BUILD="cp312-manylinux_x86_64"
+        export CIBW_BUILD_FRONTEND="build"
+        export CIBW_MANYLINUX_X86_64_IMAGE="ghcr.io/menooker/kunquant-mlir:main"
+        export CIBW_REPAIR_WHEEL_COMMAND_LINUX="auditwheel repair --exclude libcuda.so --exclude libcuda.so.1 -w {dest_dir} {wheel}"
+        export CIBW_ENVIRONMENT="KUN_USE_GIT_VERSION=0 CUDA_PATH=/usr/local/cuda-13.2 CUDA_HOME=/usr/local/cuda-13.2 CUDAToolkit_ROOT=/usr/local/cuda-13.2 CMAKE_CUDA_COMPILER=/usr/local/cuda-13.2/bin/nvcc LLVM_DIR=/host${LLVM_PREFIX}/lib/cmake/llvm MLIR_DIR=/host${LLVM_PREFIX}/lib/cmake/mlir LLVM_EXTERNAL_LIT=lit LD_LIBRARY_PATH=/usr/local/cuda-13.2/lib64:/usr/local/cuda-13.2/lib64/stubs"
+        python -m cibuildwheel python/kunquant_mlir --platform linux --output-dir wheelhouse
+    - name: Check wheel contents
+      run: |
+        python - <<'PY'
+        import glob
+        import zipfile
+
+        wheel, = glob.glob("wheelhouse/*.whl")
+        with zipfile.ZipFile(wheel) as zf:
+            names = set(zf.namelist())
+        assert "KunQuantMLIR/KunMLIR.abi3.so" in names
+        assert "KunQuantMLIR/libKunCudaRuntime.so" in names
+        print(wheel)
+        PY
+    - name: Store wheel
+      uses: actions/upload-artifact@v4
+      with:
+        name: kunquant-mlir-wheel
+        path: wheelhouse/*.whl
+        if-no-files-found: error
+
+  publish-to-testpypi:
+    name: Publish KunQuant-MLIR to TestPyPI
+    if: ${{ inputs.target == 'testpypi' || inputs.target == 'both' }}
+    needs: build
+    runs-on: ubuntu-latest
+    environment:
+      name: testpypi
+      url: https://test.pypi.org/p/KunQuant-MLIR
+    permissions:
+      contents: read
+      id-token: write
+    steps:
+    - name: Download wheel
+      uses: actions/download-artifact@v4
+      with:
+        name: kunquant-mlir-wheel
+        path: dist/
+    - name: Publish wheel to TestPyPI
+      uses: pypa/gh-action-pypi-publish@release/v1
+      with:
+        repository-url: https://test.pypi.org/legacy/
+
+  publish-to-pypi:
+    name: Publish KunQuant-MLIR to PyPI
+    if: ${{ inputs.target == 'pypi' || inputs.target == 'both' }}
+    needs: build
+    runs-on: ubuntu-latest
+    environment:
+      name: pypi
+      url: https://pypi.org/p/KunQuant-MLIR
+    permissions:
+      contents: read
+      id-token: write
+    steps:
+    - name: Download wheel
+      uses: actions/download-artifact@v4
+      with:
+        name: kunquant-mlir-wheel
+        path: dist/
+    - name: Publish wheel to PyPI
+      uses: pypa/gh-action-pypi-publish@release/v1
diff --git a/.gitignore b/.gitignore
index d9c82ef..4f534f7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 .vscode/*
+.claude/*
 *.pyc
 build/*
 tests/cpp/generated/*
@@ -8,4 +9,7 @@ KunQuant.egg-info/*
 dist/*
 *.pyd
 *.dll
-*.lib
\ No newline at end of file
+*.lib
+.codex
+python/kunquant_mlir/build/*
+python/kunquant_mlir/KunQuant_MLIR.egg-info/*
diff --git a/.gitmodules b/.gitmodules
index 5557432..463cb32 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,6 @@
 [submodule "3rdparty/nanobind"]
 	path = 3rdparty/nanobind
 	url = https://github.com/wjakob/nanobind
+[submodule "3rdparty/nlohmann_json"]
+	path = 3rdparty/nlohmann_json
+	url = https://github.com/nlohmann/json
diff --git a/3rdparty/nlohmann_json b/3rdparty/nlohmann_json
new file mode 160000
index 0000000..484483a
--- /dev/null
+++ b/3rdparty/nlohmann_json
@@ -0,0 +1 @@
+Subproject commit 484483acad6d562306efc9b3c6d413404f1b1f8a
diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 0000000..f2693f4
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,110 @@
+# CLAUDE.md
+
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+
+## What This Project Is
+
+KunQuant is a compiler, optimizer, and code generator for financial factor expressions (e.g., WorldQuant Alpha101, Qlib Alpha158). It takes Python-defined financial expressions, applies optimization passes, generates C++ code with SIMD and parallelism, and executes it via a Python binding (KunRunner). Achieves 170x+ speedup over naive Pandas.
+
+## Build & Install
+
+```bash
+# Standard install
+pip install .
+
+# Editable install with C++ tests
+KUN_BUILD_TESTS=1 pip install -e .
+```
+
+Build environment variables:
+- `KUN_BUILD_TYPE=Debug|Release` — default: Release
+- `KUN_BUILD_TESTS=1` — enables `KunTest` and `KunCApiTest` targets
+- `KUN_NO_AVX2=1` — disable AVX2/FMA (for older CPUs)
+- `KUN_DEBUG=1` — print internal compiler pass output
+- `KUN_DEBUG_JIT=1` — print JIT C++ compilation details
+
+## Running Tests
+
+```bash
+# Python IR transformation tests (no build required)
+python tests/test.py
+python tests/test2.py
+
+# Streaming mode tests
+python tests/test_stream.py
+
+# C++ runtime tests (requires KUN_BUILD_TESTS=1)
+python tests/test_runtime.py
+
+# Alpha101 correctness (random data)
+python tests/test_alpha101.py
+
+# Integration tests
+bash tests/tests.sh
+```
+
+## Architecture
+
+The pipeline: **Python expression graph → optimization passes → C++ code generation → JIT compile → shared library → KunRunner execution**
+
+### Python Compiler Layer (`KunQuant/`)
+
+- **`Op.py`** — Core IR. All operations inherit from `OpBase`. `Builder` is a thread-local context manager that records ops as they're constructed. Key traits: `WindowedOp`, `ReductionOp`, `SinkOpTrait`, `CrossSectionalOp`.
+- **`Stage.py`** — `Function` holds the op graph; `OpInfo` tracks use counts. Provides topological sort and dead-op elimination.
+- **`Driver.py`** — Orchestrates compilation. `KunCompilerConfig` holds config (dtype, layout, streaming). `compileit()` is the main entry point; `optimize()` runs the pass pipeline.
+- **`ops/`** — Concrete op types: `ElewiseOp.py` (Add, Mul, etc.), `ReduceOp.py` (ReduceAdd, ReduceRank, etc.), `CompOp.py` (Greater, Less, etc.), `MiscOp.py`.
+- **`passes/`** — All optimization passes and code generation:
+  - `InferWindow.py` — Infers time-window sizes
+  - `SpecialOpt.py` — Domain-specific rewrites (stddev, rank)
+  - `Decompose.py` — Expands windowed ops into `ForeachBackWindow` loops + reductions
+  - `ExprFold.py` — Constant folding and algebraic simplification
+  - `TempWindowElim.py` — Eliminates intermediate window buffers
+  - `MergeLoops.py` — Fuses compatible loops
+  - `Partitioner.py` — Partitions ops into parallel execution blocks
+  - `CodegenCpp.py` — Emits C++ source from the final IR
+- **`jit/cfake.py`** — Invokes MSVC/GCC/Clang to compile generated C++ to a shared library.
+- **`predefined/`** — Ready-to-use factor libraries: `Alpha101.py` (101 factors), `Alpha158.py`.
+- **`runner/`** — Python bindings via pybind11; `KunRunner` loads shared libs, creates executors, runs graphs.
+
+### Optimization Pass Order (in `Driver.optimize()`)
+
+1. InferWindow → SpecialOpt → Decompose → ExprFold → SpecialOpt → ExprFold → DecomposeRank → MoveDupRankOutput → TempWindowElim
+
+Post-compile (`post_optimize()`): TempWindowElim → InferInputWindow → MergeLoops
+
+### C++ Runtime (`cpp/`)
+
+- **`cpp/Kun/`** — Core runtime: `Runtime.cpp` (execution engine), `Executor.cpp`, `Module.cpp`, `CApi.cpp`, `Ops.hpp` (operator implementations), `Rank.hpp`, `Scale.hpp`, `SkipList.cpp` (sorted stream state).
+- **`cpp/KunSIMD/`** — SIMD vector ops for x86 (AVX2/AVX512) and ARM (NEON).
+- **`cpp/Python/`** — pybind11 bindings.
+
+### Memory Layouts
+
+- `TS` (Time-Stock): time is outer dimension — default for batch mode
+- `STs` (Stock-Time-blocked): stocks are outer, time is inner with blocking — better for streaming
+
+Recommended blocking: 8 stocks (float + AVX2), 4 stocks (double + AVX2).
+
+## Typical Usage Pattern
+
+```python
+from KunQuant.Op import Builder, Input, Output
+from KunQuant.ops import *
+from KunQuant.Stage import Function
+from KunQuant.jit import cfake
+from KunQuant.Driver import KunCompilerConfig
+
+with Builder() as b:
+    close = Input("close")
+    # ... define factor expressions ...
+    Output(some_expr, "factor_name")
+
+f = Function(b.ops)
+lib = cfake.compileit([("mylib", f, KunCompilerConfig())], "out_lib", cfake.CppCompilerConfig())
+modu = lib.getModule("mylib")
+
+# Execute
+from KunQuant.runner import KunRunner as kr
+executor = kr.createMultiThreadExecutor(num_threads)
+result = kr.runGraph(executor, modu, input_dict, start_time, num_time)
+```
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7143309..ed21190 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,6 +33,7 @@ option(KUN_AVX512DQ "Enable AVX512DQ instruction set" OFF)
 option(KUN_AVX512VL "Enable AVX512VL instruction set" OFF)
 option(KUN_NO_AVX2 "Disable AVX2 and FMA instruction set" OFF)
 option(KUN_SANITIZER "Enable sanitizer" OFF)
+option(KUN_BUILD_CPU_RUNNER "Build the CPU KunRunner extension/runtime" ON)
 
 
 if (CMAKE_CXX_COMPILER_ID MATCHES "(Clang|GNU|AppleClang)")
@@ -72,40 +73,81 @@ else()
     endif()
 endif()
 
-
-file(GLOB_RECURSE KunRuntimeSrc ${PROJECT_SOURCE_DIR}/cpp/Kun/*.cpp
-    ${PROJECT_SOURCE_DIR}/cpp/KunSIMD/*.cpp)
-add_library(KunRuntime SHARED ${KunRuntimeSrc})
-target_compile_definitions(KunRuntime PRIVATE KUN_CORE_LIB=1)
-if (NOT WIN32)
-    target_link_libraries(KunRuntime PRIVATE dl)
+if(NOT DEFINED PYTHON_EXECUTABLE)
+    set(PYTHON_EXECUTABLE ${Python_EXECUTABLE})
 endif()
 
-file(GLOB_RECURSE KunPythonSrc ${PROJECT_SOURCE_DIR}/cpp/Python/*.cpp)
-# STABLE_ABI: build a single Python-version-independent .abi3.so on
-# CPython ≥ 3.12; nanobind silently disables this and falls back to
-# regular ABI on older Pythons.  Saves us one rebuild per Python
-# minor we want to ship.
-nanobind_add_module(KunRunner STABLE_ABI ${KunPythonSrc})
+message(STATUS "PYTHON_EXECUTABLE = ${PYTHON_EXECUTABLE}")
 
-target_link_libraries(KunRunner PUBLIC KunRuntime)
+if(KUN_BUILD_CPU_RUNNER)
+    file(GLOB_RECURSE KunRuntimeSrc ${PROJECT_SOURCE_DIR}/cpp/Kun/*.cpp
+        ${PROJECT_SOURCE_DIR}/cpp/KunSIMD/*.cpp)
+    add_library(KunRuntime SHARED ${KunRuntimeSrc})
+    target_compile_definitions(KunRuntime PRIVATE KUN_CORE_LIB=1)
+    if (NOT WIN32)
+        target_link_libraries(KunRuntime PRIVATE dl)
+    endif()
 
+    file(GLOB_RECURSE KunPythonSrc ${PROJECT_SOURCE_DIR}/cpp/Python/*.cpp)
+    # STABLE_ABI: build a single Python-version-independent .abi3.so on
+    # CPython ≥ 3.12; nanobind silently disables this and falls back to
+    # regular ABI on older Pythons.  Saves us one rebuild per Python
+    # minor we want to ship.
+    nanobind_add_module(KunRunner STABLE_ABI ${KunPythonSrc})
 
+    target_link_libraries(KunRunner PUBLIC KunRuntime)
 
-file(GLOB_RECURSE KunTestSrc ${PROJECT_SOURCE_DIR}/tests/cpp/*.cpp)
-add_library(KunTest SHARED EXCLUDE_FROM_ALL ${KunTestSrc})
-target_link_libraries(KunTest KunRuntime)
 
 
-file(GLOB_RECURSE KunCApiTestSrc ${PROJECT_SOURCE_DIR}/tests/capi/*.cpp)
-add_executable(KunCApiTest EXCLUDE_FROM_ALL ${KunCApiTestSrc})
-target_link_libraries(KunCApiTest KunRuntime)
-add_dependencies(KunCApiTest KunTest)
+    file(GLOB_RECURSE KunTestSrc ${PROJECT_SOURCE_DIR}/tests/cpp/*.cpp)
+    add_library(KunTest SHARED EXCLUDE_FROM_ALL ${KunTestSrc})
+    target_link_libraries(KunTest KunRuntime)
 
-if(NOT DEFINED PYTHON_EXECUTABLE)
-    set(PYTHON_EXECUTABLE ${Python_EXECUTABLE})
-endif()
 
-message(STATUS "PYTHON_EXECUTABLE = ${PYTHON_EXECUTABLE}")
+    file(GLOB_RECURSE KunCApiTestSrc ${PROJECT_SOURCE_DIR}/tests/capi/*.cpp)
+    add_executable(KunCApiTest EXCLUDE_FROM_ALL ${KunCApiTestSrc})
+    target_link_libraries(KunCApiTest KunRuntime)
+    add_dependencies(KunCApiTest KunTest)
 
-add_custom_target(TestingTargets DEPENDS KunCApiTest KunTest KunRunner)
\ No newline at end of file
+    add_custom_target(TestingTargets DEPENDS KunCApiTest KunTest KunRunner)
+endif()
+
+#===------------------------------------------------------------------------===#
+# Optional MLIR backend (kun-opt + kunir/kungpu dialects)
+#===------------------------------------------------------------------------===#
+option(KUN_BUILD_MLIR "Build MLIR backend with kunir/kungpu dialects" OFF)
+if(KUN_BUILD_MLIR)
+  set(KUN_MLIR_PYTHON_PACKAGE_DIR "${PROJECT_SOURCE_DIR}/KunQuantMLIR"
+      CACHE PATH "Output directory for the KunQuant-MLIR Python package binaries")
+
+  find_package(MLIR REQUIRED CONFIG)
+  find_package(CUDAToolkit REQUIRED)
+  message(STATUS "Using MLIRConfig.cmake in: ${MLIR_DIR}")
+  message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}")
+
+  if(NOT CMAKE_CUDA_COMPILER)
+    set(CMAKE_CUDA_COMPILER "${CUDAToolkit_NVCC_EXECUTABLE}" CACHE FILEPATH
+        "nvcc used by CMake's CUDA-language support.")
+  endif()
+  get_filename_component(KUN_CUDA_TOOLKIT_ROOT
+                         "${CUDAToolkit_NVCC_EXECUTABLE}" DIRECTORY)
+  get_filename_component(KUN_CUDA_TOOLKIT_ROOT
+                         "${KUN_CUDA_TOOLKIT_ROOT}" DIRECTORY)
+  message(STATUS
+      "KunQuant MLIR CUDA toolkit = ${KUN_CUDA_TOOLKIT_ROOT} "
+      "(version ${CUDAToolkit_VERSION})")
+
+  set(LLVM_RUNTIME_OUTPUT_INTDIR ${CMAKE_BINARY_DIR}/bin)
+  set(LLVM_LIBRARY_OUTPUT_INTDIR ${CMAKE_BINARY_DIR}/lib)
+  set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR})
+
+  list(APPEND CMAKE_MODULE_PATH "${MLIR_CMAKE_DIR}")
+  list(APPEND CMAKE_MODULE_PATH "${LLVM_CMAKE_DIR}")
+
+  include(TableGen)
+  include(AddLLVM)
+  include(AddMLIR)
+  include(HandleLLVMOptions)
+
+  add_subdirectory(mlir)
+endif()
diff --git a/KunQuant/Driver.py b/KunQuant/Driver.py
index 513a75d..5e93469 100644
--- a/KunQuant/Driver.py
+++ b/KunQuant/Driver.py
@@ -29,6 +29,10 @@ def optimize(f: Function, options: dict)->Dict[str, int]:
     # optimize before decompose to let value ranges work
     special_optimize(f, options)
     decompose(f, options)
+    # Experimental: expand stateful ops (ExpMovingAvg / WindowedLinearRegression*)
+    # into Accumulator chains.  No-op on the CPU pipeline (gated on
+    # options["experimental_expand"]); currently enabled by the GPU backend.
+    experimental_expand(f, options)
     expr_fold(f, options)
     special_optimize(f, options)
     expr_fold(f, options)
diff --git a/KunQuant/Op.py b/KunQuant/Op.py
index bd96014..e2696f9 100644
--- a/KunQuant/Op.py
+++ b/KunQuant/Op.py
@@ -509,6 +509,14 @@ class StateConsumerTrait:
     '''
     pass
 
+class MayRequireWholeTime:
+    '''
+    Ops whose state may depend on the full time history (cannot be rebuilt
+    from a bounded warmup window).  Override to declare otherwise.
+    '''
+    def is_whole_time_required(self) -> bool:
+        return False
+
 class ReductionOp(OpBase, StatefulOpTrait):
     '''
     Base class of all reduction ops. A reduction op takes inputs that is originated from a IterValue. The input must be in a loop (v.get_parent() is a loop). The data produced
diff --git a/KunQuant/jit/KunMLIR.py b/KunQuant/jit/KunMLIR.py
new file mode 100644
index 0000000..a519d61
--- /dev/null
+++ b/KunQuant/jit/KunMLIR.py
@@ -0,0 +1,16 @@
+"""Compatibility shim for the optional KunQuant-MLIR extension module."""
+
+from importlib import import_module as _import_module
+import sys as _sys
+
+try:
+    _KunMLIR = _import_module("KunQuantMLIR.KunMLIR")
+except ModuleNotFoundError as e:
+    if e.name and e.name.startswith("KunQuantMLIR"):
+        raise ImportError(
+            "KunQuant MLIR extension is not installed. "
+            "Install KunQuant-MLIR to use KunQuant.jit.KunMLIR."
+        ) from e
+    raise
+
+_sys.modules[__name__] = _KunMLIR
diff --git a/KunQuant/jit/cuda.py b/KunQuant/jit/cuda.py
new file mode 100644
index 0000000..7d81d5a
--- /dev/null
+++ b/KunQuant/jit/cuda.py
@@ -0,0 +1,11 @@
+"""Compatibility shim for the optional KunQuant-MLIR CUDA backend."""
+
+try:
+    from KunQuantMLIR.jit_cuda import *  # noqa: F401,F403
+except ModuleNotFoundError as e:
+    if e.name and e.name.startswith("KunQuantMLIR"):
+        raise ImportError(
+            "KunQuant MLIR/CUDA backend is not installed. "
+            "Install KunQuant-MLIR to use KunQuant.jit.cuda."
+        ) from e
+    raise
diff --git a/KunQuant/jit/env.py b/KunQuant/jit/env.py
index d0975c3..c40cc6d 100644
--- a/KunQuant/jit/env.py
+++ b/KunQuant/jit/env.py
@@ -104,3 +104,65 @@ def get_compiler_env():
                 print("Reset env", "PATH+=", extra_path, "INCLUDE=", env['INCLUDE'], "LIB=", env['LIB'])
     _env = env
     return env
+
+
+def _format_cuda_sm(major: int, minor: int) -> str:
+    return f"sm_{int(major)}{int(minor)}"
+
+
+def _format_cuda_sm_from_capability(capability) -> str:
+    if isinstance(capability, tuple):
+        if len(capability) != 2:
+            raise ValueError(f"unexpected CUDA capability tuple: {capability!r}")
+        return _format_cuda_sm(capability[0], capability[1])
+
+    text = str(capability).strip().lower()
+    if text.startswith("sm_"):
+        text = text[3:]
+    text = text.replace(".", "")
+    if not text or not text.isdigit():
+        raise ValueError(f"unexpected CUDA capability value: {capability!r}")
+    return f"sm_{text}"
+
+
+def get_cuda_compute_capability() -> str:
+    """Return the current CUDA device architecture as `sm_xx`.
+
+    CuPy is preferred because the CUDA JIT path already uses CuPy arrays in
+    tests and examples.  PyTorch is used as a fallback when CuPy is not
+    available or cannot query a CUDA device.
+    """
+    errors = []
+
+    try:
+        import cupy as cp
+        dev = cp.cuda.Device()
+        capability = getattr(dev, "compute_capability", None)
+        if capability is not None:
+            return _format_cuda_sm_from_capability(capability)
+
+        props = cp.cuda.runtime.getDeviceProperties(dev.id)
+        return _format_cuda_sm(props["major"], props["minor"])
+    except Exception as e:
+        errors.append(f"cupy: {type(e).__name__}: {e}")
+
+    try:
+        import torch
+        if not torch.cuda.is_available():
+            raise RuntimeError("torch.cuda.is_available() is false")
+        major, minor = torch.cuda.get_device_capability()
+        return _format_cuda_sm(major, minor)
+    except Exception as e:
+        errors.append(f"torch: {type(e).__name__}: {e}")
+
+    raise RuntimeError(
+        "Could not determine CUDA compute capability from CuPy or PyTorch: "
+        + "; ".join(errors))
+
+
+def has_cuda_device() -> bool:
+    try:
+        get_cuda_compute_capability()
+        return True
+    except RuntimeError:
+        return False
diff --git a/KunQuant/ops/CompOp.py b/KunQuant/ops/CompOp.py
index 934bc30..fd75bd2 100644
--- a/KunQuant/ops/CompOp.py
+++ b/KunQuant/ops/CompOp.py
@@ -11,7 +11,12 @@
 def _is_fast_stat(opt: dict, attrs: dict) -> bool:
     return not opt.get("no_fast_stat", True) and not attrs.get("no_fast_stat", False)
 
-def _decide_use_skip_list(window: int, blocking_len: int) -> bool:
+def _decide_use_skip_list(options: dict, window: int, blocking_len: int) -> bool:
+    # GPU lowering doesn't implement SkipList ops; the caller can force
+    # the naive ForeachBackWindow path with `options["no_skip_list"]`
+    # regardless of window/blocking_len cost.
+    if options.get("no_skip_list", False):
+        return False
     naive_cost = window
     skip_list_cost = math.log2(window) * blocking_len * 5
     return skip_list_cost < naive_cost
@@ -65,7 +70,7 @@ def on_skip_list(self, skplist: SkipListState, cur: OpBase) -> OpBase:
     def decompose(self, options: dict) -> List[OpBase]:
         window = self.attrs["window"]
         blocking_len = options["blocking_len"]
-        if _decide_use_skip_list(window, blocking_len):
+        if _decide_use_skip_list(options, window, blocking_len):
             b = Builder(self.get_parent())
             with b:
                 newv = self.inputs[0]
@@ -469,7 +474,7 @@ class TsArgMax(WindowedReduce):
     def decompose(self, options: dict) -> List[OpBase]:
         window = self.attrs["window"]
         blocking_len = options["blocking_len"]
-        if _decide_use_skip_list(window, blocking_len):
+        if _decide_use_skip_list(options, window, blocking_len):
             b = Builder(self.get_parent())
             with b:
                 TsArgMin(0-self.inputs[0], window)
@@ -690,6 +695,11 @@ def required_input_window(self) -> int:
         return self.attrs["window"] + 1
     
     def decompose(self, options: dict) -> List[OpBase]:
+        if options.get("no_skip_list", False):
+            raise RuntimeError(
+                "WindowedQuantile has no non-skip-list decompose path; "
+                "it cannot run under options[\"no_skip_list\"]=True "
+                "(e.g. on the GPU backend)")
         b = Builder(self.get_parent())
         window = self.attrs["window"]
         v = self.inputs[0]
diff --git a/KunQuant/ops/MiscOp.py b/KunQuant/ops/MiscOp.py
index 218d487..ba6fa17 100644
--- a/KunQuant/ops/MiscOp.py
+++ b/KunQuant/ops/MiscOp.py
@@ -1,5 +1,5 @@
 import KunQuant
-from KunQuant.Op import AcceptSingleValueInputTrait, Input, OpBase, WindowedTrait, SinkOpTrait, CrossSectionalOp, GlobalStatefulProducerTrait, GloablStatefulOpTrait, StateConsumerTrait, UnaryElementwiseOp, BinaryElementwiseOp
+from KunQuant.Op import AcceptSingleValueInputTrait, Input, OpBase, WindowedTrait, SinkOpTrait, CrossSectionalOp, GlobalStatefulProducerTrait, GloablStatefulOpTrait, StateConsumerTrait, MayRequireWholeTime, UnaryElementwiseOp, BinaryElementwiseOp
 from typing import List, Tuple, Union
 
 class BackRef(OpBase, WindowedTrait):
@@ -29,15 +29,35 @@ def get_state_variable_name_prefix(self) -> str:
     def generate_step_code(self, idx: str, time_idx: str, inputs: List[str], buf_name: str) -> str:
         return f"auto v{idx} = sum_{idx}.step({buf_name}, {inputs[0]}, {time_idx});"
 
-class Accumulator(OpBase, GlobalStatefulProducerTrait):
+class Accumulator(OpBase, GlobalStatefulProducerTrait, MayRequireWholeTime):
     '''
     Accumulator is a stateful op that accumulates the input value over time.
-    It can be used to compute running totals, moving averages, etc.'''
-    def __init__(self, v: OpBase, name: str) -> None:
-        super().__init__([v], [("name", name)])
+    It can be used to compute running totals, moving averages, etc.
+
+    `init_val` is the initial scalar stored in the slot before the first
+    time step.  Pass a float (default 0) for a plain numeric init, or the
+    string "nan" for a NaN init (mirrors ConstantOp's "nan" handling).
+    '''
+    def __init__(self, v: OpBase, name: str,
+                  is_whole_time_required: bool = False,
+                  init_val: Union[float, str] = 0) -> None:
+        if isinstance(init_val, str) and init_val != "nan":
+            raise RuntimeError(
+                f"Accumulator init_val str must be 'nan', got {init_val!r}")
+        super().__init__([v],
+                          [("name", name),
+                           ("whole_time", is_whole_time_required),
+                           ("init_val", init_val)])
+    def is_whole_time_required(self) -> bool:
+        return self.attrs["whole_time"]
     def get_state_variable_name_prefix(self) -> str:
         return "accu_"
-    
+
+    def generate_init_code(self, idx: str, elem_type: str, simd_lanes: int, inputs: List[str], aligned: bool) -> str:
+        from KunQuant.passes.CodegenCpp import _float_value_to_float
+        init = _float_value_to_float(self.attrs["init_val"], elem_type)
+        return f"{self.get_func_or_class_full_name(elem_type, simd_lanes)} {self.get_state_variable_name_prefix()}{idx} {{ {init} }};"
+
     def generate_step_code(self, idx: str, time_idx: str, inputs: List[str]) -> str:
         return f"auto v{idx} = accu_{idx}.asValue();"
 
@@ -72,13 +92,20 @@ def verify(self, func: 'KunQuant.Stage.Function') -> None:
     
 class ReturnFirstValue(OpBase):
     '''
-    Return the first value of the input. It is used keep the dependency of the input op, like SetAccumulator.
+    Return inputs[0] as this op's value; the remaining inputs are kept
+    only as dependencies (graph-level keep-alives).
+
+    KunQuant's Python IR is a graph IR — an op with no users is dropped
+    during topo sort / GC.  SetAccumulator is side-effecting but produces
+    no consumer-visible value, so attaching it as inputs[1:] of
+    ReturnFirstValue is how we keep it reachable from a graph output.
     '''
     def __init__(self, v: List[OpBase]) -> None:
         super().__init__(v, [])
     
 
-class ExpMovingAvg(OpBase, GloablStatefulOpTrait, AcceptSingleValueInputTrait):
+class ExpMovingAvg(OpBase, GloablStatefulOpTrait, AcceptSingleValueInputTrait,
+                    MayRequireWholeTime):
     '''
     Exponential Moving Average (EMA)
     Similar to pd.DataFrame.ewm(span=window, adjust=False, ignore_na=True).mean()
@@ -115,6 +142,9 @@ def generate_init_code(self, idx: str, elem_type: str, simd_lanes: int, inputs:
     def generate_step_code(self, idx: str, time_idx: str, inputs: List[str]) -> str:
         return f"auto v{idx} = ema_{idx}.step({inputs[0]}, {time_idx});"
 
+    def is_whole_time_required(self) -> bool:
+        return True
+
 class WindowedLinearRegression(OpBase, WindowedTrait, GlobalStatefulProducerTrait):
     '''
     Compute states of Windowed Linear Regression
diff --git a/KunQuant/passes/CodegenMLIR.py b/KunQuant/passes/CodegenMLIR.py
new file mode 100644
index 0000000..f578fc2
--- /dev/null
+++ b/KunQuant/passes/CodegenMLIR.py
@@ -0,0 +1,11 @@
+"""Compatibility shim for the optional KunQuant-MLIR codegen backend."""
+
+try:
+    from KunQuantMLIR.codegen_mlir import *  # noqa: F401,F403
+except ModuleNotFoundError as e:
+    if e.name and e.name.startswith("KunQuantMLIR"):
+        raise ImportError(
+            "KunQuant MLIR codegen backend is not installed. "
+            "Install KunQuant-MLIR to use KunQuant.passes.CodegenMLIR."
+        ) from e
+    raise
diff --git a/KunQuant/passes/ExperimentalExpand.py b/KunQuant/passes/ExperimentalExpand.py
new file mode 100644
index 0000000..f0af28d
--- /dev/null
+++ b/KunQuant/passes/ExperimentalExpand.py
@@ -0,0 +1,292 @@
+"""Experimental stateful-op expansion pass (currently GPU-only).
+
+Gated on ``options["experimental_expand"]`` — when False (default), the
+pass returns immediately so the CPU pipeline is untouched.  Runs after
+the first ``decompose`` so user-facing composite ops (e.g.
+``WindowedLinearRegressionSlope``) have already been broken into
+``WindowedLinearRegression`` + per-extractor ``Impl`` ops.
+
+Replaces ops that the kunir codegen doesn't lower directly with
+``Accumulator + Select + SetAccumulator`` chains (and FBW reductions) it
+does support:
+
+* ``ExpMovingAvg(v, span)`` → an ``Accumulator(init_val="nan")`` carrying
+  the running EMA.  The NaN init doubles as the "not yet seeded" sentinel
+  — first non-NaN ``v`` is stored verbatim, subsequent non-NaN ``v`` uses
+  the pandas ``ewm(adjust=False, ignore_na=True)`` update.  An ``__init``
+  Input is not supported yet — the pass raises on encounter.
+
+* ``WindowedLinearRegression(v, window)`` → ``FastWindowedSum`` for the
+  running sum / sum-of-squares, plus a ``ForeachBackWindow`` +
+  ``WindowLoopIndex`` + ``ReduceAdd`` for the position-weighted sum_xy.
+  Intermediate ops are stashed in ``state[lin_op] : List[OpBase]`` so each
+  consumer Impl (``Slope``, ``RSqaure``, ``Resi``) can pick the entries it
+  needs and emit its final formula.
+
+* ``SetInfOrNanToValue(a, value)`` → ``Select(isnan(a - a), value, a)``
+  (mirrors the C++ implementation; ``a - a`` is NaN for both NaN and ±Inf).
+
+* ``ReduceDecayLinear(v, window)`` → ``ReduceAdd(v * weight)`` where
+  ``weight = (WindowLoopIndex + 1) / (window * (window + 1) / 2)``.
+"""
+
+from typing import Callable, Dict, List, Optional, Tuple, Type
+
+from KunQuant.Op import (
+    OpBase, Builder, ConstantOp, ForeachBackWindow, IterValue,
+    WindowedTempOutput, WindowLoopIndex,
+)
+from KunQuant.ops.ElewiseOp import Select, Equals, Not, SetInfOrNanToValue
+from KunQuant.ops.ReduceOp import ReduceAdd, ReduceDecayLinear
+from KunQuant.ops.MiscOp import (
+    FastWindowedSum, Accumulator, SetAccumulator,
+    ExpMovingAvg, WindowedLinearRegression,
+    WindowedLinearRegressionSlopeImpl,
+    WindowedLinearRegressionRSqaureImpl,
+    WindowedLinearRegressionResiImpl,
+)
+from KunQuant.Stage import Function
+from .Util import kun_pass
+
+
+# ── EMA expansion ───────────────────────────────────────────────────
+
+def _expand_ema(op: ExpMovingAvg) -> OpBase:
+    """Build the Accumulator-based chain inside the current Builder.
+
+    The slot is initialised to NaN, which serves as the "not yet seeded"
+    sentinel: a NaN ``prev`` means we still need to seed with the first
+    non-NaN ``x``.  The SetAccumulator's mask is ``notnan_x``, so NaN
+    inputs leave the slot unchanged (pandas ignore_na=True).
+    """
+    if len(op.inputs) >= 2:
+        raise RuntimeError(
+            "experimental_expand: ExpMovingAvg with an `__init` Input is "
+            "not supported yet on the GPU backend")
+    span  = op.attrs["window"]
+    alpha = 2.0 / (span + 1)
+    x     = op.inputs[0]
+
+    # `is_whole_time_required=True` propagates the kernel's
+    # unreliable_count to the sentinel so the runtime collapses to a
+    # single chunk — EMA's per-stock state can't survive a chunk
+    # boundary reset.
+    prev    = Accumulator(x, f"ema_{span}", init_val="nan",
+                          is_whole_time_required=True)
+    notnan_x  = Equals(x, x)
+    prev_nan  = Not(Equals(prev, prev))
+
+    formula = x * alpha + prev * (1.0 - alpha)
+    #   prev is NaN (still warmup):
+    #     - x non-NaN → seed with x
+    #     - x NaN     → keep NaN (Select returns `x`)
+    #   prev is set:
+    #     - x non-NaN → standard formula
+    #     - x NaN     → carry prev unchanged
+    new_ema = Select(prev_nan, x, Select(notnan_x, formula, prev))
+    # mask = notnan_x: on NaN x we don't touch the slot (preserves
+    # both the NaN-sentinel and the carried prev).  SetAccumulator
+    # returns the slot's new value for this step (mask ? value : prev),
+    # which matches `new_ema` here — use it directly as the EMA result.
+    return SetAccumulator(prev, notnan_x, new_ema)
+
+
+# ── WindowedLinearRegression intermediate state ────────────────────
+
+# Field names in the per-op `state` list returned by `_expand_linreg`.
+# Consumers index by these constants for clarity.
+_LR_SUM_Y    = 0   # FastWindowedSum(v,    window)
+_LR_SUM_YY   = 1   # FastWindowedSum(v*v,  window)
+_LR_SUM_XY   = 2   # Σ_{i=0..window-1} i * v[t-window+1+i]
+_LR_SLOPE    = 3
+_LR_INTERCEPT = 4
+_LR_V        = 5   # original v (for the Resi consumer)
+
+
+def _expand_linreg(op: WindowedLinearRegression) -> List[OpBase]:
+    """Emit running sums + the closed-form slope/intercept for v
+    regressed on the integer position x = 0..window-1 within the window.
+
+    The x positions are treated as constants (i.e. no NaN-aware
+    re-indexing) — for an input with NaN entries the running sums become
+    NaN via the FastWindowedSum / FBW NaN propagation and consumers
+    return NaN through.
+    """
+    window = op.attrs["window"]
+    v      = op.inputs[0]
+
+    # sum_y = rolling sum of v over the window; NaN until window full.
+    # FastWindowedSum requires a WindowedDataSourceOp input sized window+1.
+    sum_y  = FastWindowedSum(WindowedTempOutput(v, window + 1), window)
+    # sum_yy = rolling sum of v² — same pattern over a v*v intermediate.
+    sum_yy = FastWindowedSum(WindowedTempOutput(v * v, window + 1), window)
+    # sum_xy = Σ idx * v where idx is the window position (0=oldest,
+    # window-1=newest).  Express via FBW + WindowLoopIndex + Mul +
+    # ReduceAdd; OOB reads (warmup) return NaN, so sum_xy is NaN until
+    # the window fills.
+    wtemp = WindowedTempOutput(v, window)
+    with ForeachBackWindow(wtemp, window) as each:
+        idx     = WindowLoopIndex(each)
+        val     = IterValue(each, wtemp)
+        contrib = idx * val
+    sum_xy = ReduceAdd(contrib)
+
+    # Compile-time constants for x:
+    #   sum_x  = Σ i  for i in [0, window)       = window*(window-1)/2
+    #   sum_xx = Σ i² for i in [0, window)       = window*(window-1)*(2*window-1)/6
+    # ⇒ denom = window*sum_xx - sum_x² = window²(window-1)(window+1)/12
+    n      = float(window)
+    sum_x  = n * (n - 1) / 2.0
+    denom  = (n * n) * (n - 1.0) * (n + 1.0) / 12.0   # constant; assume window > 1
+    slope     = (sum_xy * n - sum_y * sum_x) / denom
+    intercept = (sum_y - slope * sum_x) / n
+
+    state = [None] * 6
+    state[_LR_SUM_Y]     = sum_y
+    state[_LR_SUM_YY]    = sum_yy
+    state[_LR_SUM_XY]    = sum_xy
+    state[_LR_SLOPE]     = slope
+    state[_LR_INTERCEPT] = intercept
+    state[_LR_V]         = v
+    return state
+
+
+# ── Consumer formulas (one per Impl op) ─────────────────────────────
+
+def _expand_lr_slope(op: OpBase,
+                     state: Dict[OpBase, List[OpBase]]) -> OpBase:
+    return state[op.inputs[0]][_LR_SLOPE]
+
+
+def _expand_lr_rsquare(op: OpBase,
+                       state: Dict[OpBase, List[OpBase]]) -> OpBase:
+    lin_op = op.inputs[0]
+    lr_state = state[lin_op]
+    # SS_reg = slope² * (window*sum_xx - sum_x²) / window = slope² * denom / window
+    # SS_tot = sum_yy - sum_y²/window
+    # R²     = SS_reg / SS_tot
+    n     = float(lin_op.attrs["window"])
+    denom = (n * n) * (n - 1.0) * (n + 1.0) / 12.0
+    slope = lr_state[_LR_SLOPE]
+    ss_reg = (slope * slope) * (denom / n)
+    ss_tot = (
+        lr_state[_LR_SUM_YY] -
+        (lr_state[_LR_SUM_Y] * lr_state[_LR_SUM_Y]) / n)
+    return ss_reg / ss_tot
+
+
+def _expand_lr_resi(op: OpBase,
+                    state: Dict[OpBase, List[OpBase]]) -> OpBase:
+    lin_op = op.inputs[0]
+    lr_state = state[lin_op]
+    # residual at the newest window position (x = window-1):
+    #   v_t - (slope * (window-1) + intercept)
+    pred = (
+        lr_state[_LR_SLOPE] * float(lin_op.attrs["window"] - 1) +
+        lr_state[_LR_INTERCEPT])
+    return lr_state[_LR_V] - pred
+
+
+# ── SetInfOrNanToValue expansion ────────────────────────────────────
+
+def _expand_set_inf_or_nan(op: SetInfOrNanToValue) -> OpBase:
+    # Mirrors the C++ implementation in cpp/Kun/Ops.hpp:
+    # `mask = isnan(a - a); return select(mask, v, a)`.
+    # `a - a` is 0 for finite `a` and NaN for NaN/±Inf (Inf-Inf == NaN),
+    # so isnan-of-diff catches both NaN and Inf in one shot.
+    a = op.inputs[0]
+    diff = a - a
+    mask = Not(Equals(diff, diff))
+    return Select(mask, ConstantOp(op.attrs["value"]), a)
+
+
+# ── DecayLinear reduction expansion ─────────────────────────────────
+
+def _expand_decay_linear(op: ReduceDecayLinear) -> OpBase:
+    if len(op.inputs) != 1:
+        raise RuntimeError(
+            f"experimental_expand: ReduceDecayLinear expects one input "
+            f"(op = {op})")
+    window = int(op.attrs["window"])
+    denom = (1.0 + window) * window / 2.0
+    loop = op.get_loop()
+    with loop:
+        idx = WindowLoopIndex(loop)
+        weight = (idx + 1.0) * (1.0 / denom)
+        contrib = op.inputs[0] * weight
+    return ReduceAdd(contrib)
+
+
+# ── Dispatch table helpers ──────────────────────────────────────────
+
+ExpandFunc = Callable[[OpBase, Dict[OpBase, List[OpBase]]], OpBase]
+ExpandRule = Tuple[Type[OpBase], ExpandFunc]
+
+
+_EXPAND_RULES: List[ExpandRule] = [
+    (ExpMovingAvg, lambda op, state: _expand_ema(op)),
+    (WindowedLinearRegressionSlopeImpl, _expand_lr_slope),
+    (WindowedLinearRegressionRSqaureImpl, _expand_lr_rsquare),
+    (WindowedLinearRegressionResiImpl, _expand_lr_resi),
+    (SetInfOrNanToValue, lambda op, state: _expand_set_inf_or_nan(op)),
+    (ReduceDecayLinear, lambda op, state: _expand_decay_linear(op)),
+]
+
+
+def _find_expand_rule(op: OpBase) -> Optional[ExpandFunc]:
+    for op_type, expand in _EXPAND_RULES:
+        if isinstance(op, op_type):
+            return expand
+    return None
+
+
+# ── Pass driver ─────────────────────────────────────────────────────
+
+def _experimental_expand_impl(
+    ops: List[OpBase], options: dict,
+) -> List[OpBase]:
+    # state[lin_op] = list of intermediate ops; consumers pick by index.
+    state: Dict[OpBase, List[OpBase]] = {}
+    replace_map: Dict[OpBase, OpBase] = {}
+    out: List[OpBase] = []
+    changed = False
+
+    for op in ops:
+        op.replace_inputs(replace_map)
+
+        if isinstance(op, WindowedLinearRegression):
+            b = Builder(op.get_parent())
+            with b:
+                lin_state = _expand_linreg(op)
+            out.extend(b.ops)
+            state[op] = lin_state
+            # The LinearRegression op produces a "state handle" Value
+            # consumed only by its Impl ops, which we lower below via
+            # `state[]` lookup — so we don't keep `op` in `out` and we
+            # don't enter it in `replace_map`.  Consumers find the same
+            # original Python object by identity through `op.inputs[0]`.
+            changed = True
+            continue
+
+        expand = _find_expand_rule(op)
+        if expand is not None:
+            b = Builder(op.get_parent())
+            with b:
+                new_val = expand(op, state)
+            out.extend(b.ops)
+            replace_map[op] = new_val
+            changed = True
+            continue
+
+        out.append(op)
+
+    return out if changed else None
+
+
+@kun_pass
+def experimental_expand(f: Function, options: dict = {}):
+    if not options.get("experimental_expand", False):
+        return
+    newops = _experimental_expand_impl(f.ops, options)
+    if newops is not None:
+        f.set_ops(newops)
diff --git a/KunQuant/passes/Partitioner.py b/KunQuant/passes/Partitioner.py
index 7f5c82f..c6534cd 100644
--- a/KunQuant/passes/Partitioner.py
+++ b/KunQuant/passes/Partitioner.py
@@ -1,4 +1,4 @@
-from KunQuant.Op import OpBase, Output, Input, CrossSectionalOp, GraphSourceTrait, ConstantOp, ReductionOp, BoolOpTrait, GlobalStatefulProducerTrait, StateConsumerTrait
+from KunQuant.Op import OpBase, Output, Input, CrossSectionalOp, GraphSourceTrait, ConstantOp, ReductionOp, BoolOpTrait, GlobalStatefulProducerTrait, StateConsumerTrait, WindowedTempOutput
 from KunQuant.ops.MiscOp import ReturnFirstValue
 from KunQuant.Stage import Function, OpInfo
 from KunQuant.ops import GenericPartition
@@ -289,10 +289,34 @@ def add_to_naming_table(v: str) -> str:
                 # input is shared by all ops
                 assert(op not in op_lookup_table)
                 op_lookup_table[op] = p
+    # Map output-name → producer partition.  Tracks the partition that
+    # owns the Output op for each cross-partition / graph name.  Used by
+    # the WTO(Input) peel below to record the real upstream dependency
+    # after dereferencing the WTO wrapper.
+    name_to_output_partition: Dict[str, _Partition] = {}
+    for op, owner in op_lookup_table.items():
+        if isinstance(op, Output):
+            name_to_output_partition[op.attrs["name"]] = owner
     hash_cache: Dict['OpBase', int] = dict()
     for p in partitions:
         name_to_input = dict()
         depending : typing.OrderedDict[_Partition, None] = OrderedDict()
+
+        def get_local_input(out_name: str, prefer: OpBase = None) -> OpBase:
+            """Return p's local `Input(out_name)`, creating it if needed.
+            If `prefer` is given and already lives in `p.ops`, reuse it
+            instead of allocating a new Input."""
+            inop = name_to_input.get(out_name)
+            if inop is not None:
+                return inop
+            if prefer is not None and prefer in p.ops:
+                inop = prefer
+            else:
+                inop = Input(out_name)
+                p.add(None, inop)
+            name_to_input[out_name] = inop
+            return inop
+
         # for each op in partition
         for op in list(p.ops):
             for idx, inp in enumerate(op.inputs):
@@ -300,6 +324,29 @@ def add_to_naming_table(v: str) -> str:
                     # if the partition depends on an op of another partition
                     if inp.get_parent():
                         raise RuntimeError("Bad cross partition op: " + str(inp) + "\ncur op=" + str(op))
+                    # If the input of an op is a WindowedTempOutput wrapping an partition Input, peel it off.
+                    # original: Op(WindowedTempOutput(Input("xxx")))
+                    # peeled: Op(Input("xxx"))
+                    # Note that the WindowedTempOutput should be in another partition,
+                    # which has been processed already in the parent loop `for p in partitions`.
+                    # Input("xxx") should be an input of that partition
+                    orig_inp = inp
+                    while isinstance(inp, WindowedTempOutput) and \
+                            isinstance(inp.inputs[0], Input):
+                        inp = inp.inputs[0]
+                    # if Op(WindowedTempOutput(Input("xxx"))) pattern is found ...
+                    if inp is not orig_inp:
+                        # orig_inp is the WindowedTempOutput
+                        orig_info = f.op_to_id[orig_inp]
+                        if op in orig_info.uses:
+                            del orig_info.uses[op]
+                        # inp is the Input
+                        out_name = inp.attrs["name"]
+                        producer = name_to_output_partition.get(out_name)
+                        if producer is not None and producer != p:
+                            depending[producer] = None
+                        op.inputs[idx] = get_local_input(out_name, prefer=inp)
+                        continue
                     inp_info = f.op_to_id[inp]
                     if isinstance(inp, ConstantOp):
                         if op in inp_info.uses:
@@ -318,6 +365,7 @@ def add_to_naming_table(v: str) -> str:
                             inp_partition = op_lookup_table[inp]
                             inp_partition.add(None, outop)
                             op_lookup_table[outop] = inp_partition
+                            name_to_output_partition[out_name] = inp_partition
                         else:
                             out_name = outop.attrs["name"]
                             inp_partition = op_lookup_table[outop]
@@ -327,13 +375,7 @@ def add_to_naming_table(v: str) -> str:
                         out_name = inp.attrs["name"]
                     if op in inp_info.uses:
                         del inp_info.uses[op]
-                    
-                    inop = name_to_input.get(out_name, None)
-                    if not inop:
-                        inop = Input(out_name)
-                        p.add(None, inop)
-                        name_to_input[out_name] = inop
-                    op.inputs[idx] = inop
+                    op.inputs[idx] = get_local_input(out_name)
         p.depending = depending
         p.stage_op = GenericPartition([], None)
     
diff --git a/KunQuant/passes/TempWindowElim.py b/KunQuant/passes/TempWindowElim.py
index 7646e21..bf51eb2 100644
--- a/KunQuant/passes/TempWindowElim.py
+++ b/KunQuant/passes/TempWindowElim.py
@@ -1,5 +1,8 @@
 from KunQuant.passes.Util import kun_pass
-from KunQuant.Op import OpBase, WindowedTempOutput, Input, Output, traverse_replace_map
+from KunQuant.Op import (
+    OpBase, WindowedTempOutput, Input, Output, WindowedTrait,
+    traverse_replace_map,
+)
 from KunQuant.Stage import Function
 from typing import List, Dict, Tuple
 
@@ -9,13 +12,18 @@ def _get_temp_out_with_window(op: OpBase, window: int):
     w = op.attrs["window"]
     return w >= window, w
 
-def for_each_op(op: OpBase, f: Function, replace_map: dict) -> Tuple[OpBase, OpBase]:
+def for_each_op(op: OpBase, f: Function, replace_map: dict, may_slice_time: bool) -> Tuple[OpBase, OpBase]:
     if not isinstance(op, WindowedTempOutput):
         return (op, None)
     inp = op.inputs[0]
     # temp window on input, simply eliminate it
     if isinstance(inp, Input):
         return (None, inp)
+    # If nobody consumes this as a windowed source, the temp window is just
+    # the current input value and can be folded away.
+    if not any(isinstance(user, WindowedTrait)
+               for user in f.op_to_id[op].uses):
+        return (None, inp)
     # check if the input of WindowedTempOutput is used in Output or other WindowedTempOutput
     inp_info = f.op_to_id[inp]
     window = op.attrs["window"]
@@ -24,8 +32,10 @@ def for_each_op(op: OpBase, f: Function, replace_map: dict) -> Tuple[OpBase, OpB
     for user, _ in inp_info.uses.items():
         if user == op:
             continue
-        # if the user is used by Output, return the output
-        if isinstance(user, Output):
+        # if the user is used by Output, return the output.  When the
+        # runtime may slice time, reading history from an output buffer can
+        # race across time chunks; keep a local temp window instead.
+        if not may_slice_time and isinstance(user, Output):
             return (None, traverse_replace_map(user, replace_map))
         # select the max window op with the larger id
         checked, w = _get_temp_out_with_window(user, window)
@@ -37,7 +47,28 @@ def for_each_op(op: OpBase, f: Function, replace_map: dict) -> Tuple[OpBase, OpB
         return (None, traverse_replace_map(max_window_op, replace_map))
     return (op, None)
 
-def temp_window_elim_impl(ops: List[OpBase], f: Function) -> List[OpBase]:
+def _unwrap_output_wto(ops: List[OpBase], f: Function) -> bool:
+    """Rewrite Output(WindowedTempOutput(x)) → Output(x)."""
+    changed = False
+    for op in ops:
+        if not isinstance(op, Output):
+            continue
+        src = op.inputs[0]
+        if not isinstance(src, WindowedTempOutput):
+            continue
+        old_src = src
+        while isinstance(src, WindowedTempOutput):
+            src = src.inputs[0]
+        if op in f.op_to_id[old_src].uses:
+            del f.op_to_id[old_src].uses[op]
+        f.op_to_id[src].uses[op] = 1
+        op.inputs[0] = src
+        changed = True
+    return changed
+
+def temp_window_elim_impl(ops: List[OpBase], f: Function, options: dict) -> List[OpBase]:
+    _unwrap_output_wto(ops, f)
+    may_slice_time = options.get("may_slice_time", False)
     replace_map = dict()
     out = []
     changed = False
@@ -45,7 +76,7 @@ def temp_window_elim_impl(ops: List[OpBase], f: Function) -> List[OpBase]:
         if op in replace_map:
             continue
         op.replace_inputs(replace_map)
-        normal, replacer = for_each_op(op, f, replace_map)
+        normal, replacer = for_each_op(op, f, replace_map, may_slice_time)
         if normal is not None:
             out.append(op)
         else:
@@ -57,7 +88,7 @@ def temp_window_elim_impl(ops: List[OpBase], f: Function) -> List[OpBase]:
 
 @kun_pass
 def temp_window_elim(f: Function, options: dict = {}):
-    newops = temp_window_elim_impl(f.ops, f)
+    newops = temp_window_elim_impl(f.ops, f, options)
     if newops is not None:
         newops = Function.topo_sort_ops(newops)
-        f.set_ops(newops)
\ No newline at end of file
+        f.set_ops(newops)
diff --git a/KunQuant/passes/__init__.py b/KunQuant/passes/__init__.py
index 9299a42..97cdf0d 100644
--- a/KunQuant/passes/__init__.py
+++ b/KunQuant/passes/__init__.py
@@ -6,4 +6,5 @@
 from .CodegenCpp import codegen_cpp
 from .InferWindow import infer_window
 from .InferWindow import infer_input_window
-from .MergeLoops import merge_loops
\ No newline at end of file
+from .MergeLoops import merge_loops
+from .ExperimentalExpand import experimental_expand
\ No newline at end of file
diff --git a/KunQuant/predefined/talib.py b/KunQuant/predefined/talib.py
index abf4b94..ff27603 100644
--- a/KunQuant/predefined/talib.py
+++ b/KunQuant/predefined/talib.py
@@ -57,7 +57,7 @@ def decompose(self, options: dict) -> List[OpBase]:
             tr = TRANGE(high, low, close)
 
             mask_true = Equals(ConstantOp(0), ConstantOp(0))
-            cnt_acc = Accumulator(tr, f"atr_cnt_{window}")
+            cnt_acc = Accumulator(tr, f"atr_cnt_{window}", is_whole_time_required=True)
             prev_cnt = cnt_acc
             new_cnt = prev_cnt + 1
             set_cnt = SetAccumulator(cnt_acc, mask_true, new_cnt)
@@ -102,15 +102,15 @@ def decompose(self, options: dict) -> List[OpBase]:
 
             mask_true = Equals(ConstantOp(0), ConstantOp(0))
             dummy = ReturnFirstValue([high, low])
-            cnt_acc = Accumulator(dummy, f"sar_cnt_{af_init}_{af_step}_{af_max}")
+            cnt_acc = Accumulator(dummy, f"sar_cnt_{af_init}_{af_step}_{af_max}", is_whole_time_required=True)
             prev_cnt = cnt_acc
             set_cnt = SetAccumulator(cnt_acc, mask_true, prev_cnt + 1)
             is_bar_0 = Equals(prev_cnt, ConstantOp(0))
             is_bar_1 = Equals(prev_cnt, ConstantOp(1))
 
-            sar_acc = Accumulator(dummy, f"sar_value_{af_init}_{af_step}_{af_max}")
-            ep_acc = Accumulator(dummy, f"sar_ep_{af_init}_{af_step}_{af_max}")
-            af_acc = Accumulator(dummy, f"sar_af_{af_init}_{af_step}_{af_max}")
+            sar_acc = Accumulator(dummy, f"sar_value_{af_init}_{af_step}_{af_max}", is_whole_time_required=True)
+            ep_acc = Accumulator(dummy, f"sar_ep_{af_init}_{af_step}_{af_max}", is_whole_time_required=True)
+            af_acc = Accumulator(dummy, f"sar_af_{af_init}_{af_step}_{af_max}", is_whole_time_required=True)
 
             prev_sar = sar_acc
             prev_ep = ep_acc
diff --git a/KunQuantMLIR/OverlapRunner.py b/KunQuantMLIR/OverlapRunner.py
new file mode 100644
index 0000000..4deef06
--- /dev/null
+++ b/KunQuantMLIR/OverlapRunner.py
@@ -0,0 +1,280 @@
+"""Pipelined CUDA runner for overlapping copies with KunMLIR launches."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Protocol, Tuple, Union
+
+import numpy as np
+
+try:
+    import cupy as cp
+except ImportError as exc:
+    raise ImportError(
+        "KunQuantMLIR.OverlapRunner requires CuPy for asynchronous CUDA "
+        "H2D/D2H copies. Install a CuPy build matching your CUDA runtime, "
+        "for example cupy-cuda12x."
+    ) from exc
+
+from KunQuantMLIR import KunMLIR
+
+
+CudaStream = Union[cp.cuda.Stream, cp.cuda.ExternalStream]
+
+
+class DLPackProvider(Protocol):
+    def __dlpack__(self) -> object:
+        ...
+
+
+def _stream_from_executor(executor: KunMLIR.Executor) -> CudaStream:
+    if executor.stream == 0:
+        return cp.cuda.Stream.null
+    return cp.cuda.ExternalStream(executor.stream)
+
+
+@dataclass
+class PendingResult:
+    """Host outputs from an asynchronous D2H copy.
+
+    The NumPy arrays are intentionally kept alive by this object until the
+    completion event has passed. Call ``wait()`` before reading them.
+    """
+
+    _output_names: List[str]
+    _host_output_block: np.ndarray
+    _done_event: cp.cuda.Event
+    _outputs: Optional[Dict[str, np.ndarray]] = None
+
+    def wait(self) -> Dict[str, np.ndarray]:
+        self._done_event.synchronize()
+        if self._outputs is None:
+            self._outputs = {
+                name: self._host_output_block[i]
+                for i, name in enumerate(self._output_names)
+            }
+        return self._outputs
+
+
+@dataclass
+class _Slot:
+    index: int
+    dev_inputs: Dict[str, cp.ndarray]
+    dev_outputs: Optional[Dict[str, cp.ndarray]]
+    output_length: Optional[int]
+    output_num_stocks: Optional[int]
+    graph_executable: Optional[KunMLIR.Executable]
+    h2d_event: cp.cuda.Event
+    compute_event: cp.cuda.Event
+    free_event: Optional[cp.cuda.Event]
+    h2d_sources: Optional[Dict[str, np.ndarray]]
+    host_output_block: Optional[np.ndarray]
+
+
+class OverlapRunner:
+    """Submit KunMLIR ``runGraph`` calls through a copy/compute pipeline.
+
+    The runner owns three non-blocking streams shared by all slots:
+    one for H2D input copies, one for the KunMLIR executor, and one for
+    D2H output copies. Slots only own reusable device input/output buffers
+    and references that must stay alive while async copies are in flight.
+    """
+
+    def __init__(self, executable: KunMLIR.Executable,
+                 executor: KunMLIR.Executor,
+                 num_slots: int = 3) -> None:
+        if num_slots < 2:
+            raise ValueError("OverlapRunner requires at least two slots")
+
+        self.executable = executable
+        self.executor = executor
+        self.compute_stream = _stream_from_executor(executor)
+        self.output_names = list(executable.output_names)
+
+        self.h2d_stream = cp.cuda.Stream(non_blocking=True)
+        self.d2h_stream = cp.cuda.Stream(non_blocking=True)
+
+        self._slots: List[_Slot] = [
+            _Slot(
+                index=i,
+                dev_inputs={},
+                dev_outputs=None,
+                output_length=None,
+                output_num_stocks=None,
+                graph_executable=None,
+                h2d_event=cp.cuda.Event(),
+                compute_event=cp.cuda.Event(),
+                free_event=None,
+                h2d_sources=None,
+                host_output_block=None,
+            )
+            for i in range(num_slots)
+        ]
+        self._next_slot = 0
+
+    @property
+    def num_slots(self) -> int:
+        return len(self._slots)
+
+    def submit(self, inputs: Dict[str, np.ndarray], cur_time: int = 0,
+               length: int = 0, mask: int = 0,
+               min_chunk_warmup_factor: int = 4,
+               sm_fill_factor: float = 1.5,
+               use_cuda_graph: bool = False) -> PendingResult:
+        slot = self._slots[self._next_slot]
+        self._next_slot = (self._next_slot + 1) % len(self._slots)
+
+        if slot.free_event is not None:
+            slot.free_event.synchronize()
+
+        host_inputs = self._prepare_host_inputs(inputs)
+        slot.h2d_sources = host_inputs
+        run_inputs, inputs_resized = self._copy_inputs_to_device(
+            slot, host_inputs)
+        output_length, num_stocks, output_dtype = self._output_spec(
+            host_inputs, length)
+        dev_outputs = self._cached_outputs_for_run(
+            slot, output_length, num_stocks, output_dtype, inputs_resized)
+
+        h2d_done = slot.h2d_event
+        h2d_done.record(self.h2d_stream)
+        slot.free_event = h2d_done
+
+        self.compute_stream.wait_event(h2d_done)
+        executable = self._executable_for_slot(slot, use_cuda_graph)
+        try:
+            ret = self.executor.runGraph(
+                executable,
+                run_inputs,
+                cur_time=cur_time,
+                length=length,
+                outputs=dev_outputs,
+                mask=mask,
+                min_chunk_warmup_factor=min_chunk_warmup_factor,
+                sm_fill_factor=sm_fill_factor,
+                use_cuda_graph=use_cuda_graph,
+            )
+        except Exception:
+            compute_done = slot.compute_event
+            compute_done.record(self.compute_stream)
+            slot.free_event = compute_done
+            raise
+
+        compute_done = slot.compute_event
+        compute_done.record(self.compute_stream)
+        slot.free_event = compute_done
+        slot.dev_outputs = self._to_cupy_outputs(ret)
+
+        self.d2h_stream.wait_event(compute_done)
+        host_output_block: Optional[np.ndarray] = None
+        try:
+            host_output_block = self._allocate_pinned_output_block(
+                slot.dev_outputs)
+            for i, name in enumerate(self.output_names):
+                dev_output = slot.dev_outputs[name]
+                cp.asnumpy(dev_output, stream=self.d2h_stream,
+                           out=host_output_block[i], blocking=False)
+        finally:
+            d2h_done = cp.cuda.Event()
+            d2h_done.record(self.d2h_stream)
+            slot.free_event = d2h_done
+            slot.host_output_block = host_output_block
+
+        result = PendingResult(self.output_names, host_output_block, d2h_done)
+        return result
+
+    def synchronize(self) -> None:
+        for slot in self._slots:
+            if slot.free_event is not None:
+                slot.free_event.synchronize()
+            slot.h2d_sources = None
+            slot.host_output_block = None
+
+    def _prepare_host_inputs(self, inputs: Dict[str, np.ndarray]
+                             ) -> Dict[str, np.ndarray]:
+        host_inputs: Dict[str, np.ndarray] = {}
+        for name, value in inputs.items():
+            arr = np.asarray(value)
+            if not arr.flags.c_contiguous:
+                arr = np.ascontiguousarray(arr)
+            host_inputs[name] = arr
+        return host_inputs
+
+    def _copy_inputs_to_device(self, slot: _Slot,
+                               host_inputs: Dict[str, np.ndarray]
+                               ) -> Tuple[Dict[str, cp.ndarray], bool]:
+        run_inputs: Dict[str, cp.ndarray] = {}
+        resized = False
+        for name, host in host_inputs.items():
+            dev = slot.dev_inputs.get(name)
+            if dev is None or dev.shape != host.shape or dev.dtype != host.dtype:
+                dev = cp.empty(host.shape, dtype=host.dtype)
+                slot.dev_inputs[name] = dev
+                resized = True
+            dev.set(host, stream=self.h2d_stream)
+            run_inputs[name] = dev
+        return run_inputs, resized
+
+    def _output_spec(self, host_inputs: Dict[str, np.ndarray],
+                     length: int) -> Tuple[int, int, np.dtype]:
+        if not host_inputs:
+            raise ValueError("OverlapRunner requires at least one input")
+        first = next(iter(host_inputs.values()))
+        if first.ndim != 2:
+            raise ValueError("OverlapRunner expects 2-D TS inputs")
+        output_length = first.shape[0] if length == 0 else length
+        return output_length, first.shape[1], first.dtype
+
+    def _cached_outputs_for_run(self, slot: _Slot,
+                                length: int,
+                                num_stocks: int,
+                                dtype: np.dtype,
+                                inputs_resized: bool
+                                ) -> Dict[str, cp.ndarray]:
+        needs_alloc = (
+            slot.dev_outputs is None or
+            slot.output_length != length or
+            slot.output_num_stocks != num_stocks or
+            inputs_resized
+        )
+        if needs_alloc:
+            slot.dev_outputs = {
+                name: cp.empty((length, num_stocks), dtype=dtype)
+                for name in self.output_names
+            }
+            slot.output_length = length
+            slot.output_num_stocks = num_stocks
+        return slot.dev_outputs
+
+    def _allocate_pinned_output_block(
+        self, dev_outputs: Dict[str, cp.ndarray]
+    ) -> np.ndarray:
+        if not self.output_names:
+            raise ValueError("OverlapRunner requires at least one output")
+        first = dev_outputs[self.output_names[0]]
+        shape = first.shape
+        dtype = np.dtype(first.dtype)
+        pinned = cp.cuda.alloc_pinned_memory(len(self.output_names) *
+                                            first.nbytes)
+        return np.frombuffer(
+            pinned, dtype=dtype, count=len(self.output_names) * first.size
+        ).reshape((len(self.output_names),) + shape)
+
+    def _to_cupy_outputs(self, outputs: Dict[str, Union[cp.ndarray,
+                                                        DLPackProvider]]
+                         ) -> Dict[str, cp.ndarray]:
+        ret: Dict[str, cp.ndarray] = {}
+        for name, value in outputs.items():
+            if isinstance(value, cp.ndarray):
+                ret[name] = value
+            else:
+                ret[name] = cp.from_dlpack(value)
+        return ret
+
+    def _executable_for_slot(self, slot: _Slot,
+                             use_cuda_graph: bool) -> KunMLIR.Executable:
+        if not use_cuda_graph:
+            return self.executable
+        if slot.graph_executable is None:
+            slot.graph_executable = self.executable.clone()
+        return slot.graph_executable
diff --git a/KunQuantMLIR/__init__.py b/KunQuantMLIR/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/KunQuantMLIR/codegen_mlir.py b/KunQuantMLIR/codegen_mlir.py
new file mode 100644
index 0000000..f92b643
--- /dev/null
+++ b/KunQuantMLIR/codegen_mlir.py
@@ -0,0 +1,416 @@
+"""Translate a (post-optimize) KunQuant Function into a KunMLIR module
+holding a single kunir.func inside a gpu.module.
+
+This is the GPU-side counterpart to passes.CodegenCpp.codegen_cpp; it
+runs after the same Driver.optimize() pipeline the CPU path uses, then
+walks the lowered IR and emits kunir ops via the KunMLIR.IRBuilder
+pybind class.
+
+Scope (v0): only the ops kunir currently supports.
+  - Elemwise binary: Add, Sub, Mul, Div, Max, Min
+  - Elemwise unary:  Abs, Log, Sign
+  - Cross-sectional: Rank, Scale
+  - Windowed:        WindowedTempOutput, ForeachBackWindow + IterValue,
+                      ReduceAdd / ReduceMul / ReduceMax / ReduceMin
+  - Boundaries:      Input, Output
+
+Anything else raises NotImplementedError with the offending op printed.
+"""
+
+from __future__ import annotations
+from typing import Dict, List, Optional, Tuple, TYPE_CHECKING
+
+if TYPE_CHECKING:
+    # KunMLIR is a compiled extension built alongside the MLIR support,
+    # only imported here for type checking — no runtime dependency added
+    # to the codegen path itself.
+    from KunQuantMLIR import KunMLIR
+
+from KunQuant.Op import (
+    OpBase, Input, Output, ForeachBackWindow, IterValue, WindowedTempOutput,
+    WindowLoopIndex, ReductionOp, SimpleCrossSectionalOp, ConstantOp,
+    WindowedTrait, Rank, Scale,
+)
+from KunQuant.ops.ElewiseOp import (
+    Add, Sub, Mul, Div, Max, Min, Abs, Log, Exp, Sqrt, Sign,
+    AddConst, SubConst, MulConst, DivConst,
+    GreaterThanConst, LessThanConst,
+    GreaterThan, GreaterEqual, LessThan, LessEqual, Equals,
+    And, Or, Not, Select,
+)
+from KunQuant.ops.ReduceOp import (
+    ReduceAdd, ReduceMul, ReduceMax, ReduceMin,
+    ReduceArgMax, ReduceArgMin, ReduceRank,
+)
+from KunQuant.ops.MiscOp import (
+    BackRef, FastWindowedSum,
+    Accumulator, SetAccumulator, ReturnFirstValue,
+)
+from KunQuant.Stage import Function
+
+
+# ── Op-class → IRBuilder method dispatch ────────────────────────────
+
+_BINARY = {
+    Add: "add", Sub: "sub", Mul: "mul", Div: "div",
+    Max: "max", Min: "min",
+    GreaterThan:  "gt", GreaterEqual: "ge",
+    LessThan:     "lt", LessEqual:    "le",
+    Equals:       "eq",
+    And:          "and_", Or:         "or_",
+}
+# Const-on-one-side variants — emit ConstantOp + the matching binary op.
+# `swap=True` puts the scalar on the LEFT (e.g. SubConst(x, v, swap=True)
+# means `v - x`, where for plain SubConst it would mean `x - v`).
+_BINARY_CONST = {
+    AddConst: "add", SubConst: "sub", MulConst: "mul", DivConst: "div",
+    GreaterThanConst: "gt", LessThanConst: "lt",
+}
+_UNARY = {
+    Abs: "abs", Log: "log", Exp: "exp", Sqrt: "sqrt", Sign: "sign",
+    Not: "not_",
+    # NOTE: cross-sectional ops are intentionally absent.
+    # partitions are routed to a pre-compiled CUmodule by
+    # `_maybe_external_partition` below; they never become kunir ops.
+}
+_REDUCE = {
+    ReduceAdd: "reduce_add", ReduceMul: "reduce_mul",
+    ReduceMax: "reduce_max", ReduceMin: "reduce_min",
+    ReduceArgMin: "reduce_argmin", ReduceArgMax: "reduce_argmax",
+}
+# Reduces that need a 2nd input (the outer-scope "current" value).
+# `ReduceRank(iter_val, current)` is the only one today; kept as a separate
+# table so `_emit_reduction` can dispatch without conflating arity.
+_REDUCE_WITH_CURRENT = {
+    ReduceRank: "reduce_rank",
+}
+
+
+# ── Target spec carrier ─────────────────────────────────────────────
+
+class TargetSpec:
+    """GPU launch parameters mirrored from kunir.target_spec."""
+    def __init__(self, *, occupancy: int = 1, warps_per_cta: int = 4,
+                 smem_size: int = 49152, vector_size: int = 1):
+        self.occupancy     = occupancy
+        self.warps_per_cta = warps_per_cta
+        self.smem_size     = smem_size
+        self.vector_size   = vector_size
+
+
+# ── Helpers ─────────────────────────────────────────────────────────
+
+def _kunir_symbol(name: str) -> str:
+    """Coerce a partition name into a valid kunir / PTX symbol.
+
+    The partitioner derives a partition's name from the names of its
+    Output ops; when a partition is "intermediate-only" (every output
+    is consumed by a downstream partition, none is a user-facing
+    Output), those names come from `OpBase.hash_hex` which starts with
+    a digit half the time.  Digits are fine for buffer-table keys
+    (CPU runtime indexes by name) but ptxas rejects them as
+    `.entry` symbols.
+
+    Prefix any such name with a single `_` so the kunir.func symbol
+    is always a valid identifier, while leaving `input_names` /
+    `output_names` (the public buffer-table keys) untouched.
+    """
+    if name and name[0].isdigit():
+        return "_" + name
+    return name
+
+def _index_loop_members(f: Function) -> Tuple[
+        Dict[ForeachBackWindow, List[OpBase]],
+        Dict[ForeachBackWindow, List[ReductionOp]]]:
+    """For each ForeachBackWindow in `f`, collect the body ops (those
+    whose `_parent_loop` is the loop) and the reduction ops (whose
+    `get_loop()` is the loop).  Both lists keep f.ops topo order."""
+    body_ops: Dict[ForeachBackWindow, List[OpBase]] = {}
+    reductions: Dict[ForeachBackWindow, List[ReductionOp]] = {}
+    for op in f.ops:
+        if isinstance(op, ReductionOp):
+            loop = op.get_loop()
+            reductions.setdefault(loop, []).append(op)
+        elif op.get_parent() is not None:
+            body_ops.setdefault(op.get_parent(), []).append(op)
+    return body_ops, reductions
+
+
+def _emit_simple(op: OpBase,
+                  ir: KunMLIR.IRBuilder,
+                  val_map: Dict[OpBase, KunMLIR.Value],
+                  ts_1: KunMLIR.Type) -> KunMLIR.Value:
+    """Emit a non-control-flow op via IRBuilder dispatch.  `ts_1` is the
+    kunir ts type with maxLookback=1, used by ops whose result has no
+    input to infer the element type from (currently only ConstantOp)."""
+    cls = type(op)
+    if cls in _BINARY:
+        getattr(ir, _BINARY[cls])
+        return getattr(ir, _BINARY[cls])(val_map[op.inputs[0]],
+                                           val_map[op.inputs[1]])
+    if cls in _BINARY_CONST:
+        # Materialize the scalar attr as a kunir.constant, then emit
+        # the matching binary op.  `swap=True` puts the scalar on the
+        # left-hand side (matters for Sub/Div, no-op for Add/Mul).
+        scalar = float(op.attrs["value"])
+        const_val = ir.constant(scalar, ts_1)
+        x = val_map[op.inputs[0]]
+        ir_op = getattr(ir, _BINARY_CONST[cls])
+        if op.attrs.get("swap", False):
+            return ir_op(const_val, x)
+        return ir_op(x, const_val)
+    if cls in _UNARY:
+        return getattr(ir, _UNARY[cls])(val_map[op.inputs[0]])
+    if isinstance(op, WindowedTempOutput):
+        return ir.windowed_output(val_map[op.inputs[0]],
+                                    int(op.attrs["window"]))
+    if isinstance(op, BackRef):
+        return ir.back_ref(val_map[op.inputs[0]], int(op.attrs["window"]))
+    if isinstance(op, FastWindowedSum):
+        return ir.fast_windowed_sum(val_map[op.inputs[0]],
+                                      int(op.attrs["window"]))
+    if isinstance(op, Select):
+        return ir.select(val_map[op.inputs[0]],
+                          val_map[op.inputs[1]],
+                          val_map[op.inputs[2]])
+    if isinstance(op, ConstantOp):
+        v = op.attrs["value"]
+        fv = float("nan") if v == "nan" else float(v)
+        return ir.constant(fv, ts_1)
+    if isinstance(op, WindowLoopIndex):
+        # Resolved by the kunir → kungpu pass to the enclosing
+        # for_each_back_window's induction variable.
+        return ir.window_loop_index(ts_1)
+    if isinstance(op, Accumulator):
+        # The Python op's `inputs[0]` is a keep-alive in the graph IR;
+        # it does NOT feed the slot.  The `name` attr is informational;
+        # each op identifies a distinct slot (kunir.accumulator is not
+        # Pure, so MLIR CSE will not dedup two accumulators).
+        init_v = op.attrs["init_val"]
+        init_f = float("nan") if init_v == "nan" else float(init_v)
+        return ir.accumulator(op.attrs["name"], ts_1, init_f)
+    if isinstance(op, SetAccumulator):
+        # Side-effecting (writes the slot) but also returns the slot's
+        # new value for the current step (`mask ? value : prev`), so
+        # downstream consumers can use the SetAccumulator's SSA result
+        # directly — matches the CPU C++ SetAccumulator semantics.
+        return ir.set_accumulator(val_map[op.inputs[0]],
+                                   val_map[op.inputs[1]],
+                                   val_map[op.inputs[2]])
+    if isinstance(op, ReturnFirstValue):
+        # In the Python graph IR, ReturnFirstValue's only job is to keep
+        # side-effecting siblings (SetAccumulator etc.) reachable from a
+        # graph output so the GC does not drop them.  In SSA-MLIR the
+        # side-effect ops are preserved by their own MemWrite semantics;
+        # ReturnFirstValue carries no new MLIR-level meaning, so we just
+        # forward the first input's Value.  Other inputs were already
+        # emitted in topo order before we got here.
+        return val_map[op.inputs[0]]
+    raise NotImplementedError(
+        f"CodegenMLIR: op type {cls.__name__} is not supported by the "
+        f"GPU backend yet (op = {op})")
+
+
+def _emit_reduction(op: ReductionOp,
+                     ir: KunMLIR.IRBuilder,
+                     val_map: Dict[OpBase, KunMLIR.Value]) -> KunMLIR.Value:
+    cls = type(op)
+    if cls in _REDUCE_WITH_CURRENT:
+        # ReduceRank(iter_val, current): 2 inputs.
+        if len(op.inputs) != 2:
+            raise NotImplementedError(
+                f"CodegenMLIR: {cls.__name__} expects 2 inputs (iter, "
+                f"current); got {len(op.inputs)} (op = {op})")
+        return getattr(ir, _REDUCE_WITH_CURRENT[cls])(
+            val_map[op.inputs[0]], val_map[op.inputs[1]])
+    if cls not in _REDUCE:
+        raise NotImplementedError(
+            f"CodegenMLIR: reduction {cls.__name__} not supported yet "
+            f"(op = {op})")
+    if len(op.inputs) != 1:
+        raise NotImplementedError(
+            f"CodegenMLIR: reductions with init_val are not supported "
+            f"yet (op = {op})")
+    return getattr(ir, _REDUCE[cls])(val_map[op.inputs[0]])
+
+
+# ── Main entry point ────────────────────────────────────────────────
+
+def _maybe_external_partition(f: Function, dtype: str) -> Optional[dict]:
+    """If `f` is a partition the GPU runtime handles as a pre-compiled
+    external kernel (bundled PTX loaded as a separate CUmodule), return
+    a descriptor dict that KunMLIR.compile() should append to the
+    executable's kernel list.  Otherwise return None.
+
+    The descriptor matches what KunMLIR.compile's `external_kernels=`
+    parameter expects:
+        {"name": <str>, "kind": <str>,
+         "inputs": [<str>...], "outputs": [<str>...]}
+
+    Detection mirrors CodegenCpp's "simple cross-sectional fast path"
+    (CodegenCpp.codegen_cpp's `len(f.ops) == 3` check): a partition
+    whose only compute op is a supported `SimpleCrossSectionalOp`
+    (currently Rank or Scale).  The partitioner places every CrossSectionalOp into its own
+    partition without other compute, so this shape is what we get.
+
+    The `kind` string is `cs_<op>_f{32,64}`.  Do not fabricate kinds for
+    cross-sectional ops unless the C++ runtime has a matching bundled
+    external kernel.
+    """
+    compute = [op for op in f.ops
+                if not isinstance(op, (Input, Output))]
+    if len(compute) != 1 or not isinstance(compute[0], SimpleCrossSectionalOp):
+        return None
+    if not isinstance(compute[0], (Rank, Scale)):
+        return None
+    inputs  = [op for op in f.ops if isinstance(op, Input)]
+    outputs = [op for op in f.ops if isinstance(op, Output)]
+    if len(inputs) != 1 or len(outputs) != 1:
+        return None  # surprising shape, let the regular path emit an error
+    if dtype not in ("f32", "f64"):
+        return None
+    op_kind = compute[0].__class__.__name__.lower()
+    return {
+        "name":    f.name or f"cs_{op_kind}",
+        "kind":    f"cs_{op_kind}_{dtype}",
+        "inputs":  [op.attrs["name"] for op in inputs],
+        "outputs": [op.attrs["name"] for op in outputs],
+    }
+
+
+def translate_function(f: Function, target: TargetSpec,
+                        ir: KunMLIR.IRBuilder,
+                        dtype: str = "f32",
+                        unreliable_count: int = 0) -> Optional[dict]:
+    """Emit `f` as a single kunir.func into the open `ir` (KunMLIR.IRBuilder).
+
+    If `f` is an externally-dispatched partition (e.g. a single cs_rank
+    op handled by the bundled cs_rank.ptx CUmodule), emit nothing into
+    the IRBuilder and return its descriptor dict so the caller can pass
+    it to KunMLIR.compile()'s `external_kernels=` list.  Otherwise
+    return `None` after emitting a kunir.func.
+
+    `unreliable_count` is the partition-local warmup depth — the caller
+    (`KunQuant.jit.cuda`) computes it via `infer_window(f)` on this
+    post-partition Function and feeds it in.
+    """
+    ext = _maybe_external_partition(f, dtype)
+    if ext is not None:
+        return ext
+
+    # 1.  Boundary ops in topo order — the kunir.func's I/O.
+    inputs:  List[Input]  = [op for op in f.ops if isinstance(op, Input)]
+    outputs: List[Output] = [op for op in f.ops if isinstance(op, Output)]
+    if not inputs:
+        raise ValueError("CodegenMLIR: function has no Input ops")
+    if not outputs:
+        raise ValueError("CodegenMLIR: function has no Output ops")
+
+    in_names  = [op.attrs["name"] for op in inputs]
+    out_names = [op.attrs["name"] for op in outputs]
+
+    # 2.  Pre-index loop members so we can emit each loop's body +
+    #     reductions contiguously (regardless of topo interleaving with
+    #     other loops).
+    body_ops_by_loop, reductions_by_loop = _index_loop_members(f)
+
+    # 3.  Open the kunir.func.  All inputs are ts<dtype, inf>; all
+    #     graph results are ts<dtype, 1>.
+    ts_inf = ir.ts_type(dtype, 0)
+    ts_1   = ir.ts_type(dtype, 1)
+
+    func_args = ir.begin_func(
+        name=_kunir_symbol(f.name or "kernel"),
+        input_types=[ts_inf] * len(inputs),
+        input_names=in_names,
+        output_names=out_names,
+        occupancy=target.occupancy, warps_per_cta=target.warps_per_cta,
+        smem_size=target.smem_size, vector_size=target.vector_size,
+        unreliable_count=unreliable_count,
+        result_types=[ts_1] * len(outputs),
+    )
+
+    val_map: Dict[OpBase, KunMLIR.Value] = {}
+    emitted = set()
+    for inp, val in zip(inputs, func_args):
+        val_map[inp] = val
+        emitted.add(inp)
+
+    # 4.  Walk f.ops in topo order, emitting one op (or one whole loop)
+    #     at a time.
+    for op in f.ops:
+        if op in emitted:
+            continue
+        if isinstance(op, Input):
+            continue                      # already mapped from func_args
+        if isinstance(op, Output):
+            # An Output may also be read as a windowed source within the
+            # same partition; emit a kunir.output_ref so downstream sees
+            # its gmem buffer as a ts handle.
+            if any(isinstance(u, WindowedTrait)
+                    for u in f.op_to_id[op].uses):
+                val_map[op] = ir.output_ref(op.attrs["name"],
+                                              val_map[op.inputs[0]])
+            continue                      # handled at the end via Return
+        if isinstance(op, ForeachBackWindow):
+            _emit_loop(op, ir, val_map, ts_1,
+                        body_ops_by_loop.get(op, []),
+                        reductions_by_loop.get(op, []),
+                        emitted)
+            continue
+        if isinstance(op, ReductionOp) or op.get_parent() is not None:
+            # Should have been emitted as part of its enclosing loop;
+            # if we hit it here, the loop never appeared first — that's
+            # a bug in topo sort or in this translator's iteration.
+            raise RuntimeError(
+                f"CodegenMLIR: reduction/body op visited before its "
+                f"enclosing loop ({op})")
+        val_map[op] = _emit_simple(op, ir, val_map, ts_1)
+        emitted.add(op)
+
+    # 5.  Close the function with Outputs in declared order.
+    return_values = [val_map[o.inputs[0]] for o in outputs]
+    ir.end_func(return_values)
+    return None
+
+
+def _emit_loop(loop: ForeachBackWindow,
+                ir: KunMLIR.IRBuilder,
+                val_map: Dict[OpBase, KunMLIR.Value],
+                ts_1: KunMLIR.Type,
+                body_ops: List[OpBase],
+                reductions: List[ReductionOp],
+                emitted: set) -> None:
+    loop_input_vals = [val_map[i] for i in loop.inputs]
+    n_results = len(reductions)
+    if n_results == 0:
+        raise NotImplementedError(
+            f"CodegenMLIR: ForeachBackWindow with no reductions "
+            f"(loop = {loop})")
+
+    block_args = ir.begin_for_each_back_window(
+        inputs=loop_input_vals,
+        window=int(loop.attrs["window"]),
+        result_types=[ts_1] * n_results,
+    )
+    # Block args mirror loop.inputs positionally.  Map the source-op
+    # → block-arg so IterValue can be resolved to the right one.
+    block_arg_by_src = {src: block_args[i]
+                          for i, src in enumerate(loop.inputs)}
+
+    # Body ops: IterValue → block arg; everything else uses _emit_simple.
+    for body_op in body_ops:
+        if isinstance(body_op, IterValue):
+            val_map[body_op] = block_arg_by_src[body_op.inputs[1]]
+        else:
+            val_map[body_op] = _emit_simple(body_op, ir, val_map, ts_1)
+        emitted.add(body_op)
+
+    # Reductions accumulate yield values, in topo order.
+    yield_vals = [_emit_reduction(r, ir, val_map) for r in reductions]
+    loop_results = ir.end_for_each_back_window(yield_vals)
+    for r, lr in zip(reductions, loop_results):
+        val_map[r] = lr
+        emitted.add(r)
+
+    emitted.add(loop)
diff --git a/KunQuantMLIR/jit_cuda.py b/KunQuantMLIR/jit_cuda.py
new file mode 100644
index 0000000..7e57cb8
--- /dev/null
+++ b/KunQuantMLIR/jit_cuda.py
@@ -0,0 +1,373 @@
+"""GPU JIT entry point for KunQuant-MLIR.
+
+Mirror of `KunQuant.jit.cfake.compileit` but targets a CUDA backend
+through the KunMLIR / kunir pipeline.  Reuses the existing Driver pass
+list (`Driver.optimize`) so any IR rewrites the CPU path benefits from
+also apply here — only the codegen layer is replaced.
+
+Two-tier config split matches the CPU path:
+
+  * Per-Function knobs live in `KunCompilerConfig` (the CPU-shared
+    dataclass): `dtype`, `blocking_len`, `partition_factor`,
+    `input_layout` / `output_layout` (TS only on GPU), `options`.
+  * Compile-/link-time knobs live in `CudaCompilerConfig`: `gpu_arch`,
+    `warps_per_cta`, `smem_size`, `occupancy`, `opt_level`,
+    `toolkit_path`.  Shared across every Function in a `Library`.
+
+Single-Function compile::
+
+    from KunQuant.jit import KunMLIR
+    from KunQuant.jit.cuda import compile_func, CudaCompilerConfig
+    from KunQuant.Driver import KunCompilerConfig
+
+    exe = compile_func(f,
+                        KunCompilerConfig(input_layout="TS",
+                                            output_layout="TS"),
+                        CudaCompilerConfig(gpu_arch="sm_80"))
+    executor = KunMLIR.Executor()                       # default stream
+    out = executor.runGraph(exe, {"a": cp_a, "b": cp_b})  # length auto-inferred
+    executor.synchronize()
+
+Multi-Function compile (CPU `cfake.compileit` shape)::
+
+    from KunQuant.jit.cuda import compileit, CudaCompilerConfig
+    from KunQuant.Driver import KunCompilerConfig
+
+    kcfg = KunCompilerConfig(input_layout="TS", output_layout="TS")
+    ccfg = CudaCompilerConfig(gpu_arch="sm_80")
+    lib = compileit([("mod1", f1, kcfg), ("mod2", f2, kcfg)],
+                     "my_lib", ccfg)
+    exe = lib.getModule("mod1")
+"""
+
+from __future__ import annotations
+import os
+from dataclasses import dataclass
+from typing import List, Tuple
+
+from KunQuantMLIR import KunMLIR
+
+from KunQuant.Driver import KunCompilerConfig, optimize, post_optimize
+from KunQuant.Op import Input, Output, MayRequireWholeTime
+from KunQuant.passes import do_partition
+from KunQuant.passes.InferWindow import infer_window
+from KunQuant.Stage import Function
+from KunQuantMLIR.codegen_mlir import TargetSpec, translate_function
+
+
+# Sentinel passed via kunir.func's `unreliable_count` attribute to mean
+# "this partition needs the full time history; the runtime must launch
+# it as a single chunk".  Kept in sync with the kunir verifier (which
+# only allows -1 or non-negative) and the CUDA runtime's `computeChunkPlan`.
+_WHOLE_TIME_UNRELIABLE = -1
+
+
+# Standard locations searched when CudaCompilerConfig.toolkit_path is left
+# empty.  A toolkit dir must contain `nvvm/libdevice/libdevice.10.bc` (the
+# upstream `gpu-module-to-binary` pass links libdevice into the LLVM
+# module) and `bin/ptxas` (PTX → cubin).
+_TOOLKIT_ENV_VARS  = ("CUDA_HOME", "CUDA_PATH", "CUDA_TOOLKIT_PATH",
+                       "CUDA_ROOT")
+_TOOLKIT_FALLBACKS = ("/usr/local/cuda", "/opt/cuda", "/opt/nvidia/cuda")
+
+
+def _is_toolkit_dir(path: str) -> bool:
+    return (path
+            and os.path.isfile(os.path.join(path, "nvvm", "libdevice",
+                                              "libdevice.10.bc"))
+            and os.path.isfile(os.path.join(path, "bin", "ptxas")))
+
+
+def find_cuda_toolkit(override: str = "") -> str:
+    """Locate a CUDA toolkit root suitable for `gpu-module-to-binary`.
+
+    Search order:
+      1. `override` (if non-empty and looks like a toolkit dir)
+      2. $CUDA_HOME / $CUDA_PATH / $CUDA_TOOLKIT_PATH / $CUDA_ROOT
+      3. Standard install paths (/usr/local/cuda, /opt/cuda, …)
+
+    Raises FileNotFoundError if nothing usable is found — the message
+    lists every location consulted so the caller can fix the env.
+    """
+    tried = []
+    if override:
+        tried.append(f"override={override!r}")
+        if _is_toolkit_dir(override):
+            return override
+    for env in _TOOLKIT_ENV_VARS:
+        val = os.environ.get(env, "")
+        if val:
+            tried.append(f"${env}={val!r}")
+            if _is_toolkit_dir(val):
+                return val
+    for fallback in _TOOLKIT_FALLBACKS:
+        tried.append(f"fallback={fallback!r}")
+        if _is_toolkit_dir(fallback):
+            return fallback
+    raise FileNotFoundError(
+        "Could not locate a CUDA toolkit (need "
+        "<root>/nvvm/libdevice/libdevice.10.bc and <root>/bin/ptxas). "
+        "Searched: " + ", ".join(tried) +
+        ". Set CUDA_PATH or pass toolkit_path explicitly.")
+
+
+@dataclass
+class CudaCompilerConfig:
+    """Compile- / link-time knobs that are shared across every Function
+    in a `Library`.  Per-Function graph-rewriting knobs (dtype,
+    blocking_len, partition_factor, layout, pass options) live in
+    `KunQuant.Driver.KunCompilerConfig` instead — the same dataclass
+    the CPU path uses.
+    """
+    gpu_arch:    str = "sm_80"
+
+    # kunir.target_spec — graph-wide for v0.  `vector_size` is taken
+    # from the per-Function `KunCompilerConfig.blocking_len` at compile
+    # time (the two are the same concept on GPU).
+    occupancy:     int = 1
+    warps_per_cta: int = 4
+    smem_size:     int = 49152
+
+    # LLVM optimization level (forwarded to #nvvm.target<O = ...>).
+    opt_level:     int  = 3
+    # Path to the CUDA toolkit (where libdevice.10.bc + ptxas live).
+    # Empty → upstream search: CUDA_HOME / CUDA_PATH / standard locations.
+    toolkit_path:  str  = ""
+
+
+def _resolve_vector_size(kcfg: KunCompilerConfig) -> int:
+    """On GPU `vector_size` (kunir target_spec) is the same as
+    `blocking_len` from the per-Function config.  Default to 1 (scalar
+    kunir) if the user didn't specify."""
+    return 1 if kcfg.blocking_len is None else int(kcfg.blocking_len)
+
+
+def _gpu_pass_options(kcfg: KunCompilerConfig) -> dict:
+    """`Driver.optimize`'s `options` dict for the GPU path.
+
+    `blocking_len` is needed by some decompose paths (it's also the
+    skip-list / naive cost-model knob).  `kcfg.options` flows through
+    first — including `no_fast_stat`, `opt_reduce`, `fast_log`, all of
+    which the GPU lowering now supports.
+
+    `no_skip_list=True` is forced unconditionally and overrides any
+    user-provided value: the kunir codegen has no lowering for
+    `SkipList*` ops, so the naive `ForeachBackWindow + Reduce*` path
+    is the only one that lowers on GPU.
+
+    `may_slice_time=True` is the safe GPU default because the runtime can
+    split a single graph launch into multiple time chunks.  Users who
+    guarantee single-chunk launches may explicitly set it to False.
+    """
+    opts: dict = {"blocking_len": _resolve_vector_size(kcfg)}
+    if kcfg.options:
+        opts.update(kcfg.options)
+    opts.setdefault("may_slice_time", True)
+    opts["no_skip_list"] = True
+    # Pipeline lowering doesn't know about ExpMovingAvg or the
+    # WindowedLinearRegression* family — turn on the Accumulator-based
+    # expansion pass instead.
+    opts["experimental_expand"] = True
+    return opts
+
+
+def _to_dtype_token(dtype: str) -> str:
+    if dtype == "float":  return "f32"
+    if dtype == "double": return "f64"
+    raise ValueError(f"compile_func: unsupported dtype '{dtype}' "
+                       f"(supported: float, double — kunir today only "
+                       f"lowers float on GPU)")
+
+
+def _validate_kun_cfg(kcfg: KunCompilerConfig) -> None:
+    """GPU path only supports TS layout on both input and output (kunir
+    runtime is TS-major).  dtype must be a kunir-supported token."""
+    if kcfg.input_layout != "TS":
+        raise ValueError(
+            f"GPU backend only supports input_layout='TS', got "
+            f"{kcfg.input_layout!r}")
+    if kcfg.output_layout != "TS":
+        raise ValueError(
+            f"GPU backend only supports output_layout='TS', got "
+            f"{kcfg.output_layout!r}")
+    if kcfg.dtype not in ("float", "double"):
+        raise ValueError(
+            f"KunCompilerConfig.dtype must be 'float' or 'double', got "
+            f"{kcfg.dtype!r}")
+
+
+def _graph_io_names(f: Function):
+    """User-facing graph inputs/outputs.  Captured BEFORE optimize +
+    do_partition because those passes mutate `f` and may scatter the
+    Input/Output ops across multiple sub-Functions (some of which then
+    look like 'TEMP' from the partition's POV but stay user-visible at
+    the graph boundary)."""
+    ins  = [op.attrs["name"] for op in f.ops if isinstance(op, Input)]
+    outs = [op.attrs["name"] for op in f.ops if isinstance(op, Output)]
+    if not ins:
+        raise ValueError("compile_func: function has no Input ops")
+    if not outs:
+        raise ValueError("compile_func: function has no Output ops")
+    return ins, outs
+
+
+def _run_full_pipeline(f: Function, kcfg: KunCompilerConfig):
+    """Run optimize / partition / post_optimize.  Returns
+    `(impl, global_unreliable)`; the second is a pre-partition
+    `infer_window` snapshot keyed by Output name.  Mutates `f`.
+    """
+    options = _gpu_pass_options(kcfg)
+    optimize(f, options)
+    global_unreliable = infer_window(f, options)
+    _mainf, impl = do_partition(f, kcfg.partition_factor, options)
+    post_optimize(impl, options)
+    return impl, global_unreliable
+
+
+def _translate_partitions(impl, kcfg: KunCompilerConfig,
+                            ccfg: CudaCompilerConfig):
+    """Emit one kunir.func per partitioned Function into a single
+    KunMLIR module (single `gpu.module` with N siblings).  Cross-
+    partition buffers stitch up automatically because each impl's
+    Input/Output names match the producing/consuming partition's
+    Output/Input names.
+
+    Cross-sectional partitions (currently: cs_rank) bypass the kunir
+    pipeline entirely — `translate_function` returns a descriptor and
+    we collect those into `external_kernels`, which the C++ side
+    appends to the executable's kernel list without ever generating
+    LLVM IR / PTX for them.
+
+    Returns (ModuleOp, list[dict]) — the second element is the list
+    of external-kernel descriptors to forward to KunMLIR.compile.
+    """
+    target = TargetSpec(occupancy=ccfg.occupancy,
+                          warps_per_cta=ccfg.warps_per_cta,
+                          smem_size=ccfg.smem_size,
+                          vector_size=_resolve_vector_size(kcfg))
+    ir = KunMLIR.IRBuilder()
+    dtype = _to_dtype_token(kcfg.dtype)
+    externals = []
+    for sub in impl:
+        # Per-kernel warmup is partition-local: the runtime serialises
+        # kernel launches so an upstream kernel's reliable writes are
+        # already in place by the time a downstream kernel reads.  Each
+        # kernel's chunk grid only needs to cover its own local warmup.
+        if any(isinstance(op, MayRequireWholeTime)
+                and op.is_whole_time_required()
+                for op in sub.ops):
+            per_kernel_unreliable = _WHOLE_TIME_UNRELIABLE
+        else:
+            per_kernel_unreliable = max(infer_window(sub).values(), default=0)
+        ext = translate_function(sub, target, ir, dtype=dtype,
+                                   unreliable_count=per_kernel_unreliable)
+        if ext is not None:
+            externals.append(ext)
+    return ir.finish(), externals
+
+
+def compile_func(f: Function, kcfg: KunCompilerConfig,
+                   ccfg: CudaCompilerConfig) -> KunMLIR.Executable:
+    """Compile a single KunQuant Function to a GPU `KunMLIR.Executable`.
+
+    Pipeline mirrors `KunQuant.jit.cfake.compileit` on the CPU path:
+
+      1. Capture user-facing Input/Output names (graph_inputs/outputs).
+      2. Run Driver.optimize on `f` in place.
+      3. do_partition splits `f` into one or more sub-Functions.
+      4. post_optimize per sub-Function (TempWindowElim + MergeLoops + …).
+      5. Translate each sub-Function into a kunir.func (siblings in one
+         gpu.module).
+      6. Hand off to KunMLIR.compile, which generates the cubin and
+         resolves cross-kernel data flow via I/O names.
+    """
+    _validate_kun_cfg(kcfg)
+
+    toolkit_path = find_cuda_toolkit(ccfg.toolkit_path)
+
+    graph_inputs, graph_outputs = _graph_io_names(f)
+    impl, global_unreliable = _run_full_pipeline(f, kcfg)
+    mod, externals = _translate_partitions(impl, kcfg, ccfg)
+
+    return KunMLIR.compile(
+        mod,
+        graph_inputs=graph_inputs,
+        graph_outputs=graph_outputs,
+        gpu_arch=ccfg.gpu_arch,
+        opt_level=ccfg.opt_level,
+        toolkit_path=toolkit_path,
+        external_kernels=externals,
+        # Forwarded for the no-JIT-kernel case: when every partition
+        # is external (e.g. a graph that is just `cs_rank(a)`), the
+        # MLIR module is empty and `data.warpsPerCta` would otherwise
+        # default to 1 — but the cs_rank launch uses it to size
+        # blockDim, so feed the config value through.
+        warps_per_cta=ccfg.warps_per_cta,
+        output_unreliable=global_unreliable,
+    )
+
+
+class Library:
+    """Bag of named `KunMLIR.Executable`s, mirroring the CPU `kr.Library`
+    shape so callers can compile multiple Functions in one go and look
+    them up by name.  Returned by the multi-Function `compileit` below.
+    """
+    def __init__(self, libname: str = "") -> None:
+        self.libname = libname
+        self._modules: dict = {}
+
+    def getModule(self, name: str) -> KunMLIR.Executable:
+        if name not in self._modules:
+            raise RuntimeError(
+                f"Library.getModule: no module named '{name}' "
+                f"(have: {sorted(self._modules)})")
+        return self._modules[name]
+
+    @property
+    def names(self):
+        """All compiled module names in registration order."""
+        return list(self._modules.keys())
+
+    def _add(self, name: str, exe: KunMLIR.Executable) -> None:
+        if name in self._modules:
+            raise RuntimeError(
+                f"Library: duplicate module name '{name}'")
+        self._modules[name] = exe
+
+
+def compileit(
+    funclist: List[Tuple[str, Function, KunCompilerConfig]],
+    libname: str,
+    compiler_config: CudaCompilerConfig,
+) -> Library:
+    """Compile a list of `(name, Function, KunCompilerConfig)` tuples
+    into a `Library`, mirroring the shape of
+    `KunQuant.jit.cfake.compileit(func, libname, compiler_config)`.
+
+    Each entry's third element is the per-Function `KunCompilerConfig`
+    (dtype / blocking_len / partition_factor / layout / pass options);
+    `compiler_config` is the GPU-wide `CudaCompilerConfig` applied to
+    every entry.  cfake's other arguments (`tempdir`, `keep_files`,
+    `load`) don't apply to the GPU path and are intentionally absent.
+
+    Returns a `Library` keyed by the tuple's `name`; look up individual
+    kernels via `lib.getModule(name)`.
+    """
+    lib = Library(libname=libname)
+    for name, f, kcfg in funclist:
+        lib._add(name, compile_func(f, kcfg, compiler_config))
+    return lib
+
+
+def to_mlir(f: Function, kcfg: KunCompilerConfig,
+              ccfg: CudaCompilerConfig) -> KunMLIR.ModuleOp:
+    """Run the same passes + translator as `compile_func`, but return
+    the KunMLIR module before PTX/CUBIN.  External (cs_rank) partitions
+    are absent from the returned module — they never become kunir
+    ops.  Useful for debugging the IR.  Mutates `f` in place (same
+    as `compile_func`)."""
+    _validate_kun_cfg(kcfg)
+    _graph_io_names(f)              # raises if no Input / Output ops
+    impl, _global_unreliable = _run_full_pipeline(f, kcfg)
+    mod, _externals = _translate_partitions(impl, kcfg, ccfg)
+    return mod
diff --git a/cpp/Kun/CApi.cpp b/cpp/Kun/CApi.cpp
index 653d04c..52f8faf 100644
--- a/cpp/Kun/CApi.cpp
+++ b/cpp/Kun/CApi.cpp
@@ -36,7 +36,7 @@ static std::shared_ptr<Library> *unwrapLibrary(KunLibraryHandle ptr) {
 KUN_API KunModuleHandle kunGetModuleFromLibrary(KunLibraryHandle lib,
                                                 const char *name) {
     auto &plib = *unwrapLibrary(lib);
-    return (KunModuleHandle)plib->getModule(name);
+    return (KunModuleHandle)const_cast<Module *>(plib->getModule(name));
 }
 
 KUN_API void kunUnloadLibrary(KunLibraryHandle ptr) {
diff --git a/cpp/Kun/CorrWith.hpp b/cpp/Kun/CorrWith.hpp
index 4ef7cea..425d75b 100644
--- a/cpp/Kun/CorrWith.hpp
+++ b/cpp/Kun/CorrWith.hpp
@@ -26,7 +26,6 @@ void CorrWith(RuntimeStage *stage, size_t time_idx,
         INPUT::getInput(&inbuf1, stage->stage->in_buffers[1], num_stocks);
     using T = typename std::decay<decltype(*input0)>::type;
     auto outinfo = stage->stage->out_buffers[0];
-    auto simd_len = stage->ctx->simd_len;
     T *output = stage->ctx->buffers[outinfo->id].getPtr<T>();
     auto time_end =
         std::min(__start + (time_idx + 1) * time_stride, __start + __length);
@@ -71,7 +70,6 @@ void RankCorrWith(RuntimeStage *stage, size_t time_idx,
         INPUT::getInput(&inbuf1, stage->stage->in_buffers[1], num_stocks);
     using T = typename std::decay<decltype(*input0)>::type;
     auto outinfo = stage->stage->out_buffers[0];
-    auto simd_len = stage->ctx->simd_len;
     T *output = stage->ctx->buffers[outinfo->id].getPtr<T>();
     auto time_end =
         std::min(__start + (time_idx + 1) * time_stride, __start + __length);
diff --git a/cpp/Kun/MathUtil.hpp b/cpp/Kun/MathUtil.hpp
index 71211a5..b8623fe 100644
--- a/cpp/Kun/MathUtil.hpp
+++ b/cpp/Kun/MathUtil.hpp
@@ -3,8 +3,12 @@
 
 namespace kun {
 namespace {
-size_t divideAndCeil(size_t x, size_t y) { return (x + y - 1) / y; }
-size_t roundUp(size_t x, size_t y) { return divideAndCeil(x, y) * y; }
+[[maybe_unused]] size_t divideAndCeil(size_t x, size_t y) {
+    return (x + y - 1) / y;
+}
+[[maybe_unused]] size_t roundUp(size_t x, size_t y) {
+    return divideAndCeil(x, y) * y;
+}
 
 } // namespace
 } // namespace kun
\ No newline at end of file
diff --git a/cpp/Kun/Ops.hpp b/cpp/Kun/Ops.hpp
index 457a2da..ac802cb 100644
--- a/cpp/Kun/Ops.hpp
+++ b/cpp/Kun/Ops.hpp
@@ -301,7 +301,12 @@ template <typename T, int stride>
 struct Accumulator {
     using simd_t = kun_simd::vec<T, stride>;
     using float_mask_t = typename simd_t::Masktype;
-    simd_t v = 0;
+    simd_t v;
+    // Default-init to 0 for backward compat with existing Accumulator()
+    // call sites; the codegen emits Accumulator{init_val} for non-zero
+    // inits and the brace-init binds to this constructor.
+    Accumulator() : v(0) {}
+    Accumulator(T init) : v(init) {}
     struct Value {
         simd_t v;
         Accumulator& acc;
diff --git a/cpp/Kun/Scale.hpp b/cpp/Kun/Scale.hpp
index 4c79005..9861451 100644
--- a/cpp/Kun/Scale.hpp
+++ b/cpp/Kun/Scale.hpp
@@ -22,7 +22,6 @@ KUN_TEMPLATE_EXPORT void ScaleStocks(RuntimeStage *stage, size_t time_idx,
         INPUT::getInput(&inbuf, stage->stage->in_buffers[0], num_stocks);
     using T = typename std::decay<decltype(*input)>::type;
     auto outinfo = stage->stage->out_buffers[0];
-    auto simd_len = stage->ctx->simd_len;
     T *output = OUTPUT::getOutput(&stage->ctx->buffers[outinfo->id], outinfo,
                                   num_stocks);
     auto time_end =
diff --git a/cpp/Kun/SkipList.cpp b/cpp/Kun/SkipList.cpp
index eaf6cb3..90a585a 100644
--- a/cpp/Kun/SkipList.cpp
+++ b/cpp/Kun/SkipList.cpp
@@ -148,8 +148,6 @@ struct SkipListImpl {
     }
 
     double get(int i, size_t &index, bool &ret) const {
-        int level;
-
         if (i < 0 || i >= size) {
             ret = false;
             return 0;
diff --git a/cpp/Kun/StreamBuffer.hpp b/cpp/Kun/StreamBuffer.hpp
index 0c67529..1540aa2 100644
--- a/cpp/Kun/StreamBuffer.hpp
+++ b/cpp/Kun/StreamBuffer.hpp
@@ -16,10 +16,11 @@ struct StreamBuffer {
     // fix-me: we can store the pre-aligned stock_count to avoid re-computation
     // of roundUp
     alignas(64) char buf[0];
-    T *getBuffer() const { return (T *)(buf); }
+    T *getBuffer() const { return (T *)const_cast<char *>(buf); }
     size_t *getPos(size_t idx, size_t stock_count, size_t window_size) const {
         assert(stock_count % 4 == 0);
-        return (size_t *)(buf + sizeof(T) * stock_count * window_size +
+        return (size_t *)(const_cast<char *>(buf) +
+                          sizeof(T) * stock_count * window_size +
                           idx * sizeof(size_t));
     }
     static size_t getBufferSize(size_t stock_count, size_t window_size,
@@ -44,7 +45,7 @@ struct StreamBuffer {
         pos += 1;
         pos = (pos >= window_size) ? 0 : pos;
         size_t *posbase = getPos(0, stock_count, window_size);
-        for (int i = 0; i < divideAndCeil(stock_count, simd_len); i++) {
+        for (size_t i = 0; i < divideAndCeil(stock_count, simd_len); i++) {
             posbase[i] = pos;
         }
         return ret;
diff --git a/doc/Customize.md b/doc/Customize.md
index 21c946d..f8aca63 100644
--- a/doc/Customize.md
+++ b/doc/Customize.md
@@ -137,6 +137,8 @@ The `CppCompilerConfig` controls how KunQuant calls the C++ compiler. To choose
 | opt_reduce | optimize WindowedSum by rolling sum algorithm |  bool  |  If in stream mode, False. Otherwise, True  |
 | fast_log | Use KunQuant's implementation of math log function instead of `std::log` |  bool  |  True  |
 | no_fast_stat | Disable fast rolling algorithm for statistics functions like stddev/corr/etc. Setting this flag to True may help to get better precision with the cost of performance. KunQuant will warn the precision issue if `options['no_fast_stat']==False`. To disable the warning and set no_fast_stat to False, set `options['no_fast_stat']=='no_warn'` |  bool or Literal\["no_warn"\]  |  If dtype is float or in stream mode, True. Otherwise, False |
+| no_skip_list | Disable the skip-list decompose path for large-window WindowedMin/WindowedMax/TsArgMin/TsArgMax/TsRank, falling back to the naive `ForeachBackWindow + Reduce*` lowering regardless of window/blocking_len cost.  `WindowedQuantile` has no non-skip-list path and will raise when this is set.  Set automatically by the GPU backend (`KunQuant.jit.cuda`) because the kunir codegen does not lower `SkipList*` ops. |  bool  |  False (CPU); forced True on GPU |
+| may_slice_time | Tell optimization passes that the runtime may execute one graph over multiple time chunks in parallel. When this is True, `TempWindowElim` will not aggressively optimize the temp windows. |  bool  |  False; GPU backend defaults to True unless explicitly set |
 
 ## Specifing Memory layouts and data types and enabling AVX512
 
@@ -185,4 +187,4 @@ There are some configurable options of function `compileit(...)` above that may
  * Input and output memory layout: `compileit(input_layout=?, output_layout=?)`. This affects how data are arranged in memory. Usually `STs` layout is faster than `TS` but may require some additional memory movement when you call the factor library.
  * Partition factor: `compileit(partition_factor=some_int)`. A larger Partition factor will put more computations in a single generated function in C++. Enlarging Partition factor may reduce the overhead of thread-scheduling and eliminate some of the temp buffers. However, if the factor is too high, the generated C++ code will suffer from register-spilling.
  * Blocking len: `compileit(blocking_len=some_int)`. It selects AVX2 or AVX512 instruction sets. Using AVX512 might have some slight performance gain over AVX2.
- * Unaligned stock number: `compileit(allow_unaligned=some_bool)`. By default `True`. When `allow_unaligned` is set to false, the generated C++ code will assume the number of stocks to be aligned with the SIMD length (e.g., 8 float32 on AVX2). This will slightly improve the performance.
\ No newline at end of file
+ * Unaligned stock number: `compileit(allow_unaligned=some_bool)`. By default `True`. When `allow_unaligned` is set to false, the generated C++ code will assume the number of stocks to be aligned with the SIMD length (e.g., 8 float32 on AVX2). This will slightly improve the performance.
diff --git a/doc/Operators.md b/doc/Operators.md
index 685ea97..87f8d93 100644
--- a/doc/Operators.md
+++ b/doc/Operators.md
@@ -424,13 +424,14 @@ class WindowedQuantile(OpBase, WindowedTrait):
     def __init__(self, v: OpBase, window: int, q: float) -> None:
         pass
 
-class ExpMovingAvg(OpBase, GloablStatefulOpTrait):
+class ExpMovingAvg(OpBase, GloablStatefulOpTrait, MayRequireWholeTime):
     '''
     Exponential Moving Average (EMA)
     Similar to pd.DataFrame.ewm(span=window, adjust=False, ignore_na=True).mean()
     optional parameter: init_val, the initial values for EMA. It must be an Input op with attr
     {"single_value":True}. The name of the Input op should starts with "__init".
     It should be an input of shape (num_stocks,)
+    Always requires the whole time history.
     '''
     def __init__(self, v: OpBase, window: int, init_val: Union[Input, None] = None) -> None:
         pass
@@ -451,16 +452,44 @@ class ReturnFirstValue(OpBase):
     def __init__(self, v: List[OpBase]) -> None:
         pass
 
-class Accumulator(OpBase, GloablStatefulOpTrait):
+class Accumulator(OpBase, GlobalStatefulProducerTrait, MayRequireWholeTime):
     '''
     Accumulator is a stateful op that accumulates the input value over time.
-    It can be used to compute running totals, moving averages, etc.'''
-    def __init__(self, v: OpBase, name: str) -> None:
+    It can be used to compute running totals, moving averages, etc.
+
+    The first positional input `v` is a graph-keepalive only — it does NOT
+    feed the slot.  The slot's value is governed by `init_val` (its initial
+    contents) and by paired `SetAccumulator` ops (which write the slot).
+
+    Parameters:
+        v: keepalive input (any OpBase in the time-step's value graph).
+        name: human-readable label.  Per-op uniqueness is NOT required —
+            each `Accumulator` op identifies a distinct slot, even when two
+            ops share a name (no CSE / dedup).
+        is_whole_time_required: set to True if the accumulator's state
+            can only be reconstructed from the full time history (forces
+            the runtime to collapse to a single chunk).
+        init_val: initial scalar stored in the slot before the first time
+            step.  Pass a Python float (default `0`) for a numeric init,
+            or the string `"nan"` for a NaN init.  NaN init is useful as
+            a "not-yet-seeded" sentinel for ops like EMA.
+    '''
+    def __init__(self, v: OpBase, name: str,
+                  is_whole_time_required: bool = False,
+                  init_val: Union[float, str] = 0) -> None:
         pass
 
 class SetAccumulator(OpBase):
     '''
-    Set the value of an Accumulator to a value, if mask is set. Otherwise, it does nothing.
+    Conditionally overwrite an Accumulator's slot.  When `mask` is true at
+    the current time step, stores `value` into the slot; otherwise the slot
+    is unchanged.
+
+    The op also returns the slot's new value for the current step — i.e.
+    `mask ? value : prev_accumulator`.  Downstream consumers can use the
+    SetAccumulator's SSA result directly as the freshly-written value
+    without re-reading the slot.  `accu` must be the result of an
+    `Accumulator` op.
     '''
     def __init__(self, accu: OpBase, mask: OpBase, value: OpBase) -> None:
         pass
@@ -513,6 +542,15 @@ class StatefulOpTrait:
     pass
 
 
+class MayRequireWholeTime:
+    '''
+    Ops whose state may depend on the full time history (cannot be rebuilt
+    from a bounded warmup window).  Override to declare otherwise.
+    '''
+    def is_whole_time_required(self) -> bool:
+        return False
+
+
 class CrossSectionalOp(OpBase):
     def __init__(self, v: OpBase) -> None:
         pass
diff --git a/mlir/CMakeLists.txt b/mlir/CMakeLists.txt
new file mode 100644
index 0000000..fc04575
--- /dev/null
+++ b/mlir/CMakeLists.txt
@@ -0,0 +1,50 @@
+set(KUN_MLIR_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+set(KUN_MLIR_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
+
+# Include paths: MLIR/LLVM installed headers, our source headers,
+# and the build dir (for generated .h.inc files from tablegen)
+include_directories(${LLVM_INCLUDE_DIRS})
+include_directories(${MLIR_INCLUDE_DIRS})
+include_directories(${KUN_MLIR_SOURCE_DIR}/include)
+include_directories(${KUN_MLIR_BINARY_DIR}/include)
+
+# LLVMConfig.cmake exports LLVM_DEFINITIONS as a single space-separated
+# *string* (e.g. "-D_GNU_SOURCE -D_GLIBCXX_USE_CXX11_ABI=1 ...").
+# Plain `add_definitions(${LLVM_DEFINITIONS})` works most of the time
+# (CMake splits on whitespace during unquoted expansion), but some
+# CMake versions cache the string as a single COMPILE_DEFINITIONS
+# entry, which then re-emits as one malformed `-D_GLIBCXX_USE_CXX11_ABI="1
+# -D__STDC_CONSTANT_MACROS ..."` flag — duplicating the macro and
+# producing the "_GLIBCXX_USE_CXX11_ABI redefined" warning.
+#
+# Tokenise the string explicitly first to keep CMake honest, then add.
+separate_arguments(_KUN_LLVM_DEFS UNIX_COMMAND "${LLVM_DEFINITIONS}")
+add_compile_options(${_KUN_LLVM_DEFS})
+unset(_KUN_LLVM_DEFS)
+
+# The MLIR backend requires LLVM to have been built with the NVPTX target
+# — we emit PTX and load cubins.  Fail loudly here so the diagnostic is
+# clear rather than getting cryptic linker errors later.
+if(NOT "NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD)
+  message(FATAL_ERROR
+    "KunQuant MLIR backend requires LLVM with NVPTX target.  Reconfigure "
+    "your LLVM build with -DLLVM_TARGETS_TO_BUILD=\"X86;NVPTX\" "
+    "(currently: '${LLVM_TARGETS_TO_BUILD}').")
+endif()
+
+# Enable gc-sections so kun-opt pulls in only the code it actually uses
+# from MLIR static libraries, keeping the binary small.
+if(NOT MSVC)
+  add_compile_options(-ffunction-sections -fdata-sections)
+  if(APPLE)
+    string(APPEND CMAKE_EXE_LINKER_FLAGS " -Wl,-dead_strip")
+  else()
+    string(APPEND CMAKE_EXE_LINKER_FLAGS " -Wl,--gc-sections")
+    string(APPEND CMAKE_SHARED_LINKER_FLAGS " -Wl,--gc-sections")
+  endif()
+endif()
+
+add_subdirectory(include)
+add_subdirectory(lib)
+add_subdirectory(Tools/kun-opt)
+add_subdirectory(test)
diff --git a/mlir/Tools/kun-opt/CMakeLists.txt b/mlir/Tools/kun-opt/CMakeLists.txt
new file mode 100644
index 0000000..cdc4590
--- /dev/null
+++ b/mlir/Tools/kun-opt/CMakeLists.txt
@@ -0,0 +1,55 @@
+set(LLVM_LINK_COMPONENTS Support)
+if("NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD)
+  list(APPEND LLVM_LINK_COMPONENTS
+    NVPTXCodeGen NVPTXDesc NVPTXInfo
+    Passes Target TargetParser Core IRReader CodeGen MC AsmPrinter)
+endif()
+
+add_llvm_executable(kun-opt kun-opt.cpp)
+
+llvm_update_compile_flags(kun-opt)
+
+target_link_libraries(kun-opt PRIVATE
+  # KunQuant dialects + passes
+  MLIRKunIrDialect
+  MLIRKunGpuDialect
+  MLIRKunIrToKunGpu
+)
+
+mlir_target_link_libraries(kun-opt PRIVATE
+  # MLIR opt infrastructure
+  MLIROptLib
+
+  # Standard dialects used inside kunir/kungpu IR
+  MLIRFuncDialect
+  MLIRFuncTransforms
+  MLIRArithDialect
+  MLIRMathDialect
+  MLIRSCFDialect
+  MLIRGPUDialect
+  MLIRLLVMDialect
+  MLIRControlFlowDialect
+  MLIRIndexDialect
+
+  # Conversion passes used by the kunir-to-llvm pipeline
+  MLIRSCFToControlFlow
+  MLIRControlFlowToLLVM
+  MLIRArithToLLVM
+  MLIRFuncToLLVM
+  MLIRIndexToLLVM
+  MLIRGPUToNVVMTransforms
+  MLIRNVVMDialect
+  MLIRConvertToLLVMPass
+  MLIRReconcileUnrealizedCasts
+
+  # Core MLIR libraries
+  MLIRIR
+  MLIRParser
+  MLIRPass
+  MLIRTransforms
+  MLIRSupport
+  MLIRSideEffectInterfaces
+)
+
+# Verify no unexpected MLIR dialects are pulled in transitively
+mlir_check_all_link_libraries(kun-opt)
diff --git a/mlir/Tools/kun-opt/kun-opt.cpp b/mlir/Tools/kun-opt/kun-opt.cpp
new file mode 100644
index 0000000..5117474
--- /dev/null
+++ b/mlir/Tools/kun-opt/kun-opt.cpp
@@ -0,0 +1,59 @@
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/Index/IR/IndexDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Conversion/Passes.h"
+#include "mlir/IR/DialectRegistry.h"
+#include "mlir/Tools/mlir-opt/MlirOptMain.h"
+#include "mlir/Transforms/Passes.h"
+
+#include "KunGpu/KunGpuDialect.h"
+#include "KunGpu/Passes.h"
+#include "KunGpu/Pipelines.h"
+#include "KunIr/KunIrDialect.h"
+#include "KunIr/KunIrOps.h"
+#include "KunIr/Passes.h"
+
+int main(int argc, char **argv) {
+  mlir::DialectRegistry registry;
+
+  // Core dialects used by kunir/kungpu
+  registry.insert<mlir::func::FuncDialect>();
+  registry.insert<mlir::arith::ArithDialect>();
+  registry.insert<mlir::math::MathDialect>();
+  registry.insert<mlir::scf::SCFDialect>();
+  registry.insert<mlir::gpu::GPUDialect>();
+  registry.insert<mlir::LLVM::LLVMDialect>();
+  registry.insert<mlir::cf::ControlFlowDialect>();
+  registry.insert<mlir::index::IndexDialect>();
+
+  // KunQuant dialects
+  registry.insert<kunir::KunIrDialect>();
+  registry.insert<kungpu::KunGpuDialect>();
+
+  // KunQuant passes & pipelines
+  kunir::registerKunIrToKunGpuPass();
+  kungpu::registerKunGpuPasses();
+  kungpu::registerKunIrToLLVMPass();
+
+  // Upstream passes used by the kunir-to-llvm pipeline (also lets users
+  // build the pipeline manually via --pass-pipeline=… for debugging).
+  mlir::registerCanonicalizerPass();
+  mlir::registerCSEPass();
+  mlir::registerLoopInvariantCodeMotionPass();
+  mlir::registerSCFToControlFlowPass();
+  mlir::registerConvertControlFlowToLLVMPass();
+  mlir::registerArithToLLVMConversionPass();
+  mlir::registerConvertIndexToLLVMPass();
+  mlir::registerConvertFuncToLLVMPass();
+  mlir::registerConvertGpuOpsToNVVMOpsPass();
+  mlir::registerConvertToLLVMPass();
+  mlir::registerReconcileUnrealizedCastsPass();
+
+  return mlir::asMainReturnCode(
+      mlir::MlirOptMain(argc, argv, "KunQuant MLIR optimizer\n", registry));
+}
diff --git a/mlir/include/CMakeLists.txt b/mlir/include/CMakeLists.txt
new file mode 100644
index 0000000..66b4ff9
--- /dev/null
+++ b/mlir/include/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_subdirectory(KunIr)
+add_subdirectory(KunGpu)
diff --git a/mlir/include/KunCuda/Runtime.h b/mlir/include/KunCuda/Runtime.h
new file mode 100644
index 0000000..7c82037
--- /dev/null
+++ b/mlir/include/KunCuda/Runtime.h
@@ -0,0 +1,387 @@
+//===- Runtime.h - kun_cuda runtime: ExecutableData + Executable -------===//
+//
+// Pure runtime piece, decoupled from the MLIR compiler and the Python
+// binding.  The compiler produces an `ExecutableData` (one cubin holding
+// N kernels + per-kernel I/O *names* + the user's graph_inputs /
+// graph_outputs lists).  The `Executable` ctor turns that into a loaded
+// kernel set plus a fully resolved schedule:
+//
+//   names → buffer indices  ──→  topo sort  ──→  slot plan
+//
+// This split keeps the *compiler* concerned only with what's in the
+// cubin, and lets the *runtime* own everything that's really a graph
+// concern (dependency analysis, schedule, memory plan).  When we add
+// CUDA-graph support later, all the input it needs already lives in the
+// runtime: per-kernel buffer indices, the producer-kernel-of-each-buffer
+// map, and the intermediate slot mapping.
+//
+// Buffer-table layout (assigned at Executable-construction time):
+//   indices [0 .. numGraphInputs)             → graph inputs
+//   indices [numGraphInputs .. firstInter)    → graph outputs
+//   indices [firstInter .. numBuffers)        → intermediates
+//
+// Memory planning:
+//   Intermediates share a pre-allocated slot pool sized to
+//   `peakIntermediateSlots`.  Slot reuse is computed by refcount + LIFO
+//   free pool over the topo-sorted schedule.  Slots are allocated lazily
+//   on the first launch (and re-allocated if `(timeLength, numStocks)`
+//   changes), then reused across subsequent launches with the same shape.
+//
+// This header forward-declares the two opaque CUDA Driver types so
+// consumers don't need to pull in <cuda.h>.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+extern "C" {
+typedef struct CUmod_st    *CUmodule;
+typedef struct CUfunc_st   *CUfunction;
+typedef struct CUstream_st *CUstream;
+} // extern "C"
+
+namespace kun_cuda {
+
+/// Internal: the resolved schedule + memory plan.  Forward-declared so
+/// the public header doesn't have to expose buffer-index tables,
+/// producer maps, etc.  Fully defined in Runtime.cpp.
+struct GraphPlan;
+
+/// Internal: context-local loaded CUDA modules, CUfunctions, and resolved
+/// graph plan.  Shared by cloned Executables; fully defined in the private
+/// runtime implementation.
+struct LoadedExecutable;
+
+/// Internal: CUDA Graph mode state.  Kept behind a pointer because normal
+/// launch mode does not need any graph objects.
+struct CudaGraphLaunchState;
+
+/// Forward-declared so `Executable::launchOnStream` can take an
+/// `Executor *` argument; the full definition lives below.
+class Executor;
+
+//===----------------------------------------------------------------------===//
+// Compile-time output (all names — runtime resolves them to indices)
+//===----------------------------------------------------------------------===//
+
+/// Kernel dispatch kind.  `Jit` kernels live in the cubin produced by
+/// the MLIR pipeline and are launched with the project-wide stock-major
+/// grid (block_x = warps_per_cta * 32, grid_x = ceil(S / block_x)).
+/// `ExtCs*` kernels are pre-compiled PTX bundled inside
+/// libKunCudaRuntime; the executor lazy-loads them as a second
+/// CUmodule and launches them with a time-major grid + dynamic shared
+/// memory sized to the cross-section (one CTA per timestep).
+enum class KernelKind : int32_t {
+  Jit           = 0,
+  ExtCsRankF32  = 1,
+  ExtCsRankF64  = 2,
+  ExtCsScaleF32 = 3,
+  ExtCsScaleF64 = 4,
+};
+
+/// Per-kernel element type.  Currently single-precision (f32) and
+/// double-precision (f64) are supported.  Determines the byte size used
+/// when allocating intermediate slots and validating user-supplied I/O.
+enum class Datatype : int32_t {
+  Float  = 0,   ///< f32 — 4 bytes/elem
+  Double = 1,   ///< f64 — 8 bytes/elem
+};
+
+/// Runtime launch backend.  Normal queues kernels one by one on the stream.
+/// CudaGraph builds a CUDA Graph node DAG and uses graph memory allocation
+/// nodes for intermediate buffers.
+enum class LaunchMode : int32_t {
+  Normal    = 0,
+  CudaGraph = 1,
+};
+
+inline size_t bytesPerElem(Datatype dt) noexcept {
+  return dt == Datatype::Double ? 8u : 4u;
+}
+
+/// Per-kernel metadata, in name form.  This is what the compiler can
+/// produce by walking a single lowered llvm.func — no graph topology
+/// reasoning required.
+struct KernelMeta {
+  std::string kernelName;                    ///< symbol in the cubin (Jit) or in the bundled PTX (ExtCs*)
+  KernelKind kind = KernelKind::Jit;         ///< picked by the MLIR pass; default is the regular path
+  std::vector<std::string> inputNames;       ///< kungpu.input_names, in argv order
+  std::vector<std::string> outputNames;      ///< kungpu.output_names, in argv order
+  /// Per-partition warmup depth (kungpu.unreliable_count on the gpu.func).
+  /// Drives the time-chunk grid: chunks ≥ 1 need this many extra time
+  /// steps before they can start writing reliable outputs, and the
+  /// chunk-size heuristic gates the minimum chunk size at K × warmup.
+  /// Always 0 for external cross-sectional kernels — they don't multi-chunk.
+  int64_t unreliableCount = 0;
+};
+
+/// What the compiler hands the runtime: a cubin + the kernels it
+/// contains, declared purely by name.  `graphInputs` / `graphOutputs`
+/// are user-supplied: they pick which named buffers cross the
+/// graph-runtime boundary; everything else a kernel produces is treated
+/// as an intermediate.
+struct ExecutableData {
+  std::vector<char> cubin;
+  int64_t warpsPerCta = 1;          ///< from kungpu.target_spec (graph-wide).
+                                     ///<   Drives JIT kernels' block_x.
+                                     ///<   External cross-sectional kernels IGNORE
+                                     ///<   this — they auto-tune block_x
+                                     ///<   from numStocks (see
+                                     ///<   launchExtCsKernel).
+  int64_t vectorSize  = 1;          ///< from kungpu.target_spec (graph-wide)
+  Datatype dtype      = Datatype::Float;  ///< element type of every kernel
+                                           ///<   I/O.  Graph-wide; verified
+                                           ///<   at compile time.  Used by
+                                           ///<   the runtime to size the
+                                           ///<   intermediate slot pool.
+  std::vector<KernelMeta> kernels;  ///< unordered set; runtime topo-sorts
+  std::vector<std::string> graphInputs;
+  std::vector<std::string> graphOutputs;
+  /// Per-graph-output warmup depth.  Walks the full dependency chain
+  /// (across partitions).  Used by the user-facing
+  /// `Executable::getOutputUnreliableCount` to tell callers how many
+  /// leading time steps of each Output buffer to skip.  Populated by
+  /// the Python frontend (which has the pre-partition `infer_window`
+  /// snapshot); empty when not supplied.
+  std::unordered_map<std::string, int64_t> outputUnreliable;
+
+  /// Write this artifact as `<dir>/<name>.json` plus `<dir>/<name>.cubin`.
+  /// The JSON metadata stores only the sibling cubin filename, never an
+  /// arbitrary cubin path.
+  void saveToFiles(const std::string &dir, const std::string &name) const;
+
+  /// Load `<dir>/<name>.json` and `<dir>/<name>.cubin` into a new data object.
+  static std::shared_ptr<ExecutableData>
+  loadFromFiles(const std::string &dir, const std::string &name);
+};
+
+//===----------------------------------------------------------------------===//
+// Executable
+//===----------------------------------------------------------------------===//
+
+/// RAII wrapper around a loaded cubin + the resolved graph plan.
+///
+/// Construction:
+///   1. Resolve names → buffer indices (graphInputs first, graphOutputs
+///      next, intermediates last).
+///   2. Build per-kernel int-index I/O lists and a producer-of-each-buffer
+///      table.
+///   3. Validate the graph (single producer; every consumer either a
+///      graph input or has a producer; every graph output is produced).
+///   4. Kahn topo sort over kernel-to-kernel edges.
+///   5. Slot plan via refcount + LIFO free pool.
+///   6. cuModuleLoadData + cuModuleGetFunction × N on the calling
+///      thread's primary CUDA context (which must already exist).
+///
+/// Destruction frees only this Executable's per-launch slot pool / CUDA graph
+/// state.  CUDA modules live in a shared LoadedExecutable and are unloaded
+/// when the last Executable sharing it is destroyed.
+class Executable {
+public:
+  /// Throws std::runtime_error on driver errors, missing CUDA context,
+  /// or graph-validation failures.  `ExecutableData` is immutable after
+  /// compile and may be shared by multiple Executables.
+  explicit Executable(std::shared_ptr<const ExecutableData> data);
+  ~Executable();
+
+  // Non-copyable, non-movable — wrap in unique_ptr / shared_ptr if you
+  // need transferable ownership.
+  Executable(const Executable &)            = delete;
+  Executable &operator=(const Executable &) = delete;
+  Executable(Executable &&)                 = delete;
+  Executable &operator=(Executable &&)      = delete;
+
+  // ── Accessors (compile-time data) ─────────────────────────────────
+  const ExecutableData &data() const noexcept { return *data_; }
+  std::shared_ptr<const ExecutableData> dataPtr() const noexcept {
+    return data_;
+  }
+  const std::vector<std::string> &graphInputs()  const noexcept { return data_->graphInputs; }
+  const std::vector<std::string> &graphOutputs() const noexcept { return data_->graphOutputs; }
+  int64_t warpsPerCta() const noexcept { return data_->warpsPerCta; }
+  int64_t vectorSize()  const noexcept { return data_->vectorSize; }
+  Datatype dtype()      const noexcept { return data_->dtype; }
+  size_t  numKernels()  const noexcept { return data_->kernels.size(); }
+  const std::unordered_map<std::string, int64_t> &
+  outputUnreliable() const noexcept {
+    return data_->outputUnreliable;
+  }
+
+  /// Create a new Executable with independent mutable launch state while
+  /// sharing immutable ExecutableData and loaded CUDA modules / functions.
+  std::unique_ptr<Executable> clone() const;
+
+  // ── Accessors (runtime-resolved plan) ─────────────────────────────
+  // Defined out-of-line so the header doesn't need GraphPlan's layout.
+
+  /// Topo-sorted indices into `data().kernels` — the order the runtime
+  /// launches kernels on the single CUDA stream.
+  const std::vector<int> &launchOrder() const noexcept;
+  /// Total buffer-table slots = numGraphInputs + numGraphOutputs +
+  /// (number of distinct intermediates produced by kernels).
+  int  numBuffers()            const noexcept;
+  /// Number of physical intermediate buffers actually allocated by the
+  /// runtime (after slot reuse).
+  int  peakIntermediateSlots() const noexcept;
+
+  /// Launch every kernel in `launchOrder` asynchronously on `stream`.
+  /// **Does not synchronize** — the caller (typically `Executor::runGraph`
+  /// + `Executor::synchronize`) owns waiting for completion.
+  ///
+  /// `args` keys must equal `graphInputs ++ graphOutputs` (order
+  /// doesn't matter; the runtime hashes them into the buffer table).
+  /// In normal mode, intermediate buffers are owned by the executable and
+  /// reused across launches with matching `(timeLength, numStocks)`.  In CUDA
+  /// Graph mode, intermediates are graph allocation nodes with free nodes after
+  /// their last consumers.
+  ///
+  /// Grid configuration (per kernel — identical because warps_per_cta
+  /// and vector_size are graph-wide):
+  ///   block_x = warps_per_cta * 32
+  ///   grid_x  = ceil_div(numStocks, block_x * vector_size)
+  ///
+  /// `devMaxSmemBytes` is the device's MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
+  /// cached by the caller (`Executor`) so the runtime can validate
+  /// `num_stocks * sizeof(T)` against the GPU's smem cap before
+  /// invoking cuLaunchKernel for external cross-sectional kernels.  Pass 0 if
+  /// there are no external kernels in the executable (the check is a
+  /// no-op in that case).
+  ///
+  /// Throws std::runtime_error on validation or driver errors.  This is
+  /// a low-level entry point — most users go through `Executor::runGraph`.
+  /// Multi-chunk parameters (`mask`, `minChunkWarmupFactor`,
+  /// `smFillFactor`) drive the time-axis chunk grid for JIT kernels:
+  ///   - `mask` is the user-visible prefix-skip on graph outputs.  The
+  ///     output array's time dim is `timeLength - mask`; chunk 0 begins
+  ///     writes at `t == mask`.
+  ///   - `minChunkWarmupFactor` (≥ 1) gates the minimum chunk size at
+  ///     `factor * kernel.unreliableCount`, so the warmup-overlap
+  ///     region of a non-first chunk stays ≤ `1 / factor` of total
+  ///     compute.
+  ///   - `smFillFactor` (≥ 0) is the target chunks-on-GPU multiplier:
+  ///     JIT uses `num_chunks * stock_tiles ≥ smFillFactor * numSMs`;
+  ///     external cross-sectional kernels use
+  ///     `num_time_chunks ≥ smFillFactor * numSMs`.  1.0
+  ///     just fills the GPU; > 1 leaves slack for scheduler latency
+  ///     hiding.
+  /// `exec` owns the CUDA stream + the cached device attributes
+  /// (`devMaxSmemBytes()`, `numSMs()`).  External cross-sectional kernels
+  /// ignore the multi-chunk params — they keep their own auto-tune
+  /// path using the same Executor accessors.
+  void launchOnStream(Executor *exec,
+                       int64_t timeLength, int64_t numStocks,
+                       const std::vector<std::pair<std::string, uintptr_t>> &args,
+                       int64_t mask = 0,
+                       int minChunkWarmupFactor = 4,
+                       double smFillFactor = 1.5,
+                       LaunchMode mode = LaunchMode::Normal);
+
+private:
+  /// Allocate (or re-allocate, if shape changed) the intermediate slot
+  /// pool.  Each slot holds one `T × S` float32 array.
+  void ensureSlotPool(int64_t timeLength, int64_t numStocks);
+  /// Free all slot allocations.  Called from dtor and on shape change.
+  void freeSlotPool();
+  void launchCudaGraphOnStream(
+      Executor *exec,
+      int64_t timeLength, int64_t numStocks,
+      const std::vector<std::pair<std::string, uintptr_t>> &args,
+      int64_t mask,
+      int minChunkWarmupFactor,
+      double smFillFactor);
+  void resetCudaGraphState() noexcept;
+
+  Executable(std::shared_ptr<const ExecutableData> data,
+             std::shared_ptr<LoadedExecutable> loaded);
+
+  std::shared_ptr<const ExecutableData> data_;
+  std::shared_ptr<LoadedExecutable> loaded_;
+  std::unique_ptr<CudaGraphLaunchState> cudaGraphState_;
+
+  // Lazily allocated intermediate buffers, one CUdeviceptr per slot
+  // (stored as uintptr_t to keep the header CUDA-free).
+  std::vector<uintptr_t> slotBufs_;
+  int64_t cachedT_ = -1;
+  int64_t cachedS_ = -1;
+};
+
+//===----------------------------------------------------------------------===//
+// Executor — wraps a CUDA stream and exposes the runGraph / synchronize
+// pair, mirroring the CPU `kun::Executor` shape.
+//
+// Default constructor uses the CUDA default (NULL) stream.  The
+// stream-injecting constructor lets callers reuse a stream they already
+// own (e.g. `cupy.cuda.Stream`'s `.ptr`); the Executor does NOT take
+// ownership and never destroys the stream.
+//
+// `runGraph` is asynchronous — it queues every kernel in the executable
+// onto this stream and returns immediately.  Call `synchronize` (or wait
+// on the stream by other means) before reading results back to host.
+//
+// Thread / Executable model: an `Executable`'s intermediate slot pool
+// is mutable state shared by every `runGraph` call against it.  Driving
+// the same Executable from two Executors concurrently is unsafe — pair
+// them 1:1, or serialize the calls externally.
+//===----------------------------------------------------------------------===//
+
+class Executor {
+public:
+  /// Use the CUDA default stream.
+  Executor();
+  /// Reuse a stream the caller owns (e.g. cupy's `.ptr`).  We do not
+  /// destroy it; lifetime is the caller's responsibility.
+  explicit Executor(CUstream stream);
+  ~Executor();
+
+  Executor(const Executor &)            = delete;
+  Executor &operator=(const Executor &) = delete;
+  Executor(Executor &&)                 = delete;
+  Executor &operator=(Executor &&)      = delete;
+
+  /// Queue all kernels in `exe` on this executor's stream.  Async — does
+  /// not synchronize.  Throws std::runtime_error on validation / driver
+  /// errors.
+  ///
+  /// `mask` skips the first `mask` time rows of every output (output
+  /// time dim = `timeLength - mask`).  `minChunkWarmupFactor` and
+  /// `smFillFactor` shape the multi-chunk grid heuristic — see
+  /// `Executable::launchOnStream` for the meaning.  Defaults are tuned
+  /// to "fill the GPU with mild scheduler slack" while keeping warmup
+  /// overhead ≤ ~25%.
+  void runGraph(Executable &exe,
+                int64_t timeLength, int64_t numStocks,
+                const std::vector<std::pair<std::string, uintptr_t>> &args,
+                int64_t mask = 0,
+                int minChunkWarmupFactor = 4,
+                double smFillFactor = 1.5,
+                LaunchMode mode = LaunchMode::Normal);
+
+  /// Block until all queued work on this stream completes.
+  void synchronize();
+
+  /// Raw stream handle (default-stream Executor returns nullptr).
+  CUstream stream() const noexcept { return stream_; }
+  /// Cached MAX_SHARED_MEMORY_PER_BLOCK_OPTIN of the device this
+  /// Executor's CUcontext is bound to, queried once at construction.
+  /// Used to validate external cross-sectional dynamic-smem requests at launch time
+  /// without a per-launch driver call.
+  int devMaxSmemBytes() const noexcept { return devMaxSmemBytes_; }
+  /// Cached MULTIPROCESSOR_COUNT of the device this Executor's CUcontext
+  /// is bound to.  Used by `runGraph` for the chunk-grid heuristic
+  /// (target num_chunks × stock_tiles ≈ smFillFactor × numSMs).
+  int numSMs() const noexcept { return numSMs_; }
+
+private:
+  CUstream stream_ = nullptr;
+  int devMaxSmemBytes_ = 0;
+  int numSMs_ = 0;
+};
+
+} // namespace kun_cuda
diff --git a/mlir/include/KunGpu/CMakeLists.txt b/mlir/include/KunGpu/CMakeLists.txt
new file mode 100644
index 0000000..4ea697a
--- /dev/null
+++ b/mlir/include/KunGpu/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_mlir_dialect(KunGpuOps kungpu)
+
+set(LLVM_TARGET_DEFINITIONS Passes.td)
+mlir_tablegen(Passes.h.inc -gen-pass-decls -name KunGpu)
+add_mlir_dialect_tablegen_target(MLIRKunGpuPassIncGen)
diff --git a/mlir/include/KunGpu/KunGpuDialect.h b/mlir/include/KunGpu/KunGpuDialect.h
new file mode 100644
index 0000000..390ea7b
--- /dev/null
+++ b/mlir/include/KunGpu/KunGpuDialect.h
@@ -0,0 +1,6 @@
+#pragma once
+
+#include "mlir/IR/Dialect.h"
+
+// Generated by TableGen
+#include "KunGpu/KunGpuOpsDialect.h.inc"
diff --git a/mlir/include/KunGpu/KunGpuDialect.td b/mlir/include/KunGpu/KunGpuDialect.td
new file mode 100644
index 0000000..5095858
--- /dev/null
+++ b/mlir/include/KunGpu/KunGpuDialect.td
@@ -0,0 +1,19 @@
+#ifndef KUNGPU_DIALECT_TD
+#define KUNGPU_DIALECT_TD
+
+include "mlir/IR/OpBase.td"
+
+def KunGpu_Dialect : Dialect {
+  let name = "kungpu";
+  let summary = "KunQuant GPU dialect for NVGPU-targeted computation";
+  let description = [{
+    The kungpu dialect lowers kunir dataflow operations to explicit GPU
+    control flow with an NVIDIA GPU thread model. It retains !kunir.ts
+    types and introduces explicit time loops, stock-thread mapping, and
+    shared memory operations. Raw pointers appear only when lowering to
+    llvm+nvvm dialects.
+  }];
+  let cppNamespace = "::kungpu";
+}
+
+#endif // KUNGPU_DIALECT_TD
diff --git a/mlir/include/KunGpu/KunGpuOps.h b/mlir/include/KunGpu/KunGpuOps.h
new file mode 100644
index 0000000..27eafe5
--- /dev/null
+++ b/mlir/include/KunGpu/KunGpuOps.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include "mlir/Bytecode/BytecodeOpInterface.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
+
+#include "KunGpu/KunGpuDialect.h"
+#include "KunIr/KunIrTypes.h"
+
+// Generated by TableGen
+#define GET_OP_CLASSES
+#include "KunGpu/KunGpuOps.h.inc"
diff --git a/mlir/include/KunGpu/KunGpuOps.td b/mlir/include/KunGpu/KunGpuOps.td
new file mode 100644
index 0000000..6495469
--- /dev/null
+++ b/mlir/include/KunGpu/KunGpuOps.td
@@ -0,0 +1,183 @@
+#ifndef KUNGPU_OPS_TD
+#define KUNGPU_OPS_TD
+
+include "KunGpu/KunGpuDialect.td"
+include "KunIr/KunIrTypes.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/IR/OpBase.td"
+
+class KunGpu_Op<string mnemonic, list<Trait> traits = []>
+    : Op<KunGpu_Dialect, mnemonic, traits>;
+
+//===----------------------------------------------------------------------===//
+// Thread/block indexing
+//===----------------------------------------------------------------------===//
+
+def KunGpu_StockIdOp : KunGpu_Op<"stock_id", [Pure]> {
+  let summary = "Get the stock index assigned to this GPU thread";
+  let description = [{
+    Returns the logical stock index for the current GPU thread.
+    Lowers to: blockIdx.x * blockDim.x + threadIdx.x
+  }];
+  let results = (outs Index:$result);
+  let assemblyFormat = "attr-dict";
+}
+
+def KunGpu_BlockStockCountOp : KunGpu_Op<"block_stock_count", [Pure]> {
+  let summary = "Number of stocks handled per GPU block (blockDim.x)";
+  let results = (outs Index:$result);
+  let assemblyFormat = "attr-dict";
+}
+
+def KunGpu_TimeLengthOp : KunGpu_Op<"time_length", [Pure]> {
+  let summary = "Number of time steps this GPU kernel must process";
+  let description = [{
+    Returns the length of the time dimension the current kernel invocation
+    is responsible for. Each thread iterates over [0, time_length).
+  }];
+  let results = (outs Index:$result);
+  let assemblyFormat = "attr-dict";
+}
+
+//===----------------------------------------------------------------------===//
+// Time-chunk loop bounds
+//
+// The outer time loop is split into multiple chunks across the y dimension
+// of the launch grid (one CTA per (stock_tile, time_chunk)).  These ops
+// encode the per-chunk loop bounds; lowering reads gpu.block_id y for
+// chunk_idx.  When num_chunks == 1 callers should set chunk_size =
+// time_length so only chunk 0 (the full range) runs.
+//===----------------------------------------------------------------------===//
+
+def KunGpu_TimeLbOp : KunGpu_Op<"time_lb", [Pure]> {
+  let summary = "Time-loop lower bound (inclusive) for the current chunk";
+  let description = [{
+    Returns:
+      chunk_idx == 0  →  0
+      otherwise       →  chunk_idx * chunk_size - warmup
+    The warmup overlap lets chunks ≥ 1 prime their windowed rolling state
+    over the trailing `warmup` steps of the previous chunk before they
+    start writing reliable outputs.  `chunk_idx` is `gpu.block_id y`.
+
+    No operands — chunk_size / warmup are runtime scalars that the
+    kungpu-to-llvm pass prepends to the gpu.func signature; lowering
+    reads them from fixed arg positions, mirroring `kungpu.time_length`.
+  }];
+  let results = (outs Index:$result);
+  let assemblyFormat = "attr-dict";
+}
+
+def KunGpu_TimeUbOp : KunGpu_Op<"time_ub", [Pure]> {
+  let summary = "Time-loop upper bound (exclusive) for the current chunk";
+  let description = [{
+    Returns `min((chunk_idx + 1) * chunk_size, time_length)`.  `chunk_idx`
+    is `gpu.block_id y`.  The last chunk gets clipped to `time_length` so
+    `time_length` need not be a multiple of `chunk_size`.
+
+    No operands — see TimeLbOp; both chunk_size and time_length are read
+    from gpu.func args at lowering time.
+  }];
+  let results = (outs Index:$result);
+  let assemblyFormat = "attr-dict";
+}
+
+//===----------------------------------------------------------------------===//
+// Time-series memory ops
+//
+// kungpu.windowed_temp allocates a per-thread circular (ring) buffer.
+// kungpu.ts.get / kungpu.ts.put provide the bridge between the abstract
+// !kunir.ts<elemType, N> handle and actual scalar element loads/stores.
+// The `ts` operand of get/put must be a function argument or a windowed_temp.
+// The `time` operand is an index into the time dimension; the `result`
+// (or `value` for put) must match the ts element type.
+//===----------------------------------------------------------------------===//
+
+def KunGpu_AccumulatorOp : KunGpu_Op<"accumulator"> {
+  let summary = "Allocate a single-slot per-thread accumulator (alloca)";
+  let description = [{
+    Allocates a per-thread single-slot register backing a `kunir.accumulator`.
+    The result is a `ts<T, 1>` handle that ts.put / ts.get treat at offset 0
+    only — there is no time dimension and no circular indexing.  The slot is
+    initialised to `init_val` at allocation time (default 0.0).
+
+    NOT Pure: each op carries its own `init_val` and identifies a distinct
+    slot, so dedup'ing same-name accumulators would silently merge state.
+  }];
+  let arguments = (ins StrAttr:$name,
+                       DefaultValuedAttr<F64Attr, "0.0">:$init_val);
+  let results = (outs KunIr_AnyTs:$result);
+  let assemblyFormat = "$name `:` type($result) attr-dict";
+}
+
+def KunGpu_WindowedTempOp : KunGpu_Op<"windowed_temp", []> {
+  let summary = "Allocate a per-thread windowed (circular) temporary buffer";
+  let description = [{
+    Allocates a thread-local circular buffer whose element type and window
+    length are encoded in the result type `!kunir.ts<elemType, N>`.
+    The buffer is used as the backing store for a windowed reduction;
+    it must be the `ts` operand of `ts.get` or `ts.put`.
+
+    Memory placement is decided by the kungpu-memory-planning pass, which
+    sets a discardable `kungpu.smem` boolean attribute (true → shared
+    memory, false / absent → local).  Use `isSmem()` / `setSmem(bool)`
+    rather than reading the attribute by name.
+  }];
+  let results = (outs KunIr_AnyTs:$result);
+  let assemblyFormat = "`:` type($result) attr-dict";
+  let extraClassDeclaration = [{
+    /// Memory-planning result: true if the buffer should live in shared
+    /// memory.  Defaults to false (local) when the attribute is absent.
+    bool isSmem() {
+      auto a = (*this)->getAttrOfType<::mlir::BoolAttr>("kungpu.smem");
+      return a && a.getValue();
+    }
+    /// Set the memory-placement flag (used by kungpu-memory-planning).
+    void setSmem(bool v) {
+      (*this)->setAttr("kungpu.smem",
+                        ::mlir::BoolAttr::get(getContext(), v));
+    }
+  }];
+}
+
+def KunGpu_TsGetOp : KunGpu_Op<"ts.get", [Pure]> {
+  let summary = "Read a scalar from a time series at a tail-relative offset";
+  let description = [{
+    Reads the per-stock element from time series `ts` at `offset` steps back
+    from the tail.  The tail is the most recently-written position:
+      offset = 0  →  latest value (just written by the most recent put)
+      offset = 1  →  one step earlier
+      offset = k  →  k steps earlier  (must be < ts.maxLookback for windowed_temp)
+
+    `offset` is i32 (64-bit ops are slow on GPUs).
+    Result type must equal the element type of `ts`.
+
+    Example:
+      %v   = kungpu.ts.get %close[%c0]  : !kunir.ts<f32, inf> -> f32
+      %old = kungpu.ts.get %wt[%c2_i32] : !kunir.ts<f32, 5>  -> f32
+  }];
+  let arguments = (ins KunIr_AnyTs:$ts, I32:$offset);
+  let results = (outs AnyFloat:$result);
+  let hasVerifier = 1;
+  let assemblyFormat =
+    "$ts `[` $offset `]` `:` type($ts) `->` type($result) attr-dict";
+}
+
+def KunGpu_TsPutOp : KunGpu_Op<"ts.put"> {
+  let summary = "Append a scalar to the tail of a time series";
+  let description = [{
+    Appends scalar `value` as the new tail of time series `ts`.  No offset:
+    a put always goes to the next writable position (advancing the tail).
+    Subsequent ts.get on the same `ts` with offset = 0 will see this value.
+    `value` must have the same type as the element type of `ts`.
+
+    Example:
+      kungpu.ts.put %wt,  %v : !kunir.ts<f32, 5>, f32
+      kungpu.ts.put %out, %v : !kunir.ts<f32, 1>, f32
+  }];
+  let arguments = (ins KunIr_AnyTs:$ts, AnyFloat:$value);
+  let hasVerifier = 1;
+  let assemblyFormat =
+    "$ts `,` $value `:` type($ts) `,` type($value) attr-dict";
+}
+
+#endif // KUNGPU_OPS_TD
diff --git a/mlir/include/KunGpu/KunGpuUtils.h b/mlir/include/KunGpu/KunGpuUtils.h
new file mode 100644
index 0000000..1f0c270
--- /dev/null
+++ b/mlir/include/KunGpu/KunGpuUtils.h
@@ -0,0 +1,61 @@
+//===- KunGpuUtils.h - Lookup helpers for kungpu metadata on func ops ----===//
+//
+// After convert-kungpu-to-llvm lowers `kunir.func` to `gpu.func`, the
+// original kunir.func metadata (target spec, input/output names) is
+// preserved as discardable attributes on the new gpu.func.  Accessors take
+// `Operation*` so they also work on whatever the gpu.func is later
+// rewritten to (e.g. `llvm.func` after convert-gpu-to-nvvm).
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "KunIr/KunIrAttrs.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/Operation.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace kungpu {
+
+/// Discardable-attribute names used to attach kunir.func metadata to the
+/// kernel function after phase 1 of convert-kungpu-to-llvm.
+constexpr llvm::StringLiteral kFuncTargetSpecAttr  = "kungpu.target_spec";
+constexpr llvm::StringLiteral kFuncInputNamesAttr  = "kungpu.input_names";
+constexpr llvm::StringLiteral kFuncOutputNamesAttr = "kungpu.output_names";
+constexpr llvm::StringLiteral kFuncUnreliableCountAttr = "kungpu.unreliable_count";
+
+inline ::kunir::TargetSpecAttr getFuncTargetSpec(::mlir::Operation *fn) {
+  return fn->getAttrOfType<::kunir::TargetSpecAttr>(kFuncTargetSpecAttr);
+}
+inline void setFuncTargetSpec(::mlir::Operation *fn,
+                                ::kunir::TargetSpecAttr spec) {
+  fn->setAttr(kFuncTargetSpecAttr, spec);
+}
+
+inline ::mlir::ArrayAttr getFuncInputNames(::mlir::Operation *fn) {
+  return fn->getAttrOfType<::mlir::ArrayAttr>(kFuncInputNamesAttr);
+}
+inline void setFuncInputNames(::mlir::Operation *fn,
+                                ::mlir::ArrayAttr names) {
+  fn->setAttr(kFuncInputNamesAttr, names);
+}
+
+inline ::mlir::ArrayAttr getFuncOutputNames(::mlir::Operation *fn) {
+  return fn->getAttrOfType<::mlir::ArrayAttr>(kFuncOutputNamesAttr);
+}
+inline void setFuncOutputNames(::mlir::Operation *fn,
+                                 ::mlir::ArrayAttr names) {
+  fn->setAttr(kFuncOutputNamesAttr, names);
+}
+
+inline int64_t getFuncUnreliableCount(::mlir::Operation *fn) {
+  auto attr = fn->getAttrOfType<::mlir::IntegerAttr>(kFuncUnreliableCountAttr);
+  return attr ? attr.getInt() : 0;
+}
+inline void setFuncUnreliableCount(::mlir::Operation *fn, int64_t v) {
+  fn->setAttr(kFuncUnreliableCountAttr,
+              ::mlir::IntegerAttr::get(
+                  ::mlir::IntegerType::get(fn->getContext(), 64), v));
+}
+
+} // namespace kungpu
diff --git a/mlir/include/KunGpu/Passes.h b/mlir/include/KunGpu/Passes.h
new file mode 100644
index 0000000..cf47230
--- /dev/null
+++ b/mlir/include/KunGpu/Passes.h
@@ -0,0 +1,20 @@
+#pragma once
+
+// KunIrOps.h must be included before Passes.h.inc so that ::kunir::FuncOp
+// is fully declared when the OperationPass<::kunir::FuncOp> template is used.
+#include "KunIr/KunIrOps.h"
+#include "mlir/Pass/Pass.h"
+#include <memory>
+
+namespace kungpu {
+
+#define GEN_PASS_DECL
+#include "KunGpu/Passes.h.inc"
+
+std::unique_ptr<mlir::Pass> createWindowedTempMemoryPlanningPass();
+std::unique_ptr<mlir::Pass> createConvertKunGpuToLLVMPass();
+
+#define GEN_PASS_REGISTRATION
+#include "KunGpu/Passes.h.inc"
+
+} // namespace kungpu
diff --git a/mlir/include/KunGpu/Passes.td b/mlir/include/KunGpu/Passes.td
new file mode 100644
index 0000000..d4f9167
--- /dev/null
+++ b/mlir/include/KunGpu/Passes.td
@@ -0,0 +1,67 @@
+#ifndef KUNGPU_PASSES_TD
+#define KUNGPU_PASSES_TD
+
+include "mlir/Pass/PassBase.td"
+
+def WindowedTempMemoryPlanning
+    : Pass<"kungpu-memory-planning", "::kunir::FuncOp"> {
+  let summary = "Assign shared/local memory to kungpu.windowed_temp ops";
+  let description = [{
+    Greedy memory planning pass for kungpu.windowed_temp allocations.
+
+    Sorts windowed_temp ops by ascending window size and assigns shared memory
+    to as many as fit within the per-block budget.  All parameters are read
+    from the enclosing kunir.func's target_spec attribute:
+
+      budget_per_block = target_spec.smem_size / target_spec.occupancy
+      num_threads      = target_spec.warps_per_cta * 32
+      bytes_per_buffer = N * num_threads * target_spec.vector_size * elem_bytes
+
+    The result is written as a discardable boolean attribute "kungpu.smem" on
+    each windowed_temp op.  The pass does not change IR structure; address-space
+    selection is deferred to the to-LLVM lowering.
+  }];
+  let constructor = "::kungpu::createWindowedTempMemoryPlanningPass()";
+  let dependentDialects = ["::kungpu::KunGpuDialect"];
+}
+
+def ConvertKunGpuToLLVM
+    : Pass<"convert-kungpu-to-llvm", "::mlir::ModuleOp"> {
+  let summary = "Lower kungpu ops and kunir.func to func.func + LLVM/GPU dialects";
+  let description = [{
+    Converts the KunGpu dialect to a mix of func, LLVM, GPU, arith, and scf dialects.
+
+    Type conversion:
+      !kunir.ts<T,N> → !llvm.ptr
+
+    Function signature:
+      kunir.func @f(%a: !kunir.ts<T,N>, ...) → func.func @f(i64, i64, !llvm.ptr, ...)
+      The two prepended i64 arguments are time_len and num_stocks.
+
+    Memory ops:
+      kungpu.windowed_temp → llvm.alloca circular buffer + head-state alloca
+      kungpu.ts.get        → GEP + llvm.load
+      kungpu.ts.put        → advance circular head + GEP + llvm.store
+      Global ts (function arg): TxS layout — element(t,s) = base + t*num_stocks + s
+      windowed_temp: circular index without modulo (compare + conditional subtract)
+
+    Shared memory (kungpu.smem = true):
+      A module-level llvm.mlir.global with addr_space=3 is emitted once per
+      windowed_temp; each thread accesses its own N-element slice at
+      offset threadIdx.x * N.
+
+    Thread indexing:
+      kungpu.time_length       → arith.index_cast %time_len : i64 to index
+      kungpu.stock_id          → blockIdx.x * blockDim.x + threadIdx.x
+      kungpu.block_stock_count → blockDim.x
+  }];
+  let constructor = "::kungpu::createConvertKunGpuToLLVMPass()";
+  let dependentDialects = [
+    "::mlir::func::FuncDialect",
+    "::mlir::LLVM::LLVMDialect",
+    "::mlir::arith::ArithDialect",
+    "::mlir::gpu::GPUDialect"
+  ];
+}
+
+#endif // KUNGPU_PASSES_TD
diff --git a/mlir/include/KunGpu/Pipelines.h b/mlir/include/KunGpu/Pipelines.h
new file mode 100644
index 0000000..aaff937
--- /dev/null
+++ b/mlir/include/KunGpu/Pipelines.h
@@ -0,0 +1,47 @@
+//===- Pipelines.h - Reusable kunir → LLVM lowering pipeline -------------===//
+//
+// Defines the canonical lowering pipeline that converts a `kunir.func`-based
+// module all the way down to the LLVM dialect.  Phase ordering:
+//
+//   1. kunir-to-kungpu                    (kunir.func nested)
+//   2. kungpu-memory-planning             (kunir.func nested)
+//   3. convert-kungpu-to-llvm             (module — also lowers kunir.func
+//                                          to func.func)
+//   4. loop-invariant-code-motion         (per func)
+//   5. canonicalize
+//   6. cse
+//   7. convert-scf-to-cf
+//   8. convert-control-flow-to-llvm
+//   9. convert-arith-to-llvm
+//  10. convert-index-to-llvm
+//  11. convert-func-to-llvm
+//  12. reconcile-unrealized-casts
+//
+// `kunir_to_ptx` will reuse `buildKunIrToLLVMPipeline` and append the
+// gpu→nvvm/llvm-translation passes after it.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <memory>
+
+namespace mlir {
+class Pass;
+class OpPassManager;
+} // namespace mlir
+
+namespace kungpu {
+
+/// Append the kunir → LLVM dialect lowering passes to `pm`.  This is the
+/// shared entry point used by both the test wrapper pass below and by any
+/// downstream pipeline that needs to lower further (e.g. kunir_to_ptx).
+void buildKunIrToLLVMPipeline(::mlir::OpPassManager &pm);
+
+/// Single-pass wrapper that runs `buildKunIrToLLVMPipeline` on the current
+/// module.  Mainly for lit-testing the pipeline as a whole.
+std::unique_ptr<::mlir::Pass> createKunIrToLLVMPass();
+
+void registerKunIrToLLVMPass();
+
+} // namespace kungpu
diff --git a/mlir/include/KunGpu/PtxBackend.h b/mlir/include/KunGpu/PtxBackend.h
new file mode 100644
index 0000000..16ed5b9
--- /dev/null
+++ b/mlir/include/KunGpu/PtxBackend.h
@@ -0,0 +1,81 @@
+//===- PtxBackend.h - Compile a kunir module to a CUDA cubin -----------===//
+//
+// Pipeline (single source of truth):
+//
+//   kunir → llvm dialect (our buildKunIrToLLVMPipeline)
+//        → upstream `gpu-module-to-binary{format=bin}`
+//        → cubin bytes pulled off the resulting `gpu.binary` op
+//
+// `gpu-module-to-binary` (via NVVMTargetAttrImpl) takes care of:
+//   * MLIR → LLVM IR translation,
+//   * libdevice.10.bc location + linking + AlwaysInline + DCE,
+//   * the LLVM optimization pipeline,
+//   * PTX emission via NVPTXTargetMachine,
+//   * ptxas invocation.
+//
+// We just attach an `#nvvm.target<chip = ..., O = ...>` to the gpu.module
+// and run the pass.  No more manual ptxas plumbing on the main path.
+//
+// `compileKunIrToPtx` is kept for **debug / inspection**: same pipeline,
+// but `format=isa` so we can read the PTX text instead of the cubin.
+// The main `compileKunIrToExecutable` does NOT route through it.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "KunCuda/Runtime.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/ADT/StringRef.h"
+
+#include <string>
+
+namespace kungpu {
+
+/// Knobs forwarded to the upstream `#nvvm.target` attribute (which the
+/// `gpu-module-to-binary` pass reads via NVVMTargetAttrImpl).
+///
+/// `targetCpu` is the LLVM/NVPTX term — it carries the SM string
+/// ("sm_80", "sm_120", …) that ptxas / NVPTXTargetMachine consume.  The
+/// user-facing Python kwarg is `gpu_arch`.
+struct PtxCompileOptions {
+  unsigned    optLevel       = 3;       ///< maps to #nvvm.target<O = N>
+  std::string targetTriple   = "nvptx64-nvidia-cuda";
+  std::string targetCpu      = "sm_80"; ///< chip, e.g. "sm_120"
+  std::string targetFeatures;           ///< empty → derived from chip
+
+  /// Forwarded to gpu-module-to-binary's `toolkit` option.  Empty → the
+  /// pass searches CUDA_HOME / CUDA_PATH / standard paths.  Useful when
+  /// the right CUDA toolkit (the one with libdevice.10.bc + a matching
+  /// ptxas) isn't on PATH.
+  std::string toolkitPath;
+};
+
+/// Lower kunir → llvm dialect → emit PTX text.  **Debug / inspection
+/// only** — the main compile path goes straight to cubin.
+///
+/// On success `ptxOut` holds the PTX assembly produced by the upstream
+/// `gpu-module-to-binary{format=isa}` pass.  Module is mutated in place
+/// (the gpu.module gets replaced with a gpu.binary op).
+::mlir::LogicalResult compileKunIrToPtx(::mlir::ModuleOp module,
+                                          const PtxCompileOptions &options,
+                                          std::string &ptxOut);
+
+/// Main entry point: lower kunir, run gpu-module-to-binary{format=bin},
+/// and pull the cubin + per-kernel name metadata into an
+/// `ExecutableData` ready for `kun_cuda::Executable`.
+///
+/// Walks the lowered module for kernel metadata (name, target spec,
+/// I/O names) BEFORE the pass runs, since `gpu-module-to-binary`
+/// replaces the gpu.module with a gpu.binary op.  graphInputs /
+/// graphOutputs are NOT set here — the caller fills them on `out`
+/// after this returns (see KunCuda/Runtime.h).
+///
+/// The module is mutated in-place by the pipeline.
+::mlir::LogicalResult
+compileKunIrToExecutable(::mlir::ModuleOp module,
+                          const PtxCompileOptions &options,
+                          ::kun_cuda::ExecutableData &out);
+
+} // namespace kungpu
diff --git a/mlir/include/KunIr/CMakeLists.txt b/mlir/include/KunIr/CMakeLists.txt
new file mode 100644
index 0000000..15db184
--- /dev/null
+++ b/mlir/include/KunIr/CMakeLists.txt
@@ -0,0 +1,8 @@
+add_mlir_dialect(KunIrOps kunir)
+add_mlir_interface(KunIrInterfaces)
+
+# AttrDef generation (add_mlir_dialect does not cover attrdefs)
+set(LLVM_TARGET_DEFINITIONS KunIrOps.td)
+mlir_tablegen(KunIrOpsAttrDefs.h.inc -gen-attrdef-decls -attrdefs-dialect=kunir)
+mlir_tablegen(KunIrOpsAttrDefs.cpp.inc -gen-attrdef-defs  -attrdefs-dialect=kunir)
+add_mlir_dialect_tablegen_target(MLIRKunIrAttrsIncGen)
diff --git a/mlir/include/KunIr/KunIrAttrs.h b/mlir/include/KunIr/KunIrAttrs.h
new file mode 100644
index 0000000..3e5eb76
--- /dev/null
+++ b/mlir/include/KunIr/KunIrAttrs.h
@@ -0,0 +1,8 @@
+#pragma once
+
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/DialectImplementation.h"
+#include "KunIr/KunIrDialect.h"
+
+#define GET_ATTRDEF_CLASSES
+#include "KunIr/KunIrOpsAttrDefs.h.inc"
diff --git a/mlir/include/KunIr/KunIrAttrs.td b/mlir/include/KunIr/KunIrAttrs.td
new file mode 100644
index 0000000..97a7ddc
--- /dev/null
+++ b/mlir/include/KunIr/KunIrAttrs.td
@@ -0,0 +1,30 @@
+#ifndef KUNIR_ATTRS_TD
+#define KUNIR_ATTRS_TD
+
+include "KunIr/KunIrDialect.td"
+
+// TargetSpec attribute: hardware parameters for a kunir.func op.
+def KunIr_TargetSpecAttr : AttrDef<KunIr_Dialect, "TargetSpec"> {
+  let mnemonic = "target_spec";
+  let summary = "Hardware target parameters for a kunir function";
+  let description = [{
+    Describes the GPU launch configuration for a kunir function:
+      occupancy      — target concurrent blocks per SM
+      warps_per_cta  — warps per thread block
+      smem_size      — total shared memory bytes available on one SM
+                       (per-block budget = smem_size / occupancy)
+      vector_size    — scalar elements per thread per time step
+
+    Printed inline inside kunir.func as:
+      target {occupancy = N, warps_per_cta = N, smem_size = N, vector_size = N}
+  }];
+  let parameters = (ins
+    "int64_t":$occupancy,
+    "int64_t":$warps_per_cta,
+    "int64_t":$smem_size,
+    "int64_t":$vector_size
+  );
+  let hasCustomAssemblyFormat = 1;
+}
+
+#endif // KUNIR_ATTRS_TD
diff --git a/mlir/include/KunIr/KunIrDialect.h b/mlir/include/KunIr/KunIrDialect.h
new file mode 100644
index 0000000..d8e7aa5
--- /dev/null
+++ b/mlir/include/KunIr/KunIrDialect.h
@@ -0,0 +1,7 @@
+#pragma once
+
+#include "mlir/Bytecode/BytecodeOpInterface.h"
+#include "mlir/IR/Dialect.h"
+
+// Generated by TableGen
+#include "KunIr/KunIrOpsDialect.h.inc"
diff --git a/mlir/include/KunIr/KunIrDialect.td b/mlir/include/KunIr/KunIrDialect.td
new file mode 100644
index 0000000..f92be23
--- /dev/null
+++ b/mlir/include/KunIr/KunIrDialect.td
@@ -0,0 +1,23 @@
+#ifndef KUNIR_DIALECT_TD
+#define KUNIR_DIALECT_TD
+
+include "mlir/IR/OpBase.td"
+
+def KunIr_Dialect : Dialect {
+  let name = "kunir";
+  let summary = "KunQuant IR dialect for dataflow financial factor computation";
+  let description = [{
+    The kunir dialect represents financial factor computations as a pure
+    dataflow graph. Operations consume and produce !kunir.ts values; there
+    is no explicit time-dimension iteration or memory access at this level.
+  }];
+  let cppNamespace = "::kunir";
+  let useDefaultTypePrinterParser = 1;
+  let useDefaultAttributePrinterParser = 1;
+  let extraClassDeclaration = [{
+    void registerTypes();
+    void registerAttrs();
+  }];
+}
+
+#endif // KUNIR_DIALECT_TD
diff --git a/mlir/include/KunIr/KunIrInterfaces.h b/mlir/include/KunIr/KunIrInterfaces.h
new file mode 100644
index 0000000..ef673cc
--- /dev/null
+++ b/mlir/include/KunIr/KunIrInterfaces.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/OpDefinition.h"
+#include "llvm/ADT/StringRef.h"
+
+#define GET_OP_INTERFACE_CLASSES
+#include "KunIr/KunIrInterfaces.h.inc"
diff --git a/mlir/include/KunIr/KunIrInterfaces.td b/mlir/include/KunIr/KunIrInterfaces.td
new file mode 100644
index 0000000..e680468
--- /dev/null
+++ b/mlir/include/KunIr/KunIrInterfaces.td
@@ -0,0 +1,70 @@
+#ifndef KUNIR_INTERFACES_TD
+#define KUNIR_INTERFACES_TD
+
+include "mlir/IR/OpBase.td"
+
+//===----------------------------------------------------------------------===//
+// BinaryArithInterface
+// Implemented by binary elemwise ops (add, sub, mul, div, max, min).
+//===----------------------------------------------------------------------===//
+
+def KunIr_BinaryArithInterface : OpInterface<"BinaryArithInterface"> {
+  let cppNamespace = "::kunir";
+  let description = [{
+    Factory interface for binary kunir ops.  Each implementing op knows how to
+    lower itself to its corresponding scalar arith/math op.
+  }];
+  let methods = [
+    InterfaceMethod<
+      "Build the scalar arith/math op corresponding to this binary kunir op.",
+      "::mlir::Value", "buildScalarOp",
+      (ins "::mlir::OpBuilder &":$b, "::mlir::Location":$loc,
+           "::mlir::Value":$lhs, "::mlir::Value":$rhs)>
+  ];
+}
+
+//===----------------------------------------------------------------------===//
+// UnaryArithInterface
+// Implemented by unary elemwise ops (abs, log, sign).
+//===----------------------------------------------------------------------===//
+
+def KunIr_UnaryArithInterface : OpInterface<"UnaryArithInterface"> {
+  let cppNamespace = "::kunir";
+  let description = [{
+    Factory interface for unary kunir ops.  Each implementing op knows how to
+    lower itself to its corresponding scalar arith/math op.
+  }];
+  let methods = [
+    InterfaceMethod<
+      "Build the scalar arith/math op corresponding to this unary kunir op.",
+      "::mlir::Value", "buildScalarOp",
+      (ins "::mlir::OpBuilder &":$b, "::mlir::Location":$loc,
+           "::mlir::Value":$operand)>
+  ];
+}
+
+//===----------------------------------------------------------------------===//
+// ReduceArithInterface
+// Implemented by reduce_* ops (reduce_add, reduce_mul, reduce_max, reduce_min).
+//===----------------------------------------------------------------------===//
+
+def KunIr_ReduceArithInterface : OpInterface<"ReduceArithInterface"> {
+  let cppNamespace = "::kunir";
+  let description = [{
+    Factory interface for reduction kunir ops.  Provides the identity element
+    for iter_args initialisation and a factory that emits one accumulation step.
+  }];
+  let methods = [
+    InterfaceMethod<
+      "Return the identity (init) value for this reduction as a typed float attribute.",
+      "::mlir::TypedAttr", "getInitValue",
+      (ins "::mlir::FloatType":$elemType)>,
+    InterfaceMethod<
+      "Build one accumulation step: newAcc = reduce_op(oldAcc, element).",
+      "::mlir::Value", "buildAccumOp",
+      (ins "::mlir::OpBuilder &":$b, "::mlir::Location":$loc,
+           "::mlir::Value":$acc, "::mlir::Value":$elem)>
+  ];
+}
+
+#endif // KUNIR_INTERFACES_TD
diff --git a/mlir/include/KunIr/KunIrOps.h b/mlir/include/KunIr/KunIrOps.h
new file mode 100644
index 0000000..43f0a07
--- /dev/null
+++ b/mlir/include/KunIr/KunIrOps.h
@@ -0,0 +1,78 @@
+#pragma once
+
+#include "mlir/Bytecode/BytecodeOpInterface.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/Interfaces/ControlFlowInterfaces.h"
+#include "mlir/Interfaces/InferTypeOpInterface.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
+
+#include "KunIr/KunIrAttrs.h"
+#include "KunIr/KunIrDialect.h"
+#include "KunIr/KunIrInterfaces.h"
+#include "KunIr/KunIrTypes.h"
+
+// NativeOpTrait<"Name"> expands to ::mlir::OpTrait::Name in generated code,
+// so the trait class must live in that namespace.  We prefix it with KunIr
+// to avoid collisions with MLIR builtins.
+namespace mlir {
+namespace OpTrait {
+
+/// Trait for element-wise kunir ts ops.
+///
+/// Provides a shared static inferReturnTypes that always yields
+/// !kunir.ts<firstOperand.elemType, 1>.  Ops that carry InferTypeOpInterface
+/// and this trait require no per-op inferReturnTypes definition.
+template <typename ConcreteType>
+class KunIrElemwiseTsResultType
+    : public TraitBase<ConcreteType, KunIrElemwiseTsResultType> {
+public:
+  static mlir::LogicalResult inferReturnTypes(
+      mlir::MLIRContext *ctx, std::optional<mlir::Location>,
+      mlir::ValueRange operands, mlir::DictionaryAttr,
+      mlir::OpaqueProperties , mlir::RegionRange,
+      llvm::SmallVectorImpl<mlir::Type> &inferred) {
+    auto inputTy = llvm::cast<::kunir::TsType>(operands[0].getType());
+    inferred.push_back(
+        ::kunir::TsType::get(ctx, inputTy.getElementType(), 1));
+    return mlir::success();
+  }
+};
+
+/// Trait for comparison kunir ts ops (gt/ge/lt/le/eq).
+///
+/// Result is always !kunir.ts<i1, 1> regardless of operand element type.
+template <typename ConcreteType>
+class KunIrCmpTsResultType
+    : public TraitBase<ConcreteType, KunIrCmpTsResultType> {
+public:
+  static mlir::LogicalResult inferReturnTypes(
+      mlir::MLIRContext *ctx, std::optional<mlir::Location>,
+      mlir::ValueRange, mlir::DictionaryAttr,
+      mlir::OpaqueProperties , mlir::RegionRange,
+      llvm::SmallVectorImpl<mlir::Type> &inferred) {
+    inferred.push_back(
+        ::kunir::TsType::get(ctx, mlir::IntegerType::get(ctx, 1), 1));
+    return mlir::success();
+  }
+};
+
+} // namespace OpTrait
+} // namespace mlir
+
+// Convenient aliases in the kunir namespace.
+namespace kunir {
+namespace OpTrait {
+template <typename ConcreteType>
+using ElemwiseTsResultType =
+    ::mlir::OpTrait::KunIrElemwiseTsResultType<ConcreteType>;
+template <typename ConcreteType>
+using CmpTsResultType =
+    ::mlir::OpTrait::KunIrCmpTsResultType<ConcreteType>;
+} // namespace OpTrait
+} // namespace kunir
+
+// Generated by TableGen
+#define GET_OP_CLASSES
+#include "KunIr/KunIrOps.h.inc"
diff --git a/mlir/include/KunIr/KunIrOps.td b/mlir/include/KunIr/KunIrOps.td
new file mode 100644
index 0000000..7739cde
--- /dev/null
+++ b/mlir/include/KunIr/KunIrOps.td
@@ -0,0 +1,569 @@
+#ifndef KUNIR_OPS_TD
+#define KUNIR_OPS_TD
+
+include "KunIr/KunIrDialect.td"
+include "KunIr/KunIrTypes.td"
+include "KunIr/KunIrInterfaces.td"
+include "KunIr/KunIrAttrs.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/Interfaces/InferTypeOpInterface.td"
+include "mlir/Interfaces/ControlFlowInterfaces.td"
+include "mlir/IR/OpBase.td"
+include "mlir/IR/SymbolInterfaces.td"
+
+class KunIr_Op<string mnemonic, list<Trait> traits = []>
+    : Op<KunIr_Dialect, mnemonic, traits>;
+
+//===----------------------------------------------------------------------===//
+// Element-wise binary ops
+//
+// Inputs may have different maxLookback values but must share the same element
+// type. The result always has maxLookback = 1 (only the current value).
+//===----------------------------------------------------------------------===//
+
+class KunIr_BinaryElemwiseOp<string mnemonic, list<Trait> traits = []>
+    : KunIr_Op<mnemonic, !listconcat(traits, [
+        Pure,
+        InferTypeOpInterface,
+        NativeOpTrait<"KunIrElemwiseTsResultType">,
+        DeclareOpInterfaceMethods<KunIr_BinaryArithInterface>
+    ])> {
+  let arguments = (ins KunIr_AnyTs:$lhs, KunIr_AnyTs:$rhs);
+  let results = (outs KunIr_AnyTs:$result);
+  let hasVerifier = 1;
+  let assemblyFormat =
+    "$lhs `,` $rhs `:` type($lhs) `,` type($rhs) attr-dict";
+}
+
+def KunIr_AddOp : KunIr_BinaryElemwiseOp<"add"> {
+  let summary = "Element-wise addition of two time series";
+}
+def KunIr_SubOp : KunIr_BinaryElemwiseOp<"sub"> {
+  let summary = "Element-wise subtraction of two time series";
+}
+def KunIr_MulOp : KunIr_BinaryElemwiseOp<"mul"> {
+  let summary = "Element-wise multiplication of two time series";
+}
+def KunIr_DivOp : KunIr_BinaryElemwiseOp<"div"> {
+  let summary = "Element-wise division of two time series";
+}
+def KunIr_MaxOp : KunIr_BinaryElemwiseOp<"max"> {
+  let summary = "Element-wise maximum of two time series";
+}
+def KunIr_MinOp : KunIr_BinaryElemwiseOp<"min"> {
+  let summary = "Element-wise minimum of two time series";
+}
+
+//===----------------------------------------------------------------------===//
+// Element-wise unary ops
+//
+// The input may have any maxLookback. The result always has maxLookback = 1.
+//===----------------------------------------------------------------------===//
+
+class KunIr_UnaryElemwiseOp<string mnemonic, list<Trait> traits = []>
+    : KunIr_Op<mnemonic, !listconcat(traits, [
+        Pure,
+        InferTypeOpInterface,
+        NativeOpTrait<"KunIrElemwiseTsResultType">,
+        DeclareOpInterfaceMethods<KunIr_UnaryArithInterface>
+    ])> {
+  let arguments = (ins KunIr_AnyTs:$input);
+  let results = (outs KunIr_AnyTs:$result);
+  let hasVerifier = 1;
+  let assemblyFormat = "$input `:` type($input) attr-dict";
+}
+
+def KunIr_AbsOp  : KunIr_UnaryElemwiseOp<"abs">  {
+  let summary = "Element-wise absolute value";
+}
+def KunIr_LogOp  : KunIr_UnaryElemwiseOp<"log">  {
+  let summary = "Element-wise natural logarithm";
+}
+def KunIr_ExpOp  : KunIr_UnaryElemwiseOp<"exp">  {
+  let summary = "Element-wise natural exponential";
+}
+def KunIr_SqrtOp : KunIr_UnaryElemwiseOp<"sqrt"> {
+  let summary = "Element-wise square root";
+}
+def KunIr_SignOp : KunIr_UnaryElemwiseOp<"sign"> {
+  let summary = "Element-wise sign (-1, 0, 1)";
+}
+
+//===----------------------------------------------------------------------===//
+// Logical ops on i1 ts
+//
+// And/Or are binary, Not is unary.  Operand element types must be i1 and the
+// result is always ts<i1, 1>.  Verifier enforces the i1 constraint; the
+// elemwise traits above already give us the right result-type inference
+// (result = ts<input.elem, 1> = ts<i1, 1> when input is i1).
+//===----------------------------------------------------------------------===//
+
+def KunIr_AndOp : KunIr_BinaryElemwiseOp<"and"> {
+  let summary = "Element-wise logical AND on ts<i1>";
+}
+def KunIr_OrOp  : KunIr_BinaryElemwiseOp<"or"> {
+  let summary = "Element-wise logical OR on ts<i1>";
+}
+def KunIr_NotOp : KunIr_UnaryElemwiseOp<"not"> {
+  let summary = "Element-wise logical NOT on ts<i1>";
+}
+
+//===----------------------------------------------------------------------===//
+// Comparison ops
+//
+// Operands are two ts values with matching element types; result is always
+// !kunir.ts<i1, 1>.  Lowering dispatches arith.cmpf for float operands and
+// arith.cmpi for integer operands.
+//===----------------------------------------------------------------------===//
+
+class KunIr_BinaryCmpOp<string mnemonic, list<Trait> traits = []>
+    : KunIr_Op<mnemonic, !listconcat(traits, [
+        Pure,
+        InferTypeOpInterface,
+        NativeOpTrait<"KunIrCmpTsResultType">,
+        DeclareOpInterfaceMethods<KunIr_BinaryArithInterface>
+    ])> {
+  let arguments = (ins KunIr_AnyTs:$lhs, KunIr_AnyTs:$rhs);
+  let results = (outs KunIr_AnyTs:$result);
+  let hasVerifier = 1;
+  let assemblyFormat =
+    "$lhs `,` $rhs `:` type($lhs) `,` type($rhs) attr-dict";
+}
+
+def KunIr_GreaterOp      : KunIr_BinaryCmpOp<"gt"> {
+  let summary = "Element-wise greater-than (lhs > rhs)";
+}
+def KunIr_GreaterEqualOp : KunIr_BinaryCmpOp<"ge"> {
+  let summary = "Element-wise greater-or-equal (lhs >= rhs)";
+}
+def KunIr_LessOp         : KunIr_BinaryCmpOp<"lt"> {
+  let summary = "Element-wise less-than (lhs < rhs)";
+}
+def KunIr_LessEqualOp    : KunIr_BinaryCmpOp<"le"> {
+  let summary = "Element-wise less-or-equal (lhs <= rhs)";
+}
+def KunIr_EqualOp        : KunIr_BinaryCmpOp<"eq"> {
+  let summary = "Element-wise equality (lhs == rhs)";
+}
+
+//===----------------------------------------------------------------------===//
+// Select op — cond ? true_value : false_value
+//
+// cond is ts<i1, *>; true_value / false_value share the same element type T.
+// Result is ts<T, 1>.
+//===----------------------------------------------------------------------===//
+
+def KunIr_SelectOp : KunIr_Op<"select", [
+    Pure,
+    DeclareOpInterfaceMethods<InferTypeOpInterface>
+]> {
+  let summary = "Element-wise select on three ts values";
+  let arguments = (ins KunIr_AnyTs:$cond,
+                       KunIr_AnyTs:$true_value,
+                       KunIr_AnyTs:$false_value);
+  let results = (outs KunIr_AnyTs:$result);
+  let hasVerifier = 1;
+  let assemblyFormat =
+    "$cond `,` $true_value `,` $false_value `:` type($cond) `,` "
+    "type($true_value) `,` type($false_value) attr-dict";
+}
+
+//===----------------------------------------------------------------------===//
+// Constant op
+//
+// A scalar value lifted into a ts<T, 1> at every time step.  The value
+// attribute is f64; kunir-to-kungpu converts it to an arith.constant of
+// the result's element type (f32, f64, i1, ...).  Pass NaN by storing
+// `0x7FF8000000000000` (the quiet-NaN bit pattern) into the f64 attr.
+//===----------------------------------------------------------------------===//
+
+def KunIr_ConstantOp : KunIr_Op<"constant", [Pure]> {
+  let summary = "Scalar constant broadcast to every time step";
+  let arguments = (ins F64Attr:$value);
+  let results = (outs KunIr_AnyTs:$result);
+  let hasVerifier = 1;
+  let assemblyFormat = "$value `:` type($result) attr-dict";
+}
+
+//===----------------------------------------------------------------------===//
+// Accumulator / SetAccumulator
+//
+// A scalar "register" that survives across time steps within one kernel
+// invocation.  Modeled as a ts<T, 1> handle so existing ts.get / ts.put
+// load and store its current value.  The handle storage is owned by a
+// single LLVM alloca after kungpu-to-llvm lowering.
+//
+// `kunir.accumulator` is NOT Pure: each op identifies a distinct slot
+// with its own `init_val`, so CSE-ing two accumulators to one slot would
+// silently merge their state.
+//
+// `kunir.set_accumulator` is NOT Pure: writing to the slot is a side
+// effect and must not be CSE'd or hoisted past dependent reads.  When the
+// scalar `mask` is true at the current time step the accumulator is
+// overwritten with `value`; otherwise it retains the previous value.
+//===----------------------------------------------------------------------===//
+
+def KunIr_AccumulatorOp : KunIr_Op<"accumulator"> {
+  let summary = "Stateful single-slot scalar register (read via ts.get @0)";
+  let arguments = (ins StrAttr:$name,
+                       DefaultValuedAttr<F64Attr, "0.0">:$init_val);
+  let results = (outs KunIr_AnyTs:$result);
+  let hasVerifier = 1;
+  let assemblyFormat = "$name `:` type($result) attr-dict";
+}
+
+def KunIr_SetAccumulatorOp : KunIr_Op<"set_accumulator"> {
+  let summary = "Conditionally overwrite an accumulator slot";
+  let description = [{
+    `acc` must be the result of a `kunir.accumulator`.  When the scalar
+    `mask` is true at the current time step, stores `value` into the
+    accumulator slot; otherwise the slot is unchanged.  Side-effecting
+    (NOT Pure): never CSE / dedup.
+
+    Returns the slot's new value for the current step — i.e.
+    `mask ? value : prev_accumulator`.  Downstream consumers can read
+    this directly without a second `ts.get` of the slot.
+  }];
+  let arguments = (ins KunIr_AnyTs:$acc,
+                       KunIr_AnyTs:$mask,
+                       KunIr_AnyTs:$value);
+  let results = (outs KunIr_AnyTs:$result);
+  let hasVerifier = 1;
+  let assemblyFormat =
+    "$acc `,` $mask `,` $value `:` type($acc) `,` type($mask) `,` type($value) `->` type($result) attr-dict";
+}
+
+//===----------------------------------------------------------------------===//
+// Cross-sectional ops
+//
+// kunir has no first-class cross-sectional op.  Cross-sectional kernels
+// (currently only cs_rank) are routed by the Python frontend
+// (KunQuant.passes.CodegenMLIR._maybe_external_partition) directly to a
+// pre-compiled CUmodule bundled with the runtime — they never appear
+// in kunir IR or the MLIR pipeline.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// WindowedOutput op
+//
+// Materialises a lookback window: takes any-length ts and produces a ts
+// with a specific maxLookback.  The element types must match.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// OutputRef op
+//
+// Expose a graph-output buffer as a ts handle so the same kernel can
+// both write to it and read trailing values from it.  `$value` is the
+// scalar to write at the current step.  Lowering hoists the write to
+// the op site so reads at offset 0 see the just-written scalar.  At
+// most one output_ref per name.
+//===----------------------------------------------------------------------===//
+
+def KunIr_OutputRefOp : KunIr_Op<"output_ref", [Pure]> {
+  let summary = "ts handle to a graph-output buffer";
+  let arguments = (ins StrAttr:$name, KunIr_AnyTs:$value);
+  let results   = (outs KunIr_AnyTs:$result);
+  let hasVerifier = 1;
+  let assemblyFormat =
+    "$name `,` $value `:` type($value) `->` type($result) attr-dict";
+}
+
+def KunIr_WindowedOutputOp : KunIr_Op<"windowed_output", [Pure]> {
+  let summary = "Store a segment of a time-series stream with fixed lookback";
+  let description = [{
+    Takes an input ts with any maxLookback and produces a ts whose
+    maxLookback equals the `length` attribute.  This op is used to
+    explicitly bound how far back subsequent windowed ops may look.
+
+    Constraints:
+      - result element type == input element type
+      - result.maxLookback == length
+  }];
+  let arguments = (ins KunIr_AnyTs:$input, I64Attr:$length);
+  let results = (outs KunIr_AnyTs:$result);
+  let hasVerifier = 1;
+  let assemblyFormat =
+    "$input `[` `length` `=` $length `]` `:` type($input) `->` type($result) attr-dict";
+}
+
+//===----------------------------------------------------------------------===//
+// Region terminator
+//===----------------------------------------------------------------------===//
+
+def KunIr_YieldOp : KunIr_Op<"yield", [Pure, Terminator, ReturnLike]> {
+  let summary = "Yield zero or more scalar values from a kunir region body";
+  let arguments = (ins Variadic<AnyType>:$values);
+  let assemblyFormat = "($values^ `:` type($values))? attr-dict";
+  // Zero-argument builder for ensureTerminator (empty yield).
+  let builders = [OpBuilder<(ins), [{}]>];
+}
+
+//===----------------------------------------------------------------------===//
+// ForEachBackWindow op
+//
+// Accepts N input ts values, iterates over their shared back window, and
+// produces M ts<elemType, 1> results via kunir.reduce_* ops and kunir.yield.
+//
+// Body block:
+//   - Has exactly N arguments, one per input.
+//   - Block arg i has type !kunir.ts<input_i.elemType, 1>  (the current slice).
+//   - May contain kunir.reduce_* ops that accumulate the block arguments.
+//   - Must terminate with kunir.yield returning M ts<elemType, 1> values,
+//     one per result of this op.
+//
+// Constraints:
+//   - window > 0
+//   - For each input i: input_i.maxLookback >= window (verified)
+//   - body block arg count == inputs count
+//   - results count == yield operands count
+//   - Each result has maxLookback == 1
+//===----------------------------------------------------------------------===//
+
+def KunIr_ForEachBackWindowOp : KunIr_Op<"for_each_back_window", [
+    Pure,
+    SingleBlockImplicitTerminator<"::kunir::YieldOp">
+]> {
+  let summary = "Apply reductions over a sliding back window of N time series";
+  let description = [{
+    For each time step t, iterates over the window [t-window+1 .. t] of every
+    input ts and applies the reductions described in the region body.
+    Each input ts contributes one block argument of type ts<elemType_i, 1>
+    representing the current window slice.
+    kunir.reduce_* ops accumulate those values; kunir.yield collects the
+    results which become the M outputs of this op, each of type ts<elemType, 1>.
+
+    Every input's maxLookback must be >= window (or inf).
+  }];
+  let arguments = (ins Variadic<KunIr_AnyTs>:$inputs, I64Attr:$window);
+  let results = (outs Variadic<KunIr_AnyTs>:$results);
+  let regions = (region SizedRegion<1>:$body);
+  let hasVerifier = 1;
+  let hasCustomAssemblyFormat = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Reduction ops (must appear inside a ForEachBackWindow body)
+//
+// Each op takes a !kunir.ts<elemType, 1> value (the current window slice
+// from the enclosing ForEachBackWindow block argument) and returns the
+// accumulated !kunir.ts<elemType, 1> result after the window is traversed.
+// SameOperandsAndResultType enforces that input and result types match.
+//===----------------------------------------------------------------------===//
+
+class KunIr_ReduceOp<string mnemonic, list<Trait> traits = []>
+    : KunIr_Op<mnemonic, !listconcat(traits, [
+        Pure,
+        SameOperandsAndResultType,
+        DeclareOpInterfaceMethods<KunIr_ReduceArithInterface>
+    ])> {
+  let arguments = (ins KunIr_AnyTs:$value);
+  let results = (outs KunIr_AnyTs:$result);
+  let hasVerifier = 1;
+  let assemblyFormat = "$value `:` type($value) attr-dict";
+}
+
+def KunIr_ReduceAddOp : KunIr_ReduceOp<"reduce_add"> {
+  let summary = "Sum reduction over the back window";
+}
+def KunIr_ReduceMulOp : KunIr_ReduceOp<"reduce_mul"> {
+  let summary = "Product reduction over the back window";
+}
+def KunIr_ReduceMaxOp : KunIr_ReduceOp<"reduce_max"> {
+  let summary = "Maximum reduction over the back window";
+}
+def KunIr_ReduceMinOp : KunIr_ReduceOp<"reduce_min"> {
+  let summary = "Minimum reduction over the back window";
+}
+
+//===----------------------------------------------------------------------===//
+// Multi-accumulator reductions — NaN-propagating, used by TsArgMin/Max/TsRank.
+// These don't fit the single-state `ReduceArithInterface` (argmin/max track
+// both the running best value *and* its window-relative position; rank tracks
+// less-count + equal-count).  The FBW lowering in `KunIrToKunGpu.cpp`
+// special-cases them to emit the matching N-iter-arg scf.for body.
+//===----------------------------------------------------------------------===//
+
+def KunIr_ReduceArgMinOp : KunIr_Op<"reduce_argmin",
+    [Pure, SameOperandsAndResultType]> {
+  let summary = "ArgMin reduction — window-relative index (window-1-w) of "
+                  "the smallest element; NaN propagates.";
+  let arguments = (ins KunIr_AnyTs:$value);
+  let results = (outs KunIr_AnyTs:$result);
+  let hasVerifier = 1;
+  let assemblyFormat = "$value `:` type($value) attr-dict";
+}
+def KunIr_ReduceArgMaxOp : KunIr_Op<"reduce_argmax",
+    [Pure, SameOperandsAndResultType]> {
+  let summary = "ArgMax reduction — window-relative index (window-1-w) of "
+                  "the largest element; NaN propagates.";
+  let arguments = (ins KunIr_AnyTs:$value);
+  let results = (outs KunIr_AnyTs:$result);
+  let hasVerifier = 1;
+  let assemblyFormat = "$value `:` type($value) attr-dict";
+}
+def KunIr_ReduceRankOp : KunIr_Op<"reduce_rank",
+    [Pure, AllTypesMatch<["value", "result"]>]> {
+  let summary = "Per-window cross-sectional rank of `current` against the "
+                  "iterated window values: less_count + (eq_count + 1) / 2.";
+  let arguments = (ins KunIr_AnyTs:$value, KunIr_AnyTs:$current);
+  let results = (outs KunIr_AnyTs:$result);
+  let hasVerifier = 1;
+  let assemblyFormat =
+    "$value `,` $current `:` type($value) `,` type($current) attr-dict";
+}
+
+//===----------------------------------------------------------------------===//
+// WindowLoopIndex — current FBW step index, 0 = oldest, window-1 = newest.
+// Result is the integer index converted to the function's element type.
+//===----------------------------------------------------------------------===//
+
+def KunIr_WindowLoopIndexOp : KunIr_Op<"window_loop_index", [Pure]> {
+  let summary = "Current step index of the enclosing for_each_back_window";
+  let results = (outs KunIr_AnyTs:$result);
+  let hasVerifier = 1;
+  let assemblyFormat = "`:` type($result) attr-dict";
+}
+
+//===----------------------------------------------------------------------===//
+// BackRef — read input value at t - window
+//
+// At time step t, returns input[t - window]; semantically NaN when t < window
+// (the upstream pipeline writes the high time-domain region first, so cells
+// before t = window-1 should be ignored by the user).
+//
+// The op carries through `kunir-to-kungpu` in two forms:
+//   * Source kunir:  result = !kunir.ts<T, 1>
+//   * Lowered form:  result = T (scalar) — a single ts.get on the ts handle
+//
+// Constraints:
+//   * input.maxLookback >= window + 1   (or inf — function args)
+//   * window > 0
+//   * if result is ts: must be ts<input.elemType, 1>
+//   * if result is scalar: must equal input.elemType
+//===----------------------------------------------------------------------===//
+
+def KunIr_BackRefOp : KunIr_Op<"back_ref", [Pure]> {
+  let summary = "Reference the input value `window` time steps in the past";
+  let arguments = (ins KunIr_AnyTs:$input, I64Attr:$window);
+  let results = (outs AnyType:$result);
+  let hasVerifier = 1;
+  let assemblyFormat =
+    "$input `[` `window` `=` $window `]` `:` type($input) `->` type($result) attr-dict";
+}
+
+//===----------------------------------------------------------------------===//
+// FastWindowedSum — running sum over the trailing `window` time steps
+//
+// Equivalent to KunQuant's CPU `FastWindowedSum<T, stride, window>` (see
+// cpp/Kun/Ops.hpp:325).  Per time step:
+//
+//     v_t = v_{t-1}                              (init 0, Kahan-corrected)
+//         - input[t-window]   if input[t-window] not NaN
+//         + input[t]          if input[t]        not NaN
+//     out_t = (num_nans == 0) ? v_t : NaN
+//
+// The op is preserved past `kunir-to-kungpu` (with ts→scalar result rewrite)
+// because its 4-element per-thread state (v, compAdd, compSub, num_nans) is
+// most naturally materialised as LLVM allocas in the kungpu-to-llvm pass.
+// See FastWindowedSumPattern there.
+//
+// Constraints (same as BackRef): input.maxLookback >= window + 1, window > 0.
+//===----------------------------------------------------------------------===//
+
+def KunIr_FastWindowedSumOp : KunIr_Op<"fast_windowed_sum", [Pure]> {
+  let summary = "Stateful rolling-window sum with Kahan compensation + NaN tracking";
+  let arguments = (ins KunIr_AnyTs:$input, I64Attr:$window);
+  let results = (outs AnyType:$result);
+  let hasVerifier = 1;
+  let assemblyFormat =
+    "$input `[` `window` `=` $window `]` `:` type($input) `->` type($result) attr-dict";
+}
+
+//===----------------------------------------------------------------------===//
+// FuncOp — function with named I/O and hardware target spec
+//
+// Custom assembly format:
+//
+//   kunir.func @name(%arg0: type0, ...)
+//       inputs {%arg0 = "name0", ...}
+//       outputs {"out0", ...}            // non-void: one string per result
+//       target {occupancy = V, warps_per_cta = V, smem_size = V}
+//       unreliable_count = N
+//       -> (result_type0, ...) {
+//     body
+//   }
+//
+//   kunir.func @name(%arg0: type0, %arg1: type1)
+//       inputs {%arg0 = "input0"}
+//       outputs {%arg1 = "output0"}      // void: %argN = "name" form
+//       target {...}
+//       unreliable_count = N {
+//     body
+//   }
+//
+// Constraints (void case):   len(inputs) + len(outputs) == len(block_args)
+// Constraints (non-void):    len(inputs) == len(block_args),
+//                            len(outputs) == num_results
+//
+// `unreliable_count` is the per-partition warmup depth: max over all
+// outputs of (sum of windowed op windows from any input to that output).
+// Required because callers (runtime) need it to size the time-axis
+// chunk grid and to thread the kernel's warmup arg.
+//===----------------------------------------------------------------------===//
+
+def KunIr_FuncOp : KunIr_Op<"func", [
+    IsolatedFromAbove,
+    Symbol,
+    SingleBlockImplicitTerminator<"::kunir::ReturnOp">
+]> {
+  let summary = "KunQuant function with named I/O and hardware target spec";
+  let arguments = (ins
+    StrAttr:$sym_name,
+    TypeAttr:$function_type,
+    ArrayAttr:$input_names,
+    ArrayAttr:$output_names,
+    KunIr_TargetSpecAttr:$target_spec,
+    SI64Attr:$unreliable_count
+  );
+  let regions = (region SizedRegion<1>:$body);
+  let hasCustomAssemblyFormat = 1;
+  let hasVerifier = 1;
+  let skipDefaultBuilders = 1;
+  let builders = [
+    OpBuilder<(ins "::llvm::StringRef":$name,
+                   "::mlir::FunctionType":$type,
+                   "::mlir::ArrayAttr":$inputNames,
+                   "::mlir::ArrayAttr":$outputNames,
+                   "::kunir::TargetSpecAttr":$targetSpec,
+                   "int64_t":$unreliableCount)>
+  ];
+  let extraClassDeclaration = [{
+    // getFunctionType() is generated by tablegen and returns mlir::Type.
+    // This typed helper casts it for callers that need mlir::FunctionType.
+    ::mlir::FunctionType getFunctionTypeTyped() {
+      return llvm::cast<::mlir::FunctionType>(getFunctionType());
+    }
+    ::mlir::Block &getBodyBlock() { return getBody().front(); }
+  }];
+}
+
+//===----------------------------------------------------------------------===//
+// ReturnOp — terminator for kunir.func body
+//===----------------------------------------------------------------------===//
+
+def KunIr_ReturnOp : KunIr_Op<"return", [
+    Pure,
+    Terminator,
+    ReturnLike,
+    HasParent<"::kunir::FuncOp">
+]> {
+  let summary = "Return from a kunir.func body";
+  let arguments = (ins Variadic<AnyType>:$operands);
+  let assemblyFormat = "($operands^ `:` type($operands))? attr-dict";
+  let hasVerifier = 1;
+  // Zero-arg builder used by SingleBlockImplicitTerminator (ensureTerminator).
+  let builders = [OpBuilder<(ins), [{}]>];
+}
+
+#endif // KUNIR_OPS_TD
diff --git a/mlir/include/KunIr/KunIrTypes.h b/mlir/include/KunIr/KunIrTypes.h
new file mode 100644
index 0000000..eaa424a
--- /dev/null
+++ b/mlir/include/KunIr/KunIrTypes.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Types.h"
+
+// Emit class declarations (storage struct is only forward-declared here;
+// the full definition lives in KunIrTypes.cpp via KunIrOpsTypes.cpp.inc).
+#define GET_TYPEDEF_CLASSES
+#include "KunIr/KunIrOpsTypes.h.inc"
diff --git a/mlir/include/KunIr/KunIrTypes.td b/mlir/include/KunIr/KunIrTypes.td
new file mode 100644
index 0000000..b885bb3
--- /dev/null
+++ b/mlir/include/KunIr/KunIrTypes.td
@@ -0,0 +1,35 @@
+#ifndef KUNIR_TYPES_TD
+#define KUNIR_TYPES_TD
+
+include "KunIr/KunIrDialect.td"
+include "mlir/IR/BuiltinTypeInterfaces.td"
+
+class KunIr_Type<string name, string typeMnemonic, list<Trait> traits = []>
+    : TypeDef<KunIr_Dialect, name, traits> {
+  let mnemonic = typeMnemonic;
+}
+
+def KunIr_TsType : KunIr_Type<"Ts", "ts"> {
+  let summary = "Time series type for financial data";
+  let description = [{
+    Represents a time series of financial data across all stocks.
+    Logically a [num_stocks x num_time] array; both dimensions are
+    determined at runtime. The ts type is a pure value flowing between
+    kunir ops — it carries no load/store semantics at this level.
+
+    The maxLookback parameter (uint64) controls how many past time steps
+    this stream retains:
+      1        — only the current value (no history)
+      N        — up to N past values available for windowed ops
+      UINT64_MAX — unlimited history (printed/parsed as "inf"), used for
+                   function input parameters
+  }];
+  let parameters = (ins "::mlir::Type":$elementType, "uint64_t":$maxLookback);
+  let hasCustomAssemblyFormat = 1;
+}
+
+// Type constraint for use in op definitions
+def KunIr_AnyTs : Type<CPred<"::llvm::isa<::kunir::TsType>($_self)">,
+                        "kunir time series type", "::kunir::TsType">;
+
+#endif // KUNIR_TYPES_TD
diff --git a/mlir/include/KunIr/Passes.h b/mlir/include/KunIr/Passes.h
new file mode 100644
index 0000000..6bc4c3e
--- /dev/null
+++ b/mlir/include/KunIr/Passes.h
@@ -0,0 +1,10 @@
+#pragma once
+
+#include <memory>
+
+namespace mlir { class Pass; }
+
+namespace kunir {
+void registerKunIrToKunGpuPass();
+std::unique_ptr<::mlir::Pass> createKunIrToKunGpuPass();
+} // namespace kunir
diff --git a/mlir/lib/CMakeLists.txt b/mlir/lib/CMakeLists.txt
new file mode 100644
index 0000000..f2c6f50
--- /dev/null
+++ b/mlir/lib/CMakeLists.txt
@@ -0,0 +1,4 @@
+add_subdirectory(KunIr)
+add_subdirectory(KunCuda)
+add_subdirectory(KunGpu)
+add_subdirectory(Python)
diff --git a/mlir/lib/KunCuda/CMakeLists.txt b/mlir/lib/KunCuda/CMakeLists.txt
new file mode 100644
index 0000000..ffea52e
--- /dev/null
+++ b/mlir/lib/KunCuda/CMakeLists.txt
@@ -0,0 +1,117 @@
+# libKunCudaRuntime.so — CUDA runtime, decoupled from the MLIR compiler
+# library and the Python binding.
+
+# ── CUDA language setup ──────────────────────────────────────────────
+# The root CMakeLists discovers CUDAToolkit once when KUN_BUILD_MLIR is
+# enabled.  Reuse its CMAKE_CUDA_COMPILER / CUDA::cuda_driver target here.
+enable_language(CUDA)
+set(CMAKE_CUDA_STANDARD 17)
+set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+# sm_75 (Turing) baseline.  cs_rank — and any other bundled kernel
+# we add to kernels/ — uses no arch-specific instructions, so sm_75
+# covers every NVIDIA GPU CUDA 13 still supports.  Later GPUs run
+# the PTX via driver-JIT (cached system-wide in ~/.nv/ComputeCache).
+set(CMAKE_CUDA_ARCHITECTURES 75)
+
+# ── Recipe: compile `.cu` → `.ptx` → embed as a C array ──────────────
+# Drop a new `kernels/<name>.cu` and the foreach below picks it up —
+# the kernel becomes available to Runtime.cpp via
+# `#include "<name>_ptx.inc"` exposing `kun_<name>_ptx[]`.
+#
+# nvcc stamps a `.version` directive on every emitted PTX matching
+# its own toolkit release, which the deployed CUDA driver may not
+# yet understand.  Our kernels here all use baseline features
+# available since PTX 7.x, so EmbedFile.cmake's PTX_VERSION knob
+# rewrites the `.version` directive to 7.8 before embedding —
+# accepted by every driver from R510 (CUDA 11.6) onwards.  Same
+# header-rewrite trick Triton uses when its LLVM emits new ISA.
+set(_kun_ptx_embed_includes "")
+set(_kun_embed_cmake "${CMAKE_CURRENT_SOURCE_DIR}/EmbedFile.cmake")
+
+function(kun_add_bundled_ptx_kernel cu_path)
+  get_filename_component(_stem "${cu_path}" NAME_WE)
+  set(_obj_tgt "KunPtxObj_${_stem}")
+  set(_inc     "${CMAKE_CURRENT_BINARY_DIR}/${_stem}_ptx.inc")
+  set(_symbol  "kun_${_stem}_ptx")
+
+  # Native CUDA OBJECT lib with CUDA_PTX_COMPILATION → CMake invokes
+  # nvcc with `-ptx`; output is one .ptx per source.
+  add_library(${_obj_tgt} OBJECT "${cu_path}")
+  set_target_properties(${_obj_tgt} PROPERTIES
+      CUDA_PTX_COMPILATION ON)
+  target_compile_options(${_obj_tgt} PRIVATE
+      $<$<COMPILE_LANGUAGE:CUDA>:-O3>)
+
+  # Embed step — `$<TARGET_OBJECTS:tgt>` is a generator expression that
+  # resolves to the .ptx output path(s) at build time.
+  add_custom_command(
+      OUTPUT  "${_inc}"
+      COMMAND "${CMAKE_COMMAND}"
+              -DINPUT=$<TARGET_OBJECTS:${_obj_tgt}>
+              -DOUTPUT=${_inc}
+              -DSYMBOL=${_symbol}
+              -DPTX_VERSION=7.8
+              -DSM_VERSION=61
+              -P "${_kun_embed_cmake}"
+      DEPENDS ${_obj_tgt} $<TARGET_OBJECTS:${_obj_tgt}> "${_kun_embed_cmake}"
+      COMMENT "Embedding ${_stem}.ptx as ${_symbol}[] (downgrading to ISA 7.8)"
+      VERBATIM
+      COMMAND_EXPAND_LISTS)
+
+  # Hand the generated header back to the parent scope (proper list
+  # append: read the parent's value via the auto-captured local,
+  # APPEND in-function, write back).
+  set(_local "${_kun_ptx_embed_includes}")
+  list(APPEND _local "${_inc}")
+  set(_kun_ptx_embed_includes "${_local}" PARENT_SCOPE)
+endfunction()
+
+# All bundled kernels live in kernels/.  CONFIGURE_DEPENDS makes
+# ninja/make re-run cmake when a new .cu is added — no manual re-config.
+file(GLOB _kun_cu_sources CONFIGURE_DEPENDS
+     "${CMAKE_CURRENT_SOURCE_DIR}/kernels/*.cu")
+foreach(_cu IN LISTS _kun_cu_sources)
+  kun_add_bundled_ptx_kernel("${_cu}")
+endforeach()
+
+add_library(KunCudaRuntime SHARED
+    ExecutableData.cpp
+    Runtime.cpp
+    RuntimeCudaGraph.cpp
+    ${_kun_ptx_embed_includes})
+
+# Project-wide compile flags set -fvisibility=hidden + inlines-hidden to
+# minimise the size of MLIR static libs.  This shared runtime needs to
+# export its public class methods so downstream .so's (KunMLIR, host
+# runners, …) can resolve them at load time.
+#
+# We also put the .so next to the KunMLIR Python module
+# (INSTALL_RPATH=$ORIGIN at the top level): all co-distributed shared libs
+# live in one directory and find each other as siblings.
+set_target_properties(KunCudaRuntime PROPERTIES
+    CXX_VISIBILITY_PRESET default
+    VISIBILITY_INLINES_HIDDEN OFF
+    LIBRARY_OUTPUT_DIRECTORY "${KUN_MLIR_PYTHON_PACKAGE_DIR}")
+
+target_include_directories(KunCudaRuntime PUBLIC
+    "${PROJECT_SOURCE_DIR}/mlir/include")
+
+# Runtime.cpp #include's the generated <kernel>_ptx.inc files.
+target_include_directories(KunCudaRuntime PRIVATE
+    "${CMAKE_CURRENT_BINARY_DIR}")
+
+# ExecutableData.cpp uses nlohmann/json for artifact metadata.  This is a
+# header-only third-party dependency kept private to the runtime library.
+target_include_directories(KunCudaRuntime SYSTEM PRIVATE
+    "${PROJECT_SOURCE_DIR}/3rdparty/nlohmann_json/include")
+
+# CUDA Driver API.  `CUDA::cuda_driver` is FindCUDAToolkit's imported
+# target wrapping `lib64/stubs/libcuda.so` with the right INCLUDE
+# INTERFACE — gets cuda.h + the link-time stub in one go.  This dep
+# is PRIVATE: downstream consumers see only Runtime.h, which never
+# includes <cuda.h>.
+target_link_libraries(KunCudaRuntime PRIVATE CUDA::cuda_driver)
+
+# Like the Python module, this shared library has no CPython symbols to
+# satisfy the global -Wl,-z,defs check; but we *do* link the CUDA stub
+# so all libcuda symbols *are* resolved.  Nothing to strip here.
diff --git a/mlir/lib/KunCuda/EmbedFile.cmake b/mlir/lib/KunCuda/EmbedFile.cmake
new file mode 100644
index 0000000..6c71895
--- /dev/null
+++ b/mlir/lib/KunCuda/EmbedFile.cmake
@@ -0,0 +1,50 @@
+# Embed a binary file as a C++ unsigned-char array.
+#
+# Usage: cmake -DINPUT=foo.ptx -DOUTPUT=foo_ptx.inc -DSYMBOL=kun_cs_rank_ptx
+#              [-DPTX_VERSION=7.8]   # optional: rewrite the `.version` directive
+#              -P EmbedFile.cmake
+#
+# Produces (in OUTPUT):
+#   static const unsigned char SYMBOL[] = { 0x12, 0x34, ... };
+#   static const unsigned int  SYMBOL_len = N;
+#
+# If PTX_VERSION is set, the input is read as text and its first
+# `.version X.Y` line is replaced before encoding — useful when nvcc
+# emits a newer ISA than the deployed CUDA driver supports.
+
+if(NOT INPUT OR NOT OUTPUT OR NOT SYMBOL)
+  message(FATAL_ERROR
+      "EmbedFile.cmake: INPUT, OUTPUT and SYMBOL must all be set")
+endif()
+
+if(PTX_VERSION)
+  file(READ "${INPUT}" text_content)
+  string(REGEX REPLACE "\\.version[ \\t]+[0-9.]+" ".version ${PTX_VERSION}"
+                       text_content "${text_content}")
+  string(REGEX REPLACE "\\.target[ \\t]+sm_[0-9]+" ".target sm_${SM_VERSION}"
+                       text_content "${text_content}")
+  set(_patched "${OUTPUT}.raw.ptx")
+  file(WRITE "${_patched}" "${text_content}")
+  file(READ "${_patched}" hex_content HEX)
+  file(REMOVE "${_patched}")
+else()
+  file(READ "${INPUT}" hex_content HEX)
+endif()
+string(LENGTH "${hex_content}" hex_len)
+math(EXPR n_bytes "${hex_len} / 2")
+
+# "abcd" → "0xab,0xcd,"
+string(REGEX REPLACE "(..)" "0x\\1," byte_list "${hex_content}")
+# Trim the trailing comma.
+string(REGEX REPLACE ",$" "" byte_list "${byte_list}")
+# Insert a newline every 16 bytes to keep the generated file diffable.
+string(REGEX REPLACE "(0x..,0x..,0x..,0x..,0x..,0x..,0x..,0x..,0x..,0x..,0x..,0x..,0x..,0x..,0x..,0x..,)"
+                     "\\1\n  " byte_list "${byte_list}")
+
+file(WRITE "${OUTPUT}"
+"// Generated from \"${INPUT}\".  Do not edit by hand.
+static const unsigned char ${SYMBOL}[] = {
+  ${byte_list},0x00
+};
+static const unsigned int ${SYMBOL}_len = ${n_bytes};
+")
diff --git a/mlir/lib/KunCuda/ExecutableData.cpp b/mlir/lib/KunCuda/ExecutableData.cpp
new file mode 100644
index 0000000..b47c333
--- /dev/null
+++ b/mlir/lib/KunCuda/ExecutableData.cpp
@@ -0,0 +1,449 @@
+//===- ExecutableData.cpp - serialize kun_cuda executable artifacts -------===//
+//
+// Stores ExecutableData as a JSON metadata file plus a sibling cubin binary.
+// The public API is name-based: callers provide only a directory and artifact
+// name, and the implementation owns the `<name>.json` / `<name>.cubin`
+// convention.
+//
+//===----------------------------------------------------------------------===//
+
+#include "KunCuda/Runtime.h"
+
+#include <nlohmann/json.hpp>
+
+#include <algorithm>
+#include <filesystem>
+#include <fstream>
+#include <iterator>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+namespace kun_cuda {
+namespace {
+
+using json = nlohmann::ordered_json;
+namespace fs = std::filesystem;
+
+constexpr const char *kFormat = "kun_cuda_executable_data";
+constexpr int64_t kVersion = 1;
+
+static void validateArtifactName(const std::string &name) {
+  if (name.empty())
+    throw std::runtime_error(
+        "kun_cuda::ExecutableData: artifact name must be non-empty");
+  if (name.find('/') != std::string::npos ||
+      name.find('\\') != std::string::npos)
+    throw std::runtime_error(
+        "kun_cuda::ExecutableData: artifact name must not contain path "
+        "separators");
+}
+
+static std::string joinArtifactPath(const std::string &dir,
+                                    const std::string &fileName) {
+  fs::path path(dir);
+  path /= fileName;
+  return path.string();
+}
+
+static std::string jsonFileName(const std::string &name) {
+  return name + ".json";
+}
+
+static std::string cubinFileName(const std::string &name) {
+  return name + ".cubin";
+}
+
+static void ensureDirectory(const std::string &dir) {
+  if (dir.empty())
+    return;
+
+  std::error_code ec;
+  fs::create_directories(dir, ec);
+  if (ec)
+    throw std::runtime_error(
+        "kun_cuda::ExecutableData: failed to create directory '" + dir +
+        "': " + ec.message());
+}
+
+static void writeTextFile(const std::string &path, const std::string &text) {
+  std::ofstream os(path, std::ios::binary);
+  if (!os)
+    throw std::runtime_error(
+        "kun_cuda::ExecutableData: failed to open '" + path +
+        "' for writing");
+  os.write(text.data(), static_cast<std::streamsize>(text.size()));
+  if (!os)
+    throw std::runtime_error(
+        "kun_cuda::ExecutableData: failed to write '" + path + "'");
+}
+
+static void writeBinaryFile(const std::string &path,
+                            const std::vector<char> &bytes) {
+  std::ofstream os(path, std::ios::binary);
+  if (!os)
+    throw std::runtime_error(
+        "kun_cuda::ExecutableData: failed to open '" + path +
+        "' for writing");
+  if (!bytes.empty())
+    os.write(bytes.data(), static_cast<std::streamsize>(bytes.size()));
+  if (!os)
+    throw std::runtime_error(
+        "kun_cuda::ExecutableData: failed to write '" + path + "'");
+}
+
+static std::string readTextFile(const std::string &path) {
+  std::ifstream is(path, std::ios::binary);
+  if (!is)
+    throw std::runtime_error(
+        "kun_cuda::ExecutableData: failed to open '" + path +
+        "' for reading");
+  std::string text((std::istreambuf_iterator<char>(is)),
+                   std::istreambuf_iterator<char>());
+  if (!is.eof() && !is)
+    throw std::runtime_error(
+        "kun_cuda::ExecutableData: failed to read '" + path + "'");
+  return text;
+}
+
+static std::vector<char> readBinaryFile(const std::string &path) {
+  std::ifstream is(path, std::ios::binary);
+  if (!is)
+    throw std::runtime_error(
+        "kun_cuda::ExecutableData: failed to open '" + path +
+        "' for reading");
+  std::vector<char> bytes((std::istreambuf_iterator<char>(is)),
+                          std::istreambuf_iterator<char>());
+  if (!is.eof() && !is)
+    throw std::runtime_error(
+        "kun_cuda::ExecutableData: failed to read '" + path + "'");
+  return bytes;
+}
+
+static const char *toString(Datatype dtype) {
+  switch (dtype) {
+  case Datatype::Float:
+    return "f32";
+  case Datatype::Double:
+    return "f64";
+  }
+  throw std::runtime_error("kun_cuda::ExecutableData: unknown datatype");
+}
+
+static Datatype parseDatatype(const std::string &text,
+                              const std::string &jsonPath) {
+  if (text == "f32")
+    return Datatype::Float;
+  if (text == "f64")
+    return Datatype::Double;
+  throw std::runtime_error(
+      "kun_cuda::ExecutableData: unsupported dtype '" + text +
+      "' in '" + jsonPath + "'");
+}
+
+static const char *toString(KernelKind kind) {
+  switch (kind) {
+  case KernelKind::Jit:
+    return "jit";
+  case KernelKind::ExtCsRankF32:
+    return "ext_cs_rank_f32";
+  case KernelKind::ExtCsRankF64:
+    return "ext_cs_rank_f64";
+  case KernelKind::ExtCsScaleF32:
+    return "ext_cs_scale_f32";
+  case KernelKind::ExtCsScaleF64:
+    return "ext_cs_scale_f64";
+  }
+  throw std::runtime_error("kun_cuda::ExecutableData: unknown kernel kind");
+}
+
+static KernelKind parseKernelKind(const std::string &text,
+                                  const std::string &jsonPath,
+                                  const std::string &fieldPath) {
+  if (text == "jit")
+    return KernelKind::Jit;
+  if (text == "ext_cs_rank_f32")
+    return KernelKind::ExtCsRankF32;
+  if (text == "ext_cs_rank_f64")
+    return KernelKind::ExtCsRankF64;
+  if (text == "ext_cs_scale_f32")
+    return KernelKind::ExtCsScaleF32;
+  if (text == "ext_cs_scale_f64")
+    return KernelKind::ExtCsScaleF64;
+  throw std::runtime_error(
+      "kun_cuda::ExecutableData: unsupported kernel kind '" + text +
+      "' at " + fieldPath + " in '" + jsonPath + "'");
+}
+
+static const json &requireObject(const json &value,
+                                 const std::string &jsonPath,
+                                 const std::string &fieldPath) {
+  if (!value.is_object())
+    throw std::runtime_error(
+        "kun_cuda::ExecutableData: expected object at " + fieldPath +
+        " in '" + jsonPath + "'");
+  return value;
+}
+
+static const json &requireField(const json &object,
+                                const std::string &jsonPath,
+                                const std::string &fieldPath,
+                                const char *fieldName) {
+  requireObject(object, jsonPath, fieldPath);
+  auto it = object.find(fieldName);
+  if (it == object.end())
+    throw std::runtime_error(
+        "kun_cuda::ExecutableData: missing field " + fieldPath + "." +
+        fieldName + " in '" + jsonPath + "'");
+  return *it;
+}
+
+static std::string getString(const json &value,
+                             const std::string &jsonPath,
+                             const std::string &fieldPath) {
+  if (!value.is_string())
+    throw std::runtime_error(
+        "kun_cuda::ExecutableData: expected string at " + fieldPath +
+        " in '" + jsonPath + "'");
+  return value.get<std::string>();
+}
+
+static int64_t getInt64(const json &value,
+                        const std::string &jsonPath,
+                        const std::string &fieldPath) {
+  if (!value.is_number_integer())
+    throw std::runtime_error(
+        "kun_cuda::ExecutableData: expected integer at " + fieldPath +
+        " in '" + jsonPath + "'");
+  return value.get<int64_t>();
+}
+
+static std::vector<std::string>
+getStringArray(const json &value, const std::string &jsonPath,
+               const std::string &fieldPath) {
+  if (!value.is_array())
+    throw std::runtime_error(
+        "kun_cuda::ExecutableData: expected array at " + fieldPath +
+        " in '" + jsonPath + "'");
+
+  std::vector<std::string> result;
+  result.reserve(value.size());
+  for (size_t i = 0; i < value.size(); ++i)
+    result.push_back(getString(value[i], jsonPath,
+                               fieldPath + "[" + std::to_string(i) + "]"));
+  return result;
+}
+
+static std::unordered_map<std::string, int64_t>
+getStringIntMap(const json &value, const std::string &jsonPath,
+                const std::string &fieldPath) {
+  if (!value.is_object())
+    throw std::runtime_error(
+        "kun_cuda::ExecutableData: expected object at " + fieldPath +
+        " in '" + jsonPath + "'");
+
+  std::unordered_map<std::string, int64_t> result;
+  for (auto it = value.begin(); it != value.end(); ++it) {
+    const std::string path = fieldPath + "." + it.key();
+    result.emplace(it.key(), getInt64(it.value(), jsonPath, path));
+  }
+  return result;
+}
+
+static json toJSONArray(const std::vector<std::string> &strings) {
+  json array = json::array();
+  for (const std::string &s : strings)
+    array.push_back(s);
+  return array;
+}
+
+static json toJSON(const KernelMeta &kernel) {
+  json obj = json::object();
+  obj["name"] = kernel.kernelName;
+  obj["kind"] = toString(kernel.kind);
+  obj["inputs"] = toJSONArray(kernel.inputNames);
+  obj["outputs"] = toJSONArray(kernel.outputNames);
+  obj["unreliable_count"] = kernel.unreliableCount;
+  return obj;
+}
+
+static KernelMeta parseKernelMeta(const json &value,
+                                  const std::string &jsonPath,
+                                  const std::string &fieldPath) {
+  requireObject(value, jsonPath, fieldPath);
+
+  KernelMeta kernel;
+  kernel.kernelName =
+      getString(requireField(value, jsonPath, fieldPath, "name"),
+                jsonPath, fieldPath + ".name");
+  const std::string kind =
+      getString(requireField(value, jsonPath, fieldPath, "kind"),
+                jsonPath, fieldPath + ".kind");
+  kernel.kind = parseKernelKind(kind, jsonPath, fieldPath + ".kind");
+  kernel.inputNames =
+      getStringArray(requireField(value, jsonPath, fieldPath, "inputs"),
+                     jsonPath, fieldPath + ".inputs");
+  kernel.outputNames =
+      getStringArray(requireField(value, jsonPath, fieldPath, "outputs"),
+                     jsonPath, fieldPath + ".outputs");
+  kernel.unreliableCount =
+      getInt64(requireField(value, jsonPath, fieldPath, "unreliable_count"),
+               jsonPath, fieldPath + ".unreliable_count");
+  return kernel;
+}
+
+struct Metadata {
+  std::string format;
+  int64_t version = 0;
+  std::string cubin;
+  int64_t warpsPerCta = 1;
+  int64_t vectorSize = 1;
+  std::string dtype;
+  std::vector<KernelMeta> kernels;
+  std::vector<std::string> graphInputs;
+  std::vector<std::string> graphOutputs;
+  std::unordered_map<std::string, int64_t> outputUnreliable;
+};
+
+static json metadataToJSON(const ExecutableData &data,
+                           const std::string &cubinName) {
+  json kernels = json::array();
+  for (const KernelMeta &kernel : data.kernels)
+    kernels.push_back(toJSON(kernel));
+
+  std::vector<std::string> outputNames;
+  outputNames.reserve(data.outputUnreliable.size());
+  for (const auto &item : data.outputUnreliable)
+    outputNames.push_back(item.first);
+  std::sort(outputNames.begin(), outputNames.end());
+
+  json outputUnreliable = json::object();
+  for (const std::string &name : outputNames)
+    outputUnreliable[name] = data.outputUnreliable.at(name);
+
+  json obj = json::object();
+  obj["format"] = kFormat;
+  obj["version"] = kVersion;
+  obj["cubin"] = cubinName;
+  obj["warps_per_cta"] = data.warpsPerCta;
+  obj["vector_size"] = data.vectorSize;
+  obj["dtype"] = toString(data.dtype);
+  obj["kernels"] = std::move(kernels);
+  obj["graph_inputs"] = toJSONArray(data.graphInputs);
+  obj["graph_outputs"] = toJSONArray(data.graphOutputs);
+  obj["output_unreliable"] = std::move(outputUnreliable);
+  return obj;
+}
+
+static Metadata parseMetadata(const std::string &jsonPath,
+                              const std::string &jsonText) {
+  json root;
+  try {
+    root = json::parse(jsonText);
+  } catch (const json::parse_error &e) {
+    throw std::runtime_error(
+        "kun_cuda::ExecutableData: failed to parse '" + jsonPath +
+        "': " + e.what());
+  }
+
+  requireObject(root, jsonPath, "$");
+
+  Metadata metadata;
+  metadata.format =
+      getString(requireField(root, jsonPath, "$", "format"),
+                jsonPath, "$.format");
+  metadata.version =
+      getInt64(requireField(root, jsonPath, "$", "version"),
+               jsonPath, "$.version");
+  metadata.cubin =
+      getString(requireField(root, jsonPath, "$", "cubin"),
+                jsonPath, "$.cubin");
+  metadata.warpsPerCta =
+      getInt64(requireField(root, jsonPath, "$", "warps_per_cta"),
+               jsonPath, "$.warps_per_cta");
+  metadata.vectorSize =
+      getInt64(requireField(root, jsonPath, "$", "vector_size"),
+               jsonPath, "$.vector_size");
+  metadata.dtype =
+      getString(requireField(root, jsonPath, "$", "dtype"),
+                jsonPath, "$.dtype");
+  metadata.graphInputs =
+      getStringArray(requireField(root, jsonPath, "$", "graph_inputs"),
+                     jsonPath, "$.graph_inputs");
+  metadata.graphOutputs =
+      getStringArray(requireField(root, jsonPath, "$", "graph_outputs"),
+                     jsonPath, "$.graph_outputs");
+  metadata.outputUnreliable =
+      getStringIntMap(requireField(root, jsonPath, "$", "output_unreliable"),
+                      jsonPath, "$.output_unreliable");
+
+  const json &kernelsValue = requireField(root, jsonPath, "$", "kernels");
+  if (!kernelsValue.is_array())
+    throw std::runtime_error(
+        "kun_cuda::ExecutableData: expected array at $.kernels in '" +
+        jsonPath + "'");
+  metadata.kernels.reserve(kernelsValue.size());
+  for (size_t i = 0; i < kernelsValue.size(); ++i)
+    metadata.kernels.push_back(
+        parseKernelMeta(kernelsValue[i], jsonPath,
+                        "$.kernels[" + std::to_string(i) + "]"));
+
+  return metadata;
+}
+
+} // namespace
+
+void ExecutableData::saveToFiles(const std::string &dir,
+                                 const std::string &name) const {
+  validateArtifactName(name);
+  ensureDirectory(dir);
+
+  const std::string jsonName = jsonFileName(name);
+  const std::string cubinName = cubinFileName(name);
+  const std::string jsonPath = joinArtifactPath(dir, jsonName);
+  const std::string cubinPath = joinArtifactPath(dir, cubinName);
+
+  writeBinaryFile(cubinPath, cubin);
+  writeTextFile(jsonPath, metadataToJSON(*this, cubinName).dump(2) + "\n");
+}
+
+std::shared_ptr<ExecutableData>
+ExecutableData::loadFromFiles(const std::string &dir,
+                              const std::string &name) {
+  validateArtifactName(name);
+
+  const std::string jsonName = jsonFileName(name);
+  const std::string cubinName = cubinFileName(name);
+  const std::string jsonPath = joinArtifactPath(dir, jsonName);
+  const std::string cubinPath = joinArtifactPath(dir, cubinName);
+
+  Metadata metadata = parseMetadata(jsonPath, readTextFile(jsonPath));
+  if (metadata.format != kFormat)
+    throw std::runtime_error(
+        "kun_cuda::ExecutableData: unsupported metadata format '" +
+        metadata.format + "' in '" + jsonPath + "'");
+  if (metadata.version != kVersion)
+    throw std::runtime_error(
+        "kun_cuda::ExecutableData: unsupported metadata version " +
+        std::to_string(metadata.version) + " in '" + jsonPath + "'");
+  if (metadata.cubin != cubinName)
+    throw std::runtime_error(
+        "kun_cuda::ExecutableData: metadata cubin field in '" + jsonPath +
+        "' must be '" + cubinName + "'");
+
+  auto data = std::make_shared<ExecutableData>();
+  data->cubin = readBinaryFile(cubinPath);
+  data->warpsPerCta = metadata.warpsPerCta;
+  data->vectorSize = metadata.vectorSize;
+  data->dtype = parseDatatype(metadata.dtype, jsonPath);
+  data->kernels = std::move(metadata.kernels);
+  data->graphInputs = std::move(metadata.graphInputs);
+  data->graphOutputs = std::move(metadata.graphOutputs);
+  data->outputUnreliable = std::move(metadata.outputUnreliable);
+  return data;
+}
+
+} // namespace kun_cuda
diff --git a/mlir/lib/KunCuda/Runtime.cpp b/mlir/lib/KunCuda/Runtime.cpp
new file mode 100644
index 0000000..218f6e6
--- /dev/null
+++ b/mlir/lib/KunCuda/Runtime.cpp
@@ -0,0 +1,1241 @@
+//===- Runtime.cpp - kun_cuda::Executable implementation ---------------===//
+//
+// The ctor pipeline is split into focused helpers — each step is small
+// enough to read top-to-bottom on its own:
+//
+//   buildBufferIndices   — assign integer indices to every named buffer
+//   resolveKernelIO      — translate per-kernel name lists to indices,
+//                           build producer-of-each-buffer table
+//   validateGraph        — check single producer, all consumers reachable,
+//                           graph_outputs all produced, no self-dependency
+//   topoSort             — Kahn's algorithm; rejects cycles
+//   planSlots            — refcount + LIFO free pool over the topo order
+//
+// The shared launch helpers live behind RuntimeUtil.h so the traditional
+// launcher and RuntimeCudaGraph.cpp use the same validation, buffer-pointer
+// resolution, chunk planning, and kernel argument construction.
+//
+//===----------------------------------------------------------------------===//
+
+#include "KunCuda/Runtime.h"
+#include "RuntimeUtil.h"
+
+#include <cuda.h>
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <sstream>
+#include <stdexcept>
+#include <unordered_map>
+#include <utility>
+
+// Pre-compiled cross-sectional PTX, embedded by EmbedFile.cmake.
+#include "cs_rank_ptx.inc"
+#include "cs_scale_ptx.inc"
+
+namespace kun_cuda {
+
+//===----------------------------------------------------------------------===//
+// CUDA driver helpers
+//===----------------------------------------------------------------------===//
+
+void checkCu(CUresult r, const char *what) {
+  if (r == CUDA_SUCCESS)
+    return;
+  const char *s = nullptr;
+  cuGetErrorString(r, &s);
+  throw std::runtime_error(std::string(what) + ": " +
+                            (s ? s : "unknown CUDA error"));
+}
+
+std::string joinNames(const std::vector<std::string> &v) {
+  std::string r;
+  for (size_t i = 0; i < v.size(); ++i) {
+    if (i) r += ", ";
+    r += v[i];
+  }
+  return r;
+}
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+// Plan-building helpers — small POD intermediates so each helper is
+// independent and trivially testable.
+//===----------------------------------------------------------------------===//
+
+struct BufTable {
+  int numBuffers      = 0;
+  int numGraphInputs  = 0;
+  int numGraphOutputs = 0;
+  // Name → index for *every* buffer (graph IO + intermediates).  Used by
+  // resolveKernelIO; the per-role maps below are kept around for the
+  // launch-time user args dict lookup.
+  std::unordered_map<std::string, int> nameToIdx;
+  std::unordered_map<std::string, int> graphInputIdx;
+  std::unordered_map<std::string, int> graphOutputIdx;
+};
+
+struct KernelIO {
+  std::vector<std::vector<int>> kernelInputBufs;   // [kernel][argv pos]
+  std::vector<std::vector<int>> kernelOutputBufs;
+  // producerKernel[bufIdx] = kernel index that writes that buffer, or
+  // -1 if it's a graph input.
+  std::vector<int> producerKernel;
+};
+
+struct SlotPlan {
+  std::vector<int> intermediateBufToSlot;
+  int peakIntermediateSlots = 0;
+};
+
+/// Step 1 — assign buffer indices.  Layout:
+///   [0 .. numGraphInputs)                        graph inputs
+///   [numGraphInputs .. numGraphInputs+numGraphOutputs)  graph outputs
+///   [..numBuffers)                               intermediates
+/// Intermediates are everything a kernel produces that isn't a
+/// graph_output; they get consecutive indices in first-seen order.
+BufTable buildBufferIndices(const std::vector<std::string> &graphInputs,
+                              const std::vector<std::string> &graphOutputs,
+                              const std::vector<KernelMeta> &kernels) {
+  BufTable t;
+
+  for (const auto &n : graphInputs) {
+    if (t.nameToIdx.count(n))
+      throw std::runtime_error(
+          "kun_cuda::Executable: duplicate name in graph_inputs: '" + n + "'");
+    int idx = static_cast<int>(t.nameToIdx.size());
+    t.nameToIdx[n] = idx;
+    t.graphInputIdx[n] = idx;
+  }
+  t.numGraphInputs = static_cast<int>(t.nameToIdx.size());
+
+  for (const auto &n : graphOutputs) {
+    if (t.nameToIdx.count(n))
+      throw std::runtime_error(
+          "kun_cuda::Executable: name '" + n +
+          "' appears in both graph_inputs and graph_outputs (or twice in "
+          "one of them)");
+    int idx = static_cast<int>(t.nameToIdx.size());
+    t.nameToIdx[n] = idx;
+    t.graphOutputIdx[n] = idx;
+  }
+  t.numGraphOutputs =
+      static_cast<int>(t.nameToIdx.size()) - t.numGraphInputs;
+
+  // Walk every kernel output and assign new indices to anything we
+  // haven't seen yet (intermediates).  We don't validate single-producer
+  // here — that's `validateGraph`'s job — but we do need to avoid
+  // double-assigning if two kernels share an output name.
+  for (const auto &k : kernels)
+    for (const auto &outName : k.outputNames)
+      if (!t.nameToIdx.count(outName))
+        t.nameToIdx[outName] = static_cast<int>(t.nameToIdx.size());
+
+  t.numBuffers = static_cast<int>(t.nameToIdx.size());
+  return t;
+}
+
+/// Step 2 — resolve each kernel's I/O name list to int indices, plus
+/// build the producer-of-each-buffer table.  Throws on a kernel input
+/// that wasn't declared anywhere (neither graph input nor produced).
+KernelIO resolveKernelIO(const std::vector<KernelMeta> &kernels,
+                           const BufTable &tbl) {
+  KernelIO kio;
+  kio.kernelInputBufs.resize(kernels.size());
+  kio.kernelOutputBufs.resize(kernels.size());
+  kio.producerKernel.assign(tbl.numBuffers, -1);
+
+  for (int kIdx = 0; kIdx < static_cast<int>(kernels.size()); ++kIdx) {
+    const auto &k = kernels[kIdx];
+
+    kio.kernelInputBufs[kIdx].reserve(k.inputNames.size());
+    for (const auto &n : k.inputNames) {
+      auto it = tbl.nameToIdx.find(n);
+      if (it == tbl.nameToIdx.end())
+        throw std::runtime_error(
+            "kun_cuda::Executable: kernel '" + k.kernelName + "' consumes '" +
+            n + "' which is neither a graph_input nor produced by any kernel");
+      kio.kernelInputBufs[kIdx].push_back(it->second);
+    }
+
+    kio.kernelOutputBufs[kIdx].reserve(k.outputNames.size());
+    for (const auto &n : k.outputNames) {
+      // Index existence is guaranteed by buildBufferIndices.
+      int b = tbl.nameToIdx.at(n);
+      kio.kernelOutputBufs[kIdx].push_back(b);
+      kio.producerKernel[b] = kIdx;  // last writer wins; validateGraph
+                                      // catches multi-producer below.
+    }
+  }
+  return kio;
+}
+
+/// Step 3 — graph-level validation.  Catches the cases buildBufferIndices /
+/// resolveKernelIO can't, namely:
+///   * two kernels claim to produce the same buffer
+///   * a graph_output is declared but never produced
+///   * a graph_input is also produced by a kernel (overlap is silly)
+void validateGraph(const std::vector<KernelMeta> &kernels,
+                     const std::vector<std::string> &graphOutputs,
+                     const BufTable &tbl,
+                     const KernelIO &kio) {
+  // Multi-producer: count how many times each output name appears as a
+  // kernel output.
+  std::unordered_map<std::string, int> outCounts;
+  std::unordered_map<std::string, std::string> firstProducer;
+  for (const auto &k : kernels) {
+    for (const auto &n : k.outputNames) {
+      if (++outCounts[n] == 1)
+        firstProducer[n] = k.kernelName;
+      else if (outCounts[n] == 2)
+        throw std::runtime_error(
+            "kun_cuda::Executable: name '" + n +
+            "' is produced by both kernel '" + firstProducer[n] +
+            "' and kernel '" + k.kernelName + "'");
+    }
+  }
+
+  // graph_outputs must be produced.
+  for (const auto &n : graphOutputs) {
+    int b = tbl.nameToIdx.at(n);
+    if (kio.producerKernel[b] < 0)
+      throw std::runtime_error(
+          "kun_cuda::Executable: graph_output '" + n +
+          "' is not produced by any kernel");
+  }
+
+  // graph_inputs must NOT be produced by any kernel — an input is by
+  // definition supplied by the caller.
+  for (const auto &kv : tbl.graphInputIdx) {
+    if (kio.producerKernel[kv.second] >= 0)
+      throw std::runtime_error(
+          "kun_cuda::Executable: graph_input '" + kv.first +
+          "' is also produced by a kernel; use a different name for the "
+          "kernel output");
+  }
+}
+
+/// Step 4 — Kahn topological sort over kernel-to-kernel edges.  An edge
+/// `producer → consumer` exists whenever consumer reads any buffer that
+/// producer writes.  Multi-edges between the same pair count as one.
+/// Throws on cycle.
+std::vector<int> topoSort(const KernelIO &kio, int numKernels) {
+  std::vector<int> indeg(numKernels, 0);
+  std::vector<std::vector<int>> succ(numKernels);
+
+  // Build edges, deduped per (producer, consumer) pair.
+  for (int kIdx = 0; kIdx < numKernels; ++kIdx) {
+    std::vector<int> deps;
+    for (int b : kio.kernelInputBufs[kIdx]) {
+      int p = kio.producerKernel[b];
+      if (p < 0) continue;                   // graph input
+      if (p == kIdx)
+        throw std::runtime_error(
+            "kun_cuda::Executable: kernel index " + std::to_string(kIdx) +
+            " depends on its own output");
+      bool seen = false;
+      for (int d : deps) if (d == p) { seen = true; break; }
+      if (!seen) deps.push_back(p);
+    }
+    indeg[kIdx] = static_cast<int>(deps.size());
+    for (int p : deps) succ[p].push_back(kIdx);
+  }
+
+  std::vector<int> order;
+  order.reserve(numKernels);
+  std::vector<int> ready;
+  for (int i = 0; i < numKernels; ++i)
+    if (indeg[i] == 0) ready.push_back(i);
+  while (!ready.empty()) {
+    int k = ready.back();
+    ready.pop_back();
+    order.push_back(k);
+    for (int n : succ[k])
+      if (--indeg[n] == 0)
+        ready.push_back(n);
+  }
+  if (static_cast<int>(order.size()) != numKernels)
+    throw std::runtime_error(
+        "kun_cuda::Executable: cycle detected in kernel dependency graph");
+  return order;
+}
+
+/// Step 5 — slot allocation for intermediates.  Refcount = number of
+/// kernel-input slots that reference the buffer, plus +1 for graph
+/// outputs (so we never try to recycle them).  Walking the topo order:
+///   * before launching kernel K, allocate a fresh slot for each
+///     intermediate output of K (drawn from the LIFO free pool when
+///     possible),
+///   * after, decrement refcounts on K's inputs; any intermediate that
+///     hits zero returns its slot to the free pool.
+SlotPlan planSlots(const std::vector<int> &launchOrder,
+                    const BufTable &tbl,
+                    const KernelIO &kio) {
+  SlotPlan plan;
+  plan.intermediateBufToSlot.assign(tbl.numBuffers, -1);
+  const int firstIntermediate = tbl.numGraphInputs + tbl.numGraphOutputs;
+
+  // Initial refcounts.
+  std::vector<int> refcount(tbl.numBuffers, 0);
+  for (const auto &ins : kio.kernelInputBufs)
+    for (int b : ins)
+      refcount[b]++;
+  // graph_outputs are externally visible — pin them so we never try to
+  // reuse them (they don't have slots anyway, but this keeps the loop
+  // free of special cases).
+  for (int i = tbl.numGraphInputs; i < firstIntermediate; ++i)
+    refcount[i]++;
+
+  std::vector<int> freePool;
+  int nextNew = 0;
+
+  auto allocSlot = [&]() -> int {
+    if (!freePool.empty()) { int s = freePool.back(); freePool.pop_back(); return s; }
+    int s = nextNew++;
+    if (nextNew > plan.peakIntermediateSlots) plan.peakIntermediateSlots = nextNew;
+    return s;
+  };
+
+  for (int kIdx : launchOrder) {
+    // Allocate slots for this kernel's intermediate outputs.  Outputs
+    // that ARE graph_outputs use caller-owned buffers and don't need a
+    // slot.
+    for (int b : kio.kernelOutputBufs[kIdx]) {
+      if (b < firstIntermediate) continue;
+      plan.intermediateBufToSlot[b] = allocSlot();
+    }
+    // Decrement refcounts on inputs; intermediate slots whose refcount
+    // hits zero return to the free pool.
+    for (int b : kio.kernelInputBufs[kIdx]) {
+      if (--refcount[b] == 0 && b >= firstIntermediate) {
+        int s = plan.intermediateBufToSlot[b];
+        if (s >= 0) freePool.push_back(s);
+      }
+    }
+  }
+  return plan;
+}
+
+} // namespace
+
+//===----------------------------------------------------------------------===//
+// Launch helpers — pure functions used by launchOnStream below.
+//===----------------------------------------------------------------------===//
+
+KernelLaunchDesc::KernelLaunchDesc(
+    int kernelIndex, KernelKind kind, CUfunction fn, bool isKernelNode,
+    int32_t timeLenI32, int32_t numStocksI32, int32_t maskI32,
+    int32_t chunkSizeI32, int32_t warmupI32,
+    std::vector<CUdeviceptr> ptrs)
+    : kernelIndex(kernelIndex),
+      kind(kind),
+      isKernelNode(isKernelNode),
+      timeLenI32(timeLenI32),
+      numStocksI32(numStocksI32),
+      maskI32(maskI32),
+      chunkSizeI32(chunkSizeI32),
+      warmupI32(warmupI32),
+      ptrs_(std::move(ptrs)) {
+  params = {};
+  params.func = fn;
+  params.gridDimX = 1;
+  params.gridDimY = 1;
+  params.gridDimZ = 1;
+  params.blockDimX = 1;
+  params.blockDimY = 1;
+  params.blockDimZ = 1;
+  params.sharedMemBytes = 0;
+  rebuildKernelParamPointers();
+}
+
+KernelLaunchDesc::KernelLaunchDesc(KernelLaunchDesc &&other) noexcept
+    : kernelIndex(other.kernelIndex),
+      kind(other.kind),
+      isKernelNode(other.isKernelNode),
+      timeLenI32(other.timeLenI32),
+      numStocksI32(other.numStocksI32),
+      maskI32(other.maskI32),
+      chunkSizeI32(other.chunkSizeI32),
+      warmupI32(other.warmupI32),
+      params(other.params),
+      ptrs_(std::move(other.ptrs_)) {
+  rebuildKernelParamPointers();
+}
+
+KernelLaunchDesc &
+KernelLaunchDesc::operator=(KernelLaunchDesc &&other) noexcept {
+  if (this == &other)
+    return *this;
+  kernelIndex = other.kernelIndex;
+  kind = other.kind;
+  isKernelNode = other.isKernelNode;
+  timeLenI32 = other.timeLenI32;
+  numStocksI32 = other.numStocksI32;
+  maskI32 = other.maskI32;
+  chunkSizeI32 = other.chunkSizeI32;
+  warmupI32 = other.warmupI32;
+  params = other.params;
+  ptrs_ = std::move(other.ptrs_);
+  rebuildKernelParamPointers();
+  return *this;
+}
+
+void KernelLaunchDesc::rebuildKernelParamPointers() {
+  argPtrs_.clear();
+  params.kernelParams = nullptr;
+  params.extra = nullptr;
+  if (!isKernelNode)
+    return;
+
+  if (kind == KernelKind::Jit) {
+    argPtrs_.reserve(5 + ptrs_.size());
+    argPtrs_.push_back(&timeLenI32);
+    argPtrs_.push_back(&numStocksI32);
+    argPtrs_.push_back(&maskI32);
+    argPtrs_.push_back(&chunkSizeI32);
+    argPtrs_.push_back(&warmupI32);
+  } else {
+    argPtrs_.reserve(2 + ptrs_.size());
+    argPtrs_.push_back(&timeLenI32);
+    argPtrs_.push_back(&numStocksI32);
+  }
+  for (auto &p : ptrs_)
+    argPtrs_.push_back(&p);
+  params.kernelParams = argPtrs_.data();
+}
+
+int firstIntermediateBuffer(const GraphPlan &plan) noexcept {
+  return plan.numGraphInputs + plan.numGraphOutputs;
+}
+
+void validateLaunchInputs(const ExecutableData &data,
+                          int64_t timeLength, int64_t numStocks,
+                          int64_t mask) {
+  if (timeLength > std::numeric_limits<int32_t>::max() ||
+      numStocks  > std::numeric_limits<int32_t>::max() ||
+      timeLength < 0 || numStocks < 0)
+    throw std::runtime_error(
+        "kun_cuda::launchOnStream: time_length / num_stocks out of i32 "
+        "range (kernel signature uses i32, i32)");
+  if (mask < 0 || (timeLength > 0 && mask >= timeLength))
+    throw std::runtime_error(
+        "kun_cuda::launchOnStream: mask must be in [0, time_length), got "
+        + std::to_string(mask) + " for time_length="
+        + std::to_string(timeLength));
+  if (data.warpsPerCta <= 0)
+    throw std::runtime_error(
+        "kun_cuda::launchOnStream: warps_per_cta is " +
+        std::to_string(data.warpsPerCta));
+}
+
+/// Translate the user-supplied {name → device_ptr} args dict into a flat
+/// buffer-index → pointer array for graph inputs/outputs.  Intermediate slots
+/// are left as 0 and filled by the caller.
+std::vector<uintptr_t> resolveExternalBufferPointers(
+    const GraphPlan &plan,
+    const ExecutableData &data,
+    const std::vector<std::pair<std::string, uintptr_t>> &args) {
+  std::vector<uintptr_t> bufPtrs(plan.numBuffers, 0);
+  std::vector<bool>      filled(plan.numBuffers, false);
+
+  for (const auto &kv : args) {
+    auto itIn  = plan.graphInputIdx.find(kv.first);
+    auto itOut = plan.graphOutputIdx.find(kv.first);
+    int idx = -1;
+    if (itIn != plan.graphInputIdx.end())
+      idx = itIn->second;
+    else if (itOut != plan.graphOutputIdx.end())
+      idx = itOut->second;
+    else
+      throw std::runtime_error(
+          "kun_cuda::launchOnStream: unexpected argument '" + kv.first +
+          "' (expected: " + joinNames(data.graphInputs) + " | " +
+          joinNames(data.graphOutputs) + ")");
+    bufPtrs[idx] = kv.second;
+    filled[idx] = true;
+  }
+
+  // Confirm every graph_input + graph_output was supplied.
+  for (int i = 0; i < plan.numGraphInputs + plan.numGraphOutputs; ++i) {
+    if (filled[i]) continue;
+    std::string missing;
+    for (auto &kv : plan.graphInputIdx)  if (kv.second == i) missing = kv.first;
+    if (missing.empty())
+      for (auto &kv : plan.graphOutputIdx) if (kv.second == i) missing = kv.first;
+    throw std::runtime_error(
+        "kun_cuda::launchOnStream: missing argument '" + missing + "'");
+  }
+
+  return bufPtrs;
+}
+
+/// Translate the user-supplied {name → device_ptr} args dict into a
+/// flat buffer-index → pointer array, plug in the executable-owned
+/// intermediate-slot pointers, and verify every graph_input /
+/// graph_output the plan expects was provided.  Throws on unknown or
+/// missing names.
+std::vector<uintptr_t> resolveBufferPointers(
+    const GraphPlan &plan,
+    const ExecutableData &data,
+    const std::vector<std::pair<std::string, uintptr_t>> &args,
+    const std::vector<uintptr_t> &slotBufs) {
+  std::vector<uintptr_t> bufPtrs =
+      resolveExternalBufferPointers(plan, data, args);
+
+  // Intermediates: index into the pre-allocated slot pool.
+  for (int i = firstIntermediateBuffer(plan); i < plan.numBuffers; ++i) {
+    int slot = plan.intermediateBufToSlot[i];
+    bufPtrs[i] = slotBufs[slot];
+  }
+  return bufPtrs;
+}
+
+static void resetLaunchShape(KernelLaunchDesc &desc) {
+  desc.params.gridDimX = 1;
+  desc.params.gridDimY = 1;
+  desc.params.gridDimZ = 1;
+  desc.params.blockDimX = 1;
+  desc.params.blockDimY = 1;
+  desc.params.blockDimZ = 1;
+  desc.params.sharedMemBytes = 0;
+}
+
+static void computeJitLaunchShape(KernelLaunchDesc &desc,
+                                  int64_t numStocks,
+                                  int64_t warpsPerCta,
+                                  int64_t vectorSize,
+                                  unsigned numChunks) {
+  desc.params.blockDimX = static_cast<unsigned>(warpsPerCta * 32);
+  uint64_t stocksPerBlock =
+      static_cast<uint64_t>(desc.params.blockDimX) *
+      static_cast<uint64_t>(vectorSize);
+  desc.params.gridDimX = static_cast<unsigned>(
+      (static_cast<uint64_t>(numStocks) + stocksPerBlock - 1) /
+      stocksPerBlock);
+  desc.params.gridDimY = numChunks;
+}
+
+/// Chunk plan for a single JIT kernel.  `chunkSize` is the time-axis
+/// width of every chunk (last chunk gets clipped to `timeLength` by
+/// kungpu.time_ub at runtime, so we don't have to special-case that
+/// here).  `numChunks` is the y-dim of the launch grid.
+///
+/// Decision tree (per kernel, since per-partition `unreliableCount`
+/// varies):
+///
+///   1. target chunks   = ceil(smFillFactor * numSMs / stockTiles), ≥ 1
+///   2. cap by warmup   = floor(T / (factor * unreliableCount))
+///        — bounds the per-chunk overhead of chunks ≥ 1, which redo the
+///          trailing `unreliableCount` time steps to prime windowed
+///          rolling state.  mask is NOT included here: it's a one-time
+///          chunk-0 skip, not a per-chunk overhead.
+///   3. cap by mask     = floor((T - 1) / mask)
+///        — chunks ≥ 1 write output[t - mask] for t ∈ [cy*chunk_size, …);
+///          if chunk_size ≤ mask, chunk 1's first output index is
+///          negative (out-of-bounds gmem write).  Enforce chunk_size >
+///          mask by capping num_chunks here.
+///   4. numChunks       = clamp(target, 1, min(cap_warmup, cap_mask))
+///   5. chunkSize       = ceil(T / numChunks)
+///
+/// When both unreliable == 0 and mask == 0, the only cap is T itself.
+/// When numSMs == 0 (Executor couldn't query the device) or
+/// smFillFactor ≤ 0, fall back to single-chunk.
+ChunkPlan computeChunkPlan(int64_t timeLength, int64_t numStocks,
+                           int64_t warpsPerCta, int64_t vectorSize,
+                           int64_t unreliableCount, int64_t mask,
+                           int minChunkWarmupFactor,
+                           double smFillFactor, int numSMs) {
+  if (timeLength <= 0)
+    return {timeLength, 1u};
+  if (numSMs <= 0 || smFillFactor <= 0.0)
+    return {timeLength, 1u};
+  // unreliableCount = -1 sentinel → whole time history required, single
+  // chunk only.  Any other negative value is rejected by the IR verifier;
+  // we don't try to interpret it.
+  if (unreliableCount < 0)
+    return {timeLength, 1u};
+
+  int64_t blockX = warpsPerCta * 32;
+  int64_t stocksPerBlock = blockX * vectorSize;
+  int64_t stockTiles =
+      (numStocks + stocksPerBlock - 1) / stocksPerBlock;
+  if (stockTiles <= 0) stockTiles = 1;
+
+  // Target chunks just to fill the GPU.  Round up so we don't under-fill.
+  int64_t targetChunks = static_cast<int64_t>(
+      std::ceil(smFillFactor * static_cast<double>(numSMs) /
+                  static_cast<double>(stockTiles)));
+  if (targetChunks < 1) targetChunks = 1;
+
+  // Caps on numChunks.  Start at T (degenerate upper bound: ≥ 1 step per
+  // chunk) and tighten with each constraint; clamp to ≥ 1 once at the end.
+  int64_t cap = timeLength;
+
+  // Per-chunk warmup overhead bound (chunks ≥ 1 only).
+  if (unreliableCount > 0 && minChunkWarmupFactor > 0)
+    cap = std::min<int64_t>(
+        cap,
+        timeLength /
+            (static_cast<int64_t>(minChunkWarmupFactor) * unreliableCount));
+
+  // chunkSize > mask: chunks ≥ 1 compute output index t - mask, which
+  // must be ≥ 0 for their writes.  chunk_size = ceil(T / numChunks);
+  // we want ceil(T / numChunks) > mask, equivalently numChunks ≤
+  // (T - 1) / mask.
+  if (mask > 0)
+    cap = std::min<int64_t>(cap, (timeLength - 1) / mask);
+
+  if (cap < 1) cap = 1;
+
+  int64_t numChunks = std::min<int64_t>(targetChunks, cap);
+  if (numChunks < 1) numChunks = 1;
+
+  int64_t chunkSize = (timeLength + numChunks - 1) / numChunks;
+  return {chunkSize, static_cast<unsigned>(numChunks)};
+}
+
+/// External cross-sectional launch.
+///
+/// Block / grid both auto-tuned — these kernels are cross-sectional, so the
+/// graph-wide `warps_per_cta` hint doesn't apply.
+///
+///   blockX = clamp(round_up(numStocks, 32), 32, 1024)
+///       Each thread owns roughly one stock; when numStocks > 1024 the
+///       kernel falls back to its built-in `for (i = tid; i < S; i +=
+///       blockDim.x)` stride loop.
+///
+///   gridX  = min(timeLength, ceil(smFillFactor * numSMs))
+///       The kernel does a contiguous time-axis slice per CTA via a
+///       grid-stride loop (see kernels/cs_*.cu).  For small T the
+///       min clamps to 1 CTA per timestep (matches the pre-tuning
+///       launch shape); for large T fewer CTAs each do more time
+///       steps, reducing launch / scheduling overhead.
+///
+///   smem   = numStocks * sizeof(T)  (one cross-section, reused across
+///                                     the CTA's time slice)
+///
+/// Falls back to (gridX = timeLength, blockX = 32) when the executor
+/// couldn't query `numSMs` from the device — degenerate "one CTA per
+/// timestep, one warp per CTA" still works correctly.
+static bool isF64ExternalKind(KernelKind kind) {
+  return kind == KernelKind::ExtCsRankF64 ||
+         kind == KernelKind::ExtCsScaleF64;
+}
+
+static bool isCsRankKind(KernelKind kind) {
+  return kind == KernelKind::ExtCsRankF32 ||
+         kind == KernelKind::ExtCsRankF64;
+}
+
+static bool isCsScaleKind(KernelKind kind) {
+  return kind == KernelKind::ExtCsScaleF32 ||
+         kind == KernelKind::ExtCsScaleF64;
+}
+
+static void computeExtCsLaunchShape(KernelLaunchDesc &desc,
+                                    KernelKind kind,
+                                    const std::string &kernelName,
+                                    int64_t timeLength,
+                                    int64_t numStocks,
+                                    int devMaxSmemBytes,
+                                    double smFillFactor,
+                                    int numSMs) {
+  size_t elemSize = isF64ExternalKind(kind) ? 8u : 4u;
+  uint64_t smemElems = static_cast<uint64_t>(numStocks);
+  if (isCsScaleKind(kind))
+    smemElems += 1;
+  uint64_t smemBytes64 = smemElems * static_cast<uint64_t>(elemSize);
+
+  if (devMaxSmemBytes <= 0)
+    throw std::runtime_error(
+        "kun_cuda::launchOnStream: external cross-sectional kernel '" +
+        kernelName +
+        "' requires Executor's devMaxSmemBytes to be set; got 0.  "
+        "Construct the Executable through Executor::runGraph, or pass "
+        "devMaxSmemBytes when calling launchOnStream directly.");
+  if (smemBytes64 > static_cast<uint64_t>(devMaxSmemBytes))
+    throw std::runtime_error(
+        "kun_cuda::launchOnStream: cross-sectional dynamic smem "
+        "(elements=" + std::to_string(smemElems) +
+        " * sizeof(T)=" + std::to_string(elemSize) + " = " +
+        std::to_string(smemBytes64) +
+        " bytes) exceeds this GPU's MAX_SHARED_MEMORY_PER_BLOCK_OPTIN (" +
+        std::to_string(devMaxSmemBytes) +
+        " bytes).  Reduce num_stocks or run on a GPU with a larger smem budget.");
+
+  if (timeLength <= 0)
+    return; // empty time chunk — nothing to launch
+
+  constexpr int kWarp = 32;
+  constexpr int kMaxBlock = 1024;
+  int64_t blockX64 =
+      ((std::max<int64_t>(numStocks, 1) + kWarp - 1) / kWarp) * kWarp;
+  if (blockX64 > kMaxBlock) blockX64 = kMaxBlock;
+  desc.params.blockDimX = static_cast<unsigned>(blockX64);
+
+  // Target gridX = sm_fill_factor * numSMs (capped at timeLength so we
+  // never launch idle CTAs).  numSMs == 0 (device query failed) →
+  // gridX = timeLength, one CTA per timestep.
+  if (numSMs > 0 && smFillFactor > 0.0) {
+    int64_t target = static_cast<int64_t>(
+        std::ceil(smFillFactor * static_cast<double>(numSMs)));
+    if (target < 1) target = 1;
+    if (target > timeLength) target = timeLength;
+    desc.params.gridDimX = static_cast<unsigned>(target);
+  } else {
+    desc.params.gridDimX = static_cast<unsigned>(timeLength);
+  }
+  desc.params.sharedMemBytes = static_cast<unsigned>(smemBytes64);
+}
+
+static std::pair<bool, bool>
+updateKernelArgPtrs(std::vector<CUdeviceptr> &ptrs,
+                    const std::vector<int> &ins,
+                    const std::vector<int> &outs,
+                    const std::vector<uintptr_t> &bufPtrs) {
+  const size_t numPtrs = ins.size() + outs.size();
+  const bool sizeChanged = ptrs.size() != numPtrs;
+  bool changed = sizeChanged;
+  if (!changed) {
+    size_t argIdx = 0;
+    for (int b : ins) {
+      if (ptrs[argIdx++] != static_cast<CUdeviceptr>(bufPtrs[b])) {
+        changed = true;
+        break;
+      }
+    }
+    if (!changed) {
+      for (int b : outs) {
+        if (ptrs[argIdx++] != static_cast<CUdeviceptr>(bufPtrs[b])) {
+          changed = true;
+          break;
+        }
+      }
+    }
+  }
+  if (!changed)
+    return {};
+
+  ptrs.resize(numPtrs);
+  size_t argIdx = 0;
+  for (int b : ins)
+    ptrs[argIdx++] = static_cast<CUdeviceptr>(bufPtrs[b]);
+  for (int b : outs)
+    ptrs[argIdx++] = static_cast<CUdeviceptr>(bufPtrs[b]);
+  return {true, sizeChanged};
+}
+
+bool KernelLaunchDesc::updateBuffer(
+    const GraphPlan &plan,
+    int kIdx,
+    const std::vector<uintptr_t> &bufPtrs) {
+  const auto &ins = plan.kernelInputBufs[kIdx];
+  const auto &outs = plan.kernelOutputBufs[kIdx];
+  const size_t numPtrs = ins.size() + outs.size();
+  if (ptrs_.size() != numPtrs)
+    throw std::runtime_error(
+        "kun_cuda::launchOnStream(cuda_graph): kernel buffer argument count "
+        "changed without graph rebuild");
+  auto [changed, reallocated] =
+      updateKernelArgPtrs(ptrs_, ins, outs, bufPtrs);
+  (void)reallocated;
+  return changed;
+}
+
+void KernelLaunchDesc::update(
+    const GraphPlan &plan,
+    const ExecutableData &data,
+    const std::vector<CUfunction> &cuFuncs,
+    int kIdx,
+    const CudaGraphLaunchParams &launch) {
+  const auto &ins  = plan.kernelInputBufs[kIdx];
+  const auto &outs = plan.kernelOutputBufs[kIdx];
+  const auto &meta = data.kernels[kIdx];
+
+  kernelIndex = kIdx;
+  kind = meta.kind;
+  isKernelNode = meta.kind == KernelKind::Jit || launch.timeLength > 0;
+  timeLenI32 = static_cast<int32_t>(launch.timeLength);
+  numStocksI32 = static_cast<int32_t>(launch.numStocks);
+  maskI32 = static_cast<int32_t>(launch.mask);
+  params.func = cuFuncs[kIdx];
+  auto [changed, reallocated] =
+      updateKernelArgPtrs(ptrs_, ins, outs, launch.bufPtrs);
+  (void)changed;
+
+  unsigned numChunks = 1;
+  chunkSizeI32 = 0;
+  warmupI32 = 0;
+  if (meta.kind == KernelKind::Jit) {
+    ChunkPlan cp = computeChunkPlan(
+        launch.timeLength, launch.numStocks, data.warpsPerCta,
+        data.vectorSize, meta.unreliableCount, launch.mask,
+        launch.minChunkWarmupFactor, launch.smFillFactor, launch.numSMs);
+    numChunks = cp.numChunks;
+    chunkSizeI32 = static_cast<int32_t>(cp.chunkSize);
+    warmupI32 = static_cast<int32_t>(
+        std::max<int64_t>(meta.unreliableCount, 0));
+  }
+
+  resetLaunchShape(*this);
+  if (meta.kind == KernelKind::Jit) {
+    computeJitLaunchShape(*this, launch.numStocks, data.warpsPerCta,
+                          data.vectorSize, numChunks);
+  } else if (isKernelNode) {
+    computeExtCsLaunchShape(*this, meta.kind, meta.kernelName,
+                            launch.timeLength, launch.numStocks,
+                            launch.devMaxSmemBytes,
+                            launch.smFillFactor, launch.numSMs);
+  }
+
+  if (reallocated)
+    rebuildKernelParamPointers();
+}
+
+static std::vector<KernelLaunchDesc> buildKernelLaunchDescs(
+    const GraphPlan &plan,
+    const ExecutableData &data,
+    const std::vector<CUfunction> &cuFuncs,
+    int64_t timeLength, int64_t numStocks,
+    const std::vector<uintptr_t> &bufPtrs,
+    int64_t mask,
+    int minChunkWarmupFactor,
+    double smFillFactor,
+    int devMaxSmemBytes,
+    int numSMs) {
+  CudaGraphLaunchParams launch;
+  launch.timeLength = timeLength;
+  launch.numStocks = numStocks;
+  launch.mask = mask;
+  launch.minChunkWarmupFactor = minChunkWarmupFactor;
+  launch.smFillFactor = smFillFactor;
+  launch.devMaxSmemBytes = devMaxSmemBytes;
+  launch.numSMs = numSMs;
+  launch.bufPtrs = bufPtrs;
+
+  std::vector<KernelLaunchDesc> descs;
+  descs.reserve(plan.launchOrder.size());
+  for (int kIdx : plan.launchOrder) {
+    KernelLaunchDesc desc;
+    desc.update(plan, data, cuFuncs, kIdx, launch);
+    descs.emplace_back(std::move(desc));
+  }
+  return descs;
+}
+
+void launchKernelDesc(const KernelLaunchDesc &desc, CUstream stream) {
+  if (!desc.isKernelNode)
+    return;
+  const char *what = desc.kind == KernelKind::Jit
+                         ? "cuLaunchKernel"
+                         : "cuLaunchKernel(external_cs)";
+  const CUDA_KERNEL_NODE_PARAMS &p = desc.params;
+  checkCu(cuLaunchKernel(p.func,
+                         p.gridDimX, p.gridDimY, p.gridDimZ,
+                         p.blockDimX, p.blockDimY, p.blockDimZ,
+                         p.sharedMemBytes, stream,
+                         p.kernelParams, p.extra),
+          what);
+}
+
+//===----------------------------------------------------------------------===//
+// Kernel-module / kernel-symbol helpers — read ExecutableData, mutate
+// the CUmodule and CUfunction handles the ctor is populating.
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+/// Load the JIT'd cubin if non-empty; otherwise sanity-check that no
+/// kernel actually needs it (every `kind == Jit` requires a cubin).
+static void loadJitCubin(const ExecutableData &data, CUmodule &outModule) {
+  if (!data.cubin.empty()) {
+    checkCu(cuModuleLoadData(&outModule, data.cubin.data()),
+             "cuModuleLoadData");
+    return;
+  }
+  for (const auto &k : data.kernels)
+    if (k.kind == KernelKind::Jit)
+      throw std::runtime_error(
+          "kun_cuda::Executable: JIT kernel '" + k.kernelName +
+          "' declared but no cubin supplied — this is a compile-side bug");
+}
+
+/// Lazy-load bundled external cross-sectional PTX modules iff any
+/// kernel uses them.  The driver JITs PTX → SASS on first load (cached
+/// system-wide in ~/.nv/ComputeCache), so this is sub-ms after the
+/// first run on a given GPU.
+static void loadExternalCsPtxIfNeeded(const std::vector<KernelMeta> &kernels,
+                                      CUmodule &csRankModule,
+                                      CUmodule &csScaleModule) {
+  bool needRank = false;
+  bool needScale = false;
+  for (const auto &k : kernels) {
+    needRank |= isCsRankKind(k.kind);
+    needScale |= isCsScaleKind(k.kind);
+    if (needRank && needScale)
+      break;
+  }
+
+  if (needRank)
+    checkCu(cuModuleLoadData(&csRankModule, kun_cs_rank_ptx),
+             "cuModuleLoadData(cs_rank.ptx)");
+  if (needScale)
+    checkCu(cuModuleLoadData(&csScaleModule, kun_cs_scale_ptx),
+             "cuModuleLoadData(cs_scale.ptx)");
+}
+
+/// Pick the right CUmodule + symbol name for a kernel and resolve it.
+static CUfunction resolveOneKernelSymbol(const KernelMeta &k,
+                                          CUmodule jitModule,
+                                          CUmodule csRankModule,
+                                          CUmodule csScaleModule) {
+  CUmodule mod = nullptr;
+  const char *symbol = nullptr;
+  switch (k.kind) {
+    case KernelKind::Jit:
+      mod = jitModule;
+      symbol = k.kernelName.c_str();
+      break;
+    case KernelKind::ExtCsRankF32:
+      mod = csRankModule;
+      symbol = "kun_cs_rank_f32";
+      break;
+    case KernelKind::ExtCsRankF64:
+      mod = csRankModule;
+      symbol = "kun_cs_rank_f64";
+      break;
+    case KernelKind::ExtCsScaleF32:
+      mod = csScaleModule;
+      symbol = "kun_cs_scale_f32";
+      break;
+    case KernelKind::ExtCsScaleF64:
+      mod = csScaleModule;
+      symbol = "kun_cs_scale_f64";
+      break;
+  }
+  CUfunction fn = nullptr;
+  checkCu(cuModuleGetFunction(&fn, mod, symbol),
+           "cuModuleGetFunction");
+  return fn;
+}
+
+/// Opt every external (non-Jit) function into the device's full
+/// dynamic-smem budget up-front.  The attribute is purely a permission
+/// cap — raising it doesn't change the carveout or per-launch smem
+/// cost, so we do it eagerly here rather than per-launch.  No-op if
+/// there are no external kernels.
+static void optInExternalSmemMax(const std::vector<KernelMeta> &kernels,
+                                   const std::vector<CUfunction> &funcs) {
+  bool anyExternal = false;
+  for (const auto &k : kernels)
+    if (k.kind != KernelKind::Jit) { anyExternal = true; break; }
+  if (!anyExternal)
+    return;
+
+  CUdevice dev = 0;
+  checkCu(cuCtxGetDevice(&dev), "cuCtxGetDevice");
+  int maxOptIn = 0;
+  checkCu(cuDeviceGetAttribute(
+              &maxOptIn,
+              CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, dev),
+           "cuDeviceGetAttribute(MAX_SHARED_MEMORY_PER_BLOCK_OPTIN)");
+  for (size_t i = 0; i < funcs.size(); ++i) {
+    if (kernels[i].kind == KernelKind::Jit) continue;
+    int staticSmem = 0;
+    checkCu(cuFuncGetAttribute(&staticSmem,
+                                CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
+                                funcs[i]),
+             "cuFuncGetAttribute(SHARED_SIZE_BYTES)");
+    int dynamicMax = maxOptIn - staticSmem;
+    if (dynamicMax < 0) dynamicMax = 0;
+    checkCu(cuFuncSetAttribute(
+                funcs[i],
+                CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
+                dynamicMax),
+             "cuFuncSetAttribute(MAX_DYNAMIC_SHARED_SIZE_BYTES)");
+  }
+}
+
+/// Per-kernel-kind I/O arity check.  External cross-sectional kernels have a
+/// fixed signature `(T_in, T_out)` — the kernel signature is set in
+/// stone by `kernels/cs_*.cu`, so we know the wiring is wrong (not
+/// just unusual) the moment we see any other shape.  Static property
+/// of the graph, so done at construction.
+static void validateKernelIO(const std::vector<KernelMeta> &kernels,
+                               const std::vector<std::vector<int>> &kernelInputBufs,
+                               const std::vector<std::vector<int>> &kernelOutputBufs) {
+  for (size_t i = 0; i < kernels.size(); ++i) {
+    const auto &k    = kernels[i];
+    const size_t nIn = kernelInputBufs[i].size();
+    const size_t nOut = kernelOutputBufs[i].size();
+    switch (k.kind) {
+      case KernelKind::Jit:
+        // JIT kernels can have any arity — they're whatever the MLIR
+        // pipeline emitted.
+        break;
+      case KernelKind::ExtCsRankF32:
+      case KernelKind::ExtCsRankF64:
+      case KernelKind::ExtCsScaleF32:
+      case KernelKind::ExtCsScaleF64:
+        if (nIn != 1 || nOut != 1)
+          throw std::runtime_error(
+              "kun_cuda::Executable: external cross-sectional kernel '" +
+              k.kernelName +
+              "' must have exactly 1 input and 1 output (have " +
+              std::to_string(nIn) + " / " + std::to_string(nOut) + ")");
+        break;
+    }
+  }
+}
+
+} // namespace
+
+//===----------------------------------------------------------------------===//
+// Executable
+//===----------------------------------------------------------------------===//
+
+LoadedExecutable::LoadedExecutable(std::shared_ptr<const ExecutableData> dataIn)
+    : data(std::move(dataIn)) {
+  if (!data)
+    throw std::runtime_error(
+        "kun_cuda::LoadedExecutable: ExecutableData pointer is null");
+  const ExecutableData &d = *data;
+
+  // Require a primary context to already exist on the calling thread —
+  // the caller's job to set one up (e.g. by allocating any device memory
+  // through cupy / cudaMalloc).
+  CUcontext cur = nullptr;
+  checkCu(cuCtxGetCurrent(&cur), "cuCtxGetCurrent");
+  if (!cur)
+    throw std::runtime_error(
+        "kun_cuda::LoadedExecutable: no current CUDA context.  Initialise the "
+        "driver first (e.g. allocate any device memory via cupy or "
+        "cudaMalloc) before constructing an Executable.");
+  if (d.kernels.empty())
+    throw std::runtime_error(
+        "kun_cuda::Executable: ExecutableData has no kernels");
+  if (d.graphInputs.empty())
+    throw std::runtime_error(
+        "kun_cuda::Executable: graph_inputs must be non-empty");
+  if (d.graphOutputs.empty())
+    throw std::runtime_error(
+        "kun_cuda::Executable: graph_outputs must be non-empty");
+
+  // ── Build the runtime plan ───────────────────────────────────────
+  BufTable tbl  = buildBufferIndices(d.graphInputs, d.graphOutputs,
+                                       d.kernels);
+  KernelIO kio  = resolveKernelIO(d.kernels, tbl);
+  validateGraph(d.kernels, d.graphOutputs, tbl, kio);
+  std::vector<int> order = topoSort(kio, static_cast<int>(d.kernels.size()));
+  SlotPlan slots = planSlots(order, tbl, kio);
+
+  plan.numBuffers          = tbl.numBuffers;
+  plan.numGraphInputs      = tbl.numGraphInputs;
+  plan.numGraphOutputs     = tbl.numGraphOutputs;
+  plan.graphInputIdx       = std::move(tbl.graphInputIdx);
+  plan.graphOutputIdx      = std::move(tbl.graphOutputIdx);
+  plan.kernelInputBufs     = std::move(kio.kernelInputBufs);
+  plan.kernelOutputBufs    = std::move(kio.kernelOutputBufs);
+  plan.producerKernel      = std::move(kio.producerKernel);
+  plan.launchOrder         = std::move(order);
+  plan.intermediateBufToSlot = std::move(slots.intermediateBufToSlot);
+  plan.peakIntermediateSlots = slots.peakIntermediateSlots;
+
+  // ── Per-kernel I/O arity validation ──────────────────────────────
+  // Catches mis-wired external kernels (which have a fixed signature)
+  // at construction time, well before the launch path.
+  validateKernelIO(d.kernels, plan.kernelInputBufs, plan.kernelOutputBufs);
+
+  // ── Load cubin(s) + resolve every kernel symbol ──────────────────
+  try {
+    loadJitCubin(d, cuModule);
+    loadExternalCsPtxIfNeeded(d.kernels, csRankModule, csScaleModule);
+
+    cuFuncs.resize(d.kernels.size(), nullptr);
+    for (size_t i = 0; i < d.kernels.size(); ++i) {
+      cuFuncs[i] = resolveOneKernelSymbol(d.kernels[i],
+                                          cuModule, csRankModule,
+                                          csScaleModule);
+    }
+
+    // ── Opt external kernels into the device's full dynamic smem cap ──
+    optInExternalSmemMax(d.kernels, cuFuncs);
+  } catch (...) {
+    if (cuModule)
+      cuModuleUnload(cuModule);
+    if (csRankModule)
+      cuModuleUnload(csRankModule);
+    if (csScaleModule)
+      cuModuleUnload(csScaleModule);
+    cuModule = nullptr;
+    csRankModule = nullptr;
+    csScaleModule = nullptr;
+    throw;
+  }
+}
+
+LoadedExecutable::~LoadedExecutable() noexcept {
+  if (cuModule)
+    cuModuleUnload(cuModule);
+  if (csRankModule)
+    cuModuleUnload(csRankModule);
+  if (csScaleModule)
+    cuModuleUnload(csScaleModule);
+}
+
+Executable::Executable(std::shared_ptr<const ExecutableData> data)
+    : data_(std::move(data)),
+      loaded_(std::make_shared<LoadedExecutable>(data_)) {}
+
+Executable::Executable(std::shared_ptr<const ExecutableData> data,
+                       std::shared_ptr<LoadedExecutable> loaded)
+    : data_(std::move(data)), loaded_(std::move(loaded)) {
+  if (!data_)
+    throw std::runtime_error(
+        "kun_cuda::Executable: ExecutableData pointer is null");
+  if (!loaded_)
+    throw std::runtime_error(
+        "kun_cuda::Executable: LoadedExecutable pointer is null");
+}
+
+std::unique_ptr<Executable> Executable::clone() const {
+  return std::unique_ptr<Executable>(
+      new Executable(data_, loaded_));
+}
+
+Executable::~Executable() {
+  // Best-effort cleanup; we deliberately don't propagate driver errors
+  // out of a destructor.
+  resetCudaGraphState();
+  freeSlotPool();
+}
+
+void Executable::freeSlotPool() {
+  for (uintptr_t p : slotBufs_)
+    if (p) cuMemFree(static_cast<CUdeviceptr>(p));
+  slotBufs_.clear();
+  cachedT_ = -1;
+  cachedS_ = -1;
+}
+
+void Executable::ensureSlotPool(int64_t timeLength, int64_t numStocks) {
+  if (timeLength == cachedT_ && numStocks == cachedS_ &&
+      static_cast<int>(slotBufs_.size()) == loaded_->plan.peakIntermediateSlots)
+    return;
+  freeSlotPool();
+  if (loaded_->plan.peakIntermediateSlots == 0) {
+    cachedT_ = timeLength;
+    cachedS_ = numStocks;
+    return;
+  }
+  size_t bytesPerSlot = static_cast<size_t>(timeLength) *
+                          static_cast<size_t>(numStocks) *
+                          bytesPerElem(data_->dtype);
+  slotBufs_.resize(loaded_->plan.peakIntermediateSlots, 0);
+  for (int i = 0; i < loaded_->plan.peakIntermediateSlots; ++i) {
+    CUdeviceptr p = 0;
+    checkCu(cuMemAlloc(&p, bytesPerSlot), "cuMemAlloc(intermediate slot)");
+    slotBufs_[i] = static_cast<uintptr_t>(p);
+  }
+  cachedT_ = timeLength;
+  cachedS_ = numStocks;
+}
+
+//===----------------------------------------------------------------------===//
+// Out-of-line plan accessors (header forward-declares GraphPlan)
+//===----------------------------------------------------------------------===//
+
+const std::vector<int> &Executable::launchOrder() const noexcept {
+  return loaded_->plan.launchOrder;
+}
+int Executable::numBuffers() const noexcept { return loaded_->plan.numBuffers; }
+int Executable::peakIntermediateSlots() const noexcept {
+  return loaded_->plan.peakIntermediateSlots;
+}
+
+void Executable::launchOnStream(
+    Executor *exec,
+    int64_t timeLength, int64_t numStocks,
+    const std::vector<std::pair<std::string, uintptr_t>> &args,
+    int64_t mask,
+    int minChunkWarmupFactor,
+    double smFillFactor,
+    LaunchMode mode) {
+  if (!exec)
+    throw std::runtime_error(
+        "kun_cuda::launchOnStream: Executor pointer is null");
+
+  validateLaunchInputs(*data_, timeLength, numStocks, mask);
+
+  if (mode == LaunchMode::CudaGraph) {
+    launchCudaGraphOnStream(exec, timeLength, numStocks, args,
+                            mask, minChunkWarmupFactor, smFillFactor);
+    return;
+  }
+
+  CUstream stream      = exec->stream();
+  int devMaxSmemBytes  = exec->devMaxSmemBytes();
+  int numSMs           = exec->numSMs();
+
+  // ── Grow / reuse the intermediate slot pool for this shape ───────
+  ensureSlotPool(timeLength, numStocks);
+
+  // ── Map user args + slot pool into a flat buffer-index → ptr ─────
+  const std::vector<uintptr_t> bufPtrs =
+      resolveBufferPointers(loaded_->plan, *data_, args, slotBufs_);
+
+  std::vector<KernelLaunchDesc> descs =
+      buildKernelLaunchDescs(loaded_->plan, *data_, loaded_->cuFuncs,
+                             timeLength, numStocks, bufPtrs,
+                             mask, minChunkWarmupFactor, smFillFactor,
+                             devMaxSmemBytes, numSMs);
+  for (const auto &desc : descs)
+    launchKernelDesc(desc, stream);
+}
+
+//===----------------------------------------------------------------------===//
+// Executor — thin CUstream wrapper, mirrors the CPU `kun::Executor` shape.
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// Query the current CUcontext's device for a single integer attribute.
+/// Returns 0 if no context is current — callers gate use on 0 == "unknown".
+int queryDevAttr(CUdevice_attribute attr) {
+  CUcontext cur = nullptr;
+  if (cuCtxGetCurrent(&cur) != CUDA_SUCCESS || !cur) return 0;
+  CUdevice dev = 0;
+  if (cuCtxGetDevice(&dev) != CUDA_SUCCESS) return 0;
+  int v = 0;
+  if (cuDeviceGetAttribute(&v, attr, dev) != CUDA_SUCCESS) return 0;
+  return v;
+}
+} // namespace
+
+Executor::Executor()
+    : stream_(nullptr),
+      devMaxSmemBytes_(
+          queryDevAttr(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN)),
+      numSMs_(queryDevAttr(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT)) {}
+Executor::Executor(CUstream stream)
+    : stream_(stream),
+      devMaxSmemBytes_(
+          queryDevAttr(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN)),
+      numSMs_(queryDevAttr(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT)) {}
+Executor::~Executor() = default;
+
+void Executor::runGraph(
+    Executable &exe, int64_t timeLength, int64_t numStocks,
+    const std::vector<std::pair<std::string, uintptr_t>> &args,
+    int64_t mask, int minChunkWarmupFactor, double smFillFactor,
+    LaunchMode mode) {
+  exe.launchOnStream(this, timeLength, numStocks, args,
+                      mask, minChunkWarmupFactor, smFillFactor, mode);
+}
+
+void Executor::synchronize() {
+  checkCu(cuStreamSynchronize(stream_), "cuStreamSynchronize");
+}
+
+} // namespace kun_cuda
diff --git a/mlir/lib/KunCuda/RuntimeCudaGraph.cpp b/mlir/lib/KunCuda/RuntimeCudaGraph.cpp
new file mode 100644
index 0000000..8162aaa
--- /dev/null
+++ b/mlir/lib/KunCuda/RuntimeCudaGraph.cpp
@@ -0,0 +1,377 @@
+//===- RuntimeCudaGraph.cpp - CUDA Graph launcher for kun_cuda ------------===//
+//
+// CUDA Graph mode builds the real producer/consumer node DAG instead of
+// enqueueing kernels in a topo-linear loop.  Intermediate buffers are
+// graph-owned allocations:
+//
+//   alloc(intermediate) -> producer kernel -> all consumer kernels -> free
+//
+// User-visible graph inputs/outputs remain caller-owned pointers supplied at
+// launch time.
+//
+//===----------------------------------------------------------------------===//
+
+#include "KunCuda/Runtime.h"
+#include "RuntimeUtil.h"
+
+#include <cuda.h>
+
+#include <algorithm>
+#include <stdexcept>
+#include <utility>
+#include <vector>
+
+namespace kun_cuda {
+
+namespace {
+
+void addUniqueNode(std::vector<CUgraphNode> &nodes, CUgraphNode n) {
+  if (!n)
+    return;
+  if (std::find(nodes.begin(), nodes.end(), n) == nodes.end())
+    nodes.push_back(n);
+}
+
+bool isIntermediate(const GraphPlan &plan, int bufIdx) {
+  return bufIdx >= firstIntermediateBuffer(plan);
+}
+
+void ensureNoInFlight(CudaGraphLaunchState &state, const char *action) {
+  if (!state.hasLaunch || !state.completionEvent)
+    return;
+
+  CUresult r = cuEventQuery(state.completionEvent);
+  if (r == CUDA_SUCCESS) {
+    state.hasLaunch = false;
+    return;
+  }
+  if (r == CUDA_ERROR_NOT_READY)
+    throw std::runtime_error(
+        std::string("kun_cuda::launchOnStream(cuda_graph): previous CUDA "
+                    "graph launch is still in flight; call synchronize() "
+                    "before ") + action + " the executable's CUDA graph");
+  checkCu(r, "cuEventQuery(cuda graph completion)");
+}
+
+size_t intermediateBytes(const ExecutableData &data,
+                         int64_t timeLength, int64_t numStocks) {
+  size_t bytes = static_cast<size_t>(timeLength) *
+                 static_cast<size_t>(numStocks) *
+                 bytesPerElem(data.dtype);
+  return bytes == 0 ? 1 : bytes;
+}
+
+bool sameLaunchParamsExceptBuffers(const CudaGraphLaunchParams &a,
+                                   const CudaGraphLaunchParams &b) {
+  return a.timeLength == b.timeLength &&
+         a.numStocks == b.numStocks &&
+         a.mask == b.mask &&
+         a.minChunkWarmupFactor == b.minChunkWarmupFactor &&
+         a.smFillFactor == b.smFillFactor &&
+         a.devMaxSmemBytes == b.devMaxSmemBytes &&
+         a.numSMs == b.numSMs;
+}
+
+bool sameLaunchParams(const CudaGraphLaunchParams &a,
+                      const CudaGraphLaunchParams &b) {
+  return sameLaunchParamsExceptBuffers(a, b) && a.bufPtrs == b.bufPtrs;
+}
+
+CudaGraphLaunchParams makeLaunchParams(
+    Executor *exec,
+    int64_t timeLength, int64_t numStocks,
+    int64_t mask,
+    int minChunkWarmupFactor,
+    double smFillFactor,
+    std::vector<uintptr_t> bufPtrs) {
+  CudaGraphLaunchParams params;
+  params.timeLength = timeLength;
+  params.numStocks = numStocks;
+  params.mask = mask;
+  params.minChunkWarmupFactor = minChunkWarmupFactor;
+  params.smFillFactor = smFillFactor;
+  params.devMaxSmemBytes = exec->devMaxSmemBytes();
+  params.numSMs = exec->numSMs();
+  params.bufPtrs = std::move(bufPtrs);
+  return params;
+}
+
+std::vector<uintptr_t> resolveCudaGraphBufferPointers(
+    const GraphPlan &plan,
+    const ExecutableData &data,
+    const CudaGraphLaunchState &state,
+    const std::vector<std::pair<std::string, uintptr_t>> &args) {
+  std::vector<uintptr_t> bufPtrs =
+      resolveExternalBufferPointers(plan, data, args);
+  for (int b = firstIntermediateBuffer(plan); b < plan.numBuffers; ++b)
+    bufPtrs[b] = state.graphAllocBufPtrs[b];
+  return bufPtrs;
+}
+
+CudaGraphLaunchParams makeLaunchParams(
+    const GraphPlan &plan,
+    const ExecutableData &data,
+    const CudaGraphLaunchState &state,
+    Executor *exec,
+    int64_t timeLength, int64_t numStocks,
+    const std::vector<std::pair<std::string, uintptr_t>> &args,
+    int64_t mask,
+    int minChunkWarmupFactor,
+    double smFillFactor) {
+  return makeLaunchParams(
+      exec, timeLength, numStocks, mask, minChunkWarmupFactor, smFillFactor,
+      resolveCudaGraphBufferPointers(plan, data, state, args));
+}
+
+CUgraphNode addAllocNode(CUgraph graph,
+                         CUdevice device,
+                         size_t bytes,
+                         const std::vector<CUgraphNode> &deps,
+                         CUdeviceptr &outPtr) {
+  CUDA_MEM_ALLOC_NODE_PARAMS params{};
+  params.poolProps.allocType = CU_MEM_ALLOCATION_TYPE_PINNED;
+  params.poolProps.handleTypes = CU_MEM_HANDLE_TYPE_NONE;
+  params.poolProps.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  params.poolProps.location.id = device;
+  params.bytesize = bytes;
+
+  CUgraphNode node = nullptr;
+  checkCu(cuGraphAddMemAllocNode(&node, graph,
+                                 deps.empty() ? nullptr : deps.data(),
+                                 deps.size(), &params),
+          "cuGraphAddMemAllocNode");
+  outPtr = params.dptr;
+  return node;
+}
+
+std::vector<CUgraphNode> kernelInputDeps(const GraphPlan &plan,
+                                         const CudaGraphLaunchState &state,
+                                         int kIdx) {
+  std::vector<CUgraphNode> deps;
+  for (int b : plan.kernelInputBufs[kIdx]) {
+    int producer = plan.producerKernel[b];
+    if (producer >= 0)
+      addUniqueNode(deps, state.kernelNodes[producer]);
+  }
+  return deps;
+}
+
+void addOutputAllocNodes(const GraphPlan &plan,
+                         CudaGraphLaunchState &state,
+                         CUdevice device,
+                         size_t bytes,
+                         int kIdx,
+                         const std::vector<CUgraphNode> &inputDeps,
+                         std::vector<uintptr_t> &bufPtrs) {
+  for (int b : plan.kernelOutputBufs[kIdx]) {
+    if (!isIntermediate(plan, b))
+      continue;
+    CUdeviceptr dptr = 0;
+    CUgraphNode allocNode =
+        addAllocNode(state.graph, device, bytes, inputDeps, dptr);
+    state.allocNodes[b] = allocNode;
+    state.graphAllocBufPtrs[b] = static_cast<uintptr_t>(dptr);
+    bufPtrs[b] = static_cast<uintptr_t>(dptr);
+  }
+}
+
+void addOneKernelNode(const GraphPlan &plan,
+                      const ExecutableData &data,
+                      const std::vector<CUfunction> &cuFuncs,
+                      CudaGraphLaunchState &state,
+                      const CudaGraphLaunchParams &launch,
+                      int kIdx,
+                      std::vector<CUgraphNode> deps) {
+  for (int b : plan.kernelOutputBufs[kIdx])
+    if (isIntermediate(plan, b))
+      addUniqueNode(deps, state.allocNodes[b]);
+
+  KernelLaunchDesc &stored = state.descs[kIdx];
+  stored.update(plan, data, cuFuncs, kIdx, launch);
+
+  CUgraphNode node = nullptr;
+  if (stored.isKernelNode) {
+    checkCu(cuGraphAddKernelNode(&node, state.graph,
+                                 deps.empty() ? nullptr : deps.data(),
+                                 deps.size(), &stored.params),
+            "cuGraphAddKernelNode");
+  } else {
+    checkCu(cuGraphAddEmptyNode(&node, state.graph,
+                                deps.empty() ? nullptr : deps.data(),
+                                deps.size()),
+            "cuGraphAddEmptyNode");
+  }
+  state.kernelNodes[kIdx] = node;
+  state.kernelNodeIsKernel[kIdx] = stored.isKernelNode;
+}
+
+void addFreeNodes(const GraphPlan &plan,
+                  CudaGraphLaunchState &state) {
+  for (int b = firstIntermediateBuffer(plan); b < plan.numBuffers; ++b) {
+    std::vector<CUgraphNode> deps;
+    for (int kIdx = 0; kIdx < static_cast<int>(plan.kernelInputBufs.size());
+         ++kIdx) {
+      const auto &ins = plan.kernelInputBufs[kIdx];
+      if (std::find(ins.begin(), ins.end(), b) != ins.end())
+        addUniqueNode(deps, state.kernelNodes[kIdx]);
+    }
+
+    if (deps.empty()) {
+      int producer = plan.producerKernel[b];
+      if (producer >= 0)
+        addUniqueNode(deps, state.kernelNodes[producer]);
+    }
+
+    CUgraphNode freeNode = nullptr;
+    checkCu(cuGraphAddMemFreeNode(
+                &freeNode, state.graph,
+                deps.empty() ? nullptr : deps.data(),
+                deps.size(),
+                static_cast<CUdeviceptr>(state.graphAllocBufPtrs[b])),
+            "cuGraphAddMemFreeNode");
+    state.freeNodes[b] = freeNode;
+  }
+}
+
+void buildCudaGraphState(const GraphPlan &plan,
+                         const ExecutableData &data,
+                         const std::vector<CUfunction> &cuFuncs,
+                         CudaGraphLaunchState &state,
+                         Executor *exec,
+                         int64_t timeLength, int64_t numStocks,
+                         const std::vector<std::pair<std::string, uintptr_t>> &args,
+                         int64_t mask,
+                         int minChunkWarmupFactor,
+                         double smFillFactor) {
+  checkCu(cuGraphCreate(&state.graph, 0), "cuGraphCreate");
+
+  const int nBufs = plan.numBuffers;
+  const int nKernels = static_cast<int>(data.kernels.size());
+  state.graphAllocBufPtrs.assign(nBufs, 0);
+  state.allocNodes.assign(nBufs, nullptr);
+  state.kernelNodes.assign(nKernels, nullptr);
+  state.kernelNodeIsKernel.assign(nKernels, false);
+  state.freeNodes.assign(nBufs, nullptr);
+  state.descs.resize(nKernels);
+
+  CUdevice device = 0;
+  checkCu(cuCtxGetDevice(&device), "cuCtxGetDevice");
+  const size_t bytes = intermediateBytes(data, timeLength, numStocks);
+
+  CudaGraphLaunchParams launch = makeLaunchParams(
+      exec, timeLength, numStocks, mask, minChunkWarmupFactor, smFillFactor,
+      resolveExternalBufferPointers(plan, data, args));
+
+  for (int kIdx : plan.launchOrder) {
+    std::vector<CUgraphNode> deps = kernelInputDeps(plan, state, kIdx);
+    addOutputAllocNodes(plan, state, device, bytes, kIdx, deps,
+                        launch.bufPtrs);
+    addOneKernelNode(plan, data, cuFuncs, state, launch, kIdx,
+                     std::move(deps));
+  }
+
+  addFreeNodes(plan, state);
+  checkCu(cuGraphInstantiate(&state.graphExec, state.graph, 0),
+          "cuGraphInstantiate");
+  state.cachedLaunchParams = std::move(launch);
+}
+
+void updateCudaGraphKernelParams(
+    const GraphPlan &plan,
+    const ExecutableData &data,
+    const std::vector<CUfunction> &cuFuncs,
+    CudaGraphLaunchState &state,
+    const CudaGraphLaunchParams &cached,
+    const CudaGraphLaunchParams &launch) {
+  const bool bufferOnly = sameLaunchParamsExceptBuffers(cached, launch);
+  for (int kIdx : plan.launchOrder) {
+    KernelLaunchDesc &desc = state.descs[kIdx];
+    bool changed = false;
+    if (bufferOnly) {
+      changed = desc.updateBuffer(plan, kIdx, launch.bufPtrs);
+    } else {
+      desc.update(plan, data, cuFuncs, kIdx, launch);
+      changed = true;
+    }
+    if (!changed)
+      continue;
+    if (state.kernelNodeIsKernel[desc.kernelIndex] != desc.isKernelNode)
+      throw std::runtime_error(
+          "kun_cuda::launchOnStream(cuda_graph): kernel/empty node shape "
+          "changed without graph rebuild");
+    if (!desc.isKernelNode)
+      continue;
+    checkCu(cuGraphExecKernelNodeSetParams(
+                state.graphExec, state.kernelNodes[desc.kernelIndex],
+                &desc.params),
+            "cuGraphExecKernelNodeSetParams");
+  }
+}
+
+} // namespace
+
+CudaGraphLaunchState::~CudaGraphLaunchState() noexcept {
+  if (hasLaunch && completionEvent)
+    (void)cuEventSynchronize(completionEvent);
+  if (graphExec)
+    (void)cuGraphExecDestroy(graphExec);
+  if (graph)
+    (void)cuGraphDestroy(graph);
+  if (completionEvent)
+    (void)cuEventDestroy(completionEvent);
+}
+
+void Executable::launchCudaGraphOnStream(
+    Executor *exec,
+    int64_t timeLength, int64_t numStocks,
+    const std::vector<std::pair<std::string, uintptr_t>> &args,
+    int64_t mask,
+    int minChunkWarmupFactor,
+    double smFillFactor) {
+  if (!cudaGraphState_)
+    cudaGraphState_ = std::make_unique<CudaGraphLaunchState>();
+
+  const bool needRebuild =
+      !cudaGraphState_->graphExec ||
+      !cudaGraphState_->cachedLaunchParams ||
+      cudaGraphState_->cachedLaunchParams->timeLength != timeLength ||
+      cudaGraphState_->cachedLaunchParams->numStocks != numStocks;
+
+  if (needRebuild) {
+    ensureNoInFlight(*cudaGraphState_, "rebuilding");
+    resetCudaGraphState();
+    cudaGraphState_ = std::make_unique<CudaGraphLaunchState>();
+    buildCudaGraphState(loaded_->plan, *data_, loaded_->cuFuncs,
+                        *cudaGraphState_,
+                        exec, timeLength, numStocks, args,
+                        mask, minChunkWarmupFactor, smFillFactor);
+  } else {
+    ensureNoInFlight(*cudaGraphState_, "updating");
+    CudaGraphLaunchParams launch = makeLaunchParams(
+        loaded_->plan, *data_, *cudaGraphState_, exec, timeLength, numStocks, args,
+        mask, minChunkWarmupFactor, smFillFactor);
+    if (!sameLaunchParams(*cudaGraphState_->cachedLaunchParams, launch)) {
+      updateCudaGraphKernelParams(loaded_->plan, *data_, loaded_->cuFuncs,
+                                  *cudaGraphState_,
+                                  *cudaGraphState_->cachedLaunchParams,
+                                  launch);
+      cudaGraphState_->cachedLaunchParams = std::move(launch);
+    }
+  }
+
+  checkCu(cuGraphLaunch(cudaGraphState_->graphExec, exec->stream()),
+          "cuGraphLaunch");
+  if (!cudaGraphState_->completionEvent)
+    checkCu(cuEventCreate(&cudaGraphState_->completionEvent,
+                          CU_EVENT_DISABLE_TIMING),
+            "cuEventCreate(cuda graph completion)");
+  checkCu(cuEventRecord(cudaGraphState_->completionEvent, exec->stream()),
+          "cuEventRecord(cuda graph completion)");
+  cudaGraphState_->hasLaunch = true;
+}
+
+void Executable::resetCudaGraphState() noexcept {
+  cudaGraphState_.reset();
+}
+
+} // namespace kun_cuda
diff --git a/mlir/lib/KunCuda/RuntimeUtil.h b/mlir/lib/KunCuda/RuntimeUtil.h
new file mode 100644
index 0000000..c827883
--- /dev/null
+++ b/mlir/lib/KunCuda/RuntimeUtil.h
@@ -0,0 +1,175 @@
+//===- RuntimeUtil.h - private kun_cuda runtime helpers ------------------===//
+//
+// This header is private to libKunCudaRuntime.  It holds the pieces shared by
+// the traditional sequential launcher and the CUDA Graph launcher.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "KunCuda/Runtime.h"
+
+#include <cuda.h>
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace kun_cuda {
+
+// Runtime-resolved schedule + memory plan.  The public header forward-declares
+// this type so Executable can keep it behind a pImpl.
+struct GraphPlan {
+  int numBuffers       = 0;
+  int numGraphInputs   = 0;
+  int numGraphOutputs  = 0;
+
+  std::unordered_map<std::string, int> graphInputIdx;
+  std::unordered_map<std::string, int> graphOutputIdx;
+
+  std::vector<std::vector<int>> kernelInputBufs;
+  std::vector<std::vector<int>> kernelOutputBufs;
+
+  // producerKernel[bufIdx] = kernel that writes the buffer, or -1 for a graph
+  // input.
+  std::vector<int> producerKernel;
+
+  // Topo order used by the sequential launcher and as a construction order for
+  // CUDA Graph kernel nodes.
+  std::vector<int> launchOrder;
+
+  // Sequential-mode intermediate slot assignment.  Graph mode uses one
+  // allocation node per logical intermediate instead.
+  std::vector<int> intermediateBufToSlot;
+  int peakIntermediateSlots = 0;
+};
+
+// Context-local immutable runtime state shared by cloned Executables.
+// Per-Executable mutable state (intermediate slot buffers and CUDA Graph
+// launch cache) intentionally stays on Executable.
+struct LoadedExecutable {
+  explicit LoadedExecutable(std::shared_ptr<const ExecutableData> data);
+  ~LoadedExecutable() noexcept;
+
+  LoadedExecutable(const LoadedExecutable &) = delete;
+  LoadedExecutable &operator=(const LoadedExecutable &) = delete;
+  LoadedExecutable(LoadedExecutable &&) = delete;
+  LoadedExecutable &operator=(LoadedExecutable &&) = delete;
+
+  std::shared_ptr<const ExecutableData> data;
+  GraphPlan plan;
+
+  CUmodule cuModule = nullptr;
+  CUmodule csRankModule = nullptr;
+  CUmodule csScaleModule = nullptr;
+  std::vector<CUfunction> cuFuncs;  ///< parallel to data->kernels
+};
+
+struct ChunkPlan {
+  int64_t chunkSize = 0;
+  unsigned numChunks = 1;
+};
+
+struct CudaGraphLaunchParams {
+  int64_t timeLength = 0;
+  int64_t numStocks = 0;
+  int64_t mask = 0;
+  int minChunkWarmupFactor = 0;
+  double smFillFactor = 0.0;
+  int devMaxSmemBytes = 0;
+  int numSMs = 0;
+  std::vector<uintptr_t> bufPtrs;
+};
+
+struct KernelLaunchDesc {
+  KernelLaunchDesc() = default;
+  KernelLaunchDesc(int kernelIndex, KernelKind kind, CUfunction fn,
+                   bool isKernelNode,
+                   int32_t timeLenI32, int32_t numStocksI32,
+                   int32_t maskI32, int32_t chunkSizeI32,
+                   int32_t warmupI32,
+                   std::vector<CUdeviceptr> ptrs);
+  KernelLaunchDesc(KernelLaunchDesc &&other) noexcept;
+  KernelLaunchDesc &operator=(KernelLaunchDesc &&other) noexcept;
+  KernelLaunchDesc(const KernelLaunchDesc &) = delete;
+  KernelLaunchDesc &operator=(const KernelLaunchDesc &) = delete;
+
+  void update(const GraphPlan &plan,
+              const ExecutableData &data,
+              const std::vector<CUfunction> &cuFuncs,
+              int kernelIndex,
+              const CudaGraphLaunchParams &launch);
+  bool updateBuffer(const GraphPlan &plan,
+                    int kernelIndex,
+                    const std::vector<uintptr_t> &bufPtrs);
+
+  int kernelIndex = -1;
+  KernelKind kind = KernelKind::Jit;
+  bool isKernelNode = true;
+
+  int32_t timeLenI32 = 0;
+  int32_t numStocksI32 = 0;
+  int32_t maskI32 = 0;
+  int32_t chunkSizeI32 = 0;
+  int32_t warmupI32 = 0;
+
+  CUDA_KERNEL_NODE_PARAMS params{};
+
+private:
+  std::vector<CUdeviceptr> ptrs_;
+  std::vector<void *> argPtrs_;
+
+  void rebuildKernelParamPointers();
+};
+
+struct CudaGraphLaunchState {
+  ~CudaGraphLaunchState() noexcept;
+
+  CUgraph graph = nullptr;
+  CUgraphExec graphExec = nullptr;
+  CUevent completionEvent = nullptr;
+  bool hasLaunch = false;
+
+  std::optional<CudaGraphLaunchParams> cachedLaunchParams;
+
+  std::vector<uintptr_t> graphAllocBufPtrs;
+  std::vector<CUgraphNode> allocNodes;
+  std::vector<CUgraphNode> kernelNodes;
+  std::vector<bool> kernelNodeIsKernel;
+  std::vector<CUgraphNode> freeNodes;
+  std::vector<KernelLaunchDesc> descs;
+};
+
+void checkCu(CUresult r, const char *what);
+
+std::string joinNames(const std::vector<std::string> &v);
+
+int firstIntermediateBuffer(const GraphPlan &plan) noexcept;
+
+void validateLaunchInputs(const ExecutableData &data,
+                          int64_t timeLength, int64_t numStocks,
+                          int64_t mask);
+
+std::vector<uintptr_t> resolveExternalBufferPointers(
+    const GraphPlan &plan,
+    const ExecutableData &data,
+    const std::vector<std::pair<std::string, uintptr_t>> &args);
+
+std::vector<uintptr_t> resolveBufferPointers(
+    const GraphPlan &plan,
+    const ExecutableData &data,
+    const std::vector<std::pair<std::string, uintptr_t>> &args,
+    const std::vector<uintptr_t> &slotBufs);
+
+ChunkPlan computeChunkPlan(int64_t timeLength, int64_t numStocks,
+                           int64_t warpsPerCta, int64_t vectorSize,
+                           int64_t unreliableCount, int64_t mask,
+                           int minChunkWarmupFactor,
+                           double smFillFactor, int numSMs);
+
+void launchKernelDesc(const KernelLaunchDesc &desc, CUstream stream);
+
+} // namespace kun_cuda
diff --git a/mlir/lib/KunCuda/kernels/cs_rank.cu b/mlir/lib/KunCuda/kernels/cs_rank.cu
new file mode 100644
index 0000000..dea2203
--- /dev/null
+++ b/mlir/lib/KunCuda/kernels/cs_rank.cu
@@ -0,0 +1,132 @@
+// cs_rank.cu — cross-sectional rank kernel, pre-compiled to PTX and
+// embedded into libKunCudaRuntime as a separate CUmodule.
+//
+// Signature matches the project's launch convention so the executor can
+// pass the same `(i32 time_length, i32 num_stocks, in_ptr, out_ptr)`
+// arg tuple it uses for JIT'd kernels.
+//
+// Launch shape (chosen by the executor):
+//   gridDim.x  = min(T, ceil(sm_fill_factor * numSMs))   // time chunks
+//   blockDim.x = clamp(round_up(num_stocks, 32), 32, 1024)
+//   smem       = num_stocks * sizeof(T)
+//
+// Each CTA processes a contiguous slice of time (`ceil(T/gridDim.x)`
+// steps), reusing its smem across the slice — time-contiguous so gmem
+// reads / writes stream cleanly through L2.
+//
+// Algorithm — pairwise O(N^2) per timestep:
+//   For each stock i with non-NaN value v,
+//     less  = #{ j : !isnan(u[j]) && u[j]  < v }
+//     equal = #{ j : !isnan(u[j]) && u[j] == v }    (includes i itself)
+//     valid = #{ j : !isnan(u[j]) }
+//   Output is the average-rank normalised to (0, 1]:
+//     out = (2*less + equal + 1) / (2 * valid)
+//   This matches cpp/Kun/Rank.hpp's equal_range formula exactly:
+//     sum = (start + end + 1) * (end - start) / 2
+//     out = sum / (end - start) / valid
+//   with start = less, end = less + equal.
+//
+// NaN policy: NaN inputs produce NaN outputs and don't contribute to
+// any count.
+
+#include <cuda_runtime.h>
+#include <math_constants.h>      // CUDART_NAN, CUDART_NAN_F
+
+// Dynamic shared memory base.  Declared at file scope (no anonymous
+// namespace) so it gets a stable, internal symbol rather than nvcc's
+// "extern .shared" with mangled-namespace linkage — the latter survives
+// to the PTX as an unresolved extern, which the driver JIT cannot link
+// when this PTX is loaded standalone via cuModuleLoadData.  Both
+// kun_cs_rank_f32 and kun_cs_rank_f64 reinterpret_cast<T*>(raw_smem)
+// from a single base — fine, since each kernel launch supplies its own
+// physical smem allocation.
+extern __shared__ unsigned char kun_cs_rank_smem[];
+
+namespace {
+
+template <typename T>
+__device__ static inline T kun_nan();
+
+template <>
+__device__ inline float kun_nan<float>() { return CUDART_NAN_F; }
+
+template <>
+__device__ inline double kun_nan<double>() { return CUDART_NAN; }
+
+// Templated body — each CTA processes a contiguous time-axis slice;
+// threads cooperate across the cross-section for every timestep in
+// the slice.
+template <typename T>
+__device__ static void cs_rank_body(const T* __restrict__ in,
+                                    T* __restrict__ out,
+                                    int time_length,
+                                    int num_stocks) {
+    // Even split of [0, time_length) across gridDim.x.  Last CTA may
+    // have fewer (or zero) timesteps when gridDim.x doesn't divide T.
+    int time_per_cta = (time_length + gridDim.x - 1) / gridDim.x;
+    int t0 = blockIdx.x * time_per_cta;
+    int t1 = t0 + time_per_cta;
+    if (t1 > time_length) t1 = time_length;
+    if (t0 >= t1) return;
+
+    T* smem = reinterpret_cast<T*>(kun_cs_rank_smem);
+
+    for (int t = t0; t < t1; ++t) {
+        const T* row_in  = in  + static_cast<size_t>(t) * num_stocks;
+        T*       row_out = out + static_cast<size_t>(t) * num_stocks;
+
+        // 1) Cooperative load of this timestep's cross-section into smem.
+        for (int i = threadIdx.x; i < num_stocks; i += blockDim.x) {
+            smem[i] = row_in[i];
+        }
+        __syncthreads();
+
+        // 2) Per-stock pairwise count.  Each thread owns a stride of
+        //    stocks (= 1 stock when blockDim.x ≥ num_stocks).
+        for (int i = threadIdx.x; i < num_stocks; i += blockDim.x) {
+            T v = smem[i];
+            if (isnan(v)) {
+                row_out[i] = kun_nan<T>();
+                continue;
+            }
+
+            int less  = 0;
+            int equal = 0;
+            int valid = 0;
+            for (int j = 0; j < num_stocks; ++j) {
+                T u = smem[j];
+                int is_valid = !isnan(u);
+                valid += is_valid;
+                less  += (is_valid & (u <  v));
+                equal += (is_valid & (u == v));
+            }
+
+            if (valid == 0) {
+                row_out[i] = kun_nan<T>();
+                continue;
+            }
+            // Average-rank percentile, matching the CPU reference.
+            T num = static_cast<T>(2 * less + equal + 1);
+            T den = static_cast<T>(2 * valid);
+            row_out[i] = num / den;
+        }
+        // Re-sync before the next iteration overwrites smem.
+        __syncthreads();
+    }
+}
+
+} // anonymous namespace
+
+extern "C" __global__
+void kun_cs_rank_f32(int time_length, int num_stocks,
+                     const float* __restrict__ in,
+                     float* __restrict__ out) {
+    cs_rank_body<float>(in, out, time_length, num_stocks);
+}
+
+extern "C" __global__
+void kun_cs_rank_f64(int time_length, int num_stocks,
+                     const double* __restrict__ in,
+                     double* __restrict__ out) {
+    cs_rank_body<double>(in, out, time_length, num_stocks);
+}
diff --git a/mlir/lib/KunCuda/kernels/cs_scale.cu b/mlir/lib/KunCuda/kernels/cs_scale.cu
new file mode 100644
index 0000000..647bc7b
--- /dev/null
+++ b/mlir/lib/KunCuda/kernels/cs_scale.cu
@@ -0,0 +1,110 @@
+// cs_scale.cu — cross-sectional scale kernel, pre-compiled to PTX and
+// embedded into libKunCudaRuntime as a separate CUmodule.
+//
+// Signature matches cs_rank and the executor's external-kernel launch
+// convention:
+//   (i32 time_length, i32 num_stocks, in_ptr, out_ptr)
+//
+// For each timestep:
+//   sum = Σ abs(x_i), ignoring NaNs
+//   out_i = x_i / sum
+// except all-zero valid rows follow the CPU ScaleStocks behavior and
+// produce NaN for zero inputs.
+
+#include <cuda_runtime.h>
+#include <math_constants.h>
+
+extern __shared__ unsigned char kun_cs_scale_smem[];
+
+namespace {
+
+template <typename T>
+__device__ static inline T kun_nan();
+
+template <>
+__device__ inline float kun_nan<float>() { return CUDART_NAN_F; }
+
+template <>
+__device__ inline double kun_nan<double>() { return CUDART_NAN; }
+
+template <typename T>
+__device__ static inline T kun_abs(T v);
+
+template <>
+__device__ inline float kun_abs<float>(float v) { return fabsf(v); }
+
+template <>
+__device__ inline double kun_abs<double>(double v) { return fabs(v); }
+
+template <typename T>
+__device__ static inline T warp_sum(T v) {
+#pragma unroll
+    for (int offset = 16; offset > 0; offset >>= 1) {
+        v += __shfl_down_sync(0xffffffffu, v, offset);
+    }
+    return v;
+}
+
+template <typename T>
+__device__ static void cs_scale_body(const T* __restrict__ in,
+                                     T* __restrict__ out,
+                                     int time_length,
+                                     int num_stocks) {
+    int time_per_cta = (time_length + gridDim.x - 1) / gridDim.x;
+    int t0 = blockIdx.x * time_per_cta;
+    int t1 = t0 + time_per_cta;
+    if (t1 > time_length) t1 = time_length;
+    if (t0 >= t1) return;
+
+    T* smem = reinterpret_cast<T*>(kun_cs_scale_smem);
+    T* row_sum = smem + num_stocks;
+
+    for (int t = t0; t < t1; ++t) {
+        const T* row_in  = in  + static_cast<size_t>(t) * num_stocks;
+        T*       row_out = out + static_cast<size_t>(t) * num_stocks;
+
+        for (int i = threadIdx.x; i < num_stocks; i += blockDim.x) {
+            smem[i] = row_in[i];
+        }
+        __syncthreads();
+
+        if (threadIdx.x < 32) {
+            int lane = threadIdx.x;
+            T lane_sum = static_cast<T>(0);
+            for (int i = lane; i < num_stocks; i += 32) {
+                T v = smem[i];
+                if (!isnan(v))
+                    lane_sum += kun_abs(v);
+            }
+            T sum = warp_sum(lane_sum);
+            if (lane == 0)
+                *row_sum = sum;
+        }
+        __syncthreads();
+
+        T sum = *row_sum;
+        for (int i = threadIdx.x; i < num_stocks; i += blockDim.x) {
+            T v = smem[i];
+            row_out[i] = (v == static_cast<T>(0) && sum == static_cast<T>(0))
+                             ? kun_nan<T>()
+                             : v / sum;
+        }
+        __syncthreads();
+    }
+}
+
+} // anonymous namespace
+
+extern "C" __global__
+void kun_cs_scale_f32(int time_length, int num_stocks,
+                      const float* __restrict__ in,
+                      float* __restrict__ out) {
+    cs_scale_body<float>(in, out, time_length, num_stocks);
+}
+
+extern "C" __global__
+void kun_cs_scale_f64(int time_length, int num_stocks,
+                      const double* __restrict__ in,
+                      double* __restrict__ out) {
+    cs_scale_body<double>(in, out, time_length, num_stocks);
+}
diff --git a/mlir/lib/KunGpu/CMakeLists.txt b/mlir/lib/KunGpu/CMakeLists.txt
new file mode 100644
index 0000000..2950b8b
--- /dev/null
+++ b/mlir/lib/KunGpu/CMakeLists.txt
@@ -0,0 +1,54 @@
+add_mlir_dialect_library(MLIRKunGpuDialect
+  KunGpuDialect.cpp
+  KunGpuOps.cpp
+  KunGpuMemoryPlanning.cpp
+  KunGpuToLLVM.cpp
+  Pipelines.cpp
+  PtxBackend.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${PROJECT_SOURCE_DIR}/mlir/include
+
+  DEPENDS
+  MLIRKunGpuOpsIncGen
+  MLIRKunGpuPassIncGen
+  MLIRKunIrOpsIncGen
+
+  LINK_LIBS PUBLIC
+  MLIRKunIrDialect
+)
+
+mlir_target_link_libraries(MLIRKunGpuDialect PUBLIC
+  MLIRIR
+  MLIRPass
+  MLIRFuncDialect
+  MLIRFuncTransforms
+  MLIRArithDialect
+  MLIRGPUDialect
+  MLIRLLVMDialect
+  MLIRTransforms
+  MLIRTransformUtils
+  MLIRSideEffectInterfaces
+  MLIRSCFToControlFlow
+  MLIRControlFlowToLLVM
+  MLIRArithToLLVM
+  MLIRFuncToLLVM
+  MLIRIndexToLLVM
+  MLIRGPUToNVVMTransforms
+  MLIRNVVMDialect
+
+  # Main path: kunir → llvm dialect → upstream gpu-module-to-binary.
+  # NVVMTarget supplies the serializeToObject impl that handles libdevice
+  # linking + LLVM optimization + PTX emission + ptxas invocation.
+  MLIRGPUTransforms
+  MLIRNVVMTarget
+  MLIRTargetLLVM
+
+  # LLVM IR translation registrations consumed transitively by
+  # NVVMTargetAttrImpl::serializeToObject.
+  MLIRTargetLLVMIRExport
+  MLIRBuiltinToLLVMIRTranslation
+  MLIRLLVMToLLVMIRTranslation
+  MLIRNVVMToLLVMIRTranslation
+  MLIRGPUToLLVMIRTranslation
+)
diff --git a/mlir/lib/KunGpu/KunGpuDialect.cpp b/mlir/lib/KunGpu/KunGpuDialect.cpp
new file mode 100644
index 0000000..9b2f94d
--- /dev/null
+++ b/mlir/lib/KunGpu/KunGpuDialect.cpp
@@ -0,0 +1,19 @@
+#include "KunGpu/KunGpuDialect.h"
+#include "KunGpu/KunGpuOps.h"
+#include "mlir/IR/BuiltinTypes.h"
+
+using namespace mlir;
+using namespace kungpu;
+
+//===----------------------------------------------------------------------===//
+// KunGpu dialect
+//===----------------------------------------------------------------------===//
+
+#include "KunGpu/KunGpuOpsDialect.cpp.inc"
+
+void KunGpuDialect::initialize() {
+  addOperations<
+#define GET_OP_LIST
+#include "KunGpu/KunGpuOps.cpp.inc"
+  >();
+}
diff --git a/mlir/lib/KunGpu/KunGpuMemoryPlanning.cpp b/mlir/lib/KunGpu/KunGpuMemoryPlanning.cpp
new file mode 100644
index 0000000..74fb79f
--- /dev/null
+++ b/mlir/lib/KunGpu/KunGpuMemoryPlanning.cpp
@@ -0,0 +1,146 @@
+//===- KunGpuMemoryPlanning.cpp - Windowed-temp shared/local memory plan --===//
+//
+// Assigns each kungpu.windowed_temp op a "use shared memory" flag stored as
+// the discardable attribute "kungpu.smem" (BoolAttr).  The pass itself does
+// not mutate IR structure; the subsequent to-LLVM lowering consults the attr
+// to pick an address space.
+//
+// Strategy: sort windowed_temp ops by ascending window size (smaller buffers
+// fit more easily into shared memory) and greedily assign shared memory until
+// the per-block budget is exhausted.
+//
+// Budget (from the enclosing kunir.func target_spec):
+//   budget_per_block = target_spec.smem_size / target_spec.occupancy
+//   num_threads      = target_spec.warps_per_cta * 32
+//   bytes_per_buf    = N * num_threads * target_spec.vector_size * elem_bytes
+//
+//===----------------------------------------------------------------------===//
+
+// MLIR and local headers must come before GEN_PASS_DEF_* so that ::kunir
+// is fully declared when Passes.h.inc is expanded.
+#include "KunGpu/KunGpuOps.h"
+#include "KunGpu/Passes.h"
+#include "KunIr/KunIrAttrs.h"
+#include "KunIr/KunIrOps.h"
+#include "KunIr/KunIrTypes.h"
+
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Debug.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <limits>
+
+// Pull in the generated PassBase scaffolding after all declarations are in scope.
+#define GEN_PASS_DEF_WINDOWEDTEMPMEMORYPLANNING
+#include "KunGpu/Passes.h.inc"
+
+#define DEBUG_TYPE "kungpu-memory-planning"
+
+using namespace mlir;
+using namespace kunir;
+using namespace kungpu;
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+// Helper: byte width of a floating-point element type
+//===----------------------------------------------------------------------===//
+
+static unsigned elemBytes(Type t) {
+  if (auto ft = dyn_cast<FloatType>(t))
+    return (ft.getWidth() + 7) / 8;
+  return 4; // conservative fallback
+}
+
+//===----------------------------------------------------------------------===//
+// Pass
+//===----------------------------------------------------------------------===//
+
+struct WindowedTempMemoryPlanningPass
+    : ::impl::WindowedTempMemoryPlanningBase<WindowedTempMemoryPlanningPass> {
+
+  void runOnOperation() override {
+    kunir::FuncOp funcOp = getOperation();
+
+    // -----------------------------------------------------------------------
+    // 1. Read hardware parameters from target_spec.
+    //    smem_size is the total SM shared memory; divide by occupancy to get
+    //    the per-block budget.
+    // -----------------------------------------------------------------------
+    auto ts = funcOp.getTargetSpec();
+    int64_t occupancy       = ts.getOccupancy();
+    int64_t budgetPerBlock  = (occupancy > 0) ? (ts.getSmemSize() / occupancy) : 0;
+    int64_t numThreads      = ts.getWarpsPerCta() * 32;
+    int64_t vectorSize      = ts.getVectorSize();
+
+    // -----------------------------------------------------------------------
+    // 2. Collect all windowed_temp ops in the function.
+    // -----------------------------------------------------------------------
+    SmallVector<WindowedTempOp> temps;
+    funcOp.walk([&](WindowedTempOp op) { temps.push_back(op); });
+
+    if (temps.empty())
+      return;
+
+    // -----------------------------------------------------------------------
+    // 3. Sort by ascending window size (smaller N → higher smem priority).
+    // -----------------------------------------------------------------------
+    std::stable_sort(temps.begin(), temps.end(),
+                     [](WindowedTempOp a, WindowedTempOp b) {
+                       return llvm::cast<TsType>(a.getType()).getMaxLookback() <
+                              llvm::cast<TsType>(b.getType()).getMaxLookback();
+                     });
+
+    // -----------------------------------------------------------------------
+    // 4. Greedy assignment: place in shared memory while budget allows.
+    // -----------------------------------------------------------------------
+    int64_t usedSmem = 0;
+
+    for (WindowedTempOp op : temps) {
+      auto tsTy = llvm::cast<TsType>(op.getType());
+      uint64_t N = tsTy.getMaxLookback();
+
+      // Infinite-lookback buffers cannot be sized statically → always local.
+      if (N == std::numeric_limits<uint64_t>::max()) {
+        op.setSmem(false);
+        continue;
+      }
+
+      int64_t bytes = static_cast<int64_t>(N) * numThreads *
+                      vectorSize * elemBytes(tsTy.getElementType());
+
+      bool useSmem =
+          (budgetPerBlock > 0) && (usedSmem + bytes <= budgetPerBlock);
+      if (useSmem)
+        usedSmem += bytes;
+
+      op.setSmem(useSmem);
+
+      LLVM_DEBUG(llvm::dbgs()
+                 << "[kungpu-memory-planning] windowed_temp N=" << N
+                 << " bytes=" << bytes << " -> "
+                 << (useSmem ? "smem" : "local") << "\n");
+    }
+
+    LLVM_DEBUG(llvm::dbgs() << "[kungpu-memory-planning] total smem used="
+                            << usedSmem << " / budget=" << budgetPerBlock
+                            << "\n");
+  }
+};
+
+} // namespace
+
+//===----------------------------------------------------------------------===//
+// Public factory function
+//===----------------------------------------------------------------------===//
+
+namespace kungpu {
+
+std::unique_ptr<mlir::Pass> createWindowedTempMemoryPlanningPass() {
+  return std::make_unique<WindowedTempMemoryPlanningPass>();
+}
+
+} // namespace kungpu
diff --git a/mlir/lib/KunGpu/KunGpuOps.cpp b/mlir/lib/KunGpu/KunGpuOps.cpp
new file mode 100644
index 0000000..ba08734
--- /dev/null
+++ b/mlir/lib/KunGpu/KunGpuOps.cpp
@@ -0,0 +1,57 @@
+#include "KunGpu/KunGpuOps.h"
+#include "KunIr/KunIrTypes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/DialectImplementation.h"
+#include "mlir/IR/OpImplementation.h"
+
+using namespace mlir;
+using namespace kungpu;
+
+// Emits op class method implementations (verifyInvariantsImpl, print, parse, etc.)
+#define GET_OP_CLASSES
+#include "KunGpu/KunGpuOps.cpp.inc"
+
+// The `ts` operand of ts.get and ts.put must be a function argument (block
+// argument of an entry block), the result of a windowed_temp op, or the
+// result of an accumulator op.
+static bool isValidTsSource(Value v) {
+  if (isa<BlockArgument>(v))
+    return true;
+  if (auto *def = v.getDefiningOp())
+    return isa<WindowedTempOp, AccumulatorOp>(def);
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// TsGetOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult TsGetOp::verify() {
+  auto tsTy = llvm::cast<kunir::TsType>(getTs().getType());
+  if (tsTy.getElementType() != getResult().getType())
+    return emitOpError("result type '")
+           << getResult().getType()
+           << "' must match ts element type '" << tsTy.getElementType() << "'";
+  if (!isValidTsSource(getTs()))
+    return emitOpError("ts operand must be a function argument or "
+                       "the result of 'kungpu.windowed_temp' / "
+                       "'kungpu.accumulator'");
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// TsPutOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult TsPutOp::verify() {
+  auto tsTy = llvm::cast<kunir::TsType>(getTs().getType());
+  if (tsTy.getElementType() != getValue().getType())
+    return emitOpError("value type '")
+           << getValue().getType()
+           << "' must match ts element type '" << tsTy.getElementType() << "'";
+  if (!isValidTsSource(getTs()))
+    return emitOpError("ts operand must be a function argument or "
+                       "the result of 'kungpu.windowed_temp' / "
+                       "'kungpu.accumulator'");
+  return success();
+}
diff --git a/mlir/lib/KunGpu/KunGpuToLLVM.cpp b/mlir/lib/KunGpu/KunGpuToLLVM.cpp
new file mode 100644
index 0000000..ff0b4d3
--- /dev/null
+++ b/mlir/lib/KunGpu/KunGpuToLLVM.cpp
@@ -0,0 +1,1027 @@
+//===- KunGpuToLLVM.cpp - Lower kungpu + kunir.func → gpu.func + LLVM ---===//
+//
+// Assumes the input module is a `gpu.module` (or that the kunir.func lives
+// inside one).  Two-phase pass.
+//
+// Phase 1 (convertFuncSignature, simple imperative helper):
+//   kunir.func @f(%a: !kunir.ts<…>, …)
+//     → gpu.func @f(%t: i32, %n: i32, %a: !kunir.ts<…>, …) kernel
+//   inserted into the same gpu.module that contained the kunir.func.
+//   The two prepended i32 arguments are time_length and num_stocks
+//   (i32 because 64-bit ops are slow on GPUs; the linear gmem address
+//   is still computed in i64).  ts arg types are preserved here — phase 2
+//   converts them to !llvm.ptr via the standard signature-conversion pat.
+//   target_spec, input_names and output_names are moved to discardable
+//   attributes (see KunGpuUtils.h accessors).
+//   kunir.return → gpu.return.
+//
+// Phase 2 (applyPartialConversion, one OpConversionPattern per op):
+//   TypeConverter:  !kunir.ts<T,N> → !llvm.ptr
+//
+// Op semantics (post-redesign):
+//   ts.put %ts, %v          : append %v at the tail of %ts.
+//   ts.get %ts[%offset_i32] : read %ts at tail-relative offset (0 = latest).
+//
+// Lowering of windowed_temp head state — single i32 alloca holding the
+// next-writable position (modeled on cpp/Kun/Ops.hpp::OutputWindow):
+//
+//   on put(v):  buf[pos] = v;
+//               pos = (pos + 1 >= N) ? 0 : pos + 1;     // no modulo
+//
+//   on get(off):
+//               adj = off + 1;                          // off=0 → most-recent put
+//               idx = (pos >= adj) ? pos - adj : pos + N - adj;
+//               return buf[idx];
+//
+// Lowering for global ts (function-arg pointer, TxS layout):
+//   the "tail" is the current time step, given by the enclosing scf.for iv.
+//   put :   gmem[iv * num_stocks + sid]            = v
+//   get :   load gmem[(iv - off) * num_stocks + sid]
+//
+//===----------------------------------------------------------------------===//
+
+#include "KunGpu/KunGpuOps.h"
+#include "KunGpu/KunGpuUtils.h"
+#include "KunGpu/Passes.h"
+#include "KunIr/KunIrAttrs.h"
+#include "KunIr/KunIrOps.h"
+#include "KunIr/KunIrTypes.h"
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Func/Transforms/FuncConversions.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMTypes.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+
+#define GEN_PASS_DEF_CONVERTKUNGPUTOLLVM
+#include "KunGpu/Passes.h.inc"
+
+using namespace mlir;
+using namespace kunir;
+using namespace kungpu;
+
+namespace {
+
+// Per-windowed_temp side state.
+//   posPtr — i32 alloca holding the next-writable circular position.
+//            NULL means the entry is an accumulator (single slot, no
+//            circular wrap; ts.get / ts.put always touch slot 0).
+//   stride — slot stride in bytes-of-T units:
+//              1 for local (alloca buffer is per-thread)
+//              K for shared (slot-major across the K threads in a block);
+//                K = warps_per_cta * 32, captured as an i32 SSA value.
+// Keyed on the original windowed_temp / accumulator result Value so the
+// ts.get / ts.put patterns can find it.
+struct WTDesc {
+  Value posPtr;     // null → accumulator (no position counter)
+  int64_t stride;   // 1 → no multiply at access time
+};
+using WTDescMap = llvm::DenseMap<Value, WTDesc>;
+
+// Per-function cache for the write-start SSA value shared across
+// multiple output-store rewrites.  Each gpu.func builds (at most) one
+// write_start; subsequent ts.put rewrites against an output arg reuse
+// it, so we don't lean on a downstream CSE pass.
+//
+// `writeStart` is index-typed (not i32) because it's compared against
+// the scf.for IV which is index-typed.  The runtime scalar args (mask,
+// chunk_size, warmup) come in as i32; the helper below inserts the
+// i32 → index cast once at function entry.
+//
+//   writeStart   : (block_id y == 0) ? mask : block_id y * chunk_size.
+//                  Output stores below this time-index are suppressed —
+//                  they fall in the warmup-overlap region.
+//
+// Emitted at the very top of the function entry block so it dominates
+// every store site, regardless of how deeply nested.
+struct ChunkContext {
+  Value writeStart;
+};
+using ChunkCtxMap = llvm::DenseMap<Operation *, ChunkContext>;
+
+//===----------------------------------------------------------------------===//
+// Helper: stock_id = blockIdx.x * blockDim.x + threadIdx.x  (index-typed)
+// Defined here so phase 1 (`convertFuncSignature` below) can reuse it
+// for the active-thread guard, in addition to the conversion patterns.
+//===----------------------------------------------------------------------===//
+
+static Value emitStockId(OpBuilder &b, Location loc, Type idxTy) {
+  Value tid  = gpu::ThreadIdOp::create(b, loc, idxTy, gpu::Dimension::x);
+  Value bid  = gpu::BlockIdOp::create(b, loc, idxTy, gpu::Dimension::x);
+  Value bdim = gpu::BlockDimOp::create(b, loc, idxTy, gpu::Dimension::x);
+  return arith::AddIOp::create(
+      b, loc, arith::MulIOp::create(b, loc, bid, bdim), tid);
+}
+
+//===----------------------------------------------------------------------===//
+// Phase 1: kunir.func → func.func (signature only)
+//===----------------------------------------------------------------------===//
+
+static LogicalResult convertFuncSignature(kunir::FuncOp fn) {
+  auto *ctx  = fn.getContext();
+  Location loc = fn.getLoc();
+  auto i32Ty = IntegerType::get(ctx, 32);
+  auto idxTy = IndexType::get(ctx);
+
+  // We only support vector_size = 1 right now.  When vector_size > 1 a
+  // single thread handles `vector_size` consecutive stocks; if those
+  // straddle the num_stocks boundary, the kernel either has to:
+  //   - clamp the lane index to min(base + k, num_stocks - 1) on
+  //     every gmem load (safe re-read), and per-lane predicate the
+  //     gmem stores to skip the out-of-range cells;
+  //   - or refuse non-aligned num_stocks at launch time.
+  // TODO(vector_size>1): implement the clamp scheme above and remove
+  // this check.  See discussion in KunGpuToLLVM history for why
+  // PTX vector loads can't mask individual lanes.
+  auto tsAttr = fn.getTargetSpecAttr();
+  int64_t vectorSize = tsAttr ? tsAttr.getVectorSize() : 1;
+  if (vectorSize != 1) {
+    return fn.emitError("convert-kungpu-to-llvm: vector_size = ")
+            << vectorSize << " not yet supported (only vector_size = 1). "
+            << "TODO: implement clamp + per-lane store predicate for the "
+            << "tail block.";
+  }
+
+  FunctionType oldFT = fn.getFunctionTypeTyped();
+  // Prepend (time_length, num_stocks, mask, chunk_size, warmup) — all
+  // i32.  time_length / num_stocks shape the linear gmem indexing;
+  // mask / chunk_size / warmup feed the multi-chunk time-axis path
+  // (kungpu.time_lb / time_ub / output-store gating).  64-bit math is
+  // slow on GPUs, so we keep them as i32 and cast to index only at the
+  // few places that need it.
+  SmallVector<Type> newArgTypes = {i32Ty, i32Ty, i32Ty, i32Ty, i32Ty};
+  for (Type t : oldFT.getInputs())
+    newArgTypes.push_back(t);
+
+  // Build gpu.func right before the kunir.func — both live inside the
+  // enclosing gpu.module.
+  OpBuilder b(fn);
+  auto newFunc = gpu::GPUFuncOp::create(
+      b, loc, fn.getSymName(), FunctionType::get(ctx, newArgTypes, {}));
+  // Mark as a kernel (sets the op-level `kernel` attribute) so that
+  // convert-gpu-to-nvvm tags the resulting llvm.func with `nvvm.kernel`.
+  // newFunc.setKernelAttr(UnitAttr::get(ctx));
+  newFunc->setAttr(gpu::GPUDialect::getKernelFuncAttrName(),
+                            UnitAttr::get(ctx));
+  setFuncTargetSpec (newFunc, fn.getTargetSpecAttr());
+  setFuncInputNames (newFunc, fn.getInputNames());
+  setFuncOutputNames(newFunc, fn.getOutputNames());
+  setFuncUnreliableCount(newFunc, fn.getUnreliableCount());
+
+  // gpu.func's auto-created entry block is replaced with the kunir.func
+  // body.  Block-arg types initially still match the kunir.func signature;
+  // phase 2's signature-conversion pattern reconciles them with the new
+  // gpu.func type (ts → !llvm.ptr).
+  newFunc.getBody().takeBody(fn.getBody());
+  Block &entry = newFunc.getBody().front();
+  entry.insertArgument(0u, i32Ty, loc); // time_length
+  entry.insertArgument(1u, i32Ty, loc); // num_stocks
+  entry.insertArgument(2u, i32Ty, loc); // mask
+  entry.insertArgument(3u, i32Ty, loc); // chunk_size
+  entry.insertArgument(4u, i32Ty, loc); // warmup
+
+  SmallVector<kunir::ReturnOp> returns;
+  newFunc.walk([&](kunir::ReturnOp r) { returns.push_back(r); });
+  for (kunir::ReturnOp r : returns) {
+    OpBuilder rb(r);
+    gpu::ReturnOp::create(rb, r.getLoc());
+    r.erase();
+  }
+  fn.erase();
+
+  // ── Tail-block guard ────────────────────────────────────────────────
+  // grid_x is sized as ceil(num_stocks / block_x), so the last block
+  // contains threads with stock_id ≥ num_stocks.  Without a guard those
+  // threads do gmem GEPs at out-of-bounds addresses (UB).  Compute
+  // stock_id at the top of the kernel and wrap the original body in
+  // `scf.if (stock_id < num_stocks)`.  Inactive threads fall through to
+  // gpu.return without touching gmem; their smem column is sized to the
+  // block (not num_stocks), so leaving it uninitialised is safe.
+  //
+  // For vector_size = 1 this is the entire fix; vector_size > 1 is
+  // gated above.
+  Operation *gpuRet = entry.getTerminator();
+  Operation *origFirst = entry.empty() ? nullptr : &entry.front();
+  if (!origFirst || origFirst == gpuRet) {
+    // Empty body — nothing to guard.
+    return success();
+  }
+
+  OpBuilder pb(ctx);
+  pb.setInsertionPointToStart(&entry);
+  Value sidIdx = emitStockId(pb, loc, idxTy);
+  Value sidI32 = arith::IndexCastOp::create(pb, loc, i32Ty, sidIdx);
+  Value numStocks = entry.getArgument(1); // i32
+  Value active = arith::CmpIOp::create(pb, loc, arith::CmpIPredicate::slt,
+                                            sidI32, numStocks);
+  auto ifOp = scf::IfOp::create(pb, loc, /*resultTypes=*/TypeRange{},
+                                     active, /*withElseRegion=*/false);
+
+  // Move all original ops (everything between the prologue we just
+  // inserted and the gpu.return) into the scf.if's then-region, before
+  // its implicit scf.yield.
+  Block &thenBlk = ifOp.getThenRegion().front();
+  Operation *thenYield = thenBlk.getTerminator();
+  thenBlk.getOperations().splice(thenYield->getIterator(),
+                                   entry.getOperations(),
+                                   origFirst->getIterator(),
+                                   gpuRet->getIterator());
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Helpers used inside conversion patterns
+//===----------------------------------------------------------------------===//
+
+// Read num_stocks (i32 func arg[1]) sign-extended to i64 for the linear gmem
+// address computation.  The bare i32 value is in arg[1]; we extend at every
+// use site (cheap, and lets the caller decide).
+static Value getNumStocksI64(OpBuilder &b, Operation *op, Location loc) {
+  Value ns32 = op->getParentOfType<gpu::GPUFuncOp>()
+                   .getBody().front().getArgument(1);
+  return arith::ExtSIOp::create(b, loc, b.getI64Type(), ns32);
+}
+static Value getCurrentTimeIdx(Operation *op) {
+  // The enclosing function may contain nested scf.for's — outermost is
+  // the per-thread time loop, inner ones come from for_each_back_window
+  // bodies.  Reads/writes against a global ts (function-arg or graph
+  // intermediate) must use the OUTER time loop's IV regardless of how
+  // deep they sit; `op->getParentOfType<scf::ForOp>()` would otherwise
+  // grab the FBW's window-step IV and produce gmem addresses indexed
+  // by `w ∈ [0, window)` instead of the actual time `t`.
+  scf::ForOp outermost;
+  for (Operation *p = op->getParentOp(); p; p = p->getParentOp()) {
+    if (auto f = dyn_cast<scf::ForOp>(p))
+      outermost = f;
+  }
+  return outermost ? outermost.getInductionVar() : Value();
+}
+
+// linear gmem address = base + (timeIdx - offsetIdx) * num_stocks + stock_id
+static Value gmemGEPWithOffset(OpBuilder &b, Location loc, Type elemTy,
+                                LLVM::LLVMPointerType ptrTy, Value basePt,
+                                Value timeIdx, Value offsetIdx,
+                                Value numStocksI64, Type idxTy, Type i64Ty) {
+  Value effIdx = offsetIdx ? arith::SubIOp::create(b, loc, timeIdx, offsetIdx).getResult()
+                            : timeIdx;
+  Value tI64   = arith::IndexCastOp::create(b, loc, i64Ty, effIdx);
+  Value sid    = emitStockId(b, loc, idxTy);
+  Value sidI64 = arith::IndexCastOp::create(b, loc, i64Ty, sid);
+  Value lin    = arith::AddIOp::create(
+      b, loc, arith::MulIOp::create(b, loc, tI64, numStocksI64), sidI64);
+  return LLVM::GEPOp::create(b, loc, ptrTy, elemTy, basePt, ValueRange{lin});
+}
+
+//===----------------------------------------------------------------------===//
+// Patterns
+//===----------------------------------------------------------------------===//
+
+struct TimeLengthPattern : OpConversionPattern<TimeLengthOp> {
+  using OpConversionPattern::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(TimeLengthOp op, OpAdaptor /*a*/,
+                  ConversionPatternRewriter &rewriter) const override {
+    Value tl32 = op->getParentOfType<gpu::GPUFuncOp>()
+                     .getBody().front().getArgument(0);
+    rewriter.replaceOpWithNewOp<arith::IndexCastOp>(
+        op, rewriter.getIndexType(), tl32);
+    return success();
+  }
+};
+
+// time_lb = (block_id y == 0) ? 0 : block_id y * chunk_size - warmup
+// All arithmetic happens in i32 (64-bit ops are slow on GPU); a single
+// index_cast at the end produces the index-typed scf.for bound.
+// chunk_size / warmup come from gpu.func args[3] / args[4]; the op has
+// no operands at the kungpu level.
+struct TimeLbPattern : OpConversionPattern<TimeLbOp> {
+  using OpConversionPattern::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(TimeLbOp op, OpAdaptor /*adaptor*/,
+                  ConversionPatternRewriter &rewriter) const override {
+    Location loc = op.getLoc();
+    auto i32Ty = rewriter.getI32Type();
+    auto idxTy = rewriter.getIndexType();
+    auto fn = op->getParentOfType<gpu::GPUFuncOp>();
+    Value chunkSize = fn.getBody().front().getArgument(3);
+    Value warmup    = fn.getBody().front().getArgument(4);
+    Value cyIdx = gpu::BlockIdOp::create(rewriter, loc, idxTy, gpu::Dimension::y);
+    Value cy = arith::IndexCastOp::create(rewriter, loc, i32Ty, cyIdx);
+    Value c0 = arith::ConstantOp::create(
+        rewriter, loc, i32Ty, rewriter.getI32IntegerAttr(0));
+    Value isFirst = arith::CmpIOp::create(
+        rewriter, loc, arith::CmpIPredicate::eq, cy, c0);
+    Value off = arith::MulIOp::create(rewriter, loc, cy, chunkSize);
+    Value offMinusW = arith::SubIOp::create(rewriter, loc, off, warmup);
+    Value lbI32 = arith::SelectOp::create(rewriter, loc, isFirst, c0, offMinusW);
+    rewriter.replaceOpWithNewOp<arith::IndexCastOp>(op, idxTy, lbI32);
+    return success();
+  }
+};
+
+// time_ub = min((block_id y + 1) * chunk_size, time_length)
+// chunk_size / time_length come from gpu.func args[3] / args[0]; both
+// are i32 so the math stays in i32 with one final cast to index.
+struct TimeUbPattern : OpConversionPattern<TimeUbOp> {
+  using OpConversionPattern::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(TimeUbOp op, OpAdaptor /*adaptor*/,
+                  ConversionPatternRewriter &rewriter) const override {
+    Location loc = op.getLoc();
+    auto i32Ty = rewriter.getI32Type();
+    auto idxTy = rewriter.getIndexType();
+    auto fn = op->getParentOfType<gpu::GPUFuncOp>();
+    Value timeLen   = fn.getBody().front().getArgument(0);
+    Value chunkSize = fn.getBody().front().getArgument(3);
+    Value cyIdx = gpu::BlockIdOp::create(rewriter, loc, idxTy, gpu::Dimension::y);
+    Value cy = arith::IndexCastOp::create(rewriter, loc, i32Ty, cyIdx);
+    Value c1 = arith::ConstantOp::create(
+        rewriter, loc, i32Ty, rewriter.getI32IntegerAttr(1));
+    Value next = arith::AddIOp::create(rewriter, loc, cy, c1);
+    Value end = arith::MulIOp::create(rewriter, loc, next, chunkSize);
+    Value ubI32 = arith::MinUIOp::create(rewriter, loc, end, timeLen);
+    rewriter.replaceOpWithNewOp<arith::IndexCastOp>(op, idxTy, ubI32);
+    return success();
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Chunk-context lazy helper.  See ChunkContext above.
+//
+// mask / chunk_size / warmup come in as i32 func args (positions 2 / 3 /
+// 4 after time_length / num_stocks).  We build writeStart from the i32
+// mask + chunk_size args, then cast to index once and cache.  Emitted
+// at the very top of the function entry block so the resulting SSA
+// value dominates every store-site inside the kernel.
+//===----------------------------------------------------------------------===//
+
+static Value getOrCreateWriteStart(Operation *op, ChunkCtxMap &map,
+                                     ConversionPatternRewriter &rewriter) {
+  auto fn = op->getParentOfType<gpu::GPUFuncOp>();
+  ChunkContext &ctx = map[fn.getOperation()];
+  if (ctx.writeStart) return ctx.writeStart;
+
+  // Compute in i32 (cheap on GPU) then cast once to index, since the
+  // result is compared against the scf.for IV (index-typed).
+  Block &entry = fn.getBody().front();
+  Value maskI32      = entry.getArgument(2);
+  Value chunkSizeI32 = entry.getArgument(3);
+  Location loc = fn.getLoc();
+
+  OpBuilder::InsertionGuard g(rewriter);
+  rewriter.setInsertionPointToStart(&entry);
+  auto i32Ty = rewriter.getI32Type();
+  auto idxTy = rewriter.getIndexType();
+  Value cyIdx = gpu::BlockIdOp::create(rewriter, loc, idxTy, gpu::Dimension::y);
+  Value cy = arith::IndexCastOp::create(rewriter, loc, i32Ty, cyIdx);
+  Value c0 = arith::ConstantOp::create(
+      rewriter, loc, i32Ty, rewriter.getI32IntegerAttr(0));
+  Value isFirst = arith::CmpIOp::create(
+      rewriter, loc, arith::CmpIPredicate::eq, cy, c0);
+  Value off = arith::MulIOp::create(rewriter, loc, cy, chunkSizeI32);
+  Value wsI32 = arith::SelectOp::create(rewriter, loc, isFirst, maskI32, off);
+  ctx.writeStart = arith::IndexCastOp::create(rewriter, loc, idxTy, wsI32);
+  return ctx.writeStart;
+}
+
+struct StockIdPattern : OpConversionPattern<StockIdOp> {
+  using OpConversionPattern::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(StockIdOp op, OpAdaptor /*a*/,
+                  ConversionPatternRewriter &rewriter) const override {
+    rewriter.replaceOp(op,
+        emitStockId(rewriter, op.getLoc(), rewriter.getIndexType()));
+    return success();
+  }
+};
+
+struct BlockStockCountPattern : OpConversionPattern<BlockStockCountOp> {
+  using OpConversionPattern::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(BlockStockCountOp op, OpAdaptor /*a*/,
+                  ConversionPatternRewriter &rewriter) const override {
+    rewriter.replaceOpWithNewOp<gpu::BlockDimOp>(
+        op, rewriter.getIndexType(), gpu::Dimension::x);
+    return success();
+  }
+};
+
+// Each windowed_temp lowers to:
+//   %buf = llvm.alloca N x elemTy   (or smem GEP slice)
+//   %pos = llvm.alloca 1 x i32
+//   llvm.store 0 : i32, %pos        (next writable position starts at 0)
+struct WindowedTempPattern : OpConversionPattern<WindowedTempOp> {
+  WTDescMap &descMap;
+  int &smemCounter;
+
+  WindowedTempPattern(TypeConverter &tc, MLIRContext *ctx, WTDescMap &m, int &sc)
+      : OpConversionPattern(tc, ctx), descMap(m), smemCounter(sc) {}
+
+  LogicalResult
+  matchAndRewrite(WindowedTempOp op, OpAdaptor /*a*/,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto *ctx     = op.getContext();
+    Location loc  = op.getLoc();
+    auto i32Ty    = rewriter.getI32Type();
+    auto idxTy    = rewriter.getIndexType();
+    auto ptrTy    = LLVM::LLVMPointerType::get(ctx);
+
+    auto tsTy   = llvm::cast<TsType>(op.getType());
+    int64_t N   = static_cast<int64_t>(tsTy.getMaxLookback());
+    Type elemTy = tsTy.getElementType();
+
+    // Buffer (alloca or smem slice).  All counters/offsets are i32.
+    //
+    // Local memory:
+    //   bufPtr = alloca [N x T]  (per-thread, contiguous)  — stride = 1
+    //
+    // Shared memory (slot-major, bank-conflict-free):
+    //   global [N * K x T] (addr_space=3) where K = threads_per_block
+    //   Slot j of thread t lives at index   j * K + t.
+    //   bufPtr = smem + tid                                 — stride = K
+    //   ts.put/get use bufPtr[idx * K], landing on
+    //     smem + tid + idx*K = slot_idx*K + tid  (correct).
+    Value bufPtr;
+    int64_t stride;
+
+    if (op.isSmem()) {
+      auto fn = op->getParentOfType<gpu::GPUFuncOp>();
+      auto gpuModule = op->getParentOfType<gpu::GPUModuleOp>();
+      auto tsAttr = getFuncTargetSpec(fn);
+      int64_t blockSize = tsAttr ? (tsAttr.getWarpsPerCta() * 32) : 32;
+      stride = blockSize;
+
+      std::string name =
+          ("__smem_" + fn.getName() + "_" +
+           llvm::Twine(smemCounter++)).str();
+      {
+        OpBuilder::InsertionGuard g(rewriter);
+        Block *modBody = &gpuModule.getBodyRegion().front();
+        rewriter.setInsertionPoint(modBody, modBody->begin());
+        LLVM::GlobalOp::create(
+            rewriter, loc, LLVM::LLVMArrayType::get(elemTy, N * blockSize), false,
+            LLVM::Linkage::Internal, name, Attribute{}, 0, 3);
+      }
+      Value raw = LLVM::AddressOfOp::create(
+          rewriter, loc, LLVM::LLVMPointerType::get(ctx, 3), name);
+      Value gen    = LLVM::AddrSpaceCastOp::create(rewriter, loc, ptrTy, raw);
+      Value tid    = gpu::ThreadIdOp::create(rewriter, loc, idxTy, gpu::Dimension::x);
+      Value tidI32 = arith::IndexCastOp::create(rewriter, loc, i32Ty, tid);
+      // bufPtr = smem + tid  (slot-major: slot j thread t lives at j*K + t)
+      bufPtr = LLVM::GEPOp::create(rewriter, loc, ptrTy, elemTy, gen,
+                                             ValueRange{tidI32});
+    } else {
+      stride = 1;
+      Value nCst = LLVM::ConstantOp::create(
+          rewriter, loc, i32Ty, rewriter.getI32IntegerAttr(N));
+      bufPtr = LLVM::AllocaOp::create(rewriter, loc, ptrTy, elemTy, nCst);
+    }
+
+    // Single i32 cell tracking next-writable position; init to 0.
+    Value c1_i32 = LLVM::ConstantOp::create(
+        rewriter, loc, i32Ty, rewriter.getI32IntegerAttr(1));
+    Value posPtr = LLVM::AllocaOp::create(rewriter, loc, ptrTy, i32Ty, c1_i32);
+    Value zeroI32 = LLVM::ConstantOp::create(
+        rewriter, loc, i32Ty, rewriter.getI32IntegerAttr(0));
+    LLVM::StoreOp::create(rewriter, loc, zeroI32, posPtr);
+
+    // Side state, keyed on the original (pre-replacement) ts Value.
+    descMap[op.getResult()] = {posPtr, stride};
+
+    rewriter.replaceOp(op, bufPtr);
+    return success();
+  }
+};
+
+// kungpu.accumulator → single-slot alloca, zero-initialised.  Modeled in
+// descMap with a null posPtr so the ts.get / ts.put dispatch can recognise
+// it and emit a plain load/store at slot 0 (no circular wrap, no position
+// counter).  The op MUST be lowered for offset = 0 only — verified at the
+// ts.get / ts.put pattern level.
+struct AccumulatorPattern : OpConversionPattern<kungpu::AccumulatorOp> {
+  WTDescMap &descMap;
+  AccumulatorPattern(TypeConverter &tc, MLIRContext *ctx, WTDescMap &m)
+      : OpConversionPattern(tc, ctx), descMap(m) {}
+
+  LogicalResult
+  matchAndRewrite(kungpu::AccumulatorOp op, OpAdaptor /*a*/,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto *ctx    = op.getContext();
+    Location loc = op.getLoc();
+    auto i32Ty   = rewriter.getI32Type();
+    auto ptrTy   = LLVM::LLVMPointerType::get(ctx);
+
+    auto tsTy   = llvm::cast<TsType>(op.getType());
+    Type elemTy = tsTy.getElementType();
+
+    auto fn = op->getParentOfType<gpu::GPUFuncOp>();
+    if (!fn)
+      return rewriter.notifyMatchFailure(
+          op, "kungpu.accumulator must be inside a gpu.func");
+
+    // Alloca + init_val-init at function entry so the slot is well-defined
+    // before the time loop begins.
+    Value bufPtr;
+    {
+      OpBuilder::InsertionGuard g(rewriter);
+      Block &entry = fn.getBody().front();
+      rewriter.setInsertionPointToStart(&entry);
+      Value c1_i32 = LLVM::ConstantOp::create(
+          rewriter, loc, i32Ty, rewriter.getI32IntegerAttr(1));
+      bufPtr = LLVM::AllocaOp::create(rewriter, loc, ptrTy, elemTy, c1_i32);
+      double initVal = op.getInitVal().convertToDouble();
+      Value initCst = LLVM::ConstantOp::create(
+          rewriter, loc, elemTy, rewriter.getFloatAttr(elemTy, initVal));
+      LLVM::StoreOp::create(rewriter, loc, initCst, bufPtr);
+    }
+
+    // posPtr = null → ts.get / ts.put treat as accumulator (slot 0 only).
+    descMap[op.getResult()] = {Value(), 1};
+    rewriter.replaceOp(op, bufPtr);
+    return success();
+  }
+};
+
+// Multiply an i32 index by a compile-time stride.  stride==1 is a no-op.
+static Value applyStride(OpBuilder &b, Location loc, Value idx, int64_t stride,
+                          Type i32Ty) {
+  if (stride == 1)
+    return idx;
+  Value k = LLVM::ConstantOp::create(b, loc, i32Ty,
+                                        b.getI32IntegerAttr(stride));
+  return LLVM::MulOp::create(b, loc, idx, k);
+}
+
+struct TsGetPattern : OpConversionPattern<TsGetOp> {
+  WTDescMap &descMap;
+
+  TsGetPattern(TypeConverter &tc, MLIRContext *ctx, WTDescMap &m)
+      : OpConversionPattern(tc, ctx), descMap(m) {}
+
+  LogicalResult
+  matchAndRewrite(TsGetOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto *ctx    = op.getContext();
+    Location loc = op.getLoc();
+    auto i32Ty   = rewriter.getI32Type();
+    auto i64Ty   = rewriter.getI64Type();
+    auto idxTy   = rewriter.getIndexType();
+    auto ptrTy   = LLVM::LLVMPointerType::get(ctx);
+    Type elemTy  = op.getType();
+
+    Value tsPtr     = adaptor.getTs();      // !llvm.ptr
+    Value offsetI32 = adaptor.getOffset();  // i32
+
+    auto it = descMap.find(op.getTs());
+    if (it != descMap.end()) {
+      const WTDesc &desc = it->second;
+      // ── accumulator: single-slot load.  offset must be 0. ─────────
+      if (!desc.posPtr) {
+        int64_t offsetVal = -1;
+        if (auto a = offsetI32.getDefiningOp<arith::ConstantOp>())
+          offsetVal = llvm::cast<IntegerAttr>(a.getValue()).getInt();
+        else if (auto l = offsetI32.getDefiningOp<LLVM::ConstantOp>())
+          offsetVal = llvm::cast<IntegerAttr>(l.getValue()).getInt();
+        if (offsetVal != 0)
+          return rewriter.notifyMatchFailure(
+              op, "ts.get on accumulator must use offset = 0");
+        rewriter.replaceOpWithNewOp<LLVM::LoadOp>(op, elemTy, tsPtr);
+        return success();
+      }
+      // ── windowed_temp: circular get without modulo ────────────────
+      //   adj = offset + 1                  (offset=0 → most-recent put)
+      //   idx = pos >= adj ? pos - adj : pos + N - adj
+      //   return buf[idx * stride]
+      int64_t N = static_cast<int64_t>(
+          llvm::cast<TsType>(op.getTs().getType()).getMaxLookback());
+      Value pos    = LLVM::LoadOp::create(rewriter, loc, i32Ty, desc.posPtr);
+      Value c1     = LLVM::ConstantOp::create(
+          rewriter, loc, i32Ty, rewriter.getI32IntegerAttr(1));
+      Value nCst   = LLVM::ConstantOp::create(
+          rewriter, loc, i32Ty, rewriter.getI32IntegerAttr(N));
+      Value adj    = LLVM::AddOp::create(rewriter, loc, offsetI32, c1);
+      Value cmp    = LLVM::ICmpOp::create(rewriter, loc, LLVM::ICmpPredicate::uge,
+                                                    pos, adj);
+      Value posMinusAdj = LLVM::SubOp::create(rewriter, loc, pos, adj);
+      Value posPlusN    = LLVM::AddOp::create(rewriter, loc, pos, nCst);
+      Value wrapped     = LLVM::SubOp::create(rewriter, loc, posPlusN, adj);
+      Value idx32       = LLVM::SelectOp::create(
+          rewriter, loc, cmp, posMinusAdj, wrapped);
+      // LLVM GEP accepts any integer index type — keep it i32 to avoid the
+      // 64-bit ops that are slow on GPUs.
+      Value gepIdx = applyStride(rewriter, loc, idx32, desc.stride, i32Ty);
+      Value gep = LLVM::GEPOp::create(
+          rewriter, loc, ptrTy, elemTy, tsPtr, ValueRange{gepIdx});
+      rewriter.replaceOpWithNewOp<LLVM::LoadOp>(op, elemTy, gep);
+    } else {
+      // ── global ts (function arg, TxS layout) ──────────────────────
+      // Load gmem[(timeIdx - offset) * num_stocks + sid].  When offset
+      // is a known zero (the common ts.get for current time) we skip
+      // the bounds guard; otherwise wrap in `scf.if (t >= offset)`
+      // returning NaN out-of-bounds to mirror CPU `InputTS::getWindow`.
+      Value timeIdx = getCurrentTimeIdx(op);
+      Value offsetIdx = arith::IndexCastOp::create(
+          rewriter, loc, idxTy, offsetI32);
+      bool offsetIsZero = false;
+      if (auto a = offsetI32.getDefiningOp<arith::ConstantOp>())
+        offsetIsZero = (llvm::cast<IntegerAttr>(a.getValue()).getInt() == 0);
+      else if (auto l = offsetI32.getDefiningOp<LLVM::ConstantOp>())
+        offsetIsZero = (llvm::cast<IntegerAttr>(l.getValue()).getInt() == 0);
+
+      Value gep = gmemGEPWithOffset(rewriter, loc, elemTy, ptrTy, tsPtr,
+                                     timeIdx, offsetIdx,
+                                     getNumStocksI64(rewriter, op, loc),
+                                     idxTy, i64Ty);
+      if (offsetIsZero) {
+        rewriter.replaceOpWithNewOp<LLVM::LoadOp>(op, elemTy, gep);
+        return success();
+      }
+      Value inRange = arith::CmpIOp::create(
+          rewriter, loc, arith::CmpIPredicate::sge, timeIdx, offsetIdx);
+      auto ifOp = scf::IfOp::create(rewriter, loc, TypeRange{elemTy},
+                                       inRange, /*withElseRegion=*/true);
+      {
+        OpBuilder::InsertionGuard g(rewriter);
+        rewriter.setInsertionPointToStart(&ifOp.getThenRegion().front());
+        Value loaded = LLVM::LoadOp::create(rewriter, loc, elemTy, gep);
+        scf::YieldOp::create(rewriter, loc, loaded);
+      }
+      {
+        OpBuilder::InsertionGuard g(rewriter);
+        rewriter.setInsertionPointToStart(&ifOp.getElseRegion().front());
+        Value nanV = LLVM::ConstantOp::create(
+            rewriter, loc,
+            llvm::cast<FloatType>(elemTy),
+            rewriter.getFloatAttr(elemTy,
+                                    std::numeric_limits<double>::quiet_NaN()));
+        scf::YieldOp::create(rewriter, loc, nanV);
+      }
+      rewriter.replaceOp(op, ifOp.getResult(0));
+    }
+    return success();
+  }
+};
+
+struct TsPutPattern : OpConversionPattern<TsPutOp> {
+  WTDescMap &descMap;
+  ChunkCtxMap &chunkCtx;
+
+  TsPutPattern(TypeConverter &tc, MLIRContext *ctx, WTDescMap &m,
+                ChunkCtxMap &c)
+      : OpConversionPattern(tc, ctx), descMap(m), chunkCtx(c) {}
+
+  LogicalResult
+  matchAndRewrite(TsPutOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto *ctx    = op.getContext();
+    Location loc = op.getLoc();
+    auto i32Ty   = rewriter.getI32Type();
+    auto i64Ty   = rewriter.getI64Type();
+    auto idxTy   = rewriter.getIndexType();
+    auto ptrTy   = LLVM::LLVMPointerType::get(ctx);
+
+    Value tsPtr = adaptor.getTs();
+    Value v     = adaptor.getValue();
+    Type elemTy = v.getType();
+
+    auto it = descMap.find(op.getTs());
+    if (it != descMap.end()) {
+      const WTDesc &desc = it->second;
+      // ── accumulator: single-slot store, no pos counter to advance. ─
+      if (!desc.posPtr) {
+        LLVM::StoreOp::create(rewriter, loc, v, tsPtr);
+        rewriter.eraseOp(op);
+        return success();
+      }
+      // ── windowed_temp: store at buf[pos*stride], then advance pos ─
+      //   buf[pos * stride] = v
+      //   pos = (pos + 1 >= N) ? 0 : pos + 1
+      int64_t N = static_cast<int64_t>(
+          llvm::cast<TsType>(op.getTs().getType()).getMaxLookback());
+      Value pos = LLVM::LoadOp::create(rewriter, loc, i32Ty, desc.posPtr);
+
+      // Keep GEP index in i32 (cheap on GPU); LLVM accepts any int type.
+      Value gepIdx = applyStride(rewriter, loc, pos, desc.stride, i32Ty);
+      Value gep = LLVM::GEPOp::create(
+          rewriter, loc, ptrTy, elemTy, tsPtr, ValueRange{gepIdx});
+      LLVM::StoreOp::create(rewriter, loc, v, gep);
+
+      Value c1     = LLVM::ConstantOp::create(
+          rewriter, loc, i32Ty, rewriter.getI32IntegerAttr(1));
+      Value nCst   = LLVM::ConstantOp::create(
+          rewriter, loc, i32Ty, rewriter.getI32IntegerAttr(N));
+      Value zero32 = LLVM::ConstantOp::create(
+          rewriter, loc, i32Ty, rewriter.getI32IntegerAttr(0));
+      Value posP1  = LLVM::AddOp::create(rewriter, loc, pos, c1);
+      Value cmp    = LLVM::ICmpOp::create(rewriter, loc, LLVM::ICmpPredicate::uge,
+                                                    posP1, nCst);
+      Value newPos = LLVM::SelectOp::create(rewriter, loc, cmp, zero32, posP1);
+      LLVM::StoreOp::create(rewriter, loc, newPos, desc.posPtr);
+      rewriter.eraseOp(op);
+    } else {
+      // ── global ts: write at current time, gated by per-chunk write_start.
+      //    Output time dim == time_length (== input time dim); the warmup
+      //    region [0, mask) is just left unwritten by the kernel.
+      //
+      //   if (t >= write_start)
+      //     out[t, sid] = v
+      //
+      // The `t >= write_start` comparison is uniform across the CTA (all
+      // threads share the same scf.for IV), so the lowered branch is a
+      // single uniform predicate — no warp divergence at chunk boundaries.
+      Value timeIdx    = getCurrentTimeIdx(op);
+      Value writeStart = getOrCreateWriteStart(op, chunkCtx, rewriter);
+
+      Value doWrite = arith::CmpIOp::create(
+          rewriter, loc, arith::CmpIPredicate::sge, timeIdx, writeStart);
+      auto ifOp = scf::IfOp::create(
+          rewriter, loc, /*resultTypes=*/TypeRange{}, doWrite,
+          /*withElseRegion=*/false);
+
+      OpBuilder ib = OpBuilder::atBlockBegin(&ifOp.getThenRegion().front());
+      Value gep = gmemGEPWithOffset(ib, loc, elemTy, ptrTy, tsPtr,
+                                     timeIdx, /*offsetIdx=*/Value(),
+                                     getNumStocksI64(ib, op, loc),
+                                     idxTy, i64Ty);
+      LLVM::StoreOp::create(ib, loc, v, gep);
+      rewriter.eraseOp(op);
+    }
+    return success();
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// FastWindowedSum — running sum with Kahan compensation + NaN tracking.
+//
+// Per-thread state (4 cells, alloca'd at function entry, promoted to
+// registers by mem2reg):
+//   v             — running sum                                    (T)
+//   compAdd       — Kahan compensation for the +cur step           (T)
+//   compSub       — Kahan compensation for the -old step           (T)
+//   numNans       — count of NaNs currently inside the trailing-N window (i32)
+//
+// Algorithm — direct port of cpp/Kun/Ops.hpp::FastWindowedSum::step:
+//
+//   cur = input[t]                                                 ts.get  off=0
+//   old = (t - loop_lb >= window) ? input[t - window] : NaN        ts.get  off=window  (guarded)
+//   old_is_nan = isnan(old)
+//   new_is_nan = isnan(cur)
+//   v = old_is_nan ? v : kahanAdd(v, -old, &compSub)               // subtract old
+//   v = new_is_nan ? v : kahanAdd(v, +cur, &compAdd)               // add cur
+//   numNans += (new_is_nan ? 1 : 0) - (old_is_nan ? 1 : 0)
+//   out = (numNans == 0) ? v : NaN
+//
+// Guard uses `t - loop_lb`, not bare `t`: state is per-CTA alloca
+// (zero-init) so each chunk needs its own N-step warmup with old=NaN
+// to build v up.  Chunk 0 has loop_lb = 0 so the guard collapses to
+// CPU's `t >= window`.
+//===----------------------------------------------------------------------===//
+
+struct FastWindowedSumPattern : OpConversionPattern<FastWindowedSumOp> {
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(FastWindowedSumOp op, OpAdaptor /*adaptor*/,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto *ctx    = op.getContext();
+    Location loc = op.getLoc();
+    auto i32Ty   = rewriter.getI32Type();
+    auto ptrTy   = LLVM::LLVMPointerType::get(ctx);
+
+    auto resultTy = op.getResult().getType();
+    auto floatTy  = llvm::dyn_cast<FloatType>(resultTy);
+    if (!floatTy)
+      return rewriter.notifyMatchFailure(
+          op, "fast_windowed_sum result must be a scalar float "
+              "(post kunir-to-kungpu lowering)");
+
+    int64_t window = op.getWindow();
+    Value origInput = op.getInput();
+
+    // ── 1. Allocate state at function entry + initialise. ──────────
+    auto fn = op->getParentOfType<gpu::GPUFuncOp>();
+    if (!fn)
+      return rewriter.notifyMatchFailure(
+          op, "fast_windowed_sum must be inside a gpu.func");
+
+    Value vPtr, addPtr, subPtr, nansPtr;
+    {
+      OpBuilder::InsertionGuard g(rewriter);
+      Block &entry = fn.getBody().front();
+      rewriter.setInsertionPointToStart(&entry);
+      Value c1_i32 = LLVM::ConstantOp::create(
+          rewriter, loc, i32Ty, rewriter.getI32IntegerAttr(1));
+      Value zeroF = LLVM::ConstantOp::create(
+          rewriter, loc, floatTy, rewriter.getFloatAttr(floatTy, 0.0));
+      Value windowI32 = LLVM::ConstantOp::create(
+          rewriter, loc, i32Ty, rewriter.getI32IntegerAttr(window));
+
+      vPtr    = LLVM::AllocaOp::create(rewriter, loc, ptrTy, floatTy, c1_i32);
+      addPtr  = LLVM::AllocaOp::create(rewriter, loc, ptrTy, floatTy, c1_i32);
+      subPtr  = LLVM::AllocaOp::create(rewriter, loc, ptrTy, floatTy, c1_i32);
+      nansPtr = LLVM::AllocaOp::create(rewriter, loc, ptrTy, i32Ty,   c1_i32);
+
+      LLVM::StoreOp::create(rewriter, loc, zeroF,     vPtr);
+      LLVM::StoreOp::create(rewriter, loc, zeroF,     addPtr);
+      LLVM::StoreOp::create(rewriter, loc, zeroF,     subPtr);
+      LLVM::StoreOp::create(rewriter, loc, windowI32, nansPtr);
+    }
+
+    // ── 2. Read cur (off=0) and old (off=window, guarded). ─────────
+    Value zeroOff   = arith::ConstantOp::create(
+        rewriter, loc, i32Ty, rewriter.getI32IntegerAttr(0));
+    Value windowOff = arith::ConstantOp::create(
+        rewriter, loc, i32Ty, rewriter.getI32IntegerAttr(window));
+    Value cur = TsGetOp::create(rewriter, loc, floatTy, origInput, zeroOff);
+
+    auto forOp = op->getParentOfType<scf::ForOp>();
+    if (!forOp)
+      return rewriter.notifyMatchFailure(
+          op, "fast_windowed_sum must be inside a scf.for time loop");
+    Value timeIdx   = forOp.getInductionVar();
+    Value loopLb    = forOp.getLowerBound();
+    Value localT    = arith::SubIOp::create(rewriter, loc, timeIdx, loopLb);
+    Value windowIdx = arith::ConstantIndexOp::create(rewriter, loc, window);
+    Value tGeWindow = arith::CmpIOp::create(
+        rewriter, loc, arith::CmpIPredicate::sge, localT, windowIdx);
+
+    auto ifOp = scf::IfOp::create(
+        rewriter, loc, TypeRange{floatTy}, tGeWindow, /*withElseRegion=*/true);
+    {
+      OpBuilder::InsertionGuard g(rewriter);
+      rewriter.setInsertionPointToStart(&ifOp.getThenRegion().front());
+      Value loaded =
+          TsGetOp::create(rewriter, loc, floatTy, origInput, windowOff);
+      scf::YieldOp::create(rewriter, loc, loaded);
+    }
+    {
+      OpBuilder::InsertionGuard g(rewriter);
+      rewriter.setInsertionPointToStart(&ifOp.getElseRegion().front());
+      Value nanV = LLVM::ConstantOp::create(
+          rewriter, loc, floatTy,
+          rewriter.getFloatAttr(
+              floatTy, std::numeric_limits<double>::quiet_NaN()));
+      scf::YieldOp::create(rewriter, loc, nanV);
+    }
+    Value old = ifOp.getResult(0);
+
+    // ── 3. Algorithm step.  All arith is via LLVM ops at this phase. ──
+    auto fcmp_isnan = [&](Value x) {
+      // isnan(x) ⇔ x != x  (UNE catches NaN, == NaN is false)
+      return LLVM::FCmpOp::create(rewriter, loc, LLVM::FCmpPredicate::une, x, x);
+    };
+    Value oldIsNan = fcmp_isnan(old);
+    Value newIsNan = fcmp_isnan(cur);
+
+    // Loaded state.
+    Value v       = LLVM::LoadOp::create(rewriter, loc, floatTy, vPtr);
+    Value compAdd = LLVM::LoadOp::create(rewriter, loc, floatTy, addPtr);
+    Value compSub = LLVM::LoadOp::create(rewriter, loc, floatTy, subPtr);
+    Value numNans = LLVM::LoadOp::create(rewriter, loc, i32Ty,   nansPtr);
+
+    Value zeroF = LLVM::ConstantOp::create(
+        rewriter, loc, floatTy, rewriter.getFloatAttr(floatTy, 0.0));
+
+    // kahanAdd(isnan_small, sum, small, &comp):
+    //   y = small - comp;  t = sum + y;
+    //   newComp = (t - sum) - y;
+    //   comp = isnan_small ? comp : newComp;
+    //   return t
+    auto kahanAdd = [&](Value isnan_small, Value sum, Value small, Value &comp) {
+      Value y     = LLVM::FSubOp::create(rewriter, loc, small, comp);
+      Value t     = LLVM::FAddOp::create(rewriter, loc, sum, y);
+      Value tMs   = LLVM::FSubOp::create(rewriter, loc, t, sum);
+      Value newC  = LLVM::FSubOp::create(rewriter, loc, tMs, y);
+      comp = LLVM::SelectOp::create(rewriter, loc, isnan_small, comp, newC);
+      return t;
+    };
+
+    // v -= old  (skip when old is NaN)
+    Value negOld = LLVM::FSubOp::create(rewriter, loc, zeroF, old);
+    Value tSub   = kahanAdd(oldIsNan, v, negOld, compSub);
+    v = LLVM::SelectOp::create(rewriter, loc, oldIsNan, v, tSub);
+
+    // v += cur  (skip when cur is NaN)
+    Value tAdd   = kahanAdd(newIsNan, v, cur, compAdd);
+    v = LLVM::SelectOp::create(rewriter, loc, newIsNan, v, tAdd);
+
+    // numNans += (new_is_nan ? 1 : 0) - (old_is_nan ? 1 : 0)
+    Value oneI32  = LLVM::ConstantOp::create(
+        rewriter, loc, i32Ty, rewriter.getI32IntegerAttr(1));
+    Value zeroI32 = LLVM::ConstantOp::create(
+        rewriter, loc, i32Ty, rewriter.getI32IntegerAttr(0));
+    Value oldDelta = LLVM::SelectOp::create(
+        rewriter, loc, oldIsNan, oneI32, zeroI32);
+    Value newDelta = LLVM::SelectOp::create(
+        rewriter, loc, newIsNan, oneI32, zeroI32);
+    numNans = LLVM::SubOp::create(rewriter, loc, numNans, oldDelta);
+    numNans = LLVM::AddOp::create(rewriter, loc, numNans, newDelta);
+
+    // result = (numNans == 0) ? v : NaN
+    Value isFull = LLVM::ICmpOp::create(
+        rewriter, loc, LLVM::ICmpPredicate::eq, numNans, zeroI32);
+    Value nanV = LLVM::ConstantOp::create(
+        rewriter, loc, floatTy,
+        rewriter.getFloatAttr(floatTy,
+                                std::numeric_limits<double>::quiet_NaN()));
+    Value out = LLVM::SelectOp::create(rewriter, loc, isFull, v, nanV);
+
+    // ── 4. Store back state. ────────────────────────────────────────
+    LLVM::StoreOp::create(rewriter, loc, v,       vPtr);
+    LLVM::StoreOp::create(rewriter, loc, compAdd, addPtr);
+    LLVM::StoreOp::create(rewriter, loc, compSub, subPtr);
+    LLVM::StoreOp::create(rewriter, loc, numNans, nansPtr);
+
+    rewriter.replaceOp(op, out);
+    return success();
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Pass
+//===----------------------------------------------------------------------===//
+
+struct ConvertKunGpuToLLVMPass
+    : ::impl::ConvertKunGpuToLLVMBase<ConvertKunGpuToLLVMPass> {
+
+  void runOnOperation() override {
+    ModuleOp module = getOperation();
+    auto *ctx       = &getContext();
+
+    // ── Phase 1 ────────────────────────────────────────────────────────
+    {
+      SmallVector<kunir::FuncOp> kfns;
+      module.walk([&](kunir::FuncOp fn) { kfns.push_back(fn); });
+      for (kunir::FuncOp fn : kfns)
+        if (failed(convertFuncSignature(fn)))
+          return signalPassFailure();
+    }
+
+    // ── Phase 2 ────────────────────────────────────────────────────────
+    TypeConverter typeConv;
+    typeConv.addConversion([](Type t) { return t; });
+    typeConv.addConversion([](TsType t) -> Type {
+      return LLVM::LLVMPointerType::get(t.getContext());
+    });
+    auto materialize = [](OpBuilder &b, Type t, ValueRange vs, Location l) -> Value {
+      if (vs.size() != 1) return Value();
+      return UnrealizedConversionCastOp::create(b, l, t, vs).getResult(0);
+    };
+    typeConv.addSourceMaterialization(materialize);
+    typeConv.addTargetMaterialization(materialize);
+
+    ConversionTarget target(*ctx);
+    target.addLegalDialect<arith::ArithDialect, scf::SCFDialect,
+                           LLVM::LLVMDialect, gpu::GPUDialect>();
+    target.addLegalOp<ModuleOp, UnrealizedConversionCastOp>();
+    target.addIllegalOp<WindowedTempOp, kungpu::AccumulatorOp,
+                        TsGetOp, TsPutOp,
+                        TimeLengthOp, TimeLbOp, TimeUbOp,
+                        StockIdOp, BlockStockCountOp>();
+    target.addIllegalOp<kunir::FastWindowedSumOp>();
+    // gpu.func is legal only after its signature has been converted from
+    // (...kunir.ts) to (...!llvm.ptr) by the FunctionOpInterface pattern
+    // we register below.
+    target.addDynamicallyLegalOp<gpu::GPUFuncOp>([&](gpu::GPUFuncOp op) {
+      return typeConv.isSignatureLegal(op.getFunctionType()) &&
+             typeConv.isLegal(&op.getBody());
+    });
+    // gpu.return is void in our IR — always legal.
+
+    WTDescMap descMap;
+    ChunkCtxMap chunkCtx;
+    int smemCounter = 0;
+
+    RewritePatternSet patterns(ctx);
+    populateFunctionOpInterfaceTypeConversionPattern<gpu::GPUFuncOp>(
+        patterns, typeConv);
+    patterns.add<TimeLengthPattern, TimeLbPattern, TimeUbPattern,
+                  StockIdPattern, BlockStockCountPattern>(typeConv, ctx);
+    patterns.add<WindowedTempPattern>(typeConv, ctx, descMap, smemCounter);
+    patterns.add<AccumulatorPattern>(typeConv, ctx, descMap);
+    patterns.add<TsGetPattern>(typeConv, ctx, descMap);
+    patterns.add<TsPutPattern>(typeConv, ctx, descMap, chunkCtx);
+    patterns.add<FastWindowedSumPattern>(typeConv, ctx);
+
+    if (failed(applyPartialConversion(module, target, std::move(patterns))))
+      signalPassFailure();
+  }
+};
+
+} // namespace
+
+namespace kungpu {
+std::unique_ptr<mlir::Pass> createConvertKunGpuToLLVMPass() {
+  return std::make_unique<ConvertKunGpuToLLVMPass>();
+}
+} // namespace kungpu
diff --git a/mlir/lib/KunGpu/Pipelines.cpp b/mlir/lib/KunGpu/Pipelines.cpp
new file mode 100644
index 0000000..27249e8
--- /dev/null
+++ b/mlir/lib/KunGpu/Pipelines.cpp
@@ -0,0 +1,128 @@
+//===- Pipelines.cpp - kunir → LLVM lowering pipeline --------------------===//
+
+#include "KunGpu/KunGpuDialect.h"
+#include "KunGpu/Passes.h"
+#include "KunGpu/Pipelines.h"
+#include "KunIr/KunIrOps.h"
+#include "KunIr/Passes.h"
+
+#include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
+#include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"
+#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h"
+#include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
+#include "mlir/Conversion/IndexToLLVM/IndexToLLVM.h"
+#include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"
+#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
+#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/Index/IR/IndexDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Transforms/Passes.h"
+
+using namespace mlir;
+
+namespace kungpu {
+
+void buildKunIrToLLVMPipeline(OpPassManager &pm) {
+  // pm's anchor is builtin.module; the kernels live one level down inside
+  // a gpu.module, so kunir.func / gpu.func passes must nest through it.
+
+  // ── 1–2.  Per-kunir.func passes (nested: gpu.module → kunir.func) ────
+  {
+    OpPassManager &gpuModPM = pm.nest<gpu::GPUModuleOp>();
+    gpuModPM.addNestedPass<::kunir::FuncOp>(::kunir::createKunIrToKunGpuPass());
+    // CSE here — kungpu.ts.get is Pure, so any duplicates emitted by the
+    // lowering above (e.g. two back_refs reading the same input at the same
+    // offset, or any other path where distinct kunir SSA values produce
+    // identical ts.get) collapse to a single load before
+    // windowed-temp-memory-planning + convert-kungpu-to-llvm see them.
+    gpuModPM.addNestedPass<::kunir::FuncOp>(createCSEPass());
+    gpuModPM.addNestedPass<::kunir::FuncOp>(
+        ::kungpu::createWindowedTempMemoryPlanningPass());
+  }
+
+  // ── 3.  kunir.func → gpu.func + kungpu ops → LLVM (module-level) ─────
+  pm.addPass(::kungpu::createConvertKunGpuToLLVMPass());
+
+  // ── 4.  LICM per gpu.func (nested: gpu.module → gpu.func) ────────────
+  {
+    OpPassManager &gpuModPM = pm.nest<gpu::GPUModuleOp>();
+    gpuModPM.addNestedPass<gpu::GPUFuncOp>(createLoopInvariantCodeMotionPass());
+  }
+
+  // ── 5–6.  Generic cleanup ─────────────────────────────────────────────
+  pm.addPass(createCanonicalizerPass());
+  pm.addPass(createCSEPass());
+
+  // ── 7.  scf → cf (control flow) ───────────────────────────────────────
+  pm.addPass(createSCFToControlFlowPass());
+
+  // ── 8.  index / arith / cf → LLVM, in order.  These lower the device-
+  //       side body of gpu.func before gpu-to-nvvm, so the latter only
+  //       has to deal with gpu ops + the gpu.func wrapper.
+  pm.addPass(createConvertIndexToLLVMPass());
+  pm.addPass(createArithToLLVMConversionPass());
+  pm.addPass(createConvertControlFlowToLLVMPass());
+
+  // ── 9.  gpu.thread_id / block_id / block_dim → nvvm intrinsics, plus
+  //       gpu.func → llvm.func (with `nvvm.kernel`).
+  // We deliberately match the default index bitwidth (64) used by the
+  // earlier arith/cf/index→LLVM passes — mixing 32 and 64 leaves
+  // i32 → index → i64 unrealized_conversion_cast chains that
+  // reconcile-unrealized-casts can't fold.  The downside is a single
+  // sext after each NVVM intrinsic, which LLVM's later DCE/InstCombine
+  // erases.
+  pm.addNestedPass<gpu::GPUModuleOp>(createConvertGpuOpsToNVVMOps());
+
+  // ── 10.  func.func → llvm.func (host-side helpers, if any).
+  pm.addPass(createConvertFuncToLLVMPass());
+
+  // ── 11.  Resolve any leftover unrealized_conversion_casts ──────────
+  pm.addPass(createReconcileUnrealizedCastsPass());
+}
+
+namespace {
+
+// Lit-test wrapper: runs the whole pipeline as a single -kunir-to-llvm pass.
+struct KunIrToLLVMPass
+    : PassWrapper<KunIrToLLVMPass, OperationPass<ModuleOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(KunIrToLLVMPass)
+  StringRef getArgument()    const override { return "kunir-to-llvm"; }
+  StringRef getDescription() const override {
+    return "Lower kunir.func down to the LLVM dialect (test wrapper)";
+  }
+  void getDependentDialects(DialectRegistry &registry) const override {
+    // Pulls in everything the nested pipeline will create / load.
+    registry.insert<::kungpu::KunGpuDialect, scf::SCFDialect,
+                    arith::ArithDialect, math::MathDialect,
+                    func::FuncDialect, gpu::GPUDialect,
+                    LLVM::LLVMDialect, NVVM::NVVMDialect,
+                    cf::ControlFlowDialect, index::IndexDialect>();
+  }
+
+  void runOnOperation() override {
+    OpPassManager pm("builtin.module");
+    buildKunIrToLLVMPipeline(pm);
+    if (failed(runPipeline(pm, getOperation())))
+      signalPassFailure();
+  }
+};
+
+} // namespace
+
+std::unique_ptr<mlir::Pass> createKunIrToLLVMPass() {
+  return std::make_unique<KunIrToLLVMPass>();
+}
+
+void registerKunIrToLLVMPass() {
+  PassRegistration<KunIrToLLVMPass>();
+}
+
+} // namespace kungpu
diff --git a/mlir/lib/KunGpu/PtxBackend.cpp b/mlir/lib/KunGpu/PtxBackend.cpp
new file mode 100644
index 0000000..de01643
--- /dev/null
+++ b/mlir/lib/KunGpu/PtxBackend.cpp
@@ -0,0 +1,262 @@
+//===- PtxBackend.cpp - kunir → cubin (single upstream-pass pipeline) -===//
+
+#include "KunGpu/PtxBackend.h"
+#include "KunGpu/KunGpuUtils.h"
+#include "KunGpu/Pipelines.h"
+#include "KunIr/KunIrAttrs.h"
+#include "KunIr/KunIrOps.h"
+#include "KunIr/KunIrTypes.h"
+
+#include "mlir/Dialect/GPU/IR/CompilationInterfaces.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/GPU/Transforms/Passes.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/Pass/PassManager.h"
+
+#include "llvm/ADT/SmallVector.h"
+
+#include <optional>
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+
+namespace kungpu {
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+// Step 1: kunir → llvm dialect.  Same pipeline both compileKunIrToPtx and
+// compileKunIrToExecutable need before they hand off to upstream
+// gpu-module-to-binary.
+//===----------------------------------------------------------------------===//
+
+LogicalResult lowerKunIrToLLVMDialect(ModuleOp module) {
+  PassManager pm(module.getContext());
+  buildKunIrToLLVMPipeline(pm);
+  if (failed(pm.run(module)))
+    return module.emitError(
+        "compileKunIr*: kunir-to-llvm pipeline failed");
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Step 2: attach #nvvm.target to the (single) gpu.module so the upstream
+// pass knows what chip/O/etc. to compile for.  We do this by hand instead
+// of running `nvvm-attach-target` to keep the chip / O knobs typed and
+// avoid re-parsing the pass options string.
+//===----------------------------------------------------------------------===//
+
+LogicalResult attachNvvmTarget(ModuleOp module,
+                                 const PtxCompileOptions &opts) {
+  gpu::GPUModuleOp gpuMod;
+  module.walk([&](gpu::GPUModuleOp m) {
+    gpuMod = m;
+    return WalkResult::interrupt();
+  });
+  if (!gpuMod)
+    return module.emitError(
+        "compileKunIr*: no gpu.module found after the kunir-to-llvm "
+        "pipeline");
+
+  MLIRContext *ctx = module.getContext();
+  auto targetAttr = NVVM::NVVMTargetAttr::get(
+      ctx, /*optLevel=*/static_cast<int>(opts.optLevel),
+      /*triple=*/opts.targetTriple,
+      /*chip=*/opts.targetCpu,
+      /*features=*/opts.targetFeatures);
+  gpuMod.setTargetsAttr(ArrayAttr::get(ctx, {targetAttr}));
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Step 3: run gpu-module-to-binary, then dig out the resulting object's
+// payload (PTX text or cubin bytes).
+//===----------------------------------------------------------------------===//
+
+LogicalResult runGpuModuleToBinary(ModuleOp module,
+                                     const std::string &compilationTarget,
+                                     const std::string &toolkitPath,
+                                     std::string &outBytes) {
+  // The Python wrapper (`KunQuant.jit.cuda.find_cuda_toolkit`) is
+  // responsible for resolving an empty toolkit path.  If it's still
+  // empty here, the caller is using the C++ API directly without
+  // hand-resolving — pass it on and let the upstream pass try its own
+  // (limited) defaults.
+  GpuModuleToBinaryPassOptions passOpts;
+  passOpts.compilationTarget = compilationTarget;   // "isa" (PTX) | "bin" (cubin)
+  passOpts.toolkitPath       = toolkitPath;
+
+  PassManager pm(module.getContext());
+  pm.addPass(createGpuModuleToBinaryPass(passOpts));
+  if (failed(pm.run(module)))
+    return module.emitError(
+        "compileKunIr*: gpu-module-to-binary{format=")
+        << compilationTarget << "} failed";
+
+  // The pass replaces every gpu.module with a gpu.binary holding one
+  // gpu.object per target attribute.  We attached exactly one target,
+  // so we expect one binary with one object — pull its bytes out.
+  gpu::BinaryOp binary;
+  module.walk([&](gpu::BinaryOp op) {
+    binary = op;
+    return WalkResult::interrupt();
+  });
+  if (!binary)
+    return module.emitError(
+        "compileKunIr*: gpu-module-to-binary produced no gpu.binary "
+        "(target attr missing on gpu.module?)");
+
+  ArrayAttr objects = binary.getObjectsAttr();
+  if (!objects || objects.empty())
+    return module.emitError(
+        "compileKunIr*: gpu.binary has no objects");
+  auto obj = llvm::dyn_cast<gpu::ObjectAttr>(objects[0]);
+  if (!obj)
+    return module.emitError(
+        "compileKunIr*: gpu.binary's first object is not a #gpu.object");
+
+  StringAttr payload = obj.getObject();
+  outBytes.assign(payload.getValue().begin(), payload.getValue().end());
+  return success();
+}
+
+} // namespace
+
+//===----------------------------------------------------------------------===//
+// Public: PTX (debug / inspection)
+//===----------------------------------------------------------------------===//
+
+LogicalResult compileKunIrToPtx(ModuleOp module,
+                                  const PtxCompileOptions &options,
+                                  std::string &ptxOut) {
+  if (failed(lowerKunIrToLLVMDialect(module))) return failure();
+  if (failed(attachNvvmTarget(module, options))) return failure();
+  return runGpuModuleToBinary(module, /*compilationTarget=*/"isa",
+                                options.toolkitPath, ptxOut);
+}
+
+//===----------------------------------------------------------------------===//
+// Public: kunir → cubin + per-kernel name metadata
+//===----------------------------------------------------------------------===//
+
+LogicalResult compileKunIrToExecutable(ModuleOp module,
+                                        const PtxCompileOptions &options,
+                                        ::kun_cuda::ExecutableData &out) {
+  // Sample the kunir.func element type before the kunir → llvm lowering
+  // erases it.  Verify graph-wide uniformity at the same time — every
+  // kunir.func in the module must agree on dtype, otherwise the slot
+  // pool wouldn't have a single byte-size to use.
+  std::optional<::kun_cuda::Datatype> sampledDtype;
+  std::string dtypeOwner;
+  WalkResult dtypeWalk = module.walk([&](kunir::FuncOp f) -> WalkResult {
+    for (Type t : f.getFunctionTypeTyped().getInputs()) {
+      auto ts = dyn_cast<kunir::TsType>(t);
+      if (!ts) continue;
+      Type et = ts.getElementType();
+      ::kun_cuda::Datatype dt;
+      if (et.isF32())      dt = ::kun_cuda::Datatype::Float;
+      else if (et.isF64()) dt = ::kun_cuda::Datatype::Double;
+      else {
+        f.emitError("compileKunIrToExecutable: unsupported ts element "
+                    "type — only f32 and f64");
+        return WalkResult::interrupt();
+      }
+      if (!sampledDtype) {
+        sampledDtype = dt;
+        dtypeOwner   = f.getSymName().str();
+      } else if (*sampledDtype != dt) {
+        f.emitError("compileKunIrToExecutable: kunir.func '")
+            << f.getSymName() << "' has dtype "
+            << (dt == ::kun_cuda::Datatype::Double ? "f64" : "f32")
+            << " but earlier '" << dtypeOwner << "' had "
+            << (*sampledDtype == ::kun_cuda::Datatype::Double ? "f64" : "f32");
+        return WalkResult::interrupt();
+      }
+    }
+    return WalkResult::advance();
+  });
+  if (dtypeWalk.wasInterrupted()) return failure();
+
+  // 1.  kunir → llvm dialect.  After this the gpu.module body is fully
+  //     lowered and our discardable kungpu.* attrs sit on llvm.func ops.
+  if (failed(lowerKunIrToLLVMDialect(module))) return failure();
+
+  // 2.  Walk every kernel function (carries kungpu.target_spec) and
+  //     gather its name + I/O lists.  Must happen BEFORE the next pass
+  //     since gpu-module-to-binary replaces the gpu.module with a
+  //     gpu.binary that has no llvm.func to walk.
+  std::vector<::kun_cuda::KernelMeta> kernels;
+  std::vector<std::pair<int64_t, int64_t>> targetSpecs;  // (warps, vector)
+  std::vector<std::string> targetSpecOwners;             // for diagnostics
+
+  module.walk([&](LLVM::LLVMFuncOp f) {
+    if (!f->hasAttr(kFuncTargetSpecAttr))
+      return WalkResult::advance();
+
+    ::kun_cuda::KernelMeta km;
+    km.kernelName = f.getSymName().str();
+    if (auto inNames = getFuncInputNames(f))
+      for (auto a : inNames)
+        km.inputNames.push_back(llvm::cast<StringAttr>(a).str());
+    if (auto outNames = getFuncOutputNames(f))
+      for (auto a : outNames)
+        km.outputNames.push_back(llvm::cast<StringAttr>(a).str());
+    km.unreliableCount = getFuncUnreliableCount(f);
+
+    int64_t w = 1, v = 1;
+    if (auto ts = getFuncTargetSpec(f)) {
+      w = ts.getWarpsPerCta();
+      v = ts.getVectorSize();
+    }
+    targetSpecs.emplace_back(w, v);
+    targetSpecOwners.push_back(km.kernelName);
+    kernels.push_back(std::move(km));
+    return WalkResult::advance();
+  });
+
+  // No JIT kernels at all is legal — every partition could be an
+  // externally-dispatched kernel (e.g. a graph that is only cs_rank).
+  // In that case skip cubin generation entirely; the caller (pyCompile)
+  // is expected to inject external KernelMetas and to provide
+  // warpsPerCta out-of-band.
+  out = ::kun_cuda::ExecutableData{};
+  if (kernels.empty())
+    return success();
+
+  // 3.  Validate target spec is graph-wide.
+  auto [warpsPerCta, vectorSize] = targetSpecs.front();
+  for (size_t i = 1; i < targetSpecs.size(); ++i) {
+    auto [w, v] = targetSpecs[i];
+    if (w != warpsPerCta || v != vectorSize)
+      return module.emitError(
+          "compileKunIrToExecutable: kernels disagree on warps_per_cta / "
+          "vector_size — graph-wide target spec required (")
+          << "kernel '" << targetSpecOwners[i] << "': warps_per_cta="
+          << w << " vector_size=" << v
+          << "; expected warps_per_cta=" << warpsPerCta
+          << " vector_size=" << vectorSize << ")";
+  }
+
+  // 4.  Attach #nvvm.target + run gpu-module-to-binary{format=bin}.
+  if (failed(attachNvvmTarget(module, options))) return failure();
+  std::string cubin;
+  if (failed(runGpuModuleToBinary(module, /*compilationTarget=*/"bin",
+                                    options.toolkitPath, cubin)))
+    return failure();
+
+  // 5.  Populate `out`.  graphInputs / graphOutputs are caller-supplied
+  //     after this returns — leave them empty.
+  out.cubin.assign(cubin.begin(), cubin.end());
+  out.warpsPerCta = warpsPerCta;
+  out.vectorSize  = vectorSize;
+  out.dtype       = sampledDtype.value_or(::kun_cuda::Datatype::Float);
+  out.kernels     = std::move(kernels);
+  return success();
+}
+
+} // namespace kungpu
diff --git a/mlir/lib/KunIr/CMakeLists.txt b/mlir/lib/KunIr/CMakeLists.txt
new file mode 100644
index 0000000..5e2877a
--- /dev/null
+++ b/mlir/lib/KunIr/CMakeLists.txt
@@ -0,0 +1,50 @@
+add_mlir_dialect_library(MLIRKunIrDialect
+  KunIrDialect.cpp
+  KunIrTypes.cpp
+  KunIrInterfaces.cpp
+  KunIrOps.cpp
+  KunIrAttrs.cpp
+
+  PARTIAL_SOURCES_INTENDED
+
+  ADDITIONAL_HEADER_DIRS
+  ${PROJECT_SOURCE_DIR}/mlir/include
+
+  DEPENDS
+  MLIRKunIrOpsIncGen
+  MLIRKunIrInterfacesIncGen
+  MLIRKunIrAttrsIncGen
+)
+
+mlir_target_link_libraries(MLIRKunIrDialect PUBLIC
+  MLIRIR
+  MLIRFuncDialect
+  MLIRSideEffectInterfaces
+)
+
+add_mlir_library(MLIRKunIrToKunGpu
+  KunIrToKunGpu.cpp
+
+  PARTIAL_SOURCES_INTENDED
+
+  ADDITIONAL_HEADER_DIRS
+  ${PROJECT_SOURCE_DIR}/mlir/include
+
+  DEPENDS
+  MLIRKunIrOpsIncGen
+  MLIRKunIrInterfacesIncGen
+  MLIRKunGpuOpsIncGen
+
+  LINK_LIBS PUBLIC
+  MLIRKunIrDialect
+  MLIRKunGpuDialect
+)
+
+mlir_target_link_libraries(MLIRKunIrToKunGpu PUBLIC
+  MLIRFuncDialect
+  MLIRArithDialect
+  MLIRMathDialect
+  MLIRSCFDialect
+  MLIRIR
+  MLIRPass
+)
diff --git a/mlir/lib/KunIr/KunIrAttrs.cpp b/mlir/lib/KunIr/KunIrAttrs.cpp
new file mode 100644
index 0000000..081e377
--- /dev/null
+++ b/mlir/lib/KunIr/KunIrAttrs.cpp
@@ -0,0 +1,55 @@
+#include "KunIr/KunIrAttrs.h"
+#include "KunIr/KunIrDialect.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/DialectImplementation.h"
+#include "llvm/ADT/TypeSwitch.h"
+
+using namespace mlir;
+using namespace kunir;
+
+#define GET_ATTRDEF_CLASSES
+#include "KunIr/KunIrOpsAttrDefs.cpp.inc"
+
+//===----------------------------------------------------------------------===//
+// TargetSpecAttr — custom assembly format
+//
+// Inline format (used inside kunir.func):
+//   {occupancy = V, warps_per_cta = V, smem_size = V}
+//
+// Canonical MLIR attribute form (used stand-alone):
+//   #kunir.target_spec<{occupancy = V, warps_per_cta = V, smem_size = V}>
+//===----------------------------------------------------------------------===//
+
+Attribute TargetSpecAttr::parse(AsmParser &parser, Type) {
+  int64_t occupancy = 0, warpsPerCta = 0, smemSize = 0, vectorSize = 1;
+  if (parser.parseLBrace() ||
+      parser.parseKeyword("occupancy") || parser.parseEqual() ||
+      parser.parseInteger(occupancy) || parser.parseComma() ||
+      parser.parseKeyword("warps_per_cta") || parser.parseEqual() ||
+      parser.parseInteger(warpsPerCta) || parser.parseComma() ||
+      parser.parseKeyword("smem_size") || parser.parseEqual() ||
+      parser.parseInteger(smemSize) || parser.parseComma() ||
+      parser.parseKeyword("vector_size") || parser.parseEqual() ||
+      parser.parseInteger(vectorSize) || parser.parseRBrace())
+    return {};
+  return TargetSpecAttr::get(parser.getContext(), occupancy, warpsPerCta,
+                              smemSize, vectorSize);
+}
+
+void TargetSpecAttr::print(AsmPrinter &printer) const {
+  printer << "{occupancy = " << getOccupancy()
+          << ", warps_per_cta = " << getWarpsPerCta()
+          << ", smem_size = " << getSmemSize()
+          << ", vector_size = " << getVectorSize() << "}";
+}
+
+//===----------------------------------------------------------------------===//
+// Dialect attr registration
+//===----------------------------------------------------------------------===//
+
+void KunIrDialect::registerAttrs() {
+  addAttributes<
+#define GET_ATTRDEF_LIST
+#include "KunIr/KunIrOpsAttrDefs.cpp.inc"
+  >();
+}
diff --git a/mlir/lib/KunIr/KunIrDialect.cpp b/mlir/lib/KunIr/KunIrDialect.cpp
new file mode 100644
index 0000000..ee03c47
--- /dev/null
+++ b/mlir/lib/KunIr/KunIrDialect.cpp
@@ -0,0 +1,22 @@
+#include "KunIr/KunIrAttrs.h"
+#include "KunIr/KunIrDialect.h"
+#include "KunIr/KunIrOps.h"
+#include "KunIr/KunIrTypes.h"
+
+using namespace mlir;
+using namespace kunir;
+
+//===----------------------------------------------------------------------===//
+// KunIr dialect
+//===----------------------------------------------------------------------===//
+
+#include "KunIr/KunIrOpsDialect.cpp.inc"
+
+void KunIrDialect::initialize() {
+  addOperations<
+#define GET_OP_LIST
+#include "KunIr/KunIrOps.cpp.inc"
+  >();
+  registerTypes();
+  registerAttrs();
+}
diff --git a/mlir/lib/KunIr/KunIrInterfaces.cpp b/mlir/lib/KunIr/KunIrInterfaces.cpp
new file mode 100644
index 0000000..9e6b4d6
--- /dev/null
+++ b/mlir/lib/KunIr/KunIrInterfaces.cpp
@@ -0,0 +1,6 @@
+#include "KunIr/KunIrInterfaces.h"
+
+using namespace mlir;
+using namespace kunir;
+
+#include "KunIr/KunIrInterfaces.cpp.inc"
diff --git a/mlir/lib/KunIr/KunIrOps.cpp b/mlir/lib/KunIr/KunIrOps.cpp
new file mode 100644
index 0000000..32c1069
--- /dev/null
+++ b/mlir/lib/KunIr/KunIrOps.cpp
@@ -0,0 +1,905 @@
+#include "KunIr/KunIrOps.h"
+#include "KunIr/KunIrAttrs.h"
+#include "KunIr/KunIrInterfaces.h"
+#include "KunIr/KunIrTypes.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/DialectImplementation.h"
+#include "mlir/IR/OpImplementation.h"
+#include <limits>
+
+using namespace mlir;
+using namespace kunir;
+
+static constexpr uint64_t kInfLookback = std::numeric_limits<uint64_t>::max();
+
+//===----------------------------------------------------------------------===//
+// Interface table (generated)
+//===----------------------------------------------------------------------===//
+
+#include "KunIr/KunIrInterfaces.cpp.inc"
+
+//===----------------------------------------------------------------------===//
+// Generated op definitions
+//===----------------------------------------------------------------------===//
+
+#define GET_OP_CLASSES
+#include "KunIr/KunIrOps.cpp.inc"
+
+//===----------------------------------------------------------------------===//
+// YieldOp — manual zero-arg build (declared by OpBuilder<(ins), [{}]>)
+//===----------------------------------------------------------------------===//
+
+void kunir::YieldOp::build(mlir::OpBuilder &, mlir::OperationState &) {
+  // Empty build: produces a zero-operand yield for ensureTerminator.
+}
+
+void kunir::ReturnOp::build(mlir::OpBuilder &, mlir::OperationState &) {
+  // Empty build: produces a zero-operand return for ensureTerminator.
+}
+
+//===----------------------------------------------------------------------===//
+// Binary elemwise ops — verify only (inferReturnTypes is in ElemwiseTsResultType)
+//===----------------------------------------------------------------------===//
+
+// Shared verifier: both inputs must share the same element type.
+static LogicalResult verifyBinaryElemwise(Operation *op,
+                                          Value lhs, Value rhs) {
+  auto lhsTy = llvm::cast<TsType>(lhs.getType());
+  auto rhsTy = llvm::cast<TsType>(rhs.getType());
+  if (lhsTy.getElementType() != rhsTy.getElementType())
+    return op->emitOpError("lhs element type '")
+           << lhsTy.getElementType() << "' must match rhs element type '"
+           << rhsTy.getElementType() << "'";
+  return success();
+}
+
+LogicalResult AddOp::verify() { return verifyBinaryElemwise(*this, getLhs(), getRhs()); }
+LogicalResult SubOp::verify() { return verifyBinaryElemwise(*this, getLhs(), getRhs()); }
+LogicalResult MulOp::verify() { return verifyBinaryElemwise(*this, getLhs(), getRhs()); }
+LogicalResult DivOp::verify() { return verifyBinaryElemwise(*this, getLhs(), getRhs()); }
+LogicalResult MaxOp::verify() { return verifyBinaryElemwise(*this, getLhs(), getRhs()); }
+LogicalResult MinOp::verify() { return verifyBinaryElemwise(*this, getLhs(), getRhs()); }
+LogicalResult EqualOp::verify()        { return verifyBinaryElemwise(*this, getLhs(), getRhs()); }
+LogicalResult GreaterOp::verify()      { return verifyBinaryElemwise(*this, getLhs(), getRhs()); }
+LogicalResult GreaterEqualOp::verify() { return verifyBinaryElemwise(*this, getLhs(), getRhs()); }
+LogicalResult LessOp::verify()         { return verifyBinaryElemwise(*this, getLhs(), getRhs()); }
+LogicalResult LessEqualOp::verify()    { return verifyBinaryElemwise(*this, getLhs(), getRhs()); }
+
+// Logical ops also require both operands to be i1 ts.
+static LogicalResult verifyLogicalBinary(Operation *op, Value lhs, Value rhs) {
+  if (failed(verifyBinaryElemwise(op, lhs, rhs)))
+    return failure();
+  auto elemTy = llvm::cast<TsType>(lhs.getType()).getElementType();
+  if (!elemTy.isInteger(1))
+    return op->emitOpError("operand element type must be i1, got '")
+           << elemTy << "'";
+  return success();
+}
+LogicalResult AndOp::verify() { return verifyLogicalBinary(*this, getLhs(), getRhs()); }
+LogicalResult OrOp::verify()  { return verifyLogicalBinary(*this, getLhs(), getRhs()); }
+
+//===----------------------------------------------------------------------===//
+// Unary elemwise ops — verify only
+//===----------------------------------------------------------------------===//
+
+LogicalResult AbsOp::verify()  { return success(); }
+LogicalResult LogOp::verify()  { return success(); }
+LogicalResult ExpOp::verify()  { return success(); }
+LogicalResult SqrtOp::verify() { return success(); }
+LogicalResult SignOp::verify() { return success(); }
+
+LogicalResult NotOp::verify() {
+  auto elemTy = llvm::cast<TsType>(getInput().getType()).getElementType();
+  if (!elemTy.isInteger(1))
+    return emitOpError("operand element type must be i1, got '")
+           << elemTy << "'";
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// SelectOp — cond must be ts<i1, *>; true/false must share elem type.
+//===----------------------------------------------------------------------===//
+
+LogicalResult SelectOp::verify() {
+  auto condTy  = llvm::cast<TsType>(getCond().getType());
+  auto trueTy  = llvm::cast<TsType>(getTrueValue().getType());
+  auto falseTy = llvm::cast<TsType>(getFalseValue().getType());
+  if (!condTy.getElementType().isInteger(1))
+    return emitOpError("cond element type must be i1, got '")
+           << condTy.getElementType() << "'";
+  if (trueTy.getElementType() != falseTy.getElementType())
+    return emitOpError("true_value element type '")
+           << trueTy.getElementType()
+           << "' must match false_value element type '"
+           << falseTy.getElementType() << "'";
+  return success();
+}
+
+// OpaqueProperties -> PropertyRef
+// Result type: ts<true_value.elem, 1>.
+LogicalResult SelectOp::inferReturnTypes(
+    MLIRContext *ctx, std::optional<Location>, ValueRange operands,
+    DictionaryAttr, OpaqueProperties , RegionRange,
+    SmallVectorImpl<Type> &inferred) {
+  auto trueTy = llvm::cast<TsType>(operands[1].getType());
+  inferred.push_back(TsType::get(ctx, trueTy.getElementType(), 1));
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// OutputRefOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult OutputRefOp::verify() {
+  auto valTy = llvm::cast<TsType>(getValue().getType());
+  auto resTy = llvm::cast<TsType>(getResult().getType());
+  if (valTy.getElementType() != resTy.getElementType())
+    return emitOpError("result element type '")
+           << resTy.getElementType()
+           << "' must match value element type '"
+           << valTy.getElementType() << "'";
+  if (getName().empty())
+    return emitOpError("output name must be non-empty");
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// WindowedOutputOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult WindowedOutputOp::verify() {
+  auto inputTy  = llvm::cast<TsType>(getInput().getType());
+  auto resultTy = llvm::cast<TsType>(getResult().getType());
+
+  if (inputTy.getElementType() != resultTy.getElementType())
+    return emitOpError("result element type '")
+           << resultTy.getElementType()
+           << "' must match input element type '"
+           << inputTy.getElementType() << "'";
+
+  int64_t len = getLength();
+  if (len <= 0)
+    return emitOpError("length must be positive, got ") << len;
+
+  if (resultTy.getMaxLookback() != static_cast<uint64_t>(len))
+    return emitOpError("result maxLookback (")
+           << resultTy.getMaxLookback()
+           << ") must equal length attribute (" << len << ")";
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Reduce ops — verify they are inside a ForEachBackWindow body
+//
+// Uses Operation* directly (no template); SameOperandsAndResultType already
+// enforces input == result type, so only the parent check is needed.
+//===----------------------------------------------------------------------===//
+
+static LogicalResult verifyInsideForEachBackWindow(Operation *op) {
+  if (!llvm::isa_and_nonnull<ForEachBackWindowOp>(op->getParentOp()))
+    return op->emitOpError(
+        "must be directly inside a 'kunir.for_each_back_window' region");
+  return success();
+}
+
+LogicalResult ReduceAddOp::verify() { return verifyInsideForEachBackWindow(*this); }
+LogicalResult ReduceMulOp::verify() { return verifyInsideForEachBackWindow(*this); }
+LogicalResult ReduceMaxOp::verify() { return verifyInsideForEachBackWindow(*this); }
+LogicalResult ReduceMinOp::verify() { return verifyInsideForEachBackWindow(*this); }
+LogicalResult ReduceArgMinOp::verify() { return verifyInsideForEachBackWindow(*this); }
+LogicalResult ReduceArgMaxOp::verify() { return verifyInsideForEachBackWindow(*this); }
+LogicalResult ReduceRankOp::verify() {
+  if (failed(verifyInsideForEachBackWindow(*this))) return failure();
+  auto vT = llvm::cast<TsType>(getValue().getType());
+  auto cT = llvm::cast<TsType>(getCurrent().getType());
+  if (cT.getElementType() != vT.getElementType())
+    return emitOpError("current element type must match value element type");
+  return success();
+}
+LogicalResult WindowLoopIndexOp::verify() {
+  return verifyInsideForEachBackWindow(*this);
+}
+
+//===----------------------------------------------------------------------===//
+// BackRef + FastWindowedSum — share a verifier (same shape / constraints)
+//===----------------------------------------------------------------------===//
+
+static LogicalResult
+verifyWindowedScalarOrTsResultOp(Operation *op, Value input, int64_t window,
+                                  Type resultTy) {
+  auto inputTy = llvm::cast<TsType>(input.getType());
+  if (window <= 0)
+    return op->emitOpError("window must be positive, got ") << window;
+
+  // Need both the current value and the value `window` steps back, so the
+  // input must retain at least `window + 1` time steps.
+  uint64_t need = static_cast<uint64_t>(window) + 1;
+  uint64_t have = inputTy.getMaxLookback();
+  if (have != kInfLookback && have < need)
+    return op->emitOpError("input.maxLookback (")
+           << have << ") must be >= window+1 (" << need << ")";
+
+  // Result type: either ts<inputElemType, 1> (source form) or the input's
+  // element type itself (lowered form, after kunir-to-kungpu).
+  Type elemTy = inputTy.getElementType();
+  if (auto resTs = llvm::dyn_cast<TsType>(resultTy)) {
+    if (resTs.getElementType() != elemTy)
+      return op->emitOpError("result element type '")
+             << resTs.getElementType()
+             << "' must match input element type '" << elemTy << "'";
+    if (resTs.getMaxLookback() != 1)
+      return op->emitOpError("result maxLookback must be 1, got ")
+             << resTs.getMaxLookback();
+    return success();
+  }
+  if (resultTy != elemTy)
+    return op->emitOpError(
+               "scalar result type must equal input element type '")
+           << elemTy << "', got '" << resultTy << "'";
+  return success();
+}
+
+LogicalResult BackRefOp::verify() {
+  return verifyWindowedScalarOrTsResultOp(*this, getInput(), getWindow(),
+                                            getResult().getType());
+}
+LogicalResult FastWindowedSumOp::verify() {
+  return verifyWindowedScalarOrTsResultOp(*this, getInput(), getWindow(),
+                                            getResult().getType());
+}
+
+//===----------------------------------------------------------------------===//
+// ConstantOp — result must be ts<T, 1>.  The value attr is f64; we don't
+// pre-check finiteness so that quiet-NaN (0x7FF8...) can flow through.
+//===----------------------------------------------------------------------===//
+
+LogicalResult ConstantOp::verify() {
+  auto resultTy = llvm::cast<TsType>(getResult().getType());
+  if (resultTy.getMaxLookback() != 1)
+    return emitOpError("result maxLookback must be 1, got ")
+           << resultTy.getMaxLookback();
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// AccumulatorOp / SetAccumulatorOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult AccumulatorOp::verify() {
+  auto resultTy = llvm::cast<TsType>(getResult().getType());
+  if (resultTy.getMaxLookback() != 1)
+    return emitOpError("accumulator result maxLookback must be 1, got ")
+           << resultTy.getMaxLookback();
+  if (getName().empty())
+    return emitOpError("accumulator name must be non-empty");
+  return success();
+}
+
+LogicalResult SetAccumulatorOp::verify() {
+  auto *accOp = getAcc().getDefiningOp();
+  if (!accOp || !llvm::isa<AccumulatorOp>(accOp))
+    return emitOpError(
+        "first operand must be the result of a 'kunir.accumulator'");
+  auto accTy   = llvm::cast<TsType>(getAcc().getType());
+  auto maskTy  = llvm::cast<TsType>(getMask().getType());
+  auto valueTy = llvm::cast<TsType>(getValue().getType());
+  if (accTy.getElementType() != valueTy.getElementType())
+    return emitOpError("value element type '")
+           << valueTy.getElementType()
+           << "' must match accumulator element type '"
+           << accTy.getElementType() << "'";
+  if (!llvm::isa<IntegerType>(maskTy.getElementType()) ||
+      llvm::cast<IntegerType>(maskTy.getElementType()).getWidth() != 1)
+    return emitOpError("mask element type must be i1, got '")
+           << maskTy.getElementType() << "'";
+  auto resultTy = llvm::cast<TsType>(getResult().getType());
+  if (resultTy.getElementType() != accTy.getElementType())
+    return emitOpError("result element type '")
+           << resultTy.getElementType()
+           << "' must match accumulator element type '"
+           << accTy.getElementType() << "'";
+  if (resultTy.getMaxLookback() != 1)
+    return emitOpError("result maxLookback must be 1, got ")
+           << resultTy.getMaxLookback();
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// ForEachBackWindowOp — verifier + custom assembly format
+//
+// Format:
+//   %r = kunir.for_each_back_window
+//       (%in0 : !kunir.ts<f32, 10>, %in1 : !kunir.ts<f32, 10>)
+//       [window = 5]
+//       (%cur0 : !kunir.ts<f32, 1>, %cur1 : !kunir.ts<f32, 1>)
+//       -> (!kunir.ts<f32, 1>) {
+//     %s = kunir.reduce_add %cur0 : !kunir.ts<f32, 1>
+//     kunir.yield %s : !kunir.ts<f32, 1>
+//   }
+//===----------------------------------------------------------------------===//
+
+LogicalResult ForEachBackWindowOp::verify() {
+  int64_t win = getWindow();
+  if (win <= 0)
+    return emitOpError("window must be positive, got ") << win;
+
+  auto inputs = getInputs();
+  Block &bodyBlock = getBody().front();
+
+  // Each input's maxLookback must be >= window.
+  for (auto [idx, input] : llvm::enumerate(inputs)) {
+    auto inputTy = llvm::cast<TsType>(input.getType());
+    uint64_t lookback = inputTy.getMaxLookback();
+    if (lookback != kInfLookback && lookback < static_cast<uint64_t>(win))
+      return emitOpError("input #")
+             << idx << " maxLookback (" << lookback
+             << ") must be >= window (" << win << ")";
+  }
+
+  // Block must have exactly one arg per input, typed ts<elemType_i, 1>.
+  if (bodyBlock.getNumArguments() != inputs.size())
+    return emitOpError("body block has ")
+           << bodyBlock.getNumArguments()
+           << " argument(s) but op has " << inputs.size() << " input(s)";
+
+  for (auto [idx, input] : llvm::enumerate(inputs)) {
+    auto inputTy = llvm::cast<TsType>(input.getType());
+    Type expectedArgTy =
+        TsType::get(getContext(), inputTy.getElementType(), 1);
+    Type actualArgTy = bodyBlock.getArgument(idx).getType();
+    if (actualArgTy != expectedArgTy)
+      return emitOpError("body block argument #")
+             << idx << " must have type '" << expectedArgTy
+             << "', got '" << actualArgTy << "'";
+  }
+
+  // Body must terminate with YieldOp.
+  auto yieldOp = llvm::dyn_cast<YieldOp>(bodyBlock.getTerminator());
+  if (!yieldOp)
+    return emitOpError("body must terminate with 'kunir.yield'");
+
+  // results count == yield operands count.
+  unsigned numResults = getNumResults();
+  if (yieldOp.getValues().size() != numResults)
+    return emitOpError("yield operands count (")
+           << yieldOp.getValues().size()
+           << ") must match op results count (" << numResults << ")";
+
+  // Every result and yield operand must be ts<elemType, 1>.
+  for (auto [idx, res] : llvm::enumerate(getResults())) {
+    auto resTy = llvm::dyn_cast<TsType>(res.getType());
+    if (!resTy)
+      return emitOpError("result #") << idx << " must be a kunir ts type";
+    if (resTy.getMaxLookback() != 1)
+      return emitOpError("result #") << idx << " maxLookback must be 1, got "
+             << resTy.getMaxLookback();
+  }
+
+  for (auto [idx, val] : llvm::enumerate(yieldOp.getValues())) {
+    auto valTy = llvm::dyn_cast<TsType>(val.getType());
+    if (!valTy)
+      return emitOpError("yield operand #") << idx << " must be a kunir ts type";
+    if (valTy.getMaxLookback() != 1)
+      return emitOpError("yield operand #") << idx
+             << " maxLookback must be 1, got " << valTy.getMaxLookback();
+    if (val.getType() != getResult(idx).getType())
+      return emitOpError("yield operand #")
+             << idx << " type '" << val.getType()
+             << "' must match result type '" << getResult(idx).getType() << "'";
+  }
+
+  return success();
+}
+
+ParseResult ForEachBackWindowOp::parse(OpAsmParser &parser,
+                                       OperationState &result) {
+  Builder &builder = parser.getBuilder();
+
+  // (%in0 : type0, %in1 : type1, ...)
+  SmallVector<OpAsmParser::UnresolvedOperand> inputOperands;
+  SmallVector<Type> inputTypes;
+  if (parser.parseLParen())
+    return failure();
+  if (parser.parseOptionalRParen().failed()) {
+    do {
+      OpAsmParser::UnresolvedOperand operand;
+      Type type;
+      if (parser.parseOperand(operand) || parser.parseColonType(type))
+        return failure();
+      inputOperands.push_back(operand);
+      inputTypes.push_back(type);
+    } while (parser.parseOptionalComma().succeeded());
+    if (parser.parseRParen())
+      return failure();
+  }
+  if (parser.resolveOperands(inputOperands, inputTypes,
+                             parser.getCurrentLocation(), result.operands))
+    return failure();
+
+  // [window = <integer>]
+  int64_t window;
+  if (parser.parseLSquare() || parser.parseKeyword("window") ||
+      parser.parseEqual() || parser.parseInteger(window) ||
+      parser.parseRSquare())
+    return failure();
+  result.addAttribute("window", builder.getI64IntegerAttr(window));
+
+  // (%cur0 : ts0, %cur1 : ts1, ...)
+  SmallVector<OpAsmParser::Argument> blockArgs;
+  if (parser.parseArgumentList(blockArgs, OpAsmParser::Delimiter::Paren,
+                               /*allowType=*/true, /*allowAttrs=*/false))
+    return failure();
+
+  // -> (types) or -> type
+  SmallVector<Type> resultTypes;
+  if (parser.parseArrow())
+    return failure();
+  if (parser.parseOptionalLParen().succeeded()) {
+    if (parser.parseTypeList(resultTypes) || parser.parseRParen())
+      return failure();
+  } else {
+    Type singleTy;
+    if (parser.parseType(singleTy))
+      return failure();
+    resultTypes.push_back(singleTy);
+  }
+  for (Type t : resultTypes)
+    result.addTypes(t);
+
+  // { body }
+  Region *body = result.addRegion();
+  if (parser.parseRegion(*body, blockArgs))
+    return failure();
+  ForEachBackWindowOp::ensureTerminator(*body, builder, result.location);
+  return success();
+}
+
+void ForEachBackWindowOp::print(OpAsmPrinter &printer) {
+  Block &bodyBlock = getBody().front();
+
+  // (%in0 : type0, %in1 : type1, ...)
+  printer << " (";
+  llvm::interleaveComma(getInputs(), printer, [&](Value input) {
+    printer << input << " : " << input.getType();
+  });
+  printer << ")";
+
+  printer << " [window = " << getWindow() << "]";
+
+  // (%cur0 : ts0, %cur1 : ts1, ...)
+  printer << " (";
+  llvm::interleaveComma(bodyBlock.getArguments(), printer,
+                        [&](BlockArgument arg) {
+                          printer.printRegionArgument(arg);
+                        });
+  printer << ")";
+
+  // -> (types) or -> type
+  auto resultTypes = getResultTypes();
+  if (resultTypes.size() == 1) {
+    printer << " -> " << resultTypes[0];
+  } else {
+    printer << " -> (";
+    llvm::interleaveComma(resultTypes, printer);
+    printer << ")";
+  }
+
+  // Body (block args already printed above).
+  printer << " ";
+  printer.printRegion(getBody(), /*printEntryBlockArgs=*/false,
+                      /*printBlockTerminators=*/true);
+}
+
+//===----------------------------------------------------------------------===//
+// BinaryArithInterface implementations
+//===----------------------------------------------------------------------===//
+
+Value AddOp::buildScalarOp(OpBuilder &b, Location loc, Value lhs, Value rhs) {
+  return arith::AddFOp::create(b, loc, lhs, rhs);
+}
+Value SubOp::buildScalarOp(OpBuilder &b, Location loc, Value lhs, Value rhs) {
+  return arith::SubFOp::create(b, loc, lhs, rhs);
+}
+Value MulOp::buildScalarOp(OpBuilder &b, Location loc, Value lhs, Value rhs) {
+  return arith::MulFOp::create(b, loc, lhs, rhs);
+}
+Value DivOp::buildScalarOp(OpBuilder &b, Location loc, Value lhs, Value rhs) {
+  return arith::DivFOp::create(b, loc, lhs, rhs);
+}
+Value MaxOp::buildScalarOp(OpBuilder &b, Location loc, Value lhs, Value rhs) {
+  return arith::MaximumFOp::create(b, loc, lhs, rhs);
+}
+Value MinOp::buildScalarOp(OpBuilder &b, Location loc, Value lhs, Value rhs) {
+  return arith::MinimumFOp::create(b, loc, lhs, rhs);
+}
+
+// Comparison ops: dispatch arith.cmpf for FloatType operands and
+// arith.cmpi for IntegerType operands.  Verifier guarantees lhs.type == rhs.type.
+static Value buildCmpScalarOp(OpBuilder &b, Location loc, Value lhs, Value rhs,
+                              arith::CmpFPredicate fp,
+                              arith::CmpIPredicate ip) {
+  if (llvm::isa<FloatType>(lhs.getType()))
+    return arith::CmpFOp::create(b, loc, fp, lhs, rhs);
+  return arith::CmpIOp::create(b, loc, ip, lhs, rhs);
+}
+Value GreaterOp::buildScalarOp(OpBuilder &b, Location loc, Value lhs, Value rhs) {
+  return buildCmpScalarOp(b, loc, lhs, rhs,
+                          arith::CmpFPredicate::OGT, arith::CmpIPredicate::sgt);
+}
+Value GreaterEqualOp::buildScalarOp(OpBuilder &b, Location loc, Value lhs, Value rhs) {
+  return buildCmpScalarOp(b, loc, lhs, rhs,
+                          arith::CmpFPredicate::OGE, arith::CmpIPredicate::sge);
+}
+Value LessOp::buildScalarOp(OpBuilder &b, Location loc, Value lhs, Value rhs) {
+  return buildCmpScalarOp(b, loc, lhs, rhs,
+                          arith::CmpFPredicate::OLT, arith::CmpIPredicate::slt);
+}
+Value LessEqualOp::buildScalarOp(OpBuilder &b, Location loc, Value lhs, Value rhs) {
+  return buildCmpScalarOp(b, loc, lhs, rhs,
+                          arith::CmpFPredicate::OLE, arith::CmpIPredicate::sle);
+}
+Value EqualOp::buildScalarOp(OpBuilder &b, Location loc, Value lhs, Value rhs) {
+  return buildCmpScalarOp(b, loc, lhs, rhs,
+                          arith::CmpFPredicate::OEQ, arith::CmpIPredicate::eq);
+}
+
+// Logical binary ops on i1.
+Value AndOp::buildScalarOp(OpBuilder &b, Location loc, Value lhs, Value rhs) {
+  return arith::AndIOp::create(b, loc, lhs, rhs);
+}
+Value OrOp::buildScalarOp(OpBuilder &b, Location loc, Value lhs, Value rhs) {
+  return arith::OrIOp::create(b, loc, lhs, rhs);
+}
+
+//===----------------------------------------------------------------------===//
+// UnaryArithInterface implementations
+//===----------------------------------------------------------------------===//
+
+Value AbsOp::buildScalarOp(OpBuilder &b, Location loc, Value operand) {
+  return math::AbsFOp::create(b, loc, operand);
+}
+Value LogOp::buildScalarOp(OpBuilder &b, Location loc, Value operand) {
+  return math::LogOp::create(b, loc, operand);
+}
+Value ExpOp::buildScalarOp(OpBuilder &b, Location loc, Value operand) {
+  return math::ExpOp::create(b, loc, operand);
+}
+Value SqrtOp::buildScalarOp(OpBuilder &b, Location loc, Value operand) {
+  return math::SqrtOp::create(b, loc, operand);
+}
+Value SignOp::buildScalarOp(OpBuilder &b, Location loc, Value operand) {
+  // sign(x) ≈ copysign(1.0, x)
+  Value one = arith::ConstantOp::create(
+      b, loc, operand.getType(), b.getFloatAttr(operand.getType(), 1.0));
+  return math::CopySignOp::create(b, loc, one, operand);
+}
+Value NotOp::buildScalarOp(OpBuilder &b, Location loc, Value operand) {
+  // not(x) = x ^ 1 on i1
+  Value one = arith::ConstantOp::create(b, loc, b.getI1Type(),
+                                            b.getIntegerAttr(b.getI1Type(), 1));
+  return arith::XOrIOp::create(b, loc, operand, one);
+}
+
+//===----------------------------------------------------------------------===//
+// ReduceArithInterface implementations
+//===----------------------------------------------------------------------===//
+
+TypedAttr ReduceAddOp::getInitValue(FloatType elemType) {
+  return FloatAttr::get(elemType, 0.0);
+}
+Value ReduceAddOp::buildAccumOp(OpBuilder &b, Location loc, Value acc, Value elem) {
+  return arith::AddFOp::create(b, loc, acc, elem);
+}
+
+TypedAttr ReduceMulOp::getInitValue(FloatType elemType) {
+  return FloatAttr::get(elemType, 1.0);
+}
+Value ReduceMulOp::buildAccumOp(OpBuilder &b, Location loc, Value acc, Value elem) {
+  return arith::MulFOp::create(b, loc, acc, elem);
+}
+
+TypedAttr ReduceMaxOp::getInitValue(FloatType elemType) {
+  return FloatAttr::get(elemType, -std::numeric_limits<double>::infinity());
+}
+Value ReduceMaxOp::buildAccumOp(OpBuilder &b, Location loc, Value acc, Value elem) {
+  return arith::MaximumFOp::create(b, loc, acc, elem);
+}
+
+TypedAttr ReduceMinOp::getInitValue(FloatType elemType) {
+  return FloatAttr::get(elemType, std::numeric_limits<double>::infinity());
+}
+Value ReduceMinOp::buildAccumOp(OpBuilder &b, Location loc, Value acc, Value elem) {
+  return arith::MinimumFOp::create(b, loc, acc, elem);
+}
+
+//===----------------------------------------------------------------------===//
+// FuncOp
+//===----------------------------------------------------------------------===//
+
+void FuncOp::build(OpBuilder &b, OperationState &result,
+                   StringRef name, FunctionType type,
+                   ArrayAttr inputNames, ArrayAttr outputNames,
+                   TargetSpecAttr targetSpec, int64_t unreliableCount) {
+  result.addAttribute(getSymNameAttrName(result.name), b.getStringAttr(name));
+  result.addAttribute(getFunctionTypeAttrName(result.name), TypeAttr::get(type));
+  result.addAttribute(getInputNamesAttrName(result.name), inputNames);
+  result.addAttribute(getOutputNamesAttrName(result.name), outputNames);
+  result.addAttribute(getTargetSpecAttrName(result.name), targetSpec);
+  result.addAttribute(getUnreliableCountAttrName(result.name),
+                        b.getIntegerAttr(b.getIntegerType(64, /*isSigned=*/true),
+                                          unreliableCount));
+  Region *body = result.addRegion();
+  Block *block = new Block;
+  for (Type inputType : type.getInputs())
+    block->addArgument(inputType, result.location);
+  body->push_back(block);
+}
+
+LogicalResult FuncOp::verify() {
+  FunctionType ft = getFunctionTypeTyped();
+  Block &block = getBodyBlock();
+
+  // Block args must match function input types
+  if (block.getNumArguments() != ft.getNumInputs())
+    return emitOpError("body block has ") << block.getNumArguments()
+           << " args but function type has " << ft.getNumInputs() << " inputs";
+  for (auto [i, argType] : llvm::enumerate(ft.getInputs())) {
+    if (block.getArgument(i).getType() != argType)
+      return emitOpError("block arg #") << i << " type mismatch";
+  }
+
+  // Validate input_names / output_names counts
+  auto inputNames  = getInputNames();
+  auto outputNames = getOutputNames();
+  unsigned numResults = ft.getNumResults();
+
+  if (numResults > 0) {
+    // Non-void: inputs == num_args, outputs == num_results
+    if (inputNames.size() != ft.getNumInputs())
+      return emitOpError("non-void func: input_names count (")
+             << inputNames.size() << ") != num args ("
+             << ft.getNumInputs() << ")";
+    if (outputNames.size() != numResults)
+      return emitOpError("non-void func: output_names count (")
+             << outputNames.size() << ") != num results (" << numResults << ")";
+  } else {
+    // Void: inputs + outputs == num_args
+    if (inputNames.size() + outputNames.size() != ft.getNumInputs())
+      return emitOpError("void func: input_names + output_names count (")
+             << (inputNames.size() + outputNames.size())
+             << ") != num args (" << ft.getNumInputs() << ")";
+  }
+
+  // Validate all names are StringAttr
+  for (auto [i, a] : llvm::enumerate(inputNames))
+    if (!llvm::isa<StringAttr>(a))
+      return emitOpError("input_names[") << i << "] is not a StringAttr";
+  for (auto [i, a] : llvm::enumerate(outputNames))
+    if (!llvm::isa<StringAttr>(a))
+      return emitOpError("output_names[") << i << "] is not a StringAttr";
+
+  // Validate target_spec
+  auto ts = getTargetSpec();
+  if (ts.getOccupancy() <= 0)
+    return emitOpError("target occupancy must be positive, got ")
+           << ts.getOccupancy();
+  if (ts.getWarpsPerCta() <= 0)
+    return emitOpError("target warps_per_cta must be positive, got ")
+           << ts.getWarpsPerCta();
+  if (ts.getSmemSize() < 0)
+    return emitOpError("target smem_size must be non-negative, got ")
+           << ts.getSmemSize();
+
+  // Validate unreliable_count.  `-1` is a sentinel meaning "whole time
+  // history required" — the runtime collapses such functions to a
+  // single chunk.  Any other negative value is rejected.
+  if (getUnreliableCount() < -1)
+    return emitOpError("unreliable_count must be -1 (whole-time) or "
+                       "non-negative, got ")
+           << getUnreliableCount();
+
+  return success();
+}
+
+ParseResult FuncOp::parse(OpAsmParser &parser, OperationState &result) {
+  Builder &b = parser.getBuilder();
+
+  // @sym_name
+  StringAttr nameAttr;
+  if (parser.parseSymbolName(nameAttr, getSymNameAttrName(result.name),
+                             result.attributes))
+    return failure();
+
+  // (%arg0 : type0, ...)
+  SmallVector<OpAsmParser::Argument> blockArgs;
+  if (parser.parseArgumentList(blockArgs, OpAsmParser::Delimiter::Paren,
+                               /*allowType=*/true, /*allowAttrs=*/false))
+    return failure();
+
+  // inputs { %name = "str", ... }
+  SmallVector<Attribute> inputNameAttrs;
+  if (parser.parseKeyword("inputs") || parser.parseLBrace())
+    return failure();
+  if (parser.parseOptionalRBrace().failed()) {
+    do {
+      OpAsmParser::UnresolvedOperand argRef;
+      StringAttr nameStr;
+      if (parser.parseOperand(argRef) || parser.parseEqual() ||
+          parser.parseAttribute(nameStr))
+        return failure();
+      inputNameAttrs.push_back(nameStr);
+    } while (parser.parseOptionalComma().succeeded());
+    if (parser.parseRBrace()) return failure();
+  }
+
+  // outputs { ["str", ...] | [%name = "str", ...] }
+  SmallVector<Attribute> outputNameAttrs;
+  if (parser.parseKeyword("outputs") || parser.parseLBrace())
+    return failure();
+  if (parser.parseOptionalRBrace().failed()) {
+    do {
+      // Try %name = "str" form; if no %, fall through to "str" form
+      OpAsmParser::UnresolvedOperand argRef;
+      auto optArg = parser.parseOptionalOperand(argRef);
+      if (optArg.has_value()) {
+        if (failed(*optArg) || parser.parseEqual()) return failure();
+      }
+      StringAttr nameStr;
+      if (parser.parseAttribute(nameStr)) return failure();
+      outputNameAttrs.push_back(nameStr);
+    } while (parser.parseOptionalComma().succeeded());
+    if (parser.parseRBrace()) return failure();
+  }
+
+  // target { occupancy = V, warps_per_cta = V, smem_size = V }
+  if (parser.parseKeyword("target")) return failure();
+  auto targetSpec = TargetSpecAttr::parse(parser, Type{});
+  if (!targetSpec) return failure();
+  result.addAttribute(getTargetSpecAttrName(result.name), targetSpec);
+
+  // unreliable_count = N
+  if (parser.parseKeyword("unreliable_count") || parser.parseEqual())
+    return failure();
+  int64_t unrelVal = 0;
+  if (parser.parseInteger(unrelVal)) return failure();
+  result.addAttribute(getUnreliableCountAttrName(result.name),
+                       b.getIntegerAttr(b.getIntegerType(64, /*isSigned=*/true),
+                                         unrelVal));
+
+  // -> (result_type, ...) or -> result_type  [optional]
+  SmallVector<Type> resultTypes;
+  if (parser.parseOptionalArrow().succeeded()) {
+    if (parser.parseOptionalLParen().succeeded()) {
+      if (!parser.parseOptionalRParen().succeeded()) {
+        if (parser.parseTypeList(resultTypes) || parser.parseRParen())
+          return failure();
+      }
+    } else {
+      Type singleTy;
+      if (parser.parseType(singleTy)) return failure();
+      resultTypes.push_back(singleTy);
+    }
+  }
+
+  // Build function type from block arg types + result types
+  SmallVector<Type> inputTypes;
+  for (auto &arg : blockArgs) inputTypes.push_back(arg.type);
+  auto funcType = FunctionType::get(result.getContext(), inputTypes, resultTypes);
+  result.addAttribute(getFunctionTypeAttrName(result.name),
+                      TypeAttr::get(funcType));
+  result.addAttribute(getInputNamesAttrName(result.name),
+                      b.getArrayAttr(inputNameAttrs));
+  result.addAttribute(getOutputNamesAttrName(result.name),
+                      b.getArrayAttr(outputNameAttrs));
+
+  // Body region
+  Region *body = result.addRegion();
+  if (parser.parseRegion(*body, blockArgs)) return failure();
+  FuncOp::ensureTerminator(*body, b, result.location);
+  return success();
+}
+
+void FuncOp::print(OpAsmPrinter &p) {
+  Block &block = getBodyBlock();
+  FunctionType ft = getFunctionTypeTyped();
+
+  // @name
+  p << " @" << getSymName();
+
+  // (%arg0 : type0, ...)
+  p << "(";
+  llvm::interleaveComma(block.getArguments(), p, [&](BlockArgument arg) {
+    p << arg << ": " << arg.getType();
+  });
+  p << ")";
+
+  // inputs {%arg0 = "name0", ...}
+  auto inputNames = getInputNames();
+  unsigned numInputs = inputNames.size();
+  p << " inputs {";
+  for (unsigned i = 0; i < numInputs; ++i) {
+    if (i) p << ", ";
+    p << block.getArgument(i) << " = "
+      << llvm::cast<StringAttr>(inputNames[i]);
+  }
+  p << "}";
+
+  // outputs {...}
+  auto outputNames = getOutputNames();
+  p << " outputs {";
+  if (ft.getNumResults() == 0) {
+    // void: %argN = "name" form
+    for (unsigned i = 0; i < outputNames.size(); ++i) {
+      if (i) p << ", ";
+      p << block.getArgument(numInputs + i) << " = "
+        << llvm::cast<StringAttr>(outputNames[i]);
+    }
+  } else {
+    // non-void: just "name" strings
+    llvm::interleaveComma(outputNames, p,
+                          [&](Attribute a) { p << llvm::cast<StringAttr>(a); });
+  }
+  p << "}";
+
+  // target {occupancy = ..., ...}
+  p << " target ";
+  getTargetSpec().print(p);
+
+  // unreliable_count = N
+  p << " unreliable_count = " << getUnreliableCount();
+
+  // -> result types (non-void)
+  auto resultTypes = ft.getResults();
+  if (!resultTypes.empty()) {
+    p << " -> ";
+    if (resultTypes.size() == 1) {
+      p << resultTypes[0];
+    } else {
+      p << "(";
+      llvm::interleaveComma(resultTypes, p);
+      p << ")";
+    }
+  }
+
+  // body
+  p << " ";
+  p.printRegion(getBody(), /*printEntryBlockArgs=*/false,
+                /*printBlockTerminators=*/true);
+}
+
+//===----------------------------------------------------------------------===//
+// ReturnOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult ReturnOp::verify() {
+  auto funcOp = llvm::cast<FuncOp>((*this)->getParentOp());
+  FunctionType ft = funcOp.getFunctionTypeTyped();
+  auto resultTypes = ft.getResults();
+
+  if (getOperands().size() != resultTypes.size())
+    return emitOpError("returns ") << getOperands().size()
+           << " value(s) but function has " << resultTypes.size()
+           << " result type(s)";
+
+  for (auto [i, opType, resType] :
+       llvm::enumerate(getOperandTypes(), resultTypes)) {
+    if (opType == resType) continue;
+    // ts<T, inf> operand → ts<T, 1> result is allowed for graph-output
+    // passes through a ts handle (function arg / output_ref).  The
+    // lowering scalarizes via ts.get @ offset 0 before ts.put.
+    auto opTs  = llvm::dyn_cast<TsType>(opType);
+    auto resTs = llvm::dyn_cast<TsType>(resType);
+    if (opTs && resTs &&
+        opTs.getElementType() == resTs.getElementType() &&
+        opTs.getMaxLookback() == kInfLookback &&
+        resTs.getMaxLookback() == 1)
+      continue;
+    return emitOpError("operand #") << i << " type '" << opType
+           << "' does not match function result type '" << resType << "'";
+  }
+  return success();
+}
diff --git a/mlir/lib/KunIr/KunIrToKunGpu.cpp b/mlir/lib/KunIr/KunIrToKunGpu.cpp
new file mode 100644
index 0000000..e669684
--- /dev/null
+++ b/mlir/lib/KunIr/KunIrToKunGpu.cpp
@@ -0,0 +1,821 @@
+//===- KunIrToKunGpu.cpp - Lower kunir ops to kungpu + scf + arith --------===//
+//
+// Lowers a kunir.func whose body contains kunir ops into a form that uses:
+//   - kungpu.time_length / kungpu.ts.get / kungpu.ts.put  for ts I/O
+//   - scf.for for the outer time loop and inner back-window loops
+//   - arith.* / math.* for scalar arithmetic
+//
+// Assumptions / limitations:
+//   - The function body is a single block.
+//   - ts-typed return values are converted to output parameters (void return).
+//   - All inputs to kunir.for_each_back_window must be ts handles (function
+//     arguments or kunir.windowed_output results).
+//   - Each yield operand of for_each_back_window must come from a reduce_* op.
+//   - Cross-sectional kernels (cs_rank) never enter kunir — the Python
+//     frontend (CodegenMLIR._maybe_external_partition) routes them
+//     directly to a pre-compiled CUmodule bundled with the runtime.
+//
+//===----------------------------------------------------------------------===//
+
+#include "KunGpu/KunGpuOps.h"
+#include "KunIr/KunIrInterfaces.h"
+#include "KunIr/KunIrOps.h"
+#include "KunIr/KunIrTypes.h"
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/Pass/Pass.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+
+#include <limits>
+
+using namespace mlir;
+using namespace kunir;
+using namespace kungpu;
+
+// In a function returning LogicalResult / FailureOr<U>:
+//   KUN_ASSIGN_OR_FAIL(T x, callReturningFailureOrT(...));
+// On failure → `return failure();`.  On success → declare `x = *result`.
+// Multi-statement expansion; do not use without braces in if/while/for bodies.
+#define KUN_DETAIL_CAT_(a, b) a##b
+#define KUN_DETAIL_CAT(a, b)  KUN_DETAIL_CAT_(a, b)
+#define KUN_ASSIGN_OR_FAIL_IMPL(decl, expr, tmp)        \
+    auto tmp = (expr);                                  \
+    if (::mlir::failed(tmp)) return ::mlir::failure();  \
+    decl = *std::move(tmp)
+#define KUN_ASSIGN_OR_FAIL(decl, expr)                  \
+    KUN_ASSIGN_OR_FAIL_IMPL(decl, expr,                 \
+        KUN_DETAIL_CAT(_kunOrFail_, __COUNTER__))
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+// Value tracking
+//
+// Two disjoint maps, both keyed by kunir SSA Values:
+//   tsMap     : ts SSA value  -> the ts handle SSA value to load from.
+//               Populated by: function-arg seeding, WindowedOutputOp
+//               (handle = windowed_temp), inner LowerHelper copy of outer.tsMap.
+//   scalarMap : SSA value     -> the scalar SSA value already materialised
+//               for it.  Populated by: arith op results, reduce accumulator
+//               pre-seed + update, BackRefOp result, for_each_back_window
+//               result, FastWindowedSumOp result, getScalar's offset-0 cache.
+//
+// A given SSA value lives in at most one map at construction time, but a ts
+// handle can become "additionally" represented in scalarMap once it has been
+// read at offset 0 — that's the getScalar cache.
+//===----------------------------------------------------------------------===//
+
+using HandleMap = llvm::DenseMap<Value, Value>;
+
+// One LowerHelper per scope (outer function body / for_each_back_window body).
+// `zeroOffsetI32` is the function-scope i32 zero constant created once before
+// the outer scf.for; all LowerHelper instances share it.
+//
+// `outerTimeIdx` / `outerLoopLb` are the outer scf.for time loop's induction
+// variable and lower bound (index type).  BackRef's warmup guard
+// (`t - loop_lb < window` → NaN) needs both; threading them through
+// LowerHelper keeps the guard available inside for_each_back_window bodies
+// as well, since `t` there is still the OUTER time index.
+struct LowerHelper {
+  HandleMap tsMap;
+  HandleMap scalarMap;
+  Value zeroOffsetI32;
+  Value outerTimeIdx;   // outer scf.for induction var (index)
+  Value outerLoopLb;    // outer scf.for lower bound (index)
+  // Inside a for_each_back_window body: the current window step offset
+  // (window-1-w).  Used by argmin/argmax to record the position index.
+  Value windowedOffsetI32;
+  // Inside a for_each_back_window body: the raw step index `w` (0 to
+  // window-1, 0 = oldest).  Used by `kunir.window_loop_index`.
+  Value windowIdxI32;
+  // Running accumulators for each reduce op in the enclosing FBW body.
+  // Single-state reduce: 1 entry; argmin/max: {best_val, best_idx};
+  // rank: {less_count, eq_count}.  Seeded by FBW pre-loop, updated by
+  // each reduce step, read by scf.yield.
+  llvm::DenseMap<Value, SmallVector<Value, 2>> multiAccs;
+
+  // Shared util: look up `v` (a ts SSA value) in tsMap, emit
+  // ts.get(handle, offsetI32), return the loaded scalar.  Does NOT touch
+  // scalarMap — callers decide whether/where to cache the result.  Returns
+  // failure (with an in-flight diagnostic at `loc`) if `v` is not a
+  // registered ts handle.
+  //
+  // Used by:
+  //   - getScalar (offset = zeroOffsetI32)
+  //   - BackRefOp branch (offset = constant(window))
+  //   - for_each_back_window block-arg pre-load (offset = window-1-w)
+  FailureOr<Value> getScalarUncached(Value v, Value offsetI32,
+                                      OpBuilder &b, Location loc) {
+    auto it = tsMap.find(v);
+    if (it == tsMap.end())
+      return emitError(loc,
+          "kunir-to-kungpu: value is not a registered ts handle in tsMap");
+    auto tsTy = llvm::cast<TsType>(v.getType());
+    return TsGetOp::create(b, loc, tsTy.getElementType(),
+                              it->second, offsetI32).getResult();
+  }
+
+  // Offset-0 read with scalarMap caching.  Looks up scalarMap first; on miss
+  // loads at offset 0 via getScalarUncached and caches the result.  This is
+  // the standard "current time step" read used by all in-body operand
+  // lookups inside lowerBlock.
+  FailureOr<Value> getScalar(Value v, OpBuilder &b, Location loc) {
+    auto sit = scalarMap.find(v);
+    if (sit != scalarMap.end()) return sit->second;
+    KUN_ASSIGN_OR_FAIL(Value scalar,
+                       getScalarUncached(v, zeroOffsetI32, b, loc));
+    scalarMap[v] = scalar;
+    return scalar;
+  }
+
+  // One step of a multi-state reduce (argmin/argmax/rank).  Mirrors
+  // cpp/Kun/Ops.hpp's step() exactly so CPU and GPU match bit-for-bit
+  // (modulo reduction-order changes).
+  LogicalResult lowerMultiReduce(Operation *op, OpBuilder &b, Location ol) {
+    auto isArgMin = isa<kunir::ReduceArgMinOp>(op);
+    auto isArgMax = isa<kunir::ReduceArgMaxOp>(op);
+    auto isRank   = isa<kunir::ReduceRankOp>(op);
+    assert(isArgMin || isArgMax || isRank);
+
+    KUN_ASSIGN_OR_FAIL(Value elem, getScalar(op->getOperand(0), b, ol));
+    FloatType elemTy = llvm::cast<FloatType>(elem.getType());
+    auto &accs = multiAccs[op->getResult(0)];
+    assert(accs.size() == 2 &&
+           "multi-state reduce must be pre-seeded with 2 iter_args");
+
+    auto fconst = [&](double v) {
+      return arith::ConstantOp::create(b, ol, elemTy,
+                                          b.getFloatAttr(elemTy, v))
+          .getResult();
+    };
+    auto fIsNan = [&](Value v) {
+      return arith::CmpFOp::create(b, ol, arith::CmpFPredicate::UNE, v, v)
+          .getResult();
+    };
+    Value nanF = fconst(std::numeric_limits<double>::quiet_NaN());
+    Value one  = fconst(1.0);
+
+    if (isArgMin || isArgMax) {
+      // accs = {best_val, best_idx}.  Ordered compare so NaN doesn't
+      // trigger the update; NaN is propagated by the final selects.
+      Value bestVal = accs[0];
+      Value bestIdx = accs[1];
+      Value bestIsNan = fIsNan(bestVal);
+      Value elemIsNan = fIsNan(elem);
+      auto pred = isArgMin ? arith::CmpFPredicate::OGT
+                            : arith::CmpFPredicate::OLT;
+      Value cmp = arith::CmpFOp::create(b, ol, pred, bestVal, elem)
+                      .getResult();
+      Value newVal = arith::SelectOp::create(b, ol, cmp, elem, bestVal)
+                          .getResult();
+      // Record the window-relative position (window-1-w) so
+      // TsArgMin = window - ReduceArgMin gives pandas's
+      // np.argmin()+1 convention (1=oldest, window=newest).
+      Value wIdxF = arith::SIToFPOp::create(b, ol, elemTy,
+                                                windowedOffsetI32)
+                        .getResult();
+      Value newIdx = arith::SelectOp::create(b, ol, cmp, wIdxF, bestIdx)
+                          .getResult();
+      Value anyNan = arith::OrIOp::create(b, ol, bestIsNan, elemIsNan)
+                          .getResult();
+      newVal = arith::SelectOp::create(b, ol, anyNan, nanF, newVal)
+                  .getResult();
+      newIdx = arith::SelectOp::create(b, ol, anyNan, nanF, newIdx)
+                  .getResult();
+      accs[0] = newVal;
+      accs[1] = newIdx;
+      return success();
+    }
+
+    // ReduceRank: accs = {less_count, eq_count}; `current` is an
+    // outer-scope ts<f, 1> already in scalarMap.
+    KUN_ASSIGN_OR_FAIL(Value cur, getScalar(op->getOperand(1), b, ol));
+    Value lessCnt = accs[0];
+    Value eqCnt   = accs[1];
+    Value curIsNan  = fIsNan(cur);
+    Value elemIsNan = fIsNan(elem);
+    Value anyNan    = arith::OrIOp::create(b, ol, curIsNan, elemIsNan)
+                          .getResult();
+    Value cmpLess = arith::CmpFOp::create(
+                        b, ol, arith::CmpFPredicate::OLT, elem, cur)
+                        .getResult();
+    Value cmpEq   = arith::CmpFOp::create(
+                        b, ol, arith::CmpFPredicate::OEQ, elem, cur)
+                        .getResult();
+    Value lessP1 = arith::AddFOp::create(b, ol, lessCnt, one).getResult();
+    Value newLess = arith::SelectOp::create(b, ol, cmpLess, lessP1, lessCnt)
+                        .getResult();
+    // NaN routed only into less_count — the final rank extract
+    // (`less + (eq + 1) / 2`, computed after the scf.for) then
+    // propagates NaN out.
+    newLess = arith::SelectOp::create(b, ol, anyNan, nanF, newLess)
+                  .getResult();
+    Value eqP1   = arith::AddFOp::create(b, ol, eqCnt, one).getResult();
+    Value newEq  = arith::SelectOp::create(b, ol, cmpEq, eqP1, eqCnt)
+                        .getResult();
+    accs[0] = newLess;
+    accs[1] = newEq;
+    return success();
+  }
+
+  // Lower non-terminator ops in `ops` in definition order.
+  //
+  // For each op:
+  //   - Anything else: call handleUnknown if provided, else return failure.
+  LogicalResult lowerBlock(
+      llvm::ArrayRef<Operation *> ops, OpBuilder &b,
+      llvm::function_ref<LogicalResult(Operation &)> handleUnknown = nullptr) {
+    for (Operation *op : ops) {
+      Location ol = op->getLoc();
+      if (auto iface = dyn_cast<BinaryArithInterface>(op)) {
+        KUN_ASSIGN_OR_FAIL(Value lhs, getScalar(op->getOperand(0), b, ol));
+        KUN_ASSIGN_OR_FAIL(Value rhs, getScalar(op->getOperand(1), b, ol));
+        scalarMap[op->getResult(0)] = iface.buildScalarOp(b, ol, lhs, rhs);
+      } else if (auto iface = dyn_cast<UnaryArithInterface>(op)) {
+        KUN_ASSIGN_OR_FAIL(Value operand, getScalar(op->getOperand(0), b, ol));
+        scalarMap[op->getResult(0)] = iface.buildScalarOp(b, ol, operand);
+      } else if (auto ri = dyn_cast<ReduceArithInterface>(op)) {
+        // Running acc lives in multiAccs[result][0] (see FBW lowering
+        // for the pre-seed); single- and multi-state reduces share
+        // the same storage so scf.yield reads them uniformly.
+        KUN_ASSIGN_OR_FAIL(Value elem, getScalar(op->getOperand(0), b, ol));
+        auto mit = multiAccs.find(op->getResult(0));
+        assert(mit != multiAccs.end() && mit->second.size() == 1 &&
+               "reduce result must be pre-seeded in multiAccs with current acc");
+        mit->second[0] = ri.buildAccumOp(b, ol, mit->second[0], elem);
+      } else if (isa<kunir::ReduceArgMinOp, kunir::ReduceArgMaxOp,
+                       kunir::ReduceRankOp>(op)) {
+        if (failed(lowerMultiReduce(op, b, ol)))
+          return failure();
+      } else if (auto wli = dyn_cast<kunir::WindowLoopIndexOp>(op)) {
+        // sitofp(w, elemTy) — `w` is the enclosing scf.for's IV.
+        auto resTsTy = llvm::cast<TsType>(wli.getResult().getType());
+        auto elemTy = llvm::cast<FloatType>(resTsTy.getElementType());
+        scalarMap[wli.getResult()] =
+            arith::SIToFPOp::create(b, ol, elemTy, windowIdxI32).getResult();
+      } else if (auto sel = dyn_cast<SelectOp>(op)) {
+        KUN_ASSIGN_OR_FAIL(Value cond, getScalar(sel.getCond(),      b, ol));
+        KUN_ASSIGN_OR_FAIL(Value tv,   getScalar(sel.getTrueValue(), b, ol));
+        KUN_ASSIGN_OR_FAIL(Value fv,   getScalar(sel.getFalseValue(),b, ol));
+        scalarMap[sel.getResult()] =
+            arith::SelectOp::create(b, ol, cond, tv, fv).getResult();
+      } else if (auto br = dyn_cast<BackRefOp>(op)) {
+        // Warmup guard:  if   t - outer_loop_lb < window  →  NaN
+        //                else                            →  ts.get(window)
+        //
+        // Chunk 0 has loop_lb = 0 so the guard collapses to CPU's
+        // "first window-1 outputs are NaN".  Chunk k>=1's per-CTA state
+        // (e.g. fast-stat accumulators) is zero-initialised at function
+        // entry, so each chunk needs `window` add-only steps to rebuild
+        // the trailing-window state — gating the "remove" value with
+        // NaN here propagates through NaN-aware remove patterns
+        // (Equals(oldx, oldx) === false on NaN) and auto-suppresses the
+        // subtract step during warmup.
+        //
+        // The scf.if uses the manual-OpBuilder (not body-builder-lambda)
+        // form, so the enclosing function context is still `lowerBlock`
+        // — KUN_ASSIGN_OR_FAIL can return failure from here without
+        // tripping any lambda return-type mismatch.
+        int64_t window = br.getWindow();
+        auto inputTs = llvm::cast<TsType>(br.getInput().getType());
+        auto floatTy = llvm::dyn_cast<FloatType>(inputTs.getElementType());
+        if (!floatTy)
+          return br.emitError("kunir-to-kungpu: back_ref input must have a "
+                              "float element type (NaN required for the "
+                              "warmup guard)");
+
+        Value delta =
+            arith::SubIOp::create(b, ol, outerTimeIdx, outerLoopLb);
+        Value windowIdx =
+            arith::ConstantIndexOp::create(b, ol, window);
+        Value inSteady = arith::CmpIOp::create(
+            b, ol, arith::CmpIPredicate::sge, delta, windowIdx);
+        auto ifOp = scf::IfOp::create(b, ol, TypeRange{floatTy}, inSteady,
+                                          /*withElseRegion=*/true);
+        {
+          OpBuilder ib =
+              OpBuilder::atBlockBegin(&ifOp.getThenRegion().front());
+          Value offset = arith::ConstantOp::create(
+              ib, ol, ib.getI32Type(), ib.getI32IntegerAttr(window));
+          KUN_ASSIGN_OR_FAIL(Value loaded,
+              getScalarUncached(br.getInput(), offset, ib, ol));
+          scf::YieldOp::create(ib, ol, loaded);
+        }
+        {
+          OpBuilder ib =
+              OpBuilder::atBlockBegin(&ifOp.getElseRegion().front());
+          llvm::APFloat qnan =
+              llvm::APFloat::getQNaN(floatTy.getFloatSemantics());
+          Value nanV = arith::ConstantOp::create(
+              ib, ol, floatTy, FloatAttr::get(floatTy, qnan));
+          scf::YieldOp::create(ib, ol, nanV);
+        }
+        scalarMap[br.getResult()] = ifOp.getResult(0);
+      } else if (auto co = dyn_cast<ConstantOp>(op)) {
+        auto resTs = llvm::cast<TsType>(co.getResult().getType());
+        Type elemTy = resTs.getElementType();
+        // The op carries an f64 attribute; convert to the element type
+        // (f32 / f64) so arith.constant gets a type-matching attribute.
+        llvm::APFloat apv(co.getValue());
+        if (auto ft = llvm::dyn_cast<FloatType>(elemTy)) {
+          bool losesInfo = false;
+          apv.convert(ft.getFloatSemantics(),
+                      llvm::APFloat::rmNearestTiesToEven, &losesInfo);
+        }
+        scalarMap[co.getResult()] = arith::ConstantOp::create(
+            b, ol, elemTy, b.getFloatAttr(elemTy, apv));
+      } else if (handleUnknown) {
+        if (failed(handleUnknown(*op))) return failure();
+      } else {
+        return op->emitError("kunir-to-kungpu: cannot lower op in block");
+      }
+    }
+    return success();
+  }
+
+  LogicalResult lowerBlock(
+      Block &block, OpBuilder &b,
+      llvm::function_ref<LogicalResult(Operation &)> handleUnknown = nullptr) {
+    SmallVector<Operation *> ops;
+    for (Operation &op : block.without_terminator())
+      ops.push_back(&op);
+    return lowerBlock(ops, b, handleUnknown);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Pass definition
+//===----------------------------------------------------------------------===//
+
+struct LowerKunIrToKunGpuPass
+    : PassWrapper<LowerKunIrToKunGpuPass, OperationPass<kunir::FuncOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(LowerKunIrToKunGpuPass)
+  StringRef getArgument()    const override { return "kunir-to-kungpu"; }
+  StringRef getDescription() const override {
+    return "Lower kunir ops to kungpu + scf + arith/math"; }
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<kungpu::KunGpuDialect, arith::ArithDialect,
+                    math::MathDialect, scf::SCFDialect>();
+  }
+  void runOnOperation() override;
+};
+
+} // namespace
+
+void LowerKunIrToKunGpuPass::runOnOperation() {
+  kunir::FuncOp funcOp = getOperation();
+  MLIRContext *ctx = &getContext();
+  Location loc = funcOp.getLoc();
+
+  Block &entry = funcOp.getBody().front();
+
+  // ------------------------------------------------------------------
+  // 1. Extend function signature: ts return types → extra output params.
+  //    Runtime-scalar args (time_length, num_stocks, mask, chunk_size,
+  //    warmup) are added later by convert-kungpu-to-llvm's
+  //    convertFuncSignature, not here.
+  // ------------------------------------------------------------------
+  FunctionType oldFT = funcOp.getFunctionTypeTyped();
+  SmallVector<Type> newArgTys(oldFT.getInputs());
+  SmallVector<unsigned> tsRetIdx;
+  for (auto [i, ty] : llvm::enumerate(oldFT.getResults()))
+    if (isa<TsType>(ty)) tsRetIdx.push_back(i);
+
+  // Output buffer args are dense streams.  Type them as `ts<T, inf>` so
+  // a kunir.output_ref can expose them as windowed read sources without
+  // tripping windowed-input verifiers.
+  SmallVector<Value> outParams;
+  for (unsigned i : tsRetIdx) {
+    auto origTy = llvm::cast<TsType>(oldFT.getResult(i));
+    auto infTy  = TsType::get(ctx, origTy.getElementType(),
+                                 std::numeric_limits<uint64_t>::max());
+    outParams.push_back(entry.addArgument(infTy, loc));
+    newArgTys.push_back(infTy);
+  }
+  SmallVector<Type> newRetTys;
+  for (auto [i, ty] : llvm::enumerate(oldFT.getResults()))
+    if (!isa<TsType>(ty)) newRetTys.push_back(ty);
+  funcOp.setFunctionTypeAttr(
+      TypeAttr::get(FunctionType::get(ctx, newArgTys, newRetTys)));
+
+  // For each ts output: false = pending normal return-time write,
+  // true = already written by an output_ref (skip at return time).
+  // A missing entry means the name isn't a ts output of this func.
+  llvm::StringMap<bool> outNameToIsTakenOver;
+  if (auto outNamesAttr = funcOp.getOutputNames()) {
+    for (unsigned i : tsRetIdx) {
+      auto name = llvm::cast<StringAttr>(outNamesAttr[i]).getValue();
+      outNameToIsTakenOver[name] = false;
+    }
+  }
+
+
+  // ------------------------------------------------------------------
+  // 2. Snapshot original ops and find the original return.
+  // ------------------------------------------------------------------
+  SmallVector<Operation *> origOps;
+  kunir::ReturnOp retOp;
+  for (Operation &op : entry) origOps.push_back(&op);
+  for (Operation *op : origOps)
+    if (auto r = dyn_cast<kunir::ReturnOp>(op)) { retOp = r; break; }
+
+  // Collect ts return values from the original return.
+  SmallVector<Value> tsRetVals;
+  if (retOp)
+    for (Value v : retOp.getOperands())
+      if (isa<TsType>(v.getType())) tsRetVals.push_back(v);
+  assert(tsRetVals.size() == outParams.size());
+
+  // ------------------------------------------------------------------
+  // 3. Insert outer scf.for loop before the first original op.
+  //    windowed_temp ops are inserted before this loop (via `b`).
+  // ------------------------------------------------------------------
+  OpBuilder b(ctx);
+  b.setInsertionPoint(origOps.front());
+
+  // Per-chunk bounds.  Both ops are operandless — chunk_size / warmup /
+  // time_length all live as kernel scalar args added by
+  // convert-kungpu-to-llvm and are read at lowering time.  When the
+  // caller's launcher uses num_chunks = 1 it sets chunk_size =
+  // time_length so chunk 0 covers the full range.
+  Value lb = TimeLbOp::create(b, loc, b.getIndexType());
+  Value ub = TimeUbOp::create(b, loc, b.getIndexType());
+  Value c0 = arith::ConstantIndexOp::create(b, loc, 0);
+  Value c1 = arith::ConstantIndexOp::create(b, loc, 1);
+  // Outer-loop ts.get/put always reference the current time step, i.e.
+  // tail-relative offset = 0 (i32).  Created before outerFor so it dominates
+  // every use inside the loop body.
+  Value zeroOffsetI32 = arith::ConstantOp::create(
+      b, loc, b.getI32Type(), b.getI32IntegerAttr(0));
+  auto outerFor = scf::ForOp::create(b, loc, lb, ub, c1);
+
+  // Erase the implicit empty scf.yield (no iter_args → zero-operand yield).
+  outerFor.getBody()->back().erase();
+  OpBuilder fb = OpBuilder::atBlockEnd(outerFor.getBody());
+
+  // Point b before outerFor so windowed_temp ops land outside the time loop.
+  b.setInsertionPoint(outerFor);
+
+  // ------------------------------------------------------------------
+  // 4. Build outer LowerHelper; seed each ts-typed function argument into tsMap.
+  // ------------------------------------------------------------------
+  LowerHelper outer;
+  outer.zeroOffsetI32 = zeroOffsetI32;
+  outer.outerTimeIdx  = outerFor.getInductionVar();
+  outer.outerLoopLb   = outerFor.getLowerBound();
+  unsigned numOrigArgs = oldFT.getNumInputs();
+  for (unsigned i = 0; i < numOrigArgs; ++i) {
+    Value arg = entry.getArgument(i);
+    if (isa<TsType>(arg.getType()))
+      outer.tsMap[arg] = arg;
+  }
+
+  // ------------------------------------------------------------------
+  // 5. Lower original ops in definition order.
+  //
+  //    LowerHelper::lowerBlock handles binary/unary/reduce/select/back_ref ops.
+  //    windowed_output, for_each_back_window, fast_windowed_sum, and
+  //    func.return are outer-scope only and are handled by the callback
+  //    below.
+  // ------------------------------------------------------------------
+  auto outerHandler = [&](Operation &op) -> LogicalResult {
+    if (isa<kunir::ReturnOp>(op)) return success(); // handled in step 7
+
+    Location ol = op.getLoc();
+
+    // windowed_output → allocate windowed_temp outside the loop,
+    //                   fill circular buffer at each time step inside.
+    if (auto woOp = dyn_cast<WindowedOutputOp>(op)) {
+      auto wt = WindowedTempOp::create(b, ol, woOp.getResult().getType());
+      outer.tsMap[woOp.getResult()] = wt.getResult();
+      KUN_ASSIGN_OR_FAIL(Value inputScalar,
+                         outer.getScalar(woOp.getInput(), fb, ol));
+      TsPutOp::create(fb, ol, wt.getResult(), inputScalar);
+      return success();
+    }
+
+    // for_each_back_window → nested scf.for with iter_args.
+    if (auto fwOp = dyn_cast<ForEachBackWindowOp>(op)) {
+      int64_t window = fwOp.getWindow();
+      Block &body = fwOp.getBody().front();
+      auto yieldOp = llvm::cast<YieldOp>(body.getTerminator());
+
+      // Verify all inputs are ts handles (already in outer.tsMap).
+      for (Value inp : fwOp.getInputs()) {
+        if (!outer.tsMap.count(inp))
+          return op.emitError("kunir-to-kungpu: for_each_back_window input "
+                              "must be a ts handle");
+      }
+
+      // Build the iter_args layout: single-state reduce = 1 init,
+      // argmin/max = (best_val, best_idx), rank = (less, eq).
+      struct ReduceSlot {
+        int numAccs;
+        int startIdx;
+      };
+      SmallVector<ReduceSlot> slots; // parallel to yieldOp.getValues()
+      SmallVector<Value> initVals;
+      auto elemTyOf = [](Operation *defOp) -> FloatType {
+        return llvm::cast<FloatType>(
+            llvm::cast<TsType>(defOp->getOperand(0).getType()).getElementType());
+      };
+      auto pushConst = [&](FloatType elemTy, double v) {
+        initVals.push_back(arith::ConstantOp::create(
+            fb, ol, elemTy, fb.getFloatAttr(elemTy, v)));
+      };
+      for (Value yv : yieldOp.getValues()) {
+        auto *defOp = yv.getDefiningOp();
+        if (!defOp) {
+          return op.emitError("kunir-to-kungpu: for_each_back_window yield "
+                              "operand has no defining op");
+        }
+        ReduceSlot slot{0, (int)initVals.size()};
+        if (auto ri = dyn_cast<ReduceArithInterface>(defOp)) {
+          FloatType elemTy = elemTyOf(defOp);
+          initVals.push_back(arith::ConstantOp::create(
+              fb, ol, ri.getInitValue(elemTy)));
+          slot.numAccs = 1;
+        } else if (isa<kunir::ReduceArgMinOp>(defOp) ||
+                     isa<kunir::ReduceArgMaxOp>(defOp)) {
+          FloatType elemTy = elemTyOf(defOp);
+          double inf = std::numeric_limits<double>::infinity();
+          pushConst(elemTy, isa<kunir::ReduceArgMinOp>(defOp) ? inf : -inf);
+          pushConst(elemTy, 0.0);
+          slot.numAccs = 2;
+        } else if (isa<kunir::ReduceRankOp>(defOp)) {
+          FloatType elemTy = elemTyOf(defOp);
+          pushConst(elemTy, 0.0);
+          pushConst(elemTy, 0.0);
+          slot.numAccs = 2;
+        } else {
+          return op.emitError("kunir-to-kungpu: for_each_back_window yield "
+                              "operand must come from a reduce_* op");
+        }
+        slots.push_back(slot);
+      }
+
+      // Create inner scf.for %w = 0 to window step 1 iter_args(acc_i = init_i).
+      // The lambda form lets us emit a proper scf.yield as the body terminator
+      // without fighting the implicit yield created by ensureTerminator.
+      Value wBound  = arith::ConstantIndexOp::create(fb, ol, window);
+      Value wM1_i32 = arith::ConstantOp::create(
+          fb, ol, fb.getI32Type(), fb.getI32IntegerAttr(window - 1));
+
+      // Capture lowerBlock result since the lambda can't return LogicalResult.
+      bool innerOk = true;
+      auto innerFor = scf::ForOp::create(
+          fb, ol, c0, wBound, c1, initVals,
+          [&](OpBuilder &ib, Location il, Value w, ValueRange iterArgs) {
+            // Tail-relative offset for this window step.  Iterating w from 0
+            // to window-1 reads oldest-to-newest, i.e. offset = window-1-w.
+            Value w_i32 =
+                arith::IndexCastOp::create(ib, il, ib.getI32Type(), w);
+            Value windowedOffset =
+                arith::SubIOp::create(ib, il, wM1_i32, w_i32);
+
+            // Inner LowerHelper inherits the outer tsMap/scalarMap so reads
+            // inside the body can still reach outer-scope handles (e.g. a
+            // back_ref placed in the body) and outer-scope scalars, and
+            // shares the function-scope zero-offset constant.
+            //
+            // Pre-loads, written directly into inner.scalarMap (bypassing
+            // the offset-0 cache since these reads are at non-zero offsets):
+            //   - Each block arg = its corresponding ts input loaded at the
+            //     windowed offset (via getScalarUncached).
+            //   - Each reduce result = the matching iter_arg accumulator.
+            //
+            // After this setup the body has no non-zero-offset reads left;
+            // lowerBlock just uses offset 0 + scalarMap for everything.
+            LowerHelper inner{outer.tsMap, outer.scalarMap,
+                                outer.zeroOffsetI32,
+                                outer.outerTimeIdx, outer.outerLoopLb};
+            // Hand the inner helper the current window-step offset
+            // (window-1-w) so multi-state reductions (argmin/argmax)
+            // can use it as the recorded `index`, and the raw step
+            // index `w` for `kunir.window_loop_index`.
+            inner.windowedOffsetI32 = windowedOffset;
+            inner.windowIdxI32 = w_i32;
+            for (auto [i, arg] : llvm::enumerate(body.getArguments())) {
+              auto r = inner.getScalarUncached(fwOp.getInputs()[i],
+                                                windowedOffset, ib, il);
+              if (failed(r)) {
+                innerOk = false;
+                scf::YieldOp::create(ib, il, initVals);
+                return;
+              }
+              inner.scalarMap[arg] = *r;
+            }
+            // Pre-seed accumulators from iter_args.
+            for (auto [i, yv] : llvm::enumerate(yieldOp.getValues())) {
+              const auto &slot = slots[i];
+              SmallVector<Value, 2> accs;
+              accs.reserve(slot.numAccs);
+              for (int j = 0; j < slot.numAccs; ++j)
+                accs.push_back(iterArgs[slot.startIdx + j]);
+              inner.multiAccs[yv] = std::move(accs);
+            }
+
+            if (failed(inner.lowerBlock(body, ib))) {
+              innerOk = false;
+              scf::YieldOp::create(ib, il, initVals); // keep IR structurally valid
+              return;
+            }
+
+            // Yield the updated accumulators back into the iter_args.
+            SmallVector<Value> newAccs(initVals.size());
+            for (auto [i, yv] : llvm::enumerate(yieldOp.getValues())) {
+              const auto &slot = slots[i];
+              const auto &accs = inner.multiAccs[yv];
+              for (int j = 0; j < slot.numAccs; ++j)
+                newAccs[slot.startIdx + j] = accs[j];
+            }
+            scf::YieldOp::create(ib, il, newAccs);
+          });
+      if (!innerOk) return failure();
+
+      // Project each fwOp result from the inner-for's iter_arg slice:
+      // single-state passes through, argmin/max returns best_idx, rank
+      // computes less + (eq + 1) / 2.
+      OpBuilder::InsertionGuard guardPost(b);
+      b.setInsertionPointAfter(innerFor);
+      for (auto [i, res] : llvm::enumerate(fwOp.getResults())) {
+        const auto &slot = slots[i];
+        Value yv = yieldOp.getValues()[i];
+        auto *defOp = yv.getDefiningOp();
+        Value finalVal;
+        if (slot.numAccs == 1) {
+          finalVal = innerFor.getResult(slot.startIdx);
+        } else if (isa<kunir::ReduceArgMinOp>(defOp) ||
+                     isa<kunir::ReduceArgMaxOp>(defOp)) {
+          finalVal = innerFor.getResult(slot.startIdx + 1);
+        } else {
+          // ReduceRankOp:  less + (eq + 1) / 2
+          Value less = innerFor.getResult(slot.startIdx);
+          Value eq   = innerFor.getResult(slot.startIdx + 1);
+          auto elemTy = llvm::cast<FloatType>(less.getType());
+          Value one = arith::ConstantOp::create(
+              b, ol, elemTy, b.getFloatAttr(elemTy, 1.0));
+          Value two = arith::ConstantOp::create(
+              b, ol, elemTy, b.getFloatAttr(elemTy, 2.0));
+          Value eqp1 = arith::AddFOp::create(b, ol, eq, one);
+          Value half = arith::DivFOp::create(b, ol, eqp1, two);
+          finalVal = arith::AddFOp::create(b, ol, less, half);
+        }
+        outer.scalarMap[res] = finalVal;
+      }
+      return success();
+    }
+
+    // kunir.accumulator → kungpu.accumulator (allocated outside the time
+    // loop, like windowed_temp).  Stored in tsMap so that downstream reads
+    // (via getScalar → kungpu.ts.get @ offset 0) resolve to the slot.
+    if (auto acc = dyn_cast<kunir::AccumulatorOp>(op)) {
+      auto ka = kungpu::AccumulatorOp::create(
+          b, ol, acc.getResult().getType(), acc.getNameAttr(),
+          acc.getInitValAttr());
+      outer.tsMap[acc.getResult()] = ka.getResult();
+      return success();
+    }
+
+    // kunir.set_accumulator → scf.if (mask) { kungpu.ts.put %acc, %value }
+    // inside the outer time loop.  mask and value are loaded at offset 0
+    // (current time step) via the standard scalarMap-cached getScalar.
+    // The op's SSA result = `mask ? value : prev_slot` — emitted as an
+    // arith.select and stashed in scalarMap for downstream consumers.
+    if (auto sa = dyn_cast<kunir::SetAccumulatorOp>(op)) {
+      auto accIt = outer.tsMap.find(sa.getAcc());
+      if (accIt == outer.tsMap.end())
+        return op.emitError("kunir-to-kungpu: set_accumulator acc must come "
+                            "from a kunir.accumulator");
+      KUN_ASSIGN_OR_FAIL(Value maskScalar,
+                         outer.getScalar(sa.getMask(),  fb, ol));
+      KUN_ASSIGN_OR_FAIL(Value valueScalar,
+                         outer.getScalar(sa.getValue(), fb, ol));
+      KUN_ASSIGN_OR_FAIL(Value prevScalar,
+                         outer.getScalar(sa.getAcc(),   fb, ol));
+      Value newScalar = arith::SelectOp::create(
+          fb, ol, maskScalar, valueScalar, prevScalar);
+      auto ifOp = scf::IfOp::create(fb, ol, /*resultTypes=*/TypeRange{},
+                                         maskScalar, /*withElseRegion=*/false);
+      OpBuilder ib = OpBuilder::atBlockBegin(&ifOp.getThenRegion().front());
+      TsPutOp::create(ib, ol, accIt->second, valueScalar);
+      outer.scalarMap[sa.getResult()] = newScalar;
+      return success();
+    }
+
+    // output_ref → hoist the write here and register the output buffer
+    // as a ts handle so downstream reads use the same buffer.
+    if (auto oref = dyn_cast<kunir::OutputRefOp>(op)) {
+      auto name = oref.getName();
+      auto it = outNameToIsTakenOver.find(name);
+      if (it == outNameToIsTakenOver.end())
+        return op.emitError("kunir-to-kungpu: output_ref references "
+                            "unknown output '") << name << "'";
+      if (it->second)
+        return op.emitError("kunir-to-kungpu: duplicate output_ref for "
+                            "output '") << name << "'";
+      it->second = true;
+      // Resolve the matching gpu.func output arg by parallel scan over
+      // (output names, outParams).  Both arrays have one entry per ts
+      // output and follow `tsRetIdx` order.
+      auto outNamesAttr = funcOp.getOutputNames();
+      Value buf;
+      for (auto [k, i] : llvm::enumerate(tsRetIdx)) {
+        if (llvm::cast<StringAttr>(outNamesAttr[i]).getValue() == name) {
+          buf = outParams[k];
+          break;
+        }
+      }
+      KUN_ASSIGN_OR_FAIL(Value valueScalar,
+                         outer.getScalar(oref.getValue(), fb, ol));
+      TsPutOp::create(fb, ol, buf, valueScalar);
+      outer.tsMap[oref.getResult()] = buf;
+      return success();
+    }
+
+    // fast_windowed_sum → preserved as a kunir op with scalar result and
+    // ts-handle input.  The kungpu-to-llvm pass owns the actual lowering
+    // (per-thread state allocas + the Kahan-corrected step).
+    if (auto fws = dyn_cast<FastWindowedSumOp>(op)) {
+      auto inputTs = llvm::cast<TsType>(fws.getInput().getType());
+      auto inputIt = outer.tsMap.find(fws.getInput());
+      if (inputIt == outer.tsMap.end())
+        return op.emitError(
+            "kunir-to-kungpu: fast_windowed_sum input must be a ts handle");
+      auto newOp = FastWindowedSumOp::create(
+          fb, ol, /*resultType=*/inputTs.getElementType(),
+          /*input=*/inputIt->second, fws.getWindowAttr());
+      outer.scalarMap[fws.getResult()] = newOp.getResult();
+      return success();
+    }
+
+    return op.emitError("kunir-to-kungpu: unhandled op in outer block");
+  };
+
+  if (failed(outer.lowerBlock(origOps, fb, outerHandler)))
+    return signalPassFailure();
+
+  // ------------------------------------------------------------------
+  // 6. Emit ts.put for each ts return value, then close the outer for.
+  //    Outputs already written by an output_ref are skipped.
+  // ------------------------------------------------------------------
+  auto outNamesAttr = funcOp.getOutputNames();
+  for (auto [k, pair] : llvm::enumerate(llvm::zip(outParams, tsRetVals))) {
+    auto [outParam, rv] = pair;
+    if (outNamesAttr) {
+      auto name = llvm::cast<StringAttr>(outNamesAttr[tsRetIdx[k]]).getValue();
+      auto it = outNameToIsTakenOver.find(name);
+      if (it != outNameToIsTakenOver.end() && it->second) continue;
+    }
+    // Most return values are scalars produced inside the time loop.
+    // A ts<T, inf> operand (graph-input passthrough) is also accepted;
+    // resolve it to a scalar via ts.get @ 0.
+    Value scalarVal;
+    auto sit = outer.scalarMap.find(rv);
+    if (sit != outer.scalarMap.end()) {
+      scalarVal = sit->second;
+    } else {
+      auto res = outer.getScalar(rv, fb, loc);
+      if (failed(res)) return signalPassFailure();
+      scalarVal = *res;
+    }
+    TsPutOp::create(fb, loc, outParam, scalarVal);
+  }
+  scf::YieldOp::create(fb, loc);
+
+  // ------------------------------------------------------------------
+  // 7. Insert a replacement return before the original return op.
+  // ------------------------------------------------------------------
+  if (retOp) {
+    b.setInsertionPoint(retOp);
+    SmallVector<Value> nonTsRets;
+    for (Value v : retOp.getOperands())
+      if (!isa<TsType>(v.getType())) nonTsRets.push_back(v);
+    kunir::ReturnOp::create(b, loc, mlir::ValueRange(nonTsRets));
+  }
+
+  // ------------------------------------------------------------------
+  // 8. Erase original ops in reverse order.
+  // ------------------------------------------------------------------
+  for (Operation *op : llvm::reverse(origOps))
+    op->erase();
+}
+
+//===----------------------------------------------------------------------===//
+// Pass registration
+//===----------------------------------------------------------------------===//
+
+namespace kunir {
+void registerKunIrToKunGpuPass() {
+  PassRegistration<LowerKunIrToKunGpuPass>();
+}
+std::unique_ptr<mlir::Pass> createKunIrToKunGpuPass() {
+  return std::make_unique<LowerKunIrToKunGpuPass>();
+}
+} // namespace kunir
diff --git a/mlir/lib/KunIr/KunIrTypes.cpp b/mlir/lib/KunIr/KunIrTypes.cpp
new file mode 100644
index 0000000..d9f52e9
--- /dev/null
+++ b/mlir/lib/KunIr/KunIrTypes.cpp
@@ -0,0 +1,52 @@
+#include "KunIr/KunIrTypes.h"
+#include "KunIr/KunIrDialect.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/DialectImplementation.h"
+#include "llvm/ADT/TypeSwitch.h"
+#include <limits>
+
+using namespace mlir;
+using namespace kunir;
+
+// Emits full TsTypeStorage definition + TypeBase method implementations.
+#define GET_TYPEDEF_CLASSES
+#include "KunIr/KunIrOpsTypes.cpp.inc"
+
+static constexpr uint64_t kInfLookback = std::numeric_limits<uint64_t>::max();
+
+// Custom assembly format: !kunir.ts<elemType, N>  or  !kunir.ts<elemType, inf>
+mlir::Type TsType::parse(mlir::AsmParser &parser) {
+  mlir::Type elemType;
+  uint64_t maxLookback;
+
+  if (parser.parseLess() || parser.parseType(elemType) || parser.parseComma())
+    return {};
+
+  if (parser.parseOptionalKeyword("inf").succeeded()) {
+    maxLookback = kInfLookback;
+  } else {
+    if (parser.parseInteger(maxLookback))
+      return {};
+  }
+
+  if (parser.parseGreater())
+    return {};
+
+  return TsType::get(parser.getContext(), elemType, maxLookback);
+}
+
+void TsType::print(mlir::AsmPrinter &printer) const {
+  printer << "<" << getElementType() << ", ";
+  if (getMaxLookback() == kInfLookback)
+    printer << "inf";
+  else
+    printer << getMaxLookback();
+  printer << ">";
+}
+
+void KunIrDialect::registerTypes() {
+  addTypes<
+#define GET_TYPEDEF_LIST
+#include "KunIr/KunIrOpsTypes.cpp.inc"
+  >();
+}
diff --git a/mlir/lib/Python/CMakeLists.txt b/mlir/lib/Python/CMakeLists.txt
new file mode 100644
index 0000000..a11c2b6
--- /dev/null
+++ b/mlir/lib/Python/CMakeLists.txt
@@ -0,0 +1,77 @@
+# Python binding module for the kunir → PTX → CUBIN → launch flow.
+#
+# nanobind is added at the top-level KunQuant CMakeLists, so the
+# `nanobind_add_module` macro is already in scope.
+
+# nanobind modules have undefined symbols (PyObject_*, PyExc_*, …) that
+# the Python interpreter resolves at module-load time.  MLIR's
+# HandleLLVMOptions adds `-Wl,-z,defs` globally, which is incompatible
+# with that policy.  Strip it locally for this subdirectory.
+string(REPLACE "-Wl,-z,defs" "" CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS}")
+string(REPLACE "-Wl,-z,defs" "" CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS}")
+
+# STABLE_ABI: single .abi3.so on CPython ≥ 3.12; falls back to per-version
+# on older Pythons.  Matches the runner binding (cpp/Python).
+nanobind_add_module(KunMLIR STABLE_ABI
+  MlirBinding.cpp
+  IRBuilder.cpp
+  PyModule.cpp
+)
+
+# Drop the .so directly into the KunQuant-MLIR Python package.  The
+# libKunCudaRuntime.so runtime is co-located there too (see
+# mlir/lib/KunCuda/CMakeLists.txt) so the $ORIGIN rpath resolves the sibling
+# at load time.
+set_target_properties(KunMLIR PROPERTIES
+    LIBRARY_OUTPUT_DIRECTORY "${KUN_MLIR_PYTHON_PACKAGE_DIR}")
+
+target_link_libraries(KunMLIR PRIVATE
+  # cuda.h + libcuda stub — the binding's runGraph wrapper calls
+  # cuMemAlloc / cuMemFree directly to back caller-omitted output
+  # buffers.  The CUDA::cuda_driver target is provided by the root
+  # KUN_BUILD_MLIR CUDAToolkit discovery.
+  CUDA::cuda_driver
+
+  # Compiler side
+  MLIRKunIrDialect
+  MLIRKunGpuDialect
+  MLIRKunIrToKunGpu
+
+  # Runtime side — owns cuda.h + libcuda; we just hand it ExecutableData
+  # and call launch().
+  KunCudaRuntime
+)
+
+mlir_target_link_libraries(KunMLIR PRIVATE
+  # Compiler side
+  MLIRIR
+  MLIRParser
+  MLIRPass
+  MLIRSupport
+  MLIRTransforms
+  MLIRTransformUtils
+
+  MLIRArithDialect
+  MLIRControlFlowDialect
+  MLIRFuncDialect
+  MLIRGPUDialect
+  MLIRIndexDialect
+  MLIRLLVMDialect
+  MLIRMathDialect
+  MLIRNVVMDialect
+  MLIRSCFDialect
+
+  # Pulled in by the kunir-to-llvm pipeline that runs inside lower_to_ptx.
+  MLIRSCFToControlFlow
+  MLIRControlFlowToLLVM
+  MLIRArithToLLVM
+  MLIRFuncToLLVM
+  MLIRIndexToLLVM
+  MLIRGPUToNVVMTransforms
+  MLIRReconcileUnrealizedCasts
+
+  # Required for `gpu-module-to-binary` (the upstream pass we drive in
+  # PtxBackend.cpp) + the NVVM target serializer it dispatches to.
+  MLIRGPUTransforms
+  MLIRNVVMTarget
+)
diff --git a/mlir/lib/Python/IRBuilder.cpp b/mlir/lib/Python/IRBuilder.cpp
new file mode 100644
index 0000000..dcd4651
--- /dev/null
+++ b/mlir/lib/Python/IRBuilder.cpp
@@ -0,0 +1,463 @@
+//===- IRBuilder.cpp - Programmatic kunir module construction from Python ===//
+
+#include "IRBuilder.h"
+#include "PyModule.h"
+
+#include <limits>
+
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/unique_ptr.h>
+#include <nanobind/stl/vector.h>
+
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/SymbolTable.h"
+#include "mlir/IR/Value.h"
+
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+
+#include "KunIr/KunIrAttrs.h"
+#include "KunIr/KunIrOps.h"
+#include "KunIr/KunIrTypes.h"
+
+#include <limits>
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+namespace nb = nanobind;
+using namespace mlir;
+
+namespace kun_mlir_py {
+
+namespace {
+
+/// Stateful kunir builder.  Holds an MLIRContext + ModuleOp (via PyModule),
+/// a current OpBuilder insertion point, and a stack used by
+/// for_each_back_window's region nesting.
+class IRBuilder {
+public:
+  IRBuilder()
+      : pm_(std::make_unique<PyModule>()), b_(pm_->ctx.get()) {
+    Location loc = b_.getUnknownLoc();
+    pm_->module = OwningOpRef<ModuleOp>(ModuleOp::create(loc));
+    b_.setInsertionPointToEnd(pm_->module.get().getBody());
+    // One gpu.module per IRBuilder — KunMLIR's pipeline expects exactly
+    // one container for all kunir.func ops.
+    gpuMod_ = gpu::GPUModuleOp::create(b_, loc, "kungpu_kernels");
+    b_.setInsertionPointToStart(&gpuMod_.getBodyRegion().front());
+  }
+
+  // ── Type construction ─────────────────────────────────────────────
+  Type tsType(const std::string &elemDtype, int64_t lookback) {
+    Type elem;
+    if (elemDtype == "f32" || elemDtype == "float")
+      elem = b_.getF32Type();
+    else if (elemDtype == "f64" || elemDtype == "double")
+      elem = b_.getF64Type();
+    else if (elemDtype == "i1" || elemDtype == "bool")
+      elem = b_.getI1Type();
+    else
+      throw std::runtime_error("IRBuilder.ts_type: unsupported elem dtype '" +
+                                 elemDtype + "' (expected f32/f64/i1)");
+    uint64_t lb = lookback == 0 ? std::numeric_limits<uint64_t>::max()
+                                  : static_cast<uint64_t>(lookback);
+    return kunir::TsType::get(pm_->ctx.get(), elem, lb);
+  }
+
+  // ── Function ──────────────────────────────────────────────────────
+  std::vector<Value>
+  beginFunc(const std::string &name,
+              std::vector<Type> inputTypes,
+              std::vector<std::string> inputNames,
+              std::vector<std::string> outputNames,
+              int64_t occupancy, int64_t warpsPerCta,
+              int64_t smemSize, int64_t vectorSize,
+              int64_t unreliableCount,
+              std::vector<Type> resultTypes) {
+    if (curFunc_)
+      throw std::runtime_error(
+          "IRBuilder.begin_func: a function is already open — call "
+          "end_func() first");
+    if (inputTypes.size() != inputNames.size())
+      throw std::runtime_error(
+          "IRBuilder.begin_func: input_types and input_names must have "
+          "the same length");
+    if (resultTypes.size() != outputNames.size())
+      throw std::runtime_error(
+          "IRBuilder.begin_func: result_types and output_names must have "
+          "the same length (non-void form: outputs become result types)");
+    // `-1` is the whole-time sentinel.  Anything more negative is bogus.
+    if (unreliableCount < -1)
+      throw std::runtime_error(
+          "IRBuilder.begin_func: unreliable_count must be -1 (whole-time) "
+          "or non-negative, got "
+          + std::to_string(unreliableCount));
+
+    // Restore insertion point to the gpu.module body before starting a
+    // new function (in case end_func left us at module scope already).
+    b_.setInsertionPointToEnd(&gpuMod_.getBodyRegion().front());
+
+    MLIRContext *ctx = pm_->ctx.get();
+    Location loc = b_.getUnknownLoc();
+
+    auto funcType = b_.getFunctionType(inputTypes, resultTypes);
+    auto inNamesAttr = b_.getArrayAttr(llvm::map_to_vector(
+        inputNames,
+        [&](const std::string &s) -> Attribute { return b_.getStringAttr(s); }));
+    auto outNamesAttr = b_.getArrayAttr(llvm::map_to_vector(
+        outputNames,
+        [&](const std::string &s) -> Attribute { return b_.getStringAttr(s); }));
+    auto target = kunir::TargetSpecAttr::get(ctx, occupancy, warpsPerCta,
+                                                smemSize, vectorSize);
+
+    curFunc_ = kunir::FuncOp::create(b_, loc, name, funcType, inNamesAttr,
+                                       outNamesAttr, target,
+                                       unreliableCount);
+    Block &entry = curFunc_.getBodyBlock();
+    b_.setInsertionPointToStart(&entry);
+
+    std::vector<Value> args(entry.args_begin(), entry.args_end());
+    return args;
+  }
+
+  void endFunc(std::vector<Value> returnValues) {
+    if (!curFunc_)
+      throw std::runtime_error(
+          "IRBuilder.end_func: no open function — call begin_func() first");
+    if (!loopStack_.empty())
+      throw std::runtime_error(
+          "IRBuilder.end_func: " + std::to_string(loopStack_.size()) +
+          " for_each_back_window region(s) still open — close them first");
+
+    Location loc = b_.getUnknownLoc();
+    kunir::ReturnOp::create(b_, loc, ValueRange(returnValues));
+
+    // Restore insertion point to gpu.module so the next begin_func
+    // appends a sibling.
+    b_.setInsertionPointToEnd(&gpuMod_.getBodyRegion().front());
+    curFunc_ = nullptr;
+  }
+
+  // ── Elemwise ops (InferTypeOpInterface — no result type needed) ──
+  Value addOp(Value a, Value b) { return makeBin<kunir::AddOp>(a, b); }
+  Value subOp(Value a, Value b) { return makeBin<kunir::SubOp>(a, b); }
+  Value mulOp(Value a, Value b) { return makeBin<kunir::MulOp>(a, b); }
+  Value divOp(Value a, Value b) { return makeBin<kunir::DivOp>(a, b); }
+  Value maxOp(Value a, Value b) { return makeBin<kunir::MaxOp>(a, b); }
+  Value minOp(Value a, Value b) { return makeBin<kunir::MinOp>(a, b); }
+
+  Value absOp(Value x)  { return makeUn<kunir::AbsOp>(x); }
+  Value logOp(Value x)  { return makeUn<kunir::LogOp>(x); }
+  Value expOp(Value x)  { return makeUn<kunir::ExpOp>(x); }
+  Value sqrtOp(Value x) { return makeUn<kunir::SqrtOp>(x); }
+  Value signOp(Value x) { return makeUn<kunir::SignOp>(x); }
+
+  // ── Comparison + logical (binary, return ts<i1, 1>) ─────────────
+  Value gtOp(Value a, Value b) { return makeBin<kunir::GreaterOp>(a, b); }
+  Value geOp(Value a, Value b) { return makeBin<kunir::GreaterEqualOp>(a, b); }
+  Value ltOp(Value a, Value b) { return makeBin<kunir::LessOp>(a, b); }
+  Value leOp(Value a, Value b) { return makeBin<kunir::LessEqualOp>(a, b); }
+  Value eqOp(Value a, Value b) { return makeBin<kunir::EqualOp>(a, b); }
+  Value andOp(Value a, Value b) { return makeBin<kunir::AndOp>(a, b); }
+  Value orOp(Value a, Value b)  { return makeBin<kunir::OrOp>(a, b); }
+  Value notOp(Value x) { return makeUn<kunir::NotOp>(x); }
+
+  // ── Select (cond, true_value, false_value) ──────────────────────
+  Value selectOp(Value cond, Value tv, Value fv) {
+    return kunir::SelectOp::create(b_, b_.getUnknownLoc(), cond, tv, fv);
+  }
+
+  // ── Scalar constant lifted to ts<T, 1> ─────────────────────────
+  Value constantOp(double value, Type tsTy) {
+    auto attr = b_.getF64FloatAttr(value);
+    return kunir::ConstantOp::create(b_, b_.getUnknownLoc(), tsTy, attr);
+  }
+
+  // ── Accumulator / SetAccumulator ───────────────────────────────
+  Value accumulatorOp(std::string name, Type tsTy, double initVal) {
+    return kunir::AccumulatorOp::create(b_, b_.getUnknownLoc(), tsTy,
+                                            b_.getStringAttr(name),
+                                            b_.getF64FloatAttr(initVal));
+  }
+  Value setAccumulatorOp(Value acc, Value mask, Value value) {
+    return kunir::SetAccumulatorOp::create(
+        b_, b_.getUnknownLoc(), acc.getType(), acc, mask, value);
+  }
+
+  // ── Graph-output buffer as ts handle ─────────────────────────────
+  Value outputRefOp(std::string name, Value value) {
+    auto vTs = llvm::cast<kunir::TsType>(value.getType());
+    auto resTy = kunir::TsType::get(pm_->ctx.get(), vTs.getElementType(),
+                                       std::numeric_limits<uint64_t>::max());
+    return kunir::OutputRefOp::create(b_, b_.getUnknownLoc(), resTy,
+                                          b_.getStringAttr(name), value);
+  }
+
+  // ── Windowed buffer materialization ───────────────────────────────
+  Value windowedOutputOp(Value x, int64_t length) {
+    auto inTs = llvm::cast<kunir::TsType>(x.getType());
+    auto resultTy = kunir::TsType::get(pm_->ctx.get(), inTs.getElementType(),
+                                          static_cast<uint64_t>(length));
+    return kunir::WindowedOutputOp::create(b_, b_.getUnknownLoc(), resultTy, x,
+                                                length);
+  }
+
+  // ── Back-reference + Fast windowed sum (high-level: ts → ts<T,1>) ─
+  Value backRefOp(Value x, int64_t window) {
+    auto inTs = llvm::cast<kunir::TsType>(x.getType());
+    auto resultTy = kunir::TsType::get(pm_->ctx.get(), inTs.getElementType(), 1);
+    return kunir::BackRefOp::create(b_, b_.getUnknownLoc(), resultTy, x, window);
+  }
+  Value fastWindowedSumOp(Value x, int64_t window) {
+    auto inTs = llvm::cast<kunir::TsType>(x.getType());
+    auto resultTy = kunir::TsType::get(pm_->ctx.get(), inTs.getElementType(), 1);
+    return kunir::FastWindowedSumOp::create(b_, b_.getUnknownLoc(), resultTy, x,
+                                                 window);
+  }
+
+  // ── For-each-back-window region ───────────────────────────────────
+  std::vector<Value>
+  beginForEachBackWindow(std::vector<Value> inputs, int64_t window,
+                            std::vector<Type> resultTypes) {
+    Location loc = b_.getUnknownLoc();
+    auto loopOp = kunir::ForEachBackWindowOp::create(b_, loc, resultTypes,
+                                                          inputs, window);
+    // Populate body block: one block arg per input, each ts<elemType, 1>.
+    Block *body = new Block;
+    for (Value in : inputs) {
+      auto ts = llvm::cast<kunir::TsType>(in.getType());
+      body->addArgument(
+          kunir::TsType::get(pm_->ctx.get(), ts.getElementType(), 1), loc);
+    }
+    loopOp.getBody().push_back(body);
+
+    // Descend into the body; remember where to resume.
+    ipStack_.push_back(b_.saveInsertionPoint());
+    loopStack_.push_back(loopOp);
+    b_.setInsertionPointToStart(body);
+
+    return std::vector<Value>(body->args_begin(), body->args_end());
+  }
+
+  std::vector<Value>
+  endForEachBackWindow(std::vector<Value> yieldValues) {
+    if (loopStack_.empty())
+      throw std::runtime_error(
+          "IRBuilder.end_for_each_back_window: no open loop");
+    Location loc = b_.getUnknownLoc();
+    kunir::YieldOp::create(b_, loc, ValueRange(yieldValues));
+
+    auto loopOp = loopStack_.back();
+    loopStack_.pop_back();
+    b_.restoreInsertionPoint(ipStack_.back());
+    ipStack_.pop_back();
+
+    return std::vector<Value>(loopOp.getResults().begin(),
+                                loopOp.getResults().end());
+  }
+
+  // ── Reductions (must be inside a loop body) ───────────────────────
+  Value reduceAddOp(Value x) { return makeReduce<kunir::ReduceAddOp>(x); }
+  Value reduceMulOp(Value x) { return makeReduce<kunir::ReduceMulOp>(x); }
+  Value reduceMaxOp(Value x) { return makeReduce<kunir::ReduceMaxOp>(x); }
+  Value reduceMinOp(Value x) { return makeReduce<kunir::ReduceMinOp>(x); }
+  Value reduceArgMinOp(Value x) { return makeReduce<kunir::ReduceArgMinOp>(x); }
+  Value reduceArgMaxOp(Value x) { return makeReduce<kunir::ReduceArgMaxOp>(x); }
+  Value reduceRankOp(Value x, Value cur) {
+    // SameOperandsAndResultType — pass x's type as the result type.
+    return kunir::ReduceRankOp::create(b_, b_.getUnknownLoc(), x.getType(), x, cur);
+  }
+  Value windowLoopIndexOp(Type ts_ty) {
+    return kunir::WindowLoopIndexOp::create(b_, b_.getUnknownLoc(), ts_ty);
+  }
+
+  // ── Finalize ──────────────────────────────────────────────────────
+  std::unique_ptr<PyModule> finish() {
+    if (curFunc_)
+      throw std::runtime_error(
+          "IRBuilder.finish: a function is still open — call end_func() "
+          "first");
+    if (!loopStack_.empty())
+      throw std::runtime_error(
+          "IRBuilder.finish: for_each_back_window region(s) still open");
+    return std::move(pm_);
+  }
+
+  std::string toString() const {
+    if (!pm_)
+      throw std::runtime_error(
+          "IRBuilder.to_string: builder has been consumed by finish()");
+    return pm_->toString();
+  }
+
+private:
+  template <typename OpTy> Value makeBin(Value a, Value b) {
+    return OpTy::create(b_, b_.getUnknownLoc(), a, b);
+  }
+  template <typename OpTy> Value makeUn(Value x) {
+    return OpTy::create(b_, b_.getUnknownLoc(), x);
+  }
+  template <typename OpTy> Value makeReduce(Value x) {
+    // SameOperandsAndResultType — pass x's type as the result type.
+    return OpTy::create(b_, b_.getUnknownLoc(), x.getType(), x);
+  }
+
+  std::unique_ptr<PyModule> pm_;
+  OpBuilder b_;
+  gpu::GPUModuleOp gpuMod_;
+  kunir::FuncOp curFunc_;
+  std::vector<OpBuilder::InsertPoint> ipStack_;
+  std::vector<kunir::ForEachBackWindowOp> loopStack_;
+};
+
+std::string valueRepr(Value v) {
+  std::string s;
+  llvm::raw_string_ostream os(s);
+  if (v) v.print(os);
+  else   os << "<null Value>";
+  return s;
+}
+
+std::string typeRepr(Type t) {
+  std::string s;
+  llvm::raw_string_ostream os(s);
+  if (t) t.print(os);
+  else   os << "<null Type>";
+  return s;
+}
+
+} // namespace
+
+void registerIRBuilder(nb::module_ &m) {
+  // Opaque MLIR Value / Type wrappers.  No mutating methods — just an
+  // identity / repr.  They live as long as the IRBuilder + its resulting
+  // PyModule.
+  nb::class_<Value>(m, "Value")
+      .def("__repr__", [](Value v) { return "<KunMLIR.Value " + valueRepr(v) + ">"; })
+      .def("__str__",  [](Value v) { return valueRepr(v); });
+
+  nb::class_<Type>(m, "Type")
+      .def("__repr__", [](Type t) { return "<KunMLIR.Type " + typeRepr(t) + ">"; })
+      .def("__str__",  [](Type t) { return typeRepr(t); });
+
+  nb::class_<IRBuilder>(m, "IRBuilder",
+        "Stateful builder that constructs a kunir module programmatically.\n"
+        "Wrap your translator around this — it's the canonical alternative "
+        "to round-tripping through MLIR text via parse().")
+      .def(nb::init<>())
+
+      // Type
+      .def("ts_type", &IRBuilder::tsType,
+            nb::arg("elem_dtype"), nb::arg("lookback"),
+            "Build a !kunir.ts<elem_dtype, lookback>.  lookback==0 → 'inf'.")
+
+      // Function
+      .def("begin_func", &IRBuilder::beginFunc,
+            nb::arg("name"),
+            nb::arg("input_types"), nb::arg("input_names"),
+            nb::arg("output_names"),
+            nb::arg("occupancy"), nb::arg("warps_per_cta"),
+            nb::arg("smem_size"), nb::arg("vector_size"),
+            nb::arg("unreliable_count"),
+            nb::arg("result_types"),
+            "Open a new kunir.func.  Returns its argument Values.  "
+            "`unreliable_count` is the partition-local warmup depth "
+            "(max windowed-chain depth from any input to any output).")
+      .def("end_func", &IRBuilder::endFunc, nb::arg("return_values"),
+            "Close the current kunir.func with a kunir.return.")
+
+      // Elemwise
+      .def("add",    &IRBuilder::addOp,    nb::arg("lhs"), nb::arg("rhs"))
+      .def("sub",    &IRBuilder::subOp,    nb::arg("lhs"), nb::arg("rhs"))
+      .def("mul",    &IRBuilder::mulOp,    nb::arg("lhs"), nb::arg("rhs"))
+      .def("div",    &IRBuilder::divOp,    nb::arg("lhs"), nb::arg("rhs"))
+      .def("max",    &IRBuilder::maxOp,    nb::arg("lhs"), nb::arg("rhs"))
+      .def("min",    &IRBuilder::minOp,    nb::arg("lhs"), nb::arg("rhs"))
+      .def("abs",    &IRBuilder::absOp,    nb::arg("x"))
+      .def("log",    &IRBuilder::logOp,    nb::arg("x"))
+      .def("exp",    &IRBuilder::expOp,    nb::arg("x"))
+      .def("sqrt",   &IRBuilder::sqrtOp,   nb::arg("x"))
+      .def("sign",   &IRBuilder::signOp,   nb::arg("x"))
+
+      // Comparison + logical (binary). Cmp ops return ts<i1, 1>;
+      // and/or expect ts<i1, *> operands and also return ts<i1, 1>.
+      .def("gt",     &IRBuilder::gtOp,     nb::arg("lhs"), nb::arg("rhs"))
+      .def("ge",     &IRBuilder::geOp,     nb::arg("lhs"), nb::arg("rhs"))
+      .def("lt",     &IRBuilder::ltOp,     nb::arg("lhs"), nb::arg("rhs"))
+      .def("le",     &IRBuilder::leOp,     nb::arg("lhs"), nb::arg("rhs"))
+      .def("eq",     &IRBuilder::eqOp,     nb::arg("lhs"), nb::arg("rhs"))
+      .def("and_",   &IRBuilder::andOp,    nb::arg("lhs"), nb::arg("rhs"))
+      .def("or_",    &IRBuilder::orOp,     nb::arg("lhs"), nb::arg("rhs"))
+      .def("not_",   &IRBuilder::notOp,    nb::arg("x"))
+
+      // Select: cond ? true_value : false_value
+      .def("constant", &IRBuilder::constantOp,
+            nb::arg("value"), nb::arg("type"),
+            "Build a kunir.constant of element-type matching `type` (a "
+            "ts<T, 1>).  Pass float('nan') for NaN.")
+
+      .def("accumulator", &IRBuilder::accumulatorOp,
+            nb::arg("name"), nb::arg("type"), nb::arg("init_val") = 0.0,
+            "Build a kunir.accumulator with the given name and ts<T, 1> "
+            "result type.  `init_val` is the initial scalar stored in the "
+            "slot before the first time step (pass float('nan') for NaN).")
+      .def("set_accumulator", &IRBuilder::setAccumulatorOp,
+            nb::arg("acc"), nb::arg("mask"), nb::arg("value"),
+            "Conditionally store `value` into `acc` when `mask` is true. "
+            "Side-effecting; returns the slot's new value for the current "
+            "step (`mask ? value : prev_accumulator`).")
+
+      .def("select", &IRBuilder::selectOp,
+            nb::arg("cond"), nb::arg("true_value"), nb::arg("false_value"))
+
+      // Windowed materialization
+      .def("windowed_output", &IRBuilder::windowedOutputOp,
+            nb::arg("x"), nb::arg("length"))
+
+      .def("output_ref", &IRBuilder::outputRefOp,
+            nb::arg("name"), nb::arg("value"),
+            "ts handle to a graph-output buffer.  Downstream reads use "
+            "the same buffer the kernel writes `value` into.")
+
+      // Back-reference + Fast windowed sum
+      .def("back_ref",          &IRBuilder::backRefOp,
+            nb::arg("x"), nb::arg("window"))
+      .def("fast_windowed_sum", &IRBuilder::fastWindowedSumOp,
+            nb::arg("x"), nb::arg("window"))
+
+      // Loop
+      .def("begin_for_each_back_window", &IRBuilder::beginForEachBackWindow,
+            nb::arg("inputs"), nb::arg("window"), nb::arg("result_types"),
+            "Open a for_each_back_window region.  Returns block args (one "
+            "per loop input, type ts<elem,1>).")
+      .def("end_for_each_back_window", &IRBuilder::endForEachBackWindow,
+            nb::arg("yield_values"),
+            "Close the current for_each_back_window with a kunir.yield, "
+            "returning the loop op's results.")
+
+      // Reductions
+      .def("reduce_add", &IRBuilder::reduceAddOp, nb::arg("x"))
+      .def("reduce_mul", &IRBuilder::reduceMulOp, nb::arg("x"))
+      .def("reduce_max", &IRBuilder::reduceMaxOp, nb::arg("x"))
+      .def("reduce_min", &IRBuilder::reduceMinOp, nb::arg("x"))
+      .def("reduce_argmin", &IRBuilder::reduceArgMinOp, nb::arg("x"))
+      .def("reduce_argmax", &IRBuilder::reduceArgMaxOp, nb::arg("x"))
+      .def("reduce_rank",   &IRBuilder::reduceRankOp,
+            nb::arg("x"), nb::arg("current"))
+      .def("window_loop_index", &IRBuilder::windowLoopIndexOp,
+            nb::arg("ts_ty"))
+
+      // Finalize / debug
+      .def("to_string", &IRBuilder::toString,
+            "Print the module under construction (for debugging — does "
+            "not consume the builder).")
+      .def("finish", &IRBuilder::finish,
+            "Hand off the module to a KunMLIR.ModuleOp.  Builder is "
+            "consumed.");
+}
+
+} // namespace kun_mlir_py
diff --git a/mlir/lib/Python/IRBuilder.h b/mlir/lib/Python/IRBuilder.h
new file mode 100644
index 0000000..edbeb74
--- /dev/null
+++ b/mlir/lib/Python/IRBuilder.h
@@ -0,0 +1,28 @@
+//===- IRBuilder.h - Programmatic kunir module construction from Python ---===//
+//
+// Exposes a stateful builder to Python so a translator (e.g. KunQuant's
+// codegen pass) can emit kunir ops without going through textual MLIR.
+//
+// Lifecycle:
+//   ir = KunMLIR.IRBuilder()
+//   ir.begin_func(name, in_types, in_names, out_names, target_spec, result_types)
+//   args = ir.func_args
+//   v = ir.add(args[0], args[1])
+//   ir.end_func([v])
+//   ...
+//   mod = ir.finish()                # → KunMLIR.ModuleOp
+//
+// `Value` and `Type` are opaque wrappers around mlir::Value / mlir::Type.
+// They are valid only while the IRBuilder (and the resulting PyModule)
+// are alive.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <nanobind/nanobind.h>
+
+namespace kun_mlir_py {
+/// Register the IRBuilder + Value + Type nanobind classes on `m`.
+void registerIRBuilder(::nanobind::module_ &m);
+} // namespace kun_mlir_py
diff --git a/mlir/lib/Python/MlirBinding.cpp b/mlir/lib/Python/MlirBinding.cpp
new file mode 100644
index 0000000..7a4f279
--- /dev/null
+++ b/mlir/lib/Python/MlirBinding.cpp
@@ -0,0 +1,716 @@
+//===- MlirBinding.cpp - Python bindings for the kunir → cubin flow ----===//
+//
+// Exposes:
+//   KunMLIR.parse(text)            → ModuleOp     (loads MLIR text)
+//   ModuleOp.to_string() / __str__  → str          (dumps the module)
+//   KunMLIR.lower_to_ptx(mod, …)   → str          (kunir → PTX, debug only)
+//   KunMLIR.compile(mod, …)        → Executable   (kunir → loadable kernel)
+//   Executable.launch({name: cupy}) → None         (cuLaunchKernel + sync)
+//
+// `compile` is the main path; `lower_to_ptx` is for inspecting the
+// intermediate PTX text that the upstream `gpu-module-to-binary` pass
+// produces (with `format=isa`).  Both go through the same lowering
+// pipeline — see PtxBackend.h.
+//
+//===----------------------------------------------------------------------===//
+
+#include <nanobind/nanobind.h>
+#include <nanobind/ndarray.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/unordered_map.h>
+#include <nanobind/stl/vector.h>
+#include <nanobind/stl/unique_ptr.h>
+
+#include "PyModule.h"     // shared MLIRContext + ModuleOp wrapper
+#include "IRBuilder.h"    // nanobind class for programmatic kunir construction
+#include "dlpack.h"       // vendored DLPack ABI (consumer-only)
+
+#include "KunCuda/Runtime.h"
+#include "KunGpu/PtxBackend.h"
+
+#include "llvm/ADT/StringRef.h"
+
+#include <cuda.h>
+
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+namespace nb = nanobind;
+
+using kun_mlir_py::PyModule;
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+// PTX inspection (debug)
+//===----------------------------------------------------------------------===//
+
+static std::string pyLowerToPtx(PyModule &pm, const std::string &gpuArch,
+                                  const std::string &targetTriple,
+                                  const std::string &targetFeatures,
+                                  unsigned optLevel,
+                                  const std::string &toolkitPath) {
+  kungpu::PtxCompileOptions opts;
+  if (!gpuArch.empty())        opts.targetCpu      = gpuArch;
+  if (!targetTriple.empty())   opts.targetTriple   = targetTriple;
+  if (!targetFeatures.empty()) opts.targetFeatures = targetFeatures;
+  opts.optLevel    = optLevel;
+  opts.toolkitPath = toolkitPath;
+
+  std::string ptx;
+  if (failed(kungpu::compileKunIrToPtx(pm.module.get(), opts, ptx)))
+    throw std::runtime_error("KunMLIR.lower_to_ptx failed");
+  return ptx;
+}
+
+//===----------------------------------------------------------------------===//
+// nanobind glue: read a Python GPU array via DLPack → device pointer + shape
+//===----------------------------------------------------------------------===//
+
+/// Result of reading one GPU array argument.  `ptr` is the device pointer
+/// the kernel will consume; `(timeLength, numStocks)` is the resolved
+/// 2-D shape used for cross-arg consistency checks.
+struct CudaArrayInfo {
+  uintptr_t ptr;
+  int64_t   timeLength;   ///< shape[0]
+  int64_t   numStocks;    ///< shape[1]
+};
+
+/// DLPack-spec encoding of the executor's CUDA stream, ready to hand to
+/// `obj.__dlpack__(stream=…)`.  The protocol uses int sentinels for the
+/// two default streams and the actual `CUstream` pointer otherwise:
+///
+///   None  ⇒  producer chooses (no sync)
+///   1     ⇒  legacy default stream
+///   2     ⇒  per-thread default stream
+///   other ⇒  CUstream pointer cast to int
+///
+/// We're never "no sync" — every launch must serialise on the executor's
+/// stream — so `stream_ == nullptr` (default-stream executor) maps to 1.
+static nb::object dlpackStreamArg(CUstream stream) {
+  if (stream == nullptr)
+    return nb::int_(1);
+  return nb::int_(reinterpret_cast<uintptr_t>(stream));
+}
+
+/// Throws if `(shape, stridesBytes)` doesn't describe a C-contiguous
+/// 2-D buffer with `elemSize`-byte elements.  `stridesBytes == nullptr`
+/// is the "default row-major" case (always contiguous).
+static void requireRowMajorContiguous2D(const std::string &paramName,
+                                          const int64_t *shape,
+                                          const int64_t *stridesBytes,
+                                          int64_t elemSize) {
+  if (!stridesBytes)
+    return;
+  const int64_t innerStride = elemSize;
+  const int64_t outerStride = elemSize * shape[1];
+  if (stridesBytes[0] != outerStride || stridesBytes[1] != innerStride) {
+    std::stringstream ss;
+    ss << "'" << paramName << "' is not C-contiguous: strides=("
+       << stridesBytes[0] << ", " << stridesBytes[1] << ") bytes, "
+       << "expected (" << outerStride << ", " << innerStride
+       << ") for shape (" << shape[0] << ", " << shape[1] << ")";
+    throw std::runtime_error(ss.str());
+  }
+}
+
+/// Read `__dlpack__(stream=…)` — the cross-framework GPU array protocol
+/// implemented by CuPy / PyTorch / JAX / TensorFlow.  Validates every
+/// field the kernel relies on and threads the executor's stream so the
+/// producer can insert the needed cross-stream sync.
+///
+/// Memory lifecycle: `__dlpack__()` returns a PyCapsule named "dltensor"
+/// owning a `DLManagedTensor`; when the capsule is GC'd, its destructor
+/// calls the producer's `deleter`.  We grab the fields we need and let
+/// the capsule fall out of scope at function exit — the underlying
+/// tensor stays alive because the user is still holding `obj`.
+static CudaArrayInfo readDLPack(nb::handle obj, const std::string &paramName,
+                                  const nb::object &streamArg,
+                                  kun_cuda::Datatype expectedDtype) {
+  if (!nb::hasattr(obj, "__dlpack__"))
+    throw std::runtime_error(
+        "'" + paramName + "' does not implement __dlpack__ — pass a CuPy "
+        "ndarray, a PyTorch CUDA tensor, a JAX device array, or any other "
+        "object exporting the DLPack protocol.");
+
+  nb::object capsule = obj.attr("__dlpack__")(nb::arg("stream") = streamArg);
+  void *raw = PyCapsule_GetPointer(capsule.ptr(), "dltensor");
+  if (!raw) {
+    PyErr_Clear();
+    throw std::runtime_error(
+        "'" + paramName + "' __dlpack__() did not return a PyCapsule named "
+        "'dltensor' (consumed capsule?  wrong producer?)");
+  }
+  const DLManagedTensor *mt = reinterpret_cast<const DLManagedTensor *>(raw);
+  const DLTensor &t = mt->dl_tensor;
+
+  // ── device: only CUDA (managed counts as CUDA-addressable) ──────────
+  if (t.device.device_type != kDLCUDA &&
+      t.device.device_type != kDLCUDAManaged)
+    throw std::runtime_error(
+        "'" + paramName + "' is on DLPack device type " +
+        std::to_string(static_cast<int>(t.device.device_type)) +
+        " — only CUDA (=2) and CUDAManaged (=13) are supported");
+
+  // ── ndim ────────────────────────────────────────────────────────────
+  if (t.ndim != 2)
+    throw std::runtime_error(
+        "'" + paramName + "' must be 2-D (got " +
+        std::to_string(t.ndim) + "-D)");
+
+  // ── dtype: kDLFloat, matches executable's element type ──────────────
+  const uint8_t expectedBits =
+      expectedDtype == kun_cuda::Datatype::Double ? 64 : 32;
+  if (t.dtype.code != kDLFloat || t.dtype.bits != expectedBits ||
+      t.dtype.lanes != 1)
+    throw std::runtime_error(
+        "'" + paramName + "' DLPack dtype is (code=" +
+        std::to_string(static_cast<int>(t.dtype.code)) +
+        ", bits=" + std::to_string(static_cast<int>(t.dtype.bits)) +
+        ", lanes=" + std::to_string(static_cast<int>(t.dtype.lanes)) +
+        ") — kernel expects float" + std::to_string(expectedBits) +
+        " (kDLFloat, " + std::to_string(expectedBits) + ", 1)");
+
+  // ── strides: NULL = row-major contiguous; else validate.  DLPack
+  //    strides are in *elements*, not bytes — convert before checking.
+  const int64_t elemBytes = static_cast<int64_t>(kun_cuda::bytesPerElem(expectedDtype));
+  if (t.strides) {
+    int64_t sb[2] = {t.strides[0] * elemBytes, t.strides[1] * elemBytes};
+    requireRowMajorContiguous2D(paramName, t.shape, sb, elemBytes);
+  }
+
+  // ── data pointer (apply byte_offset before handing to kernel) ───────
+  uintptr_t ptr = reinterpret_cast<uintptr_t>(t.data) + t.byte_offset;
+  if (ptr == 0)
+    throw std::runtime_error(
+        "'" + paramName + "' DLPack data pointer is null");
+
+  return CudaArrayInfo{ptr, t.shape[0], t.shape[1]};
+}
+
+/// Reject keys in `pyDict` that are not in `expectedNames`.  Used so the
+/// error message points at the offending name instead of complaining
+/// about a different missing key down the loop.
+///
+/// Fast path: when `pyDict.size() == expectedNames.size()` we skip the
+/// per-key scan.  Either every expected name is present (no unexpected
+/// key by definition) or one is missing — in the latter case the
+/// downstream missing-key check still raises with a correct (if less
+/// precise) error.
+static void rejectUnexpectedKeys(const nb::dict &pyDict,
+                                   const std::vector<std::string> &expectedNames,
+                                   const char *kind) {
+  if (pyDict.size() == expectedNames.size())
+    return;
+  for (auto kv : pyDict) {
+    std::string key = nb::cast<std::string>(kv.first);
+    bool known = false;
+    for (auto &n : expectedNames) if (n == key) { known = true; break; }
+    if (known) continue;
+    std::string expected;
+    for (size_t j = 0; j < expectedNames.size(); ++j) {
+      if (j) expected += ", ";
+      expected += expectedNames[j];
+    }
+    throw std::runtime_error(std::string("runGraph: unexpected ") + kind +
+                              " '" + key + "' (expected: " + expected + ")");
+  }
+}
+
+/// Walk `pyInputs` in `exe.graphInputs()` order, validate that every name
+/// is present and that all arrays share the input shape (timeLength,
+/// numStocks).  Caller specifies the required `timeLength` via
+/// `requiredTimeLength` (== start + length); a value of -1 means "infer
+/// from the first input" and the binding will treat that as the locked
+/// shape.
+struct CollectedInputs {
+  int64_t timeLength;
+  int64_t numStocks;
+  std::vector<std::pair<std::string, uintptr_t>> args;
+};
+
+static CollectedInputs collectInputs(const kun_cuda::Executable &exe,
+                                        const nb::dict &pyInputs,
+                                        const nb::object &streamArg,
+                                        int64_t requiredTimeLength) {
+  const auto &inputNames = exe.graphInputs();
+  rejectUnexpectedKeys(pyInputs, inputNames, "input");
+
+  CollectedInputs out;
+  out.timeLength = requiredTimeLength;
+  out.numStocks  = -1;
+  out.args.reserve(inputNames.size());
+
+  for (const std::string &name : inputNames) {
+    nb::object key = nb::str(name.c_str());
+    if (!pyInputs.contains(key))
+      throw std::runtime_error("runGraph: missing input '" + name + "'");
+    CudaArrayInfo info = readDLPack(pyInputs[key], name, streamArg, exe.dtype());
+
+    if (out.timeLength < 0) {
+      out.timeLength = info.timeLength;
+      out.numStocks  = info.numStocks;
+    } else if (info.timeLength != out.timeLength ||
+                 (out.numStocks >= 0 && info.numStocks != out.numStocks)) {
+      std::stringstream ss;
+      ss << "runGraph: input '" << name << "' has shape ("
+         << info.timeLength << ", " << info.numStocks
+         << "), expected (" << out.timeLength << ", "
+         << (out.numStocks < 0 ? info.numStocks : out.numStocks) << ")";
+      throw std::runtime_error(ss.str());
+    }
+    if (out.numStocks < 0)
+      out.numStocks = info.numStocks;
+    out.args.emplace_back(name, info.ptr);
+  }
+  return out;
+}
+
+/// Allocate a CUDA device buffer of `T*S` elements (`sizeof(elem) =
+/// bytesPerElem(dt)`) and wrap it in an `nb::ndarray<>` (no framework
+/// annotation) owning the allocation via a capsule.  Lifetime is tied
+/// to the Python object: when the array's refcount drops to zero, the
+/// capsule destructor frees via `cuMemFree`.
+static nb::ndarray<> allocOwnedCudaArray2D(int64_t T, int64_t S,
+                                              kun_cuda::Datatype dt) {
+  const size_t elemBytes = kun_cuda::bytesPerElem(dt);
+  size_t total = static_cast<size_t>(T) * static_cast<size_t>(S);
+  CUdeviceptr p = 0;
+  CUresult r = cuMemAlloc(&p, total * elemBytes);
+  if (r != CUDA_SUCCESS) {
+    const char *msg = nullptr;
+    cuGetErrorString(r, &msg);
+    throw std::runtime_error(std::string("runGraph: cuMemAlloc failed: ") +
+                              (msg ? msg : "(unknown)"));
+  }
+  nb::capsule owner(reinterpret_cast<void *>(p), [](void *q) noexcept {
+    cuMemFree(reinterpret_cast<CUdeviceptr>(q));
+  });
+  CUdevice dev = 0;
+  cuCtxGetDevice(&dev);
+  size_t shape[2] = {static_cast<size_t>(T), static_cast<size_t>(S)};
+  nb::dlpack::dtype npDtype =
+      dt == kun_cuda::Datatype::Double ? nb::dtype<double>()
+                                         : nb::dtype<float>();
+  return nb::ndarray<>(reinterpret_cast<void *>(p), /*ndim=*/2, shape, owner,
+                        /*strides=*/nullptr,
+                        /*dtype=*/npDtype,
+                        /*device_type=*/nb::device::cuda::value,
+                        /*device_id=*/static_cast<int>(dev));
+}
+
+/// Walk `exe.graphOutputs()` in order: for each name, either pick the
+/// caller-allocated buffer out of `pyOutputs` (validating shape) or
+/// allocate a fresh CUDA buffer.  Appends `(name, devicePtr)` to `args`
+/// and returns a `{name: ndarray}` dict of every output that Python
+/// will see.
+///
+/// When `pyOutputs.is_none()` we short-circuit:  no dict cast, no
+/// rejectUnexpectedKeys, no per-name `contains` probe — every output
+/// is auto-allocated.  This is the common case (caller doesn't pre-
+/// allocate outputs) and keeps it tight.
+static nb::dict collectOutputs(
+    const kun_cuda::Executable &exe,
+    nb::object pyOutputs, int64_t length, int64_t numStocks,
+    const nb::object &streamArg,
+    std::vector<std::pair<std::string, uintptr_t>> &args) {
+  const auto &outputNames = exe.graphOutputs();
+  args.reserve(args.size() + outputNames.size());
+
+  // Start with a null-PyObject* `nb::dict` — `nb::handle::inc_ref()` /
+  // `dec_ref()` are `Py_XINCREF`/`Py_XDECREF` so it's safe to hold, and
+  // we skip the `PyDict_New()` that bare `nb::dict()` would do.  Only
+  // populate + extras-check when the caller passed a real dict; then
+  // the handle's `operator bool()` doubles as the "user gave us
+  // outputs" flag.
+  nb::dict userOutputs = nb::steal<nb::dict>(nb::handle());
+  if (!pyOutputs.is_none()) {
+    userOutputs = nb::cast<nb::dict>(pyOutputs);
+    rejectUnexpectedKeys(userOutputs, outputNames, "output");
+  }
+
+  nb::dict ret;
+  for (const std::string &name : outputNames) {
+    nb::object key = nb::str(name.c_str());
+    uintptr_t base;
+    if (userOutputs && userOutputs.contains(key)) {
+      CudaArrayInfo info =
+          readDLPack(userOutputs[key], name, streamArg, exe.dtype());
+      if (info.timeLength != length || info.numStocks != numStocks) {
+        std::stringstream ss;
+        ss << "runGraph: output '" << name << "' has shape ("
+           << info.timeLength << ", " << info.numStocks
+           << "), expected (" << length << ", " << numStocks << ")";
+        throw std::runtime_error(ss.str());
+      }
+      base = info.ptr;
+      ret[key] = userOutputs[key];
+    } else {
+      nb::ndarray<> arr =
+          allocOwnedCudaArray2D(length, numStocks, exe.dtype());
+      base = reinterpret_cast<uintptr_t>(arr.data());
+      ret[key] = nb::cast(std::move(arr));
+    }
+    args.emplace_back(name, base);
+  }
+  return ret;
+}
+
+/// Parse one Python `external_kernels=[...]` entry into a KernelMeta.
+/// Expected dict shape:
+///   {"name": str, "kind": str, "inputs": [str...], "outputs": [str...]}
+/// where `kind` is one of "cs_rank_f32", "cs_rank_f64",
+/// "cs_scale_f32", or "cs_scale_f64".
+static kun_cuda::KernelMeta parseExternalKernel(nb::handle obj) {
+  nb::dict d = nb::cast<nb::dict>(obj);
+  kun_cuda::KernelMeta km;
+  km.kernelName = nb::cast<std::string>(d["name"]);
+  std::string kind = nb::cast<std::string>(d["kind"]);
+  if (kind == "cs_rank_f32")
+    km.kind = kun_cuda::KernelKind::ExtCsRankF32;
+  else if (kind == "cs_rank_f64")
+    km.kind = kun_cuda::KernelKind::ExtCsRankF64;
+  else if (kind == "cs_scale_f32")
+    km.kind = kun_cuda::KernelKind::ExtCsScaleF32;
+  else if (kind == "cs_scale_f64")
+    km.kind = kun_cuda::KernelKind::ExtCsScaleF64;
+  else
+    throw std::runtime_error(
+        "KunMLIR.compile: unknown external kernel kind '" + kind +
+        "' (supported: cs_rank_f32, cs_rank_f64, "
+        "cs_scale_f32, cs_scale_f64)");
+  nb::iterable inputs  = nb::cast<nb::iterable>(d["inputs"]);
+  nb::iterable outputs = nb::cast<nb::iterable>(d["outputs"]);
+  for (nb::handle n : inputs)
+    km.inputNames.push_back(nb::cast<std::string>(n));
+  for (nb::handle n : outputs)
+    km.outputNames.push_back(nb::cast<std::string>(n));
+  return km;
+}
+
+static std::unique_ptr<kun_cuda::Executable>
+pyCompile(PyModule &pm,
+            const std::vector<std::string> &graphInputs,
+            const std::vector<std::string> &graphOutputs,
+            const std::string &gpuArch,
+            const std::string &targetTriple,
+            const std::string &targetFeatures, unsigned optLevel,
+            const std::string &toolkitPath,
+            nb::list externalKernels,
+            int warpsPerCta,
+            nb::dict outputUnreliable) {
+  if (graphInputs.empty())
+    throw std::runtime_error(
+        "KunMLIR.compile: graph_inputs cannot be empty");
+  if (graphOutputs.empty())
+    throw std::runtime_error(
+        "KunMLIR.compile: graph_outputs cannot be empty");
+
+  kungpu::PtxCompileOptions opts;
+  if (!gpuArch.empty())        opts.targetCpu      = gpuArch;
+  if (!targetTriple.empty())   opts.targetTriple   = targetTriple;
+  if (!targetFeatures.empty()) opts.targetFeatures = targetFeatures;
+  opts.optLevel    = optLevel;
+  opts.toolkitPath = toolkitPath;
+
+  auto data = std::make_shared<kun_cuda::ExecutableData>();
+  if (failed(kungpu::compileKunIrToExecutable(pm.module.get(), opts, *data)))
+    throw std::runtime_error("KunMLIR.compile failed");
+
+  // Append external (pre-compiled, runtime-dispatched) kernels.  The
+  // MLIR pipeline never saw them; they're fabricated here from the
+  // descriptor list the Python frontend collected.
+  for (nb::handle obj : externalKernels)
+    data->kernels.push_back(parseExternalKernel(obj));
+
+  if (data->kernels.empty())
+    throw std::runtime_error(
+        "KunMLIR.compile: no kernels (neither MLIR-emitted nor "
+        "external) — refusing to build an empty Executable");
+
+  // No JIT kernels → `compileKunIrToExecutable` left warpsPerCta at
+  // its default of 1.  Override with the caller-supplied value so the
+  // external launch path's blockDim is right.  When there are JIT
+  // kernels they fix warpsPerCta via their kungpu.target_spec, and we
+  // trust that over the parameter (and ignore the parameter).
+  bool anyJit = false;
+  for (const auto &k : data->kernels)
+    if (k.kind == kun_cuda::KernelKind::Jit) { anyJit = true; break; }
+  if (!anyJit) {
+    if (warpsPerCta <= 0)
+      throw std::runtime_error(
+          "KunMLIR.compile: warps_per_cta must be positive when every "
+          "kernel is external; got " + std::to_string(warpsPerCta));
+    data->warpsPerCta = warpsPerCta;
+  }
+
+  // Graph topology is a runtime concern — fill it in here, just before
+  // handing off to Executable's ctor (which validates + plans).
+  data->graphInputs  = graphInputs;
+  data->graphOutputs = graphOutputs;
+  for (auto item : outputUnreliable) {
+    auto name = nb::cast<std::string>(item.first);
+    auto val  = nb::cast<int64_t>(item.second);
+    data->outputUnreliable[name] = val;
+  }
+  return std::make_unique<kun_cuda::Executable>(std::move(data));
+}
+
+static std::unique_ptr<kun_cuda::Executable>
+pyLoadExecutable(const std::string &dir, const std::string &name) {
+  return std::make_unique<kun_cuda::Executable>(
+      kun_cuda::ExecutableData::loadFromFiles(dir, name));
+}
+
+} // namespace
+
+NB_MODULE(KunMLIR, m) {
+  m.doc() = "Bindings for the KunQuant MLIR compiler (kunir → PTX → CUBIN "
+             "→ launch).";
+
+  // Programmatic kunir construction (Value/Type opaque wrappers, IRBuilder).
+  kun_mlir_py::registerIRBuilder(m);
+
+  nb::class_<PyModule>(m, "ModuleOp")
+      .def("to_string", &PyModule::toString,
+            "Return the textual MLIR form of the module.")
+      .def("__str__",  &PyModule::toString)
+      .def("__repr__", [](const PyModule &m) {
+        return "<KunMLIR.ModuleOp>\n" + m.toString();
+      });
+
+  m.def("parse", &PyModule::parse, nb::arg("text"),
+         "Parse an MLIR text fragment into a ModuleOp.");
+
+  m.def("lower_to_ptx", &pyLowerToPtx,
+         nb::arg("module"),
+         nb::arg("gpu_arch")       = "sm_80",
+         nb::arg("target_triple")  = "nvptx64-nvidia-cuda",
+         nb::arg("target_features") = "",
+         nb::arg("opt_level")      = 3u,
+         nb::arg("toolkit_path")   = "",
+         "Lower kunir → PTX text via the upstream `gpu-module-to-binary` "
+         "pass with `format=isa`.  Debug / inspection only — the main "
+         "compile path goes straight to cubin.");
+
+  nb::class_<kun_cuda::Executable>(m, "Executable")
+      .def_static("load_from_files", &pyLoadExecutable,
+            nb::arg("dir"), nb::arg("name"),
+            "Load an Executable from `<dir>/<name>.json` and "
+            "`<dir>/<name>.cubin`.")
+      .def_prop_ro("input_names",   &kun_cuda::Executable::graphInputs,
+            "Graph-level input names — match this against the keys of the "
+            "args dict you pass to launch().")
+      .def_prop_ro("output_names",  &kun_cuda::Executable::graphOutputs,
+            "Graph-level output names — match this against the keys of the "
+            "args dict you pass to launch().")
+      .def("getOutputNames",
+            [](const kun_cuda::Executable &e) {
+              return e.graphOutputs();
+            },
+            "CPU-runtime-compatible alias for `output_names`.")
+      .def_prop_ro("warps_per_cta", &kun_cuda::Executable::warpsPerCta)
+      .def_prop_ro("vector_size",   &kun_cuda::Executable::vectorSize)
+      .def_prop_ro("num_kernels",
+            [](const kun_cuda::Executable &e) {
+              return e.numKernels();
+            })
+      .def_prop_ro("kernel_names",
+            [](const kun_cuda::Executable &e) {
+              std::vector<std::string> r;
+              r.reserve(e.data().kernels.size());
+              for (auto &km : e.data().kernels)
+                r.push_back(km.kernelName);
+              return r;
+            })
+      .def_prop_ro("launch_order",  &kun_cuda::Executable::launchOrder,
+            "Topo-sorted indices into kernel_names; the order kernels run "
+            "on the single CUDA stream.")
+      .def_prop_ro("peak_intermediate_slots",
+            &kun_cuda::Executable::peakIntermediateSlots,
+            "Number of intermediate buffers allocated by the runtime — "
+            "shape `(time_length, num_stocks)` each.")
+      .def_prop_ro("num_buffers",   &kun_cuda::Executable::numBuffers)
+      .def_prop_ro("cubin",
+            [](const kun_cuda::Executable &e) {
+              const auto &b = e.data().cubin;
+              return nb::bytes(b.data(), b.size());
+            })
+      .def("clone",
+            [](const kun_cuda::Executable &e) {
+              return e.clone();
+            },
+            "Return a new Executable with independent launch state while "
+            "sharing immutable compile data and loaded CUDA modules.")
+      .def("save_to_files",
+            [](const kun_cuda::Executable &e, const std::string &dir,
+               const std::string &name) {
+              e.data().saveToFiles(dir, name);
+            },
+            nb::arg("dir"), nb::arg("name"),
+            "Write `<dir>/<name>.json` and `<dir>/<name>.cubin`.")
+      .def("getOutputUnreliableCount",
+            &kun_cuda::Executable::outputUnreliable,
+            nb::rv_policy::reference_internal,
+            "Return {output_name: unreliable_count} — leading time steps "
+            "of each graph output to drop.");
+
+  // ── Executor ────────────────────────────────────────────────────────
+  // Mirrors the CPU `kun::Executor` shape: an opaque object that wraps a
+  // CUDA stream and exposes run_graph / synchronize.  Constructor accepts
+  // either a raw int (uintptr_t — e.g. the stream's `.ptr` from cupy) or
+  // a duck-typed object with a `.ptr` attribute (so passing a
+  // `cupy.cuda.Stream` directly Just Works).  None / no arg → default
+  // CUDA stream.
+  nb::class_<kun_cuda::Executor>(m, "Executor",
+        "Wraps a CUDA stream + provides `run_graph(exe, args)` (async) "
+        "and `synchronize()`.  Default constructor uses the CUDA default "
+        "stream; pass a cupy stream (or its `.ptr` integer) to share one "
+        "with caller-managed code.")
+      .def("__init__", [](kun_cuda::Executor *self, nb::object stream_arg) {
+            uintptr_t ptr = 0;
+            if (!stream_arg.is_none()) {
+              if (nb::hasattr(stream_arg, "ptr"))
+                ptr = nb::cast<uintptr_t>(stream_arg.attr("ptr"));
+              else
+                ptr = nb::cast<uintptr_t>(stream_arg);
+            }
+            new (self) kun_cuda::Executor(reinterpret_cast<CUstream>(ptr));
+          },
+          nb::arg("stream") = nb::none(),
+          "Build an Executor.  `stream=None` → default CUDA stream; "
+          "otherwise expects either an int (uintptr_t handle) or a "
+          "cupy.cuda.Stream-like object exposing `.ptr`.")
+      .def_prop_ro("stream",
+          [](const kun_cuda::Executor &e) -> uintptr_t {
+            return reinterpret_cast<uintptr_t>(e.stream());
+          },
+          "Raw stream handle as an int (0 ↔ CUDA default stream).")
+      .def("runGraph",
+          [](kun_cuda::Executor &e, kun_cuda::Executable &exe,
+              nb::dict pyInputs, int64_t cur_time, int64_t length,
+              nb::object pyOutputs, int64_t mask,
+              int minChunkWarmupFactor, double smFillFactor,
+              bool useCudaGraph) -> nb::dict {
+            if (cur_time != 0)
+              throw std::runtime_error(
+                  "runGraph: cur_time != 0 not supported on GPU");
+            if (length < 0)
+              throw std::runtime_error("runGraph: length must be >= 0");
+
+            // `length == 0` (default) → auto-infer from the first
+            // input's row count; otherwise it's the engine's internal
+            // time dim (== input rows == output rows).
+            const bool inferLength = (length == 0);
+
+            // Thread the executor's stream into __dlpack__(stream=…)
+            // so producers (CuPy / PyTorch / JAX / TF) can insert the
+            // cross-stream sync needed for data-readiness on our
+            // launch stream.
+            nb::object streamArg = dlpackStreamArg(e.stream());
+            auto in = collectInputs(exe, pyInputs, streamArg,
+                                       inferLength ? -1 : length);
+            if (inferLength)
+              length = in.timeLength;
+            if (mask < 0 || mask >= length)
+              throw std::runtime_error(
+                  "runGraph: mask must be in [0, length)");
+
+            // Kernel writes `output[t]` directly for `t ∈ [mask, length)`
+            // (kungpu codegen no longer subtracts mask).  Rows `[0, mask)`
+            // are left as whatever the user / allocator put there.
+            const int64_t timeLength = length;
+
+            // Build the args vector (inputs first, then outputs in
+            // exe.graphOutputs order).  Auto-allocates any output the
+            // caller didn't pre-allocate; returns the dict that goes
+            // back to Python.
+            std::vector<std::pair<std::string, uintptr_t>> args =
+                std::move(in.args);
+            nb::dict ret = collectOutputs(exe, pyOutputs, length,
+                                            in.numStocks, streamArg, args);
+
+            e.runGraph(exe, timeLength, in.numStocks, args,
+                        mask, minChunkWarmupFactor, smFillFactor,
+                        useCudaGraph ? kun_cuda::LaunchMode::CudaGraph
+                                     : kun_cuda::LaunchMode::Normal);
+            return ret;
+          },
+          nb::arg("exe"), nb::arg("inputs"),
+          nb::arg("cur_time") = 0, nb::arg("length") = 0,
+          nb::arg("outputs") = nb::none(),
+          nb::arg("mask") = 0,
+          nb::arg("min_chunk_warmup_factor") = 4,
+          nb::arg("sm_fill_factor") = 1.5,
+          nb::arg("use_cuda_graph") = false,
+          "Queue every kernel in `exe` onto this executor's stream.\n"
+          "**Asynchronous** — call `.synchronize()` (or otherwise wait\n"
+          "on the stream) before reading results back to host.\n"
+          "\n"
+          "`inputs` is a {name → cuda_array} dict whose keys must equal\n"
+          "`exe.input_names`.  Arrays must be float32, 2-D, shape\n"
+          "`(length, num_stocks)` (TS layout), and reside on the GPU.\n"
+          "\n"
+          "`cur_time` mirrors CPU `kr.runGraph`; GPU only accepts 0.\n"
+          "\n"
+          "`length` is input/output time dim.  Default 0 ⇒ auto-infer\n"
+          "from the first input's row count.\n"
+          "\n"
+          "`outputs` is an optional {name → cuda_array} dict of\n"
+          "caller-allocated output buffers (subset of\n"
+          "`exe.output_names`).  Each must have shape `(length,\n"
+          "num_stocks)` (same as input).  Names missing from `outputs`\n"
+          "are auto-allocated by the binding (float32 CUDA buffers,\n"
+          "capsule-owned).  Returns a dict of every output name → its\n"
+          "buffer (user-supplied or freshly allocated).\n"
+          "\n"
+          "`mask` is the warmup-skip on graph outputs: the kernel only\n"
+          "writes to output rows `[mask, length)`; rows `[0, mask)` are\n"
+          "left untouched (whatever the user / allocator put there).\n"
+          "Default 0.\n"
+          "\n"
+          "`min_chunk_warmup_factor` is the lower bound on "
+          "`chunk_size / warmup` — keeps warmup-overlap overhead below "
+          "`1 / factor` of total compute.  Default 4 (≤ 25% overhead).\n"
+          "`sm_fill_factor` is the target `num_chunks * stock_tiles / "
+          "numSMs`.  1.0 just fills the GPU; > 1 leaves scheduler "
+          "slack.  Default 1.5.\n"
+          "`use_cuda_graph=True` launches through a CUDA Graph node DAG "
+          "with graph allocation/free nodes for intermediate buffers. "
+          "Default false keeps the existing sequential launch path.\n"
+          "\n"
+          "Named to match the CPU executor API "
+          "(`KunRunner.runGraph(executor, mod, inputs, cur_time, length)`).")
+      .def("synchronize", &kun_cuda::Executor::synchronize,
+          "Block until every kernel queued on this stream completes.");
+
+  m.def("compile", &pyCompile,
+         nb::arg("module"),
+         nb::arg("graph_inputs"),
+         nb::arg("graph_outputs"),
+         nb::arg("gpu_arch")       = "sm_80",
+         nb::arg("target_triple")  = "nvptx64-nvidia-cuda",
+         nb::arg("target_features") = "",
+         nb::arg("opt_level")      = 3u,
+         nb::arg("toolkit_path")   = "",
+         nb::arg("external_kernels") = nb::list(),
+         nb::arg("warps_per_cta")    = 0,
+         nb::arg("output_unreliable") = nb::dict(),
+         "Compile a kunir module all the way to a loaded Executable.\n"
+         "\n"
+         "Pipeline: kunir → LLVM dialect → upstream `gpu-module-to-binary`\n"
+         "(format=bin) which handles libdevice linking + LLVM optimization\n"
+         "+ PTX emission + ptxas, → cuModuleLoad on the resulting cubin.\n"
+         "\n"
+         "graph_inputs / graph_outputs name the buffers that flow in/out\n"
+         "of the whole kernel graph; everything else produced by the\n"
+         "kernels is treated as an intermediate and gets a runtime-managed\n"
+         "slot.\n"
+         "\n"
+         "toolkit_path: optional path to the CUDA toolkit (where\n"
+         "libdevice.10.bc and ptxas live).  Empty → search CUDA_HOME /\n"
+         "CUDA_PATH / standard install locations.");
+}
diff --git a/mlir/lib/Python/PyModule.cpp b/mlir/lib/Python/PyModule.cpp
new file mode 100644
index 0000000..38ffe8b
--- /dev/null
+++ b/mlir/lib/Python/PyModule.cpp
@@ -0,0 +1,101 @@
+//===- PyModule.cpp - dialect / translation / target registration -----===//
+//
+// Everything that touches a specific dialect or translation lives here,
+// not in PyModule.h, so consumers of `class PyModule` only pay for the
+// MLIRContext + ModuleOp typedefs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PyModule.h"
+
+#include "mlir/IR/DialectRegistry.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/Parser/Parser.h"
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/Index/IR/IndexDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
+#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+
+// Required for `gpu-module-to-binary` to dispatch to the NVVM target
+// implementation (libdevice link + LLVM opt + ptxas).
+#include "mlir/Target/LLVM/NVVM/Target.h"
+
+// MLIR → LLVM IR translation registrations consumed by the NVVM target
+// serializer.  Keep the list minimal — `registerAllToLLVMIRTranslations`
+// would force linking ArmSVE / SPIR-V / etc.
+#include "mlir/Target/LLVMIR/Dialect/Builtin/BuiltinToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.h"
+
+#include "KunGpu/KunGpuDialect.h"
+#include "KunIr/KunIrDialect.h"
+
+#include "llvm/Support/raw_ostream.h"
+
+#include <stdexcept>
+
+namespace kun_mlir_py {
+
+namespace {
+
+mlir::DialectRegistry makeRegistry() {
+  mlir::DialectRegistry registry;
+
+  registry.insert<mlir::arith::ArithDialect>();
+  registry.insert<mlir::cf::ControlFlowDialect>();
+  registry.insert<mlir::func::FuncDialect>();
+  registry.insert<mlir::gpu::GPUDialect>();
+  registry.insert<mlir::index::IndexDialect>();
+  registry.insert<mlir::LLVM::LLVMDialect>();
+  registry.insert<mlir::math::MathDialect>();
+  registry.insert<mlir::NVVM::NVVMDialect>();
+  registry.insert<mlir::scf::SCFDialect>();
+  registry.insert<kunir::KunIrDialect>();
+  registry.insert<kungpu::KunGpuDialect>();
+
+  // Wire up `#nvvm.target`'s serializeToObject impl so
+  // `gpu-module-to-binary` can lower gpu.module → cubin / PTX.
+  mlir::NVVM::registerNVVMTargetInterfaceExternalModels(registry);
+  // ...and the dialect → LLVM IR translation hooks the NVVM target
+  // calls once it has its hands on the gpu.module body.
+  mlir::registerBuiltinDialectTranslation(registry);
+  mlir::registerLLVMDialectTranslation(registry);
+  mlir::registerNVVMDialectTranslation(registry);
+  mlir::registerGPUDialectTranslation(registry);
+  return registry;
+}
+
+} // namespace
+
+PyModule::PyModule()
+    : ctx(std::make_unique<mlir::MLIRContext>(
+          makeRegistry(), mlir::MLIRContext::Threading::DISABLED)) {
+  ctx->loadAllAvailableDialects();
+}
+
+PyModule::~PyModule() = default;
+
+std::unique_ptr<PyModule> PyModule::parse(const std::string &text) {
+  auto pm = std::make_unique<PyModule>();
+  pm->module = mlir::parseSourceString<mlir::ModuleOp>(text, pm->ctx.get());
+  if (!pm->module)
+    throw std::runtime_error("KunMLIR.parse: failed to parse MLIR text");
+  return pm;
+}
+
+std::string PyModule::toString() const {
+  std::string out;
+  llvm::raw_string_ostream os(out);
+  module.get().print(os);
+  os.flush();
+  return out;
+}
+
+} // namespace kun_mlir_py
diff --git a/mlir/lib/Python/PyModule.h b/mlir/lib/Python/PyModule.h
new file mode 100644
index 0000000..5490c01
--- /dev/null
+++ b/mlir/lib/Python/PyModule.h
@@ -0,0 +1,42 @@
+//===- PyModule.h - PyModule (MLIR ctx + ModuleOp) shared by bindings --===//
+//
+// Used by both MlirBinding.cpp (parse / compile entry points) and
+// IRBuilder.cpp (programmatic construction of a kunir module from
+// Python).  The header is deliberately thin: dialect / translation /
+// target registrations all live in PyModule.cpp so nobody pays for them
+// transitively.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include "mlir/IR/BuiltinOps.h"   // mlir::ModuleOp
+#include "mlir/IR/OwningOpRef.h"  // mlir::OwningOpRef
+
+#include <memory>
+#include <string>
+
+namespace mlir { class MLIRContext; }
+
+namespace kun_mlir_py {
+
+class PyModule {
+public:
+  PyModule();                        // sets up ctx + dialects + registrations
+  ~PyModule();                       // out-of-line so MLIRContext can stay
+                                      // forward-declared in this header
+  PyModule(const PyModule &)            = delete;
+  PyModule &operator=(const PyModule &) = delete;
+
+  /// Parse an MLIR text fragment into a fresh PyModule.  Throws on
+  /// parse failure.
+  static std::unique_ptr<PyModule> parse(const std::string &text);
+
+  /// Pretty-print the held module.
+  std::string toString() const;
+
+  std::unique_ptr<mlir::MLIRContext> ctx;
+  mlir::OwningOpRef<mlir::ModuleOp>  module;
+};
+
+} // namespace kun_mlir_py
diff --git a/mlir/lib/Python/dlpack.h b/mlir/lib/Python/dlpack.h
new file mode 100644
index 0000000..b133e18
--- /dev/null
+++ b/mlir/lib/Python/dlpack.h
@@ -0,0 +1,87 @@
+//===- dlpack.h - Minimal vendored DLPack ABI (consumer-only) ----------===//
+//
+// Trimmed subset of dmlc/dlpack v0.8.  We only need to *consume* a
+// `DLManagedTensor` produced by CuPy / PyTorch / JAX via the
+// `__dlpack__()` protocol, so this header omits the producer-side
+// helpers and the newer versioned form.  Vendored to keep the build
+// dependency-free; full spec lives at https://github.com/dmlc/dlpack.
+//
+// Original license: Apache-2.0.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// `DLDeviceType` — where the tensor data sits.  We accept kDLCUDA and
+/// kDLCUDAManaged; everything else (CPU, ROCm, Metal, …) is rejected.
+typedef enum {
+  kDLCPU = 1,
+  kDLCUDA = 2,
+  kDLCUDAHost = 3,
+  kDLOpenCL = 4,
+  kDLVulkan = 7,
+  kDLMetal = 8,
+  kDLVPI = 9,
+  kDLROCM = 10,
+  kDLROCMHost = 11,
+  kDLExtDev = 12,
+  kDLCUDAManaged = 13,
+  kDLOneAPI = 14,
+  kDLWebGPU = 15,
+  kDLHexagon = 16,
+} DLDeviceType;
+
+typedef struct {
+  DLDeviceType device_type;
+  int32_t      device_id;
+} DLDevice;
+
+/// `DLDataTypeCode` — element-kind dimension of the dtype triple.
+typedef enum {
+  kDLInt = 0,
+  kDLUInt = 1,
+  kDLFloat = 2,
+  kDLOpaqueHandle = 3,
+  kDLBfloat = 4,
+  kDLComplex = 5,
+  kDLBool = 6,
+} DLDataTypeCode;
+
+/// (code, bits, lanes).  E.g. f32 = {kDLFloat, 32, 1}; f64 = {kDLFloat, 64, 1}.
+typedef struct {
+  uint8_t  code;
+  uint8_t  bits;
+  uint16_t lanes;
+} DLDataType;
+
+/// Plain tensor descriptor — pointer + shape + dtype + device.
+typedef struct {
+  void       *data;
+  DLDevice    device;
+  int32_t     ndim;
+  DLDataType  dtype;
+  int64_t    *shape;
+  int64_t    *strides;       ///< NULL → row-major contiguous
+  uint64_t    byte_offset;
+} DLTensor;
+
+/// The wrapper exchanged via the unversioned PyCapsule named "dltensor".
+/// The capsule's PyCapsule_Destructor calls `deleter(self)` when it is
+/// GC'd — unless the consumer renamed the capsule to "used_dltensor",
+/// in which case the destructor is a no-op and the consumer must call
+/// `deleter` itself.
+typedef struct DLManagedTensor {
+  DLTensor dl_tensor;
+  void    *manager_ctx;
+  void   (*deleter)(struct DLManagedTensor *self);
+} DLManagedTensor;
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
diff --git a/mlir/llvm_commit.txt b/mlir/llvm_commit.txt
new file mode 100644
index 0000000..391f543
--- /dev/null
+++ b/mlir/llvm_commit.txt
@@ -0,0 +1 @@
+llvmorg-22.1.6
\ No newline at end of file
diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt
new file mode 100644
index 0000000..eff8412
--- /dev/null
+++ b/mlir/test/CMakeLists.txt
@@ -0,0 +1,31 @@
+# kun-opt is placed in LLVM_RUNTIME_OUTPUT_INTDIR (= ${CMAKE_BINARY_DIR}/bin)
+set(KUN_OPT_BINARY "${LLVM_RUNTIME_OUTPUT_INTDIR}/kun-opt")
+
+# MLIR_TOOLS_DIR may not be set when building out-of-tree; fall back to
+# the LLVM tools directory which contains FileCheck, mlir-opt, etc.
+if(NOT MLIR_TOOLS_DIR)
+  set(MLIR_TOOLS_DIR "${LLVM_TOOLS_BINARY_DIR}")
+endif()
+
+configure_lit_site_cfg(
+  ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in
+  ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py
+  MAIN_CONFIG
+  ${CMAKE_CURRENT_SOURCE_DIR}/lit.cfg.py
+)
+
+set(KUN_MLIR_TEST_DEPENDS
+  kun-opt
+  KunMLIR
+  FileCheck
+)
+
+add_lit_testsuite(check-kun-mlir "Running KunQuant MLIR regression tests"
+  ${CMAKE_CURRENT_BINARY_DIR}
+  DEPENDS ${KUN_MLIR_TEST_DEPENDS}
+)
+set_target_properties(check-kun-mlir PROPERTIES FOLDER "Tests")
+
+add_lit_testsuites(KUN_MLIR ${CMAKE_CURRENT_SOURCE_DIR}
+  DEPENDS ${KUN_MLIR_TEST_DEPENDS}
+)
diff --git a/mlir/test/kungpu/basic.mlir b/mlir/test/kungpu/basic.mlir
new file mode 100644
index 0000000..128fb3c
--- /dev/null
+++ b/mlir/test/kungpu/basic.mlir
@@ -0,0 +1,59 @@
+// RUN: %kun-opt %s | %FileCheck %s
+// RUN: %kun-opt %s | %kun-opt | %FileCheck %s
+
+// CHECK-LABEL: kunir.func @test_stock_id
+kunir.func @test_stock_id()
+    inputs {} outputs {"id"}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1} unreliable_count = 0
+    -> index {
+  // CHECK: kungpu.stock_id
+  %id = kungpu.stock_id
+  kunir.return %id : index
+}
+
+// CHECK-LABEL: kunir.func @test_block_stock_count
+kunir.func @test_block_stock_count()
+    inputs {} outputs {"n"}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1} unreliable_count = 0
+    -> index {
+  // CHECK: kungpu.block_stock_count
+  %n = kungpu.block_stock_count
+  kunir.return %n : index
+}
+
+// CHECK-LABEL: kunir.func @test_time_length
+kunir.func @test_time_length()
+    inputs {} outputs {"len"}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1} unreliable_count = 0
+    -> index {
+  // CHECK: kungpu.time_length
+  %len = kungpu.time_length
+  kunir.return %len : index
+}
+
+// CHECK-LABEL: kunir.func @test_ts_get_put
+kunir.func @test_ts_get_put(%ts_in: !kunir.ts<f32, inf>, %ts_out: !kunir.ts<f32, 1>)
+    inputs {%ts_in = "ts_in"}
+    outputs {%ts_out = "ts_out"}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1} unreliable_count = 0 {
+  %off = arith.constant 0 : i32
+  // CHECK: kungpu.ts.get
+  // CHECK-SAME: <f32, inf> -> f32
+  %v = kungpu.ts.get %ts_in[%off] : !kunir.ts<f32, inf> -> f32
+  // CHECK: kungpu.ts.put
+  kungpu.ts.put %ts_out, %v : !kunir.ts<f32, 1>, f32
+  kunir.return
+}
+
+// CHECK-LABEL: kunir.func @test_windowed_temp
+kunir.func @test_windowed_temp()
+    inputs {} outputs {"v"}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1} unreliable_count = 0
+    -> f32 {
+  %off = arith.constant 0 : i32
+  // CHECK: %[[WT:.*]] = kungpu.windowed_temp : <f32, 5>
+  %wt = kungpu.windowed_temp : !kunir.ts<f32, 5>
+  // CHECK: kungpu.ts.get %[[WT]]
+  %v = kungpu.ts.get %wt[%off] : !kunir.ts<f32, 5> -> f32
+  kunir.return %v : f32
+}
diff --git a/mlir/test/kungpu/cse_ts_get.mlir b/mlir/test/kungpu/cse_ts_get.mlir
new file mode 100644
index 0000000..be4522a
--- /dev/null
+++ b/mlir/test/kungpu/cse_ts_get.mlir
@@ -0,0 +1,31 @@
+// RUN: %kun-opt --pass-pipeline='builtin.module(gpu.module(kunir.func(kunir-to-kungpu,cse)))' %s | %FileCheck %s
+//
+// Verify that the CSE pass placed between `kunir-to-kungpu` and
+// `windowed-temp-memory-planning` deduplicates identical kungpu.ts.get
+// loads.  kungpu.ts.get is marked Pure, so CSE collapses any pair of
+// reads with the same (handle, offset) operands.
+//
+// Two distinct kunir.back_ref ops on the same windowed_output at the
+// same window lower to two ts.get %wt[%c5_i32] inside the outer time
+// loop; after CSE only one survives.
+
+gpu.module @kungpu_kernels {
+  // CHECK-LABEL: kunir.func @two_back_refs
+  kunir.func @two_back_refs(%a: !kunir.ts<f32, inf>)
+      inputs {%a = "a"}
+      outputs {"out"}
+      target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1} unreliable_count = 5
+      -> !kunir.ts<f32, 1> {
+    %r1 = kunir.back_ref %a [window = 5] : !kunir.ts<f32, inf> -> !kunir.ts<f32, 1>
+    %r2 = kunir.back_ref %a [window = 5] : !kunir.ts<f32, inf> -> !kunir.ts<f32, 1>
+    %sum = kunir.add %r1, %r2 : !kunir.ts<f32, 1>, !kunir.ts<f32, 1>
+    kunir.return %sum : !kunir.ts<f32, 1>
+  }
+}
+
+// One load of %a at offset 0 for the windowed_output fill (outer loop),
+// and exactly ONE load of the windowed_temp at offset 5 (the two
+// back_refs collapsed via CSE) — for a total of two ts.get ops.
+//
+// CHECK:       kungpu.ts.get
+// CHECK-NOT:   kungpu.ts.get
diff --git a/mlir/test/kungpu/kunir_to_llvm_pipeline.mlir b/mlir/test/kungpu/kunir_to_llvm_pipeline.mlir
new file mode 100644
index 0000000..6837f69
--- /dev/null
+++ b/mlir/test/kungpu/kunir_to_llvm_pipeline.mlir
@@ -0,0 +1,63 @@
+// RUN: %kun-opt --kunir-to-llvm %s | %FileCheck %s
+//
+// End-to-end smoke test for the kunir-to-llvm pipeline:
+//   kunir-to-kungpu → memory-planning → convert-kungpu-to-llvm
+//   → LICM → canonicalize → cse → scf-to-cf
+//   → convert-gpu-to-nvvm (indexBitwidth=32)
+//   → index/arith/cf/func to-llvm → reconcile-unrealized-casts.
+//
+// We verify that no kunir/kungpu/scf/gpu *ops* survive in the function
+// body and that the original kunir.func becomes an llvm.func with
+// nvvm.kernel tagging, inside the same gpu.module.
+
+// CHECK-NOT: kunir.{{[a-z_.]+ }}
+// CHECK-NOT: kungpu.{{[a-z_.]+ }}
+// CHECK-NOT: scf.{{[a-z_]+}}
+// CHECK-NOT: gpu.{{[a-z_]+ }}
+
+// CHECK:       gpu.module @kungpu_kernels
+
+// llvm.func with the (i32 time_len, i32 num_stocks, i32 mask, i32 chunk_size,
+// i32 warmup, ptr...) signature, tagged as a kernel by convert-gpu-to-nvvm.
+// CHECK-LABEL: llvm.func @test_addsum
+// CHECK-SAME:    i32
+// CHECK-SAME:    i32
+// CHECK-SAME:    i32
+// CHECK-SAME:    i32
+// CHECK-SAME:    i32
+// CHECK-SAME:    !llvm.ptr
+// CHECK-SAME:    !llvm.ptr
+// CHECK-SAME:    !llvm.ptr
+//
+// kunir-func metadata preserved as discardable attributes:
+// CHECK-SAME:    kungpu.input_names = ["a", "b"]
+// CHECK-SAME:    kungpu.output_names = ["sum"]
+// CHECK-SAME:    kungpu.target_spec = #kunir<target_spec{
+// CHECK-SAME:    nvvm.kernel
+
+// gpu.thread_id / block_id / block_dim are now NVVM intrinsics.
+// CHECK:       nvvm.read.ptx.sreg.tid.x
+// CHECK:       nvvm.read.ptx.sreg.ctaid.x
+// CHECK:       nvvm.read.ptx.sreg.ntid.x
+
+// Branch-based control flow from scf-to-cf:
+// CHECK-DAG:   llvm.br
+// CHECK-DAG:   llvm.cond_br
+
+// Lowered arithmetic + load/store from gmem:
+// CHECK-DAG:   llvm.fadd
+// CHECK-DAG:   llvm.getelementptr
+// CHECK-DAG:   llvm.load
+// CHECK-DAG:   llvm.store
+// CHECK:       llvm.return
+
+gpu.module @kungpu_kernels {
+  kunir.func @test_addsum(%a: !kunir.ts<f32, inf>, %b: !kunir.ts<f32, inf>)
+      inputs {%a = "a", %b = "b"}
+      outputs {"sum"}
+      target {occupancy = 1, warps_per_cta = 4, smem_size = 49152, vector_size = 1} unreliable_count = 0
+      -> !kunir.ts<f32, 1> {
+    %s = kunir.add %a, %b : !kunir.ts<f32, inf>, !kunir.ts<f32, inf>
+    kunir.return %s : !kunir.ts<f32, 1>
+  }
+}
diff --git a/mlir/test/kungpu/lower_to_llvm.mlir b/mlir/test/kungpu/lower_to_llvm.mlir
new file mode 100644
index 0000000..69b3d60
--- /dev/null
+++ b/mlir/test/kungpu/lower_to_llvm.mlir
@@ -0,0 +1,332 @@
+// RUN: %kun-opt --convert-kungpu-to-llvm %s | %FileCheck %s
+//
+// All kernels live in a single gpu.module — convert-kungpu-to-llvm rewrites
+// each kunir.func to a gpu.func (kernel) inside that gpu.module, with the
+// signature prepended by (i32 time_len, i32 num_stocks).
+
+gpu.module @kungpu_kernels {
+
+// =====================================================================
+// Smem global emitted by `test_windowed_smem` lands inside gpu.module.
+// =====================================================================
+// CHECK:       gpu.module @kungpu_kernels {
+// CHECK:         llvm.mlir.global internal @[[SMEM:__smem_test_windowed_smem_[0-9]+]]()
+// CHECK-SAME:    {addr_space = 3 : i32}
+// CHECK-SAME:    !llvm.array<{{[0-9]+}} x f32>
+
+
+// =====================================================================
+// Case 0 — time_lb / time_ub lowering in isolation.
+//
+// Both ops do their arithmetic in i32 (64-bit ops are slow on GPU);
+// only the final scf.for bound is cast back to index.
+//
+//   time_lb = (cy_i32 == 0) ? 0 : cy_i32 * chunk_size - warmup
+//   time_ub = min((cy_i32 + 1) * chunk_size, time_length)
+//
+// where cy = gpu.block_id y.  This function lowers to nothing but the
+// signature, the two bound computations, and an empty scf.for (no body
+// ops survive the conversion — kungpu.time_length / time_lb / time_ub
+// are illegal in the output IR).
+// =====================================================================
+//
+// CHECK-LABEL: gpu.func @test_time_bounds(
+// CHECK-SAME:    %[[TL_I32:[^:]+]]: i32,
+// CHECK-SAME:    %[[NS:[^:]+]]: i32,
+// CHECK-SAME:    %[[MASK_I32:[^:]+]]: i32,
+// CHECK-SAME:    %[[CSZ_I32:[^:]+]]: i32,
+// CHECK-SAME:    %[[WUP_I32:[^:]+]]: i32,
+// CHECK-SAME:    %[[IN:[^:]+]]: !llvm.ptr,
+// CHECK-SAME:    %[[OUT:[^:]+]]: !llvm.ptr
+// CHECK-SAME:    kernel
+//
+// ── time_lb lowering ───────────────────────────────────────────────────
+// chunk_size / warmup are read from gpu.func args directly (op is
+// operandless at this level).
+//   lb_i32 = select(cy_i32 == 0, 0, cy_i32 * chunk_size - warmup)
+//   lb     = index_cast lb_i32
+// CHECK:         %[[CY_LB_IDX:.*]] = gpu.block_id y
+// CHECK:         %[[CY_LB:.*]] = arith.index_cast %[[CY_LB_IDX]] : index to i32
+// CHECK:         %[[LBC0:.*]] = arith.constant 0 : i32
+// CHECK:         %[[ISFST_LB:.*]] = arith.cmpi eq, %[[CY_LB]], %[[LBC0]] : i32
+// CHECK:         %[[OFF_LB:.*]] = arith.muli %[[CY_LB]], %[[CSZ_I32]] : i32
+// CHECK:         %[[OFFMW:.*]] = arith.subi %[[OFF_LB]], %[[WUP_I32]] : i32
+// CHECK:         %[[LB_I32:.*]] = arith.select %[[ISFST_LB]], %[[LBC0]], %[[OFFMW]] : i32
+// CHECK:         %[[LB:.*]] = arith.index_cast %[[LB_I32]] : i32 to index
+//
+// ── time_ub lowering ───────────────────────────────────────────────────
+// chunk_size / time_length are read from gpu.func args directly.
+//   ub_i32 = min((cy_i32 + 1) * chunk_size, time_length)
+//   ub     = index_cast ub_i32
+// CHECK:         %[[CY_UB_IDX:.*]] = gpu.block_id y
+// CHECK:         %[[CY_UB:.*]] = arith.index_cast %[[CY_UB_IDX]] : index to i32
+// CHECK:         %[[UBC1:.*]] = arith.constant 1 : i32
+// CHECK:         %[[CYP1:.*]] = arith.addi %[[CY_UB]], %[[UBC1]] : i32
+// CHECK:         %[[END:.*]] = arith.muli %[[CYP1]], %[[CSZ_I32]] : i32
+// CHECK:         %[[UB_I32:.*]] = arith.minui %[[END]], %[[TL_I32]] : i32
+// CHECK:         %[[UB:.*]] = arith.index_cast %[[UB_I32]] : i32 to index
+//
+// Resulting scf.for picks up the two bounds.
+// CHECK:         scf.for %{{.*}} = %[[LB]] to %[[UB]] step %{{.*}}
+// CHECK:         gpu.return
+//
+// The kungpu ops are illegal in the final IR.
+// CHECK-NOT:     kungpu.time_lb
+// CHECK-NOT:     kungpu.time_ub
+kunir.func @test_time_bounds(%in: !kunir.ts<f32, inf>, %out: !kunir.ts<f32, 1>)
+    inputs {%in = "in"}
+    outputs {%out = "out"}
+    target {occupancy = 1, warps_per_cta = 1, smem_size = 0, vector_size = 1} unreliable_count = 0 {
+  %lb = kungpu.time_lb
+  %ub = kungpu.time_ub
+  %c1 = arith.constant 1 : index
+  scf.for %t = %lb to %ub step %c1 {
+  }
+  kunir.return
+}
+
+
+// =====================================================================
+// Case 1 — gmem-only: signature change, time_length lowering, TxS GEPs.
+// =====================================================================
+//
+// CHECK-LABEL: gpu.func @test_copy(
+// CHECK-SAME:    %[[TL:[^:]+]]: i32,
+// CHECK-SAME:    %[[NS:[^:]+]]: i32,
+// CHECK-SAME:    %[[MASK_I32:[^:]+]]: i32,
+// CHECK-SAME:    %[[CSZ_I32:[^:]+]]: i32,
+// CHECK-SAME:    %[[WUP_I32:[^:]+]]: i32,
+// CHECK-SAME:    %[[IN:[^:]+]]: !llvm.ptr,
+// CHECK-SAME:    %[[OUT:[^:]+]]: !llvm.ptr
+// kernel attribute is set, kunir-func metadata preserved as discardables:
+// CHECK-SAME:    kernel
+// CHECK-SAME:    kungpu.input_names = ["in"]
+// CHECK-SAME:    kungpu.output_names = ["out"]
+// CHECK-SAME:    kungpu.target_spec = #kunir<target_spec{
+//
+// ── Per-function chunk write_start cache, lazily inserted at entry ────
+// All chunk arithmetic stays in i32 (64-bit ops are slow on GPU); only
+// the final write_start gets an index_cast for comparing against the
+// index-typed scf.for IV.
+// CHECK:       %[[CY_IDX:.*]] = gpu.block_id y
+// CHECK:       %[[CY:.*]] = arith.index_cast %[[CY_IDX]] : index to i32
+// CHECK:       %[[CYC0:.*]] = arith.constant 0 : i32
+// CHECK:       %[[ISFIRST:.*]] = arith.cmpi eq, %[[CY]], %[[CYC0]] : i32
+// CHECK:       %[[CYMUL:.*]] = arith.muli %[[CY]], %[[CSZ_I32]] : i32
+// CHECK:       %[[WSTART_I32:.*]] = arith.select %[[ISFIRST]], %[[MASK_I32]], %[[CYMUL]] : i32
+// CHECK:       %[[WSTART:.*]] = arith.index_cast %[[WSTART_I32]] : i32 to index
+//
+// ── Active-thread guard prologue ──────────────────────────────────────
+// Computes stock_id = bid*bdim + tid, compares with %num_stocks, then
+// wraps the original kernel body in scf.if so threads with
+// stock_id ≥ num_stocks fall straight through to gpu.return.
+// CHECK:       %[[TID:.*]]  = gpu.thread_id  x
+// CHECK:       %[[BID:.*]]  = gpu.block_id   x
+// CHECK:       %[[BDIM:.*]] = gpu.block_dim  x
+// CHECK:       %[[BTB:.*]]  = arith.muli %[[BID]], %[[BDIM]]
+// CHECK:       %[[SID:.*]]  = arith.addi %[[BTB]], %[[TID]]
+// CHECK:       %[[SIDI:.*]] = arith.index_cast %[[SID]] : index to i32
+// CHECK:       %[[ACTIVE:.*]] = arith.cmpi slt, %[[SIDI]], %[[NS]] : i32
+// CHECK:       scf.if %[[ACTIVE]] {
+//
+// time_lb / time_ub lowering is verified in detail by @test_time_bounds
+// above.  Here we only assert that the scf.for picks up index-typed
+// bounds (which can only be the index_cast results of time_lb / time_ub
+// since chunk_size / warmup are i32).
+// CHECK:         %[[LB:.*]] = arith.index_cast %{{.*}} : i32 to index
+// CHECK:         %[[UB:.*]] = arith.index_cast %{{.*}} : i32 to index
+// CHECK:         %[[OFFCST:.*]] = arith.constant 0 : i32
+//
+// CHECK:         scf.for %[[T:.*]] = %[[LB]] to %[[UB]] step %{{.*}}
+//
+// ── ts.get on global %in at offset 0 ───────────────────────────────────
+// effective time = t − 0; stock_id = bid*bdim + tid; lin = effT*ns + sid.
+// num_stocks (i32 arg[1]) is sign-extended to i64 for the linear index.
+// CHECK:         %[[OFFI:.*]] = arith.index_cast %[[OFFCST]] : i32 to index
+// CHECK:         %[[NS64:.*]] = arith.extsi %[[NS]] : i32 to i64
+// CHECK:         %[[EFFT:.*]] = arith.subi %[[T]], %[[OFFI]] : index
+// CHECK:         %[[EFFT64:.*]] = arith.index_cast %[[EFFT]] : index to i64
+// CHECK:         %[[TID:.*]] = gpu.thread_id  x
+// CHECK:         %[[BID:.*]] = gpu.block_id   x
+// CHECK:         %[[BDIM:.*]] = gpu.block_dim  x
+// CHECK:         %[[BTB:.*]] = arith.muli %[[BID]], %[[BDIM]]
+// CHECK:         %[[SID:.*]] = arith.addi %[[BTB]], %[[TID]]
+// CHECK:         %[[SIDI:.*]] = arith.index_cast %[[SID]] : index to i64
+// CHECK:         %[[ROW:.*]] = arith.muli %[[EFFT64]], %[[NS64]] : i64
+// CHECK:         %[[LIN:.*]] = arith.addi %[[ROW]], %[[SIDI]] : i64
+// CHECK:         %[[GEP:.*]] = llvm.getelementptr %[[IN]][%[[LIN]]] {{.*}} -> !llvm.ptr, f32
+// CHECK:         %[[V:.*]] = llvm.load %[[GEP]] : !llvm.ptr -> f32
+//
+// ── ts.put on global %out: gated by t ≥ write_start, output index t ──
+// CHECK:         %[[DOW:.*]] = arith.cmpi sge, %[[T]], %[[WSTART]] : index
+// CHECK:         scf.if %[[DOW]] {
+// CHECK:           %[[NS64B:.*]] = arith.extsi %[[NS]] : i32 to i64
+// CHECK:           %[[T64:.*]] = arith.index_cast %[[T]] : index to i64
+// CHECK:           %[[ROW2:.*]] = arith.muli %[[T64]], %[[NS64B]] : i64
+// CHECK:           %[[LIN2:.*]] = arith.addi %[[ROW2]],
+// CHECK:           %[[GEP2:.*]] = llvm.getelementptr %[[OUT]][%[[LIN2]]]
+// CHECK:           llvm.store %[[V]], %[[GEP2]]
+// scf.if + gpu.return: inactive threads (sid ≥ ns) skip the body and
+// arrive at gpu.return directly.
+// CHECK:       gpu.return
+kunir.func @test_copy(%in: !kunir.ts<f32, inf>, %out: !kunir.ts<f32, 1>)
+    inputs {%in = "in"}
+    outputs {%out = "out"}
+    target {occupancy = 1, warps_per_cta = 1, smem_size = 0, vector_size = 1} unreliable_count = 0 {
+  %lb = kungpu.time_lb
+  %ub = kungpu.time_ub
+  %c1 = arith.constant 1 : index
+  %off = arith.constant 0 : i32
+  scf.for %t = %lb to %ub step %c1 {
+    %v = kungpu.ts.get %in[%off] : !kunir.ts<f32, inf> -> f32
+    kungpu.ts.put %out, %v : !kunir.ts<f32, 1>, f32
+  }
+  kunir.return
+}
+
+
+// =====================================================================
+// Case 2 — windowed_temp in local memory: alloca buffer + i32 pos cell,
+//          circular put/get (no modulo).
+// =====================================================================
+//
+// CHECK-LABEL: gpu.func @test_windowed_local
+// CHECK-SAME:  i32
+// CHECK-SAME:  i32
+// CHECK-SAME:  !llvm.ptr
+// CHECK-SAME:  !llvm.ptr
+//
+// ── windowed_temp lowering — buf alloca + 1×i32 pos cell init to 0 ────
+// CHECK:       %[[NCST:.*]] = llvm.mlir.constant(5 : i32) : i32
+// CHECK:       %[[BUF:.*]] = llvm.alloca %[[NCST]] x f32 : (i32) -> !llvm.ptr
+// CHECK:       %[[ONE32A:.*]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       %[[POS:.*]] = llvm.alloca %[[ONE32A]] x i32 : (i32) -> !llvm.ptr
+// CHECK:       %[[ZERO32:.*]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       llvm.store %[[ZERO32]], %[[POS]] : i32, !llvm.ptr
+//
+// CHECK:       scf.for %[[T:.*]] =
+//
+// ── ts.put %wt, %v (circular write):  buf[pos] = v; pos = (pos+1>=N)?0:pos+1
+// (GEP index is i32 — no sext, since LLVM accepts any int type for indices.)
+// CHECK:         %[[V:.*]] = llvm.load %{{.*}} : !llvm.ptr -> f32
+// CHECK:         %[[P:.*]] = llvm.load %[[POS]] : !llvm.ptr -> i32
+// CHECK:         %[[GEP:.*]] = llvm.getelementptr %[[BUF]][%[[P]]] : (!llvm.ptr, i32) -> !llvm.ptr, f32
+// CHECK:         llvm.store %[[V]], %[[GEP]] : f32, !llvm.ptr
+// CHECK:         %[[ONE32:.*]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:         %[[N32:.*]] = llvm.mlir.constant(5 : i32) : i32
+// CHECK:         %[[Z32:.*]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:         %[[PP1:.*]] = llvm.add %[[P]], %[[ONE32]] : i32
+// CHECK:         %[[CMP:.*]] = llvm.icmp "uge" %[[PP1]], %[[N32]] : i32
+// CHECK:         %[[NEW:.*]] = llvm.select %[[CMP]], %[[Z32]], %[[PP1]] : i1, i32
+// CHECK:         llvm.store %[[NEW]], %[[POS]] : i32, !llvm.ptr
+//
+// ── ts.get %wt[off] (circular read):
+//      adj=off+1; idx = pos>=adj ? pos-adj : pos+N-adj; return buf[idx]
+// CHECK:         %[[OF:.*]] = arith.index_cast %{{.*}} : index to i32
+// CHECK:         %[[P2:.*]] = llvm.load %[[POS]] : !llvm.ptr -> i32
+// CHECK:         %[[ONE32B:.*]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:         %[[N32B:.*]] = llvm.mlir.constant(5 : i32) : i32
+// CHECK:         %[[ADJ:.*]] = llvm.add %[[OF]], %[[ONE32B]] : i32
+// CHECK:         %[[GE:.*]] = llvm.icmp "uge" %[[P2]], %[[ADJ]] : i32
+// CHECK:         %[[PMA:.*]] = llvm.sub %[[P2]], %[[ADJ]] : i32
+// CHECK:         %[[PPN:.*]] = llvm.add %[[P2]], %[[N32B]] : i32
+// CHECK:         %[[WR:.*]] = llvm.sub %[[PPN]], %[[ADJ]] : i32
+// CHECK:         %[[IDX:.*]] = llvm.select %[[GE]], %[[PMA]], %[[WR]] : i1, i32
+// CHECK:         %[[GGEP:.*]] = llvm.getelementptr %[[BUF]][%[[IDX]]] : (!llvm.ptr, i32) -> !llvm.ptr, f32
+// CHECK:         llvm.load %[[GGEP]] : !llvm.ptr -> f32
+kunir.func @test_windowed_local(%in: !kunir.ts<f32, inf>, %out: !kunir.ts<f32, 1>)
+    inputs {%in = "in"}
+    outputs {%out = "out"}
+    target {occupancy = 1, warps_per_cta = 1, smem_size = 0, vector_size = 1} unreliable_count = 0 {
+  %wt = kungpu.windowed_temp : !kunir.ts<f32, 5> {kungpu.smem = false}
+  %lb = kungpu.time_lb
+  %ub = kungpu.time_ub
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %off0 = arith.constant 0 : i32
+  scf.for %t = %lb to %ub step %c1 {
+    %v  = kungpu.ts.get %in[%off0] : !kunir.ts<f32, inf> -> f32
+    kungpu.ts.put %wt, %v : !kunir.ts<f32, 5>, f32
+    %off_idx = arith.subi %t, %c0 : index
+    %off_i32 = arith.index_cast %off_idx : index to i32
+    %w  = kungpu.ts.get %wt[%off_i32] : !kunir.ts<f32, 5> -> f32
+    kungpu.ts.put %out, %w : !kunir.ts<f32, 1>, f32
+  }
+  kunir.return
+}
+
+
+// =====================================================================
+// Case 3 — windowed_temp in shared memory: slot-major layout.
+//
+//   layout:        smem[ slot*K + tid ]   (K = threads_per_block)
+//   global size:   N * K elements
+//   per-thread base:  bufPtr = smem + tid           (1 GEP at allocation)
+//   per-access stride: K  (smem[bufPtr + idx*K] in ts.put / ts.get)
+// =====================================================================
+//
+// Global has 5*128 = 640 elements (N=5, warps_per_cta=4 → K=128).
+//
+// CHECK-LABEL: gpu.func @test_windowed_smem
+// CHECK:       %[[RAW:.*]] = llvm.mlir.addressof @[[SMEM]] : !llvm.ptr<3>
+// CHECK:       %[[GEN:.*]] = llvm.addrspacecast %[[RAW]] : !llvm.ptr<3> to !llvm.ptr
+// CHECK:       %[[TID:.*]] = gpu.thread_id  x
+// CHECK:       %[[TIDI:.*]] = arith.index_cast %[[TID]] : index to i32
+// bufPtr = smem + tid  (no per-allocation N multiply)
+// CHECK:       %[[BUF3:.*]] = llvm.getelementptr %[[GEN]][%[[TIDI]]] : (!llvm.ptr, i32) -> !llvm.ptr, f32
+// pos cell still alloca'd (i32)
+// CHECK:       llvm.alloca {{.*}} x i32
+//
+// Inside the loop — ts.put: stride-K multiply before the GEP.
+// CHECK:         %[[POSV:.*]] = llvm.load %{{.*}} : !llvm.ptr -> i32
+// CHECK:         %[[K:.*]] = llvm.mlir.constant(128 : i32) : i32
+// CHECK:         %[[OFFP:.*]] = llvm.mul %[[POSV]], %[[K]] : i32
+// CHECK:         %[[GEPP:.*]] = llvm.getelementptr %[[BUF3]][%[[OFFP]]] : (!llvm.ptr, i32) -> !llvm.ptr, f32
+// CHECK:         llvm.store %{{.*}}, %[[GEPP]]
+//
+// ts.get: same stride-K pattern.
+// CHECK:         %[[K2:.*]] = llvm.mlir.constant(128 : i32) : i32
+// CHECK:         %[[OFFG:.*]] = llvm.mul %{{.*}}, %[[K2]] : i32
+// CHECK:         %[[GEPG:.*]] = llvm.getelementptr %[[BUF3]][%[[OFFG]]] : (!llvm.ptr, i32) -> !llvm.ptr, f32
+// CHECK:         llvm.load %[[GEPG]]
+kunir.func @test_windowed_smem(%in: !kunir.ts<f32, inf>, %out: !kunir.ts<f32, 1>)
+    inputs {%in = "in"}
+    outputs {%out = "out"}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 49152, vector_size = 1} unreliable_count = 0 {
+  %wt = kungpu.windowed_temp : !kunir.ts<f32, 5> {kungpu.smem = true}
+  %lb = kungpu.time_lb
+  %ub = kungpu.time_ub
+  %c1 = arith.constant 1 : index
+  %off0 = arith.constant 0 : i32
+  scf.for %t = %lb to %ub step %c1 {
+    %v  = kungpu.ts.get %in[%off0] : !kunir.ts<f32, inf> -> f32
+    kungpu.ts.put %wt, %v : !kunir.ts<f32, 5>, f32
+    %w  = kungpu.ts.get %wt[%off0] : !kunir.ts<f32, 5> -> f32
+    kungpu.ts.put %out, %w : !kunir.ts<f32, 1>, f32
+  }
+  kunir.return
+}
+
+
+// =====================================================================
+// Case 4 — stock_id and block_stock_count lowering.
+// =====================================================================
+//
+// CHECK-LABEL: gpu.func @test_indexing
+// CHECK:       gpu.thread_id  x
+// CHECK-NEXT:  gpu.block_id   x
+// CHECK-NEXT:  gpu.block_dim  x
+// CHECK-NEXT:  arith.muli
+// CHECK-NEXT:  arith.addi
+// CHECK:       gpu.block_dim  x
+kunir.func @test_indexing(%in: !kunir.ts<f32, inf>, %out: !kunir.ts<f32, 1>)
+    inputs {%in = "in"}
+    outputs {%out = "out"}
+    target {occupancy = 1, warps_per_cta = 1, smem_size = 0, vector_size = 1} unreliable_count = 0 {
+  %sid = kungpu.stock_id
+  %bsc = kungpu.block_stock_count
+  %sum = arith.addi %sid, %bsc : index
+  kunir.return
+}
+
+}  // gpu.module
diff --git a/mlir/test/kungpu/memory_planning.mlir b/mlir/test/kungpu/memory_planning.mlir
new file mode 100644
index 0000000..be04dfe
--- /dev/null
+++ b/mlir/test/kungpu/memory_planning.mlir
@@ -0,0 +1,76 @@
+// RUN: %kun-opt --kungpu-memory-planning %s | %FileCheck %s
+//
+// All three functions share the same target_spec:
+//   smem_size = 49152 bytes (per-SM total), occupancy = 1
+//   → per-block budget = 49152 / 1 = 49152 bytes
+//   warps_per_cta = 1  →  num_threads = 32
+//   vector_size = 1
+//
+// Buffer cost (f32 = 4 bytes): bytes = N * 32 * 1 * 4 = N * 128
+//   N=3   →   384 bytes
+//   N=5   →   640 bytes
+//   N=10  →  1280 bytes
+//   N=400 → 51200 bytes  (> 49152)
+//   N=500 → 64000 bytes  (> 49152)
+//
+// Case 1 – all smem:   N=3 (384) + N=5 (640) + N=10 (1280) = 2304 ≤ 49152
+// Case 2 – mixed:      N=5 (640) → smem; N=400 (51200) → 640+51200 > 49152 → local
+// Case 3 – all local:  N=400 (51200) > 49152 → local; N=500 → local
+//
+// The pass sorts ops by ascending N before assigning, so declaration order
+// in the IR does not affect the assignment.
+
+// -----------------------------------------------------------------------
+// Case 1: all three buffers fit in shared memory
+// -----------------------------------------------------------------------
+
+// CHECK-LABEL: kunir.func @test_all_smem
+kunir.func @test_all_smem(%in: !kunir.ts<f32, inf>, %out: !kunir.ts<f32, 1>)
+    inputs {%in = "in"}
+    outputs {%out = "out"}
+    target {occupancy = 1, warps_per_cta = 1, smem_size = 49152, vector_size = 1} unreliable_count = 0 {
+  // Declared in reverse order to verify sort-by-N behaviour.
+  // CHECK-DAG: kungpu.windowed_temp : <f32, 10> {kungpu.smem = true}
+  %c = kungpu.windowed_temp : !kunir.ts<f32, 10>
+  // CHECK-DAG: kungpu.windowed_temp : <f32, 3> {kungpu.smem = true}
+  %a = kungpu.windowed_temp : !kunir.ts<f32, 3>
+  // CHECK-DAG: kungpu.windowed_temp : <f32, 5> {kungpu.smem = true}
+  %b = kungpu.windowed_temp : !kunir.ts<f32, 5>
+  kunir.return
+}
+
+// -----------------------------------------------------------------------
+// Case 2: small buffer goes to smem, large buffer spills to local memory
+// -----------------------------------------------------------------------
+
+// CHECK-LABEL: kunir.func @test_mixed
+kunir.func @test_mixed(%in: !kunir.ts<f32, inf>, %out: !kunir.ts<f32, 1>)
+    inputs {%in = "in"}
+    outputs {%out = "out"}
+    target {occupancy = 1, warps_per_cta = 1, smem_size = 49152, vector_size = 1} unreliable_count = 0 {
+  // N=400 (51200 bytes) is declared first but sorted after N=5 (640 bytes).
+  // N=5 takes 640 bytes; N=400 would need 51200 more, exceeding 48512 remaining.
+  // CHECK-DAG: kungpu.windowed_temp : <f32, 400> {kungpu.smem = false}
+  %big = kungpu.windowed_temp : !kunir.ts<f32, 400>
+  // CHECK-DAG: kungpu.windowed_temp : <f32, 5> {kungpu.smem = true}
+  %small = kungpu.windowed_temp : !kunir.ts<f32, 5>
+  kunir.return
+}
+
+// -----------------------------------------------------------------------
+// Case 3: every buffer exceeds the budget on its own → all local memory
+// -----------------------------------------------------------------------
+
+// CHECK-LABEL: kunir.func @test_all_local
+kunir.func @test_all_local(%in: !kunir.ts<f32, inf>, %out: !kunir.ts<f32, 1>)
+    inputs {%in = "in"}
+    outputs {%out = "out"}
+    target {occupancy = 1, warps_per_cta = 1, smem_size = 49152, vector_size = 1} unreliable_count = 0 {
+  // N=400 → 51200 bytes > 49152, smem=false.
+  // CHECK-DAG: kungpu.windowed_temp : <f32, 400> {kungpu.smem = false}
+  %a = kungpu.windowed_temp : !kunir.ts<f32, 400>
+  // N=500 → 64000 bytes > 49152, smem=false.
+  // CHECK-DAG: kungpu.windowed_temp : <f32, 500> {kungpu.smem = false}
+  %b = kungpu.windowed_temp : !kunir.ts<f32, 500>
+  kunir.return
+}
diff --git a/mlir/test/kunir/basic.mlir b/mlir/test/kunir/basic.mlir
new file mode 100644
index 0000000..d19ffac
--- /dev/null
+++ b/mlir/test/kunir/basic.mlir
@@ -0,0 +1,182 @@
+// RUN: %kun-opt %s | %FileCheck %s
+// RUN: %kun-opt %s | %kun-opt | %FileCheck %s
+
+// Verify the kunir dialect types and ops parse and round-trip inside kunir.func.
+
+// CHECK-LABEL: kunir.func @test_ts_lookback_type
+// CHECK-SAME: !kunir.ts<f32, inf>
+// CHECK-SAME: !kunir.ts<f32, 1>
+// CHECK-SAME: !kunir.ts<f64, 10>
+kunir.func @test_ts_lookback_type(
+    %a: !kunir.ts<f32, inf>,
+    %b: !kunir.ts<f32, 1>,
+    %c: !kunir.ts<f64, 10>)
+    inputs {%a = "a", %b = "b", %c = "c"}
+    outputs {"result"}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1} unreliable_count = 0
+    -> !kunir.ts<f32, 1> {
+  kunir.return %b : !kunir.ts<f32, 1>
+}
+
+// CHECK-LABEL: kunir.func @test_binary_mismatched_lookbacks
+kunir.func @test_binary_mismatched_lookbacks(%a: !kunir.ts<f32, 5>, %b: !kunir.ts<f32, 10>)
+    inputs {%a = "a", %b = "b"}
+    outputs {"result"}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1} unreliable_count = 0
+    -> !kunir.ts<f32, 1> {
+  // CHECK: kunir.add
+  // CHECK-SAME: <f32, 5>, <f32, 10>
+  %sum = kunir.add %a, %b : !kunir.ts<f32, 5>, !kunir.ts<f32, 10>
+  // CHECK: kunir.sub
+  %diff = kunir.sub %a, %b : !kunir.ts<f32, 5>, !kunir.ts<f32, 10>
+  // CHECK: kunir.mul
+  %prod = kunir.mul %sum, %diff : !kunir.ts<f32, 1>, !kunir.ts<f32, 1>
+  kunir.return %prod : !kunir.ts<f32, 1>
+}
+
+// CHECK-LABEL: kunir.func @test_unary
+kunir.func @test_unary(%x: !kunir.ts<f32, inf>)
+    inputs {%x = "x"}
+    outputs {"result"}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1} unreliable_count = 0
+    -> !kunir.ts<f32, 1> {
+  // CHECK: kunir.abs
+  %a = kunir.abs %x : !kunir.ts<f32, inf>
+  // CHECK: kunir.sign
+  %s = kunir.sign %a : !kunir.ts<f32, 1>
+  kunir.return %s : !kunir.ts<f32, 1>
+}
+
+// CHECK-LABEL: kunir.func @test_windowed_output
+kunir.func @test_windowed_output(%input: !kunir.ts<f32, inf>)
+    inputs {%input = "input"}
+    outputs {"result"}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1} unreliable_count = 0
+    -> !kunir.ts<f32, 10> {
+  // CHECK: kunir.windowed_output
+  // CHECK-SAME: length = 10
+  %out = kunir.windowed_output %input [length = 10] : !kunir.ts<f32, inf> -> !kunir.ts<f32, 10>
+  kunir.return %out : !kunir.ts<f32, 10>
+}
+
+// CHECK-LABEL: kunir.func @test_for_each_back_window_single
+kunir.func @test_for_each_back_window_single(%close: !kunir.ts<f32, 10>)
+    inputs {%close = "close"}
+    outputs {"result"}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1} unreliable_count = 0
+    -> !kunir.ts<f32, 1> {
+  // CHECK: kunir.for_each_back_window
+  // CHECK-SAME: [window = 5]
+  %ts_sum = kunir.for_each_back_window
+      (%close : !kunir.ts<f32, 10>) [window = 5]
+      (%close_cur : !kunir.ts<f32, 1>)
+      -> (!kunir.ts<f32, 1>) {
+    // CHECK: kunir.reduce_add
+    %s = kunir.reduce_add %close_cur : !kunir.ts<f32, 1>
+    kunir.yield %s : !kunir.ts<f32, 1>
+  }
+  kunir.return %ts_sum : !kunir.ts<f32, 1>
+}
+
+// CHECK-LABEL: kunir.func @test_for_each_back_window_multi_input
+kunir.func @test_for_each_back_window_multi_input(
+    %close: !kunir.ts<f32, 20>,
+    %vol:   !kunir.ts<f32, 20>)
+    inputs {%close = "close", %vol = "vol"}
+    outputs {"sum_close", "sum_vol"}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1} unreliable_count = 0
+    -> (!kunir.ts<f32, 1>, !kunir.ts<f32, 1>) {
+  // CHECK: kunir.for_each_back_window
+  %sum_c, %sum_v = kunir.for_each_back_window
+      (%close : !kunir.ts<f32, 20>, %vol : !kunir.ts<f32, 20>) [window = 10]
+      (%cc : !kunir.ts<f32, 1>, %vc : !kunir.ts<f32, 1>)
+      -> (!kunir.ts<f32, 1>, !kunir.ts<f32, 1>) {
+    // CHECK: kunir.reduce_add
+    %sc = kunir.reduce_add %cc : !kunir.ts<f32, 1>
+    // CHECK: kunir.reduce_add
+    %sv = kunir.reduce_add %vc : !kunir.ts<f32, 1>
+    kunir.yield %sc, %sv : !kunir.ts<f32, 1>, !kunir.ts<f32, 1>
+  }
+  kunir.return %sum_c, %sum_v : !kunir.ts<f32, 1>, !kunir.ts<f32, 1>
+}
+
+// CHECK-LABEL: kunir.func @test_for_each_back_window_multi_reduce
+kunir.func @test_for_each_back_window_multi_reduce(%input: !kunir.ts<f32, 20>)
+    inputs {%input = "input"}
+    outputs {"sum", "max"}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1} unreliable_count = 0
+    -> (!kunir.ts<f32, 1>, !kunir.ts<f32, 1>) {
+  %sum_ts, %max_ts = kunir.for_each_back_window
+      (%input : !kunir.ts<f32, 20>) [window = 10]
+      (%val : !kunir.ts<f32, 1>)
+      -> (!kunir.ts<f32, 1>, !kunir.ts<f32, 1>) {
+    // CHECK: kunir.reduce_add
+    %s = kunir.reduce_add %val : !kunir.ts<f32, 1>
+    // CHECK: kunir.reduce_max
+    %m = kunir.reduce_max %val : !kunir.ts<f32, 1>
+    kunir.yield %s, %m : !kunir.ts<f32, 1>, !kunir.ts<f32, 1>
+  }
+  kunir.return %sum_ts, %max_ts : !kunir.ts<f32, 1>, !kunir.ts<f32, 1>
+}
+
+// CHECK-LABEL: kunir.func @test_for_each_back_window_inf
+kunir.func @test_for_each_back_window_inf(%input: !kunir.ts<f64, inf>)
+    inputs {%input = "input"}
+    outputs {"result"}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1} unreliable_count = 0
+    -> !kunir.ts<f64, 1> {
+  %result = kunir.for_each_back_window
+      (%input : !kunir.ts<f64, inf>) [window = 100]
+      (%val : !kunir.ts<f64, 1>)
+      -> (!kunir.ts<f64, 1>) {
+    %s = kunir.reduce_add %val : !kunir.ts<f64, 1>
+    kunir.yield %s : !kunir.ts<f64, 1>
+  }
+  kunir.return %result : !kunir.ts<f64, 1>
+}
+
+// CHECK-LABEL: kunir.func @test_f64_binary
+kunir.func @test_f64_binary(%a: !kunir.ts<f64, inf>, %b: !kunir.ts<f64, inf>)
+    inputs {%a = "a", %b = "b"}
+    outputs {"result"}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1} unreliable_count = 0
+    -> !kunir.ts<f64, 1> {
+  // CHECK: !kunir.ts<f64
+  %result = kunir.max %a, %b : !kunir.ts<f64, inf>, !kunir.ts<f64, inf>
+  kunir.return %result : !kunir.ts<f64, 1>
+}
+
+// CHECK-LABEL: kunir.func @test_cmp_logical_select
+kunir.func @test_cmp_logical_select(%a: !kunir.ts<f32, inf>, %b: !kunir.ts<f32, inf>)
+    inputs {%a = "a", %b = "b"}
+    outputs {"gt_out", "lt_out", "eq_out", "and_out", "or_out", "not_out"}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1} unreliable_count = 0
+    -> (!kunir.ts<f32, 1>, !kunir.ts<f32, 1>, !kunir.ts<f32, 1>,
+        !kunir.ts<f32, 1>, !kunir.ts<f32, 1>, !kunir.ts<f32, 1>) {
+  // CHECK: kunir.gt
+  %gt = kunir.gt %a, %b : !kunir.ts<f32, inf>, !kunir.ts<f32, inf>
+  // CHECK: kunir.lt
+  %lt = kunir.lt %a, %b : !kunir.ts<f32, inf>, !kunir.ts<f32, inf>
+  // CHECK: kunir.ge
+  %ge = kunir.ge %a, %b : !kunir.ts<f32, inf>, !kunir.ts<f32, inf>
+  // CHECK: kunir.le
+  %le = kunir.le %a, %b : !kunir.ts<f32, inf>, !kunir.ts<f32, inf>
+  // CHECK: kunir.eq
+  %eq = kunir.eq %a, %b : !kunir.ts<f32, inf>, !kunir.ts<f32, inf>
+  // CHECK: kunir.and
+  %and = kunir.and %gt, %lt : !kunir.ts<i1, 1>, !kunir.ts<i1, 1>
+  // CHECK: kunir.or
+  %or  = kunir.or  %ge, %le : !kunir.ts<i1, 1>, !kunir.ts<i1, 1>
+  // CHECK: kunir.not
+  %nt  = kunir.not %lt : !kunir.ts<i1, 1>
+  // CHECK: kunir.select
+  %s_gt  = kunir.select %gt,  %a, %b : !kunir.ts<i1, 1>, !kunir.ts<f32, inf>, !kunir.ts<f32, inf>
+  %s_lt  = kunir.select %lt,  %a, %b : !kunir.ts<i1, 1>, !kunir.ts<f32, inf>, !kunir.ts<f32, inf>
+  %s_eq  = kunir.select %eq,  %a, %b : !kunir.ts<i1, 1>, !kunir.ts<f32, inf>, !kunir.ts<f32, inf>
+  %s_and = kunir.select %and, %a, %b : !kunir.ts<i1, 1>, !kunir.ts<f32, inf>, !kunir.ts<f32, inf>
+  %s_or  = kunir.select %or,  %a, %b : !kunir.ts<i1, 1>, !kunir.ts<f32, inf>, !kunir.ts<f32, inf>
+  %s_nt  = kunir.select %nt,  %a, %b : !kunir.ts<i1, 1>, !kunir.ts<f32, inf>, !kunir.ts<f32, inf>
+  kunir.return %s_gt, %s_lt, %s_eq, %s_and, %s_or, %s_nt
+    : !kunir.ts<f32, 1>, !kunir.ts<f32, 1>, !kunir.ts<f32, 1>,
+      !kunir.ts<f32, 1>, !kunir.ts<f32, 1>, !kunir.ts<f32, 1>
+}
diff --git a/mlir/test/kunir/func.mlir b/mlir/test/kunir/func.mlir
new file mode 100644
index 0000000..62fbce3
--- /dev/null
+++ b/mlir/test/kunir/func.mlir
@@ -0,0 +1,66 @@
+// RUN: %kun-opt %s | %FileCheck %s
+// RUN: %kun-opt %s | %kun-opt | %FileCheck %s
+
+// CHECK-LABEL: kunir.func @test_non_void
+// CHECK-SAME: (%[[A:.*]]: !kunir.ts<f32, inf>, %[[B:.*]]: !kunir.ts<f32, inf>)
+// CHECK:      inputs {%[[A]] = "close", %[[B]] = "vol"}
+// CHECK:      outputs {"alpha"}
+// CHECK:      target {occupancy = 2, warps_per_cta = 4, smem_size = 49152, vector_size = 1} unreliable_count = 0
+// CHECK:      -> !kunir.ts<f32, 1>
+kunir.func @test_non_void(%close: !kunir.ts<f32, inf>, %vol: !kunir.ts<f32, inf>)
+    inputs {%close = "close", %vol = "vol"}
+    outputs {"alpha"}
+    target {occupancy = 2, warps_per_cta = 4, smem_size = 49152, vector_size = 1} unreliable_count = 0
+    -> !kunir.ts<f32, 1> {
+  %sum = kunir.add %close, %vol : !kunir.ts<f32, inf>, !kunir.ts<f32, inf>
+  kunir.return %sum : !kunir.ts<f32, 1>
+}
+
+// Void form: one input, one output — both are function args.
+// CHECK-LABEL: kunir.func @test_void
+// CHECK-SAME: (%[[IN:.*]]: !kunir.ts<f32, inf>, %[[OUT:.*]]: !kunir.ts<f32, 1>)
+// CHECK:      inputs {%[[IN]] = "close"}
+// CHECK:      outputs {%[[OUT]] = "alpha"}
+// CHECK:      target {occupancy = 1, warps_per_cta = 2, smem_size = 0, vector_size = 1} unreliable_count = 0
+// CHECK-NOT:  ->
+kunir.func @test_void(%close: !kunir.ts<f32, inf>, %out: !kunir.ts<f32, 1>)
+    inputs {%close = "close"}
+    outputs {%out = "alpha"}
+    target {occupancy = 1, warps_per_cta = 2, smem_size = 0, vector_size = 1} unreliable_count = 0 {
+  kunir.return
+}
+
+// Void form: two inputs, two outputs — all four are function args.
+// CHECK-LABEL: kunir.func @test_void_multi_output
+// CHECK-SAME: (%[[I0:.*]]: !kunir.ts<f32, inf>, %[[I1:.*]]: !kunir.ts<f32, inf>, %[[O0:.*]]: !kunir.ts<f32, 1>, %[[O1:.*]]: !kunir.ts<f32, 1>)
+// CHECK:      inputs {%[[I0]] = "close", %[[I1]] = "vol"}
+// CHECK:      outputs {%[[O0]] = "alpha1", %[[O1]] = "alpha2"}
+// CHECK:      target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1} unreliable_count = 0
+// CHECK-NOT:  ->
+kunir.func @test_void_multi_output(
+    %close: !kunir.ts<f32, inf>, %vol: !kunir.ts<f32, inf>,
+    %out1: !kunir.ts<f32, 1>, %out2: !kunir.ts<f32, 1>)
+    inputs {%close = "close", %vol = "vol"}
+    outputs {%out1 = "alpha1", %out2 = "alpha2"}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 0, vector_size = 1} unreliable_count = 0 {
+  kunir.return
+}
+
+// Non-void multi-result.
+// CHECK-LABEL: kunir.func @test_multi_result
+kunir.func @test_multi_result(%input: !kunir.ts<f64, inf>)
+    inputs {%input = "input"}
+    outputs {"sum", "maxval"}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 16384, vector_size = 1} unreliable_count = 0
+    -> (!kunir.ts<f64, 1>, !kunir.ts<f64, 1>) {
+  %w = kunir.windowed_output %input [length = 10] : !kunir.ts<f64, inf> -> !kunir.ts<f64, 10>
+  %s, %m = kunir.for_each_back_window
+      (%w : !kunir.ts<f64, 10>) [window = 10]
+      (%val : !kunir.ts<f64, 1>)
+      -> (!kunir.ts<f64, 1>, !kunir.ts<f64, 1>) {
+    %radd = kunir.reduce_add %val : !kunir.ts<f64, 1>
+    %rmax = kunir.reduce_max %val : !kunir.ts<f64, 1>
+    kunir.yield %radd, %rmax : !kunir.ts<f64, 1>, !kunir.ts<f64, 1>
+  }
+  kunir.return %s, %m : !kunir.ts<f64, 1>, !kunir.ts<f64, 1>
+}
diff --git a/mlir/test/kunir/lower_to_kungpu.mlir b/mlir/test/kunir/lower_to_kungpu.mlir
new file mode 100644
index 0000000..833de39
--- /dev/null
+++ b/mlir/test/kunir/lower_to_kungpu.mlir
@@ -0,0 +1,138 @@
+// RUN: %kun-opt --kunir-to-kungpu %s | %FileCheck %s
+
+// CHECK-LABEL: kunir.func @test_binary_lower
+// Pure ts args at this stage; the runtime scalars (time_length / num_stocks /
+// mask / chunk_size / warmup) are prepended later by convert-kungpu-to-llvm.
+// CHECK-SAME: !kunir.ts<f32, inf>
+// CHECK-SAME: !kunir.ts<f32, inf>
+// Graph output buffers are full TS arrays; the per-op result window has
+// already been materialized into the loop body.
+// CHECK-SAME: !kunir.ts<f32, inf>
+// CHECK-NOT: -> !kunir.ts
+kunir.func @test_binary_lower(%a: !kunir.ts<f32, inf>, %b: !kunir.ts<f32, inf>)
+    inputs {%a = "a", %b = "b"}
+    outputs {"result"}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 49152, vector_size = 1} unreliable_count = 0
+    -> !kunir.ts<f32, 1> {
+  // Outer for bounds come from the per-chunk lb/ub ops, not [0, T).
+  // Both are operandless — they pull chunk_size / warmup / time_length
+  // from gpu.func args at the kungpu-to-llvm stage.
+  // CHECK:      %[[LB:.*]] = kungpu.time_lb
+  // CHECK:      %[[UB:.*]] = kungpu.time_ub
+  // CHECK:      %[[C0:.*]] = arith.constant 0 : index
+  // CHECK:      %[[C1:.*]] = arith.constant 1 : index
+  // outer-loop offset = 0 (i32) used by every gmem ts.get/put
+  // CHECK:      %[[OFF:.*]] = arith.constant 0 : i32
+  // CHECK:      scf.for %{{.*}} = %[[LB]] to %[[UB]] step %[[C1]]
+  // CHECK:        kungpu.ts.get %{{.*}}[%[[OFF]]]
+  // CHECK:        kungpu.ts.get %{{.*}}[%[[OFF]]]
+  // CHECK:        arith.addf
+  // CHECK:        kungpu.ts.put
+  // CHECK-NOT:    kungpu.ts.put %{{.*}}[
+  // CHECK-NOT:    kungpu.time_length
+  %sum = kunir.add %a, %b : !kunir.ts<f32, inf>, !kunir.ts<f32, inf>
+  kunir.return %sum : !kunir.ts<f32, 1>
+}
+
+// CHECK-LABEL: kunir.func @test_unary_lower
+kunir.func @test_unary_lower(%x: !kunir.ts<f32, inf>)
+    inputs {%x = "x"}
+    outputs {"result"}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 49152, vector_size = 1} unreliable_count = 0
+    -> !kunir.ts<f32, 1> {
+  // CHECK: math.absf
+  %a = kunir.abs %x : !kunir.ts<f32, inf>
+  kunir.return %a : !kunir.ts<f32, 1>
+}
+
+// CHECK-LABEL: kunir.func @test_windowed_sum
+kunir.func @test_windowed_sum(%close: !kunir.ts<f32, inf>)
+    inputs {%close = "close"}
+    outputs {"result"}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 49152, vector_size = 1} unreliable_count = 0
+    -> !kunir.ts<f32, 1> {
+  // CHECK:      %[[C0:.*]] = arith.constant 0 : index
+  // CHECK:      %[[C1:.*]] = arith.constant 1 : index
+  // CHECK:      %[[OFF0:.*]] = arith.constant 0 : i32
+  // CHECK:      %[[WT:.*]] = kungpu.windowed_temp : <f32, 5>
+  // CHECK:      scf.for %[[T:.*]] =
+  // CHECK:        kungpu.ts.get %{{.*}}[%[[OFF0]]]
+  // outer-loop ts.put has no offset operand
+  // CHECK:        kungpu.ts.put %[[WT]], %{{[^[]+}} : <f32, 5>, f32
+  // CHECK:        %[[WIN:.*]] = arith.constant 5 : index
+  // window-loop offset = (window-1) - w  (oldest first)
+  // CHECK:        %[[WM1:.*]] = arith.constant 4 : i32
+  // CHECK:        scf.for %[[W:.*]] = %[[C0]] to %[[WIN]] step %[[C1]] iter_args
+  // CHECK:          %[[WI:.*]] = arith.index_cast %[[W]] : index to i32
+  // CHECK:          %[[OFFW:.*]] = arith.subi %[[WM1]], %[[WI]] : i32
+  // CHECK:          kungpu.ts.get %[[WT]][%[[OFFW]]]
+  // CHECK:          arith.addf
+  %w = kunir.windowed_output %close [length = 5] : !kunir.ts<f32, inf> -> !kunir.ts<f32, 5>
+  %sum = kunir.for_each_back_window
+      (%w : !kunir.ts<f32, 5>) [window = 5]
+      (%cur : !kunir.ts<f32, 1>)
+      -> (!kunir.ts<f32, 1>) {
+    %s = kunir.reduce_add %cur : !kunir.ts<f32, 1>
+    kunir.yield %s : !kunir.ts<f32, 1>
+  }
+  kunir.return %sum : !kunir.ts<f32, 1>
+}
+
+// CHECK-LABEL: kunir.func @test_computed_reduce
+kunir.func @test_computed_reduce(%x: !kunir.ts<f32, inf>, %y: !kunir.ts<f32, inf>)
+    inputs {%x = "x", %y = "y"}
+    outputs {"result"}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 49152, vector_size = 1} unreliable_count = 0
+    -> !kunir.ts<f32, 1> {
+  // CHECK:      %[[WX:.*]] = kungpu.windowed_temp : <f32, 3>
+  // CHECK:      %[[WY:.*]] = kungpu.windowed_temp : <f32, 3>
+  // CHECK:      scf.for
+  // CHECK:        scf.for {{.*}} iter_args
+  // CHECK:          %[[A:.*]] = kungpu.ts.get %[[WX]][%{{.*}}]
+  // CHECK:          %[[B:.*]] = kungpu.ts.get %[[WY]][%{{.*}}]
+  // CHECK:          %[[P:.*]] = arith.mulf %[[A]], %[[B]]
+  // CHECK:          arith.addf {{.*}}, %[[P]]
+  %wx = kunir.windowed_output %x [length = 3] : !kunir.ts<f32, inf> -> !kunir.ts<f32, 3>
+  %wy = kunir.windowed_output %y [length = 3] : !kunir.ts<f32, inf> -> !kunir.ts<f32, 3>
+  %sum = kunir.for_each_back_window
+      (%wx : !kunir.ts<f32, 3>, %wy : !kunir.ts<f32, 3>) [window = 3]
+      (%a : !kunir.ts<f32, 1>, %b : !kunir.ts<f32, 1>)
+      -> (!kunir.ts<f32, 1>) {
+    %prod = kunir.mul %a, %b : !kunir.ts<f32, 1>, !kunir.ts<f32, 1>
+    %s = kunir.reduce_add %prod : !kunir.ts<f32, 1>
+    kunir.yield %s : !kunir.ts<f32, 1>
+  }
+  kunir.return %sum : !kunir.ts<f32, 1>
+}
+
+// CHECK-LABEL: kunir.func @test_multi_reduce
+// CHECK-SAME: (%[[IN:.*]]: !kunir.ts<f64, inf>, %[[OUT0:.*]]: !kunir.ts<f64, inf>, %[[OUT1:.*]]: !kunir.ts<f64, inf>)
+kunir.func @test_multi_reduce(%input: !kunir.ts<f64, inf>)
+    inputs {%input = "input"}
+    outputs {"sum", "maxval"}
+    target {occupancy = 1, warps_per_cta = 4, smem_size = 49152, vector_size = 1} unreliable_count = 0
+    -> (!kunir.ts<f64, 1>, !kunir.ts<f64, 1>) {
+  // CHECK:      %[[WT:.*]] = kungpu.windowed_temp : <f64, 10>
+  // CHECK:      scf.for %[[T:.*]] =
+  // CHECK:        kungpu.ts.get %[[IN]][%{{.*}}]
+  // CHECK:        kungpu.ts.put %[[WT]], %{{[^[]+}} : <f64, 10>, f64
+  // CHECK:        %[[CST0:.*]] = arith.constant 0.0{{.*}} : f64
+  // CHECK:        %[[NEGINF:.*]] = arith.constant 0xFFF0000000000000 : f64
+  // CHECK:        %[[R:.*]]:2 = scf.for {{.*}} iter_args(%{{.*}} = %[[CST0]], %{{.*}} = %[[NEGINF]]) -> (f64, f64)
+  // CHECK:          kungpu.ts.get %[[WT]][%{{.*}}]
+  // CHECK:          arith.addf
+  // CHECK:          arith.maximumf
+  // CHECK:          scf.yield {{.*}}, {{.*}} : f64, f64
+  // CHECK:        kungpu.ts.put %[[OUT0]], %[[R]]#0 : <f64, inf>, f64
+  // CHECK:        kungpu.ts.put %[[OUT1]], %[[R]]#1 : <f64, inf>, f64
+  %w = kunir.windowed_output %input [length = 10] : !kunir.ts<f64, inf> -> !kunir.ts<f64, 10>
+  %sum, %max = kunir.for_each_back_window
+      (%w : !kunir.ts<f64, 10>) [window = 10]
+      (%val : !kunir.ts<f64, 1>)
+      -> (!kunir.ts<f64, 1>, !kunir.ts<f64, 1>) {
+    %s = kunir.reduce_add %val : !kunir.ts<f64, 1>
+    %m = kunir.reduce_max %val : !kunir.ts<f64, 1>
+    kunir.yield %s, %m : !kunir.ts<f64, 1>, !kunir.ts<f64, 1>
+  }
+  kunir.return %sum, %max : !kunir.ts<f64, 1>, !kunir.ts<f64, 1>
+}
diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py
new file mode 100644
index 0000000..13295e0
--- /dev/null
+++ b/mlir/test/lit.cfg.py
@@ -0,0 +1,72 @@
+import os
+import subprocess
+import lit.formats
+
+config.name = "KunQuant MLIR Tests"
+config.test_format = lit.formats.ShTest(True)
+config.suffixes = [".mlir", ".py"]
+
+config.test_source_root = os.path.dirname(__file__)
+config.test_exec_root = config.obj_root
+
+def prepend_env(name, entries):
+    entries = [entry for entry in entries if entry]
+    old = config.environment.get(name, "")
+    if old:
+        entries.append(old)
+    config.environment[name] = os.pathsep.join(entries)
+
+# Python GPU tests import the in-tree KunQuant package and load the freshly
+# built KunQuant-MLIR extension module from KunQuantMLIR/.
+prepend_env("PYTHONPATH", [config.project_source_dir])
+
+# KunMLIR.abi3.so links against the downloaded LLVM/MLIR shared libraries.
+# The CUDA toolkit path is also made explicit so both CuPy and the MLIR
+# libdevice/ptxas discovery use the same installation as CMake.
+config.environment["CUDA_PATH"] = config.cuda_toolkit_root
+config.environment["CUDA_HOME"] = config.cuda_toolkit_root
+prepend_env("PATH", [os.path.join(config.cuda_toolkit_root, "bin")])
+prepend_env("LD_LIBRARY_PATH", [
+    config.llvm_lib_dir,
+    os.path.join(config.cuda_toolkit_root, "lib"),
+    os.path.join(config.cuda_toolkit_root, "lib64"),
+    os.path.join(config.cuda_toolkit_root, "lib64", "stubs"),
+])
+
+def detect_cuda_device():
+    try:
+        result = subprocess.run(
+            [config.python_executable, "-c",
+             "from KunQuant.jit.env import get_cuda_compute_capability; "
+             "print(get_cuda_compute_capability())"],
+            env=config.environment,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+            timeout=20,
+            check=True)
+        arch = result.stdout.strip()
+        if arch:
+            lit_config.note("CUDA device detected for Python tests: " + arch)
+        return True
+    except Exception as exc:
+        lit_config.note("No CUDA device detected for Python tests: " + str(exc))
+        return False
+
+if detect_cuda_device():
+    config.available_features.add("cuda-device")
+
+# Tool substitutions
+config.substitutions.append(("%kun-opt", config.kun_opt))
+config.substitutions.append(("%python", config.python_executable))
+config.substitutions.append(
+    ("%FileCheck", os.path.join(config.llvm_tools_dir, "FileCheck"))
+)
+
+# Exclude non-test directories from discovery
+config.excludes = [
+    "CMakeLists.txt",
+    "lit.cfg.py",
+    "lit.site.cfg.py.in",
+    "utils.py",
+]
diff --git a/mlir/test/lit.site.cfg.py.in b/mlir/test/lit.site.cfg.py.in
new file mode 100644
index 0000000..b43cd5f
--- /dev/null
+++ b/mlir/test/lit.site.cfg.py.in
@@ -0,0 +1,14 @@
+@LIT_SITE_CFG_IN_HEADER@
+
+import sys
+
+config.llvm_tools_dir = lit_config.substitute("@LLVM_TOOLS_BINARY_DIR@")
+config.mlir_tools_dir = lit_config.substitute("@MLIR_TOOLS_DIR@")
+config.kun_opt = lit_config.substitute("@KUN_OPT_BINARY@")
+config.obj_root = "@CMAKE_CURRENT_BINARY_DIR@"
+config.project_source_dir = lit_config.substitute("@PROJECT_SOURCE_DIR@")
+config.python_executable = lit_config.substitute("@PYTHON_EXECUTABLE@")
+config.llvm_lib_dir = lit_config.substitute("@LLVM_LIBRARY_DIRS@")
+config.cuda_toolkit_root = lit_config.substitute("@KUN_CUDA_TOOLKIT_ROOT@")
+
+lit_config.load_config(config, "@CMAKE_CURRENT_SOURCE_DIR@/lit.cfg.py")
diff --git a/mlir/test/python/lit.local.cfg b/mlir/test/python/lit.local.cfg
new file mode 100644
index 0000000..ee28185
--- /dev/null
+++ b/mlir/test/python/lit.local.cfg
@@ -0,0 +1,2 @@
+lit_config.parallelism_groups["kun_cuda_python"] = 1
+config.parallelism_group = "kun_cuda_python"
diff --git a/mlir/test/python/test_cs_rank_cuda.py b/mlir/test/python/test_cs_rank_cuda.py
new file mode 100644
index 0000000..fa3a26c
--- /dev/null
+++ b/mlir/test/python/test_cs_rank_cuda.py
@@ -0,0 +1,272 @@
+#!/usr/bin/env python3
+# RUN: %python %s
+# REQUIRES: cuda-device
+"""End-to-end test for the cs_rank GPU dispatch path.
+
+Cross-sectional rank (`KunQuant.Op.Rank`) is special on the GPU: it
+has no kunir representation at all.  CodegenMLIR detects a cs_rank
+partition in the Python IR and routes it as an "external kernel"
+descriptor straight to the C++ binding, which fabricates a KernelMeta
+tagged with `KernelKind::ExtCsRankF*`.  At Executable construction the
+runtime loads a pre-compiled, sm_75-baseline PTX bundled inside
+libKunCudaRuntime as a second CUmodule and resolves
+`kun_cs_rank_f{32,64}` from it.  This test exercises the whole
+plumbing end-to-end:
+
+  * Python frontend skips the kunir pipeline for cs_rank partitions,
+  * the executor lazy-loads the bundled PTX and resolves the right
+    symbol,
+  * the launch uses a time-major grid + dynamic shared memory (one
+    CTA per timestep, smem = S * sizeof(T)) rather than the default
+    stock-major grid the JIT'd kernels use,
+  * the result matches the CPU `equal_range`-based reference exactly,
+    including NaN passthrough and tie averaging,
+  * a graph that mixes cs_rank with regular JIT kernels stitches up
+    correctly (cs_rank produces an intermediate consumed by an
+    elementwise kernel).
+"""
+
+from __future__ import annotations
+import argparse
+import sys
+
+import numpy as np
+
+from KunQuant.Op import Builder, Input, Output, Rank
+from KunQuant.ops import Add
+from KunQuant.Driver import KunCompilerConfig
+from KunQuant.Stage import Function
+from KunQuant.jit import KunMLIR
+from KunQuant.jit.cuda import compile_func, CudaCompilerConfig, to_mlir
+
+
+# ── CPU reference (matches cpp/Kun/Rank.hpp's equal_range formula) ──
+
+def _ref_cs_rank(arr: np.ndarray) -> np.ndarray:
+    """Per-row average-rank percentile in (0, 1], NaN preserved.
+
+    Matches cpp/Kun/Rank.hpp exactly:
+      sum   = (start + end + 1) * (end - start) / 2
+      out   = sum / (end - start) / num_valid
+    where [start, end) is the equal-range of the value in the sorted
+    non-NaN array.  Algebraically this equals
+      (2 * less + equal + 1) / (2 * num_valid)
+    which is what the GPU kernel computes.
+    """
+    T, S = arr.shape
+    out = np.full((T, S), np.nan, dtype=arr.dtype)
+    for t in range(T):
+        row = arr[t]
+        valid_mask = ~np.isnan(row)
+        v = row[valid_mask]
+        nv = len(v)
+        if nv == 0:
+            continue
+        sorted_v = np.sort(v)
+        ranks = np.empty(nv, dtype=arr.dtype)
+        for i, x in enumerate(v):
+            lo = np.searchsorted(sorted_v, x, side='left')
+            hi = np.searchsorted(sorted_v, x, side='right')
+            # avg rank (1-indexed) within the equal-range, divided by nv
+            ranks[i] = (lo + hi + 1) / 2.0 / nv
+        out[t][valid_mask] = ranks
+    return out
+
+
+# ── Function builders ────────────────────────────────────────────────
+
+def _build_cs_rank_only() -> Function:
+    """r = cs_rank(a) — a single cs_rank partition, no other compute."""
+    b = Builder()
+    with b:
+        Output(Rank(Input('a')), 'r')
+    return Function(b.ops, name='cs_rank_only')
+
+
+def _build_cs_rank_mixed() -> Function:
+    """out = a + cs_rank(b) — forces the partitioner to produce
+    two kernels: an external cs_rank partition, then a JIT'd
+    elementwise Add that consumes its result."""
+    b = Builder()
+    with b:
+        a = Input('a')
+        bin_ = Input('b')
+        Output(Add(a, Rank(bin_)), 'out')
+    return Function(b.ops, name='cs_rank_mixed')
+
+
+# ── Helpers ──────────────────────────────────────────────────────────
+
+def _dtype_pair(dtype_token: str):
+    """Map CudaCompilerConfig.dtype → (numpy dtype, label)."""
+    if dtype_token == 'float':  return np.float32, 'float32'
+    if dtype_token == 'double': return np.float64, 'float64'
+    raise ValueError(dtype_token)
+
+
+def _run_cs_rank_only(target: str, dtype_token: str, T: int, S: int,
+                       *, with_nan: bool, with_ties: bool, seed: int) -> int:
+    """Compile r = cs_rank(a), launch, and compare to the CPU
+    reference.  Asserts the partition was tagged as external (i.e.
+    kernel_names should be a single kernel that doesn't show up as a
+    typical compute kernel — but since we can't introspect kind from
+    Python directly, we lean on the correctness check to prove the
+    external path is wired up)."""
+    import cupy as cp
+
+    np_dt, label = _dtype_pair(dtype_token)
+    print(f"=== cs_rank ({label}) T={T} S={S} "
+           f"nan={with_nan} ties={with_ties} ===")
+
+    f = _build_cs_rank_only()
+    kcfg = KunCompilerConfig(input_layout="TS", output_layout="TS",
+                              dtype=dtype_token)
+    ccfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4)
+    mod = to_mlir(_build_cs_rank_only(), kcfg, ccfg)
+    print("--- mlir ---")
+    print(mod.to_string())
+
+    exe = compile_func(f, kcfg, ccfg)
+    print(f"  kernel_names={exe.kernel_names}  "
+          f"num_buffers={exe.num_buffers}  "
+          f"peak_intermediate_slots={exe.peak_intermediate_slots}")
+
+    rng = np.random.default_rng(seed)
+    a_h = rng.standard_normal((T, S)).astype(np_dt)
+    if with_ties:
+        # Force a moderate tie population: quantize ~30% of cells.
+        tie_mask = rng.random((T, S)) < 0.3
+        a_h[tie_mask] = np.round(a_h[tie_mask] * 2) / 2  # snap to 0.5-grid
+    if with_nan:
+        # Sprinkle ~10% NaNs.  Also force a row to be all-NaN to test
+        # the valid==0 path.
+        nan_mask = rng.random((T, S)) < 0.1
+        a_h[nan_mask] = np.nan
+        a_h[0, :] = np.nan
+
+    a_d = cp.asarray(a_h)
+    out_d = cp.zeros((T, S), dtype=np_dt)
+    ex = KunMLIR.Executor()
+    ex.runGraph(exe, inputs={'a': a_d}, outputs={'r': out_d})
+    ex.synchronize()
+    out_h = cp.asnumpy(out_d)
+
+    ref = _ref_cs_rank(a_h)
+    # NaN cells in the reference must remain NaN on the GPU.
+    nan_ref = np.isnan(ref)
+    if not np.array_equal(np.isnan(out_h), nan_ref):
+        print("  FAIL — NaN pattern mismatch", file=sys.stderr)
+        return 1
+    # Numeric cells must match exactly modulo a few ulps (the formula
+    # is the same algebraic expression on both sides).
+    atol = 1e-6 if np_dt == np.float32 else 1e-12
+    diff = np.abs(out_h[~nan_ref] - ref[~nan_ref])
+    max_abs = float(diff.max()) if diff.size else 0.0
+    if max_abs > atol:
+        print(f"  FAIL — max |Δ| = {max_abs:.3e} > {atol:.0e}",
+                file=sys.stderr)
+        return 1
+    print(f"  ok — max |Δ| = {max_abs:.3e} (atol={atol:.0e}) "
+          f"on {diff.size} numeric cells")
+    return 0
+
+
+def _run_cs_rank_mixed(target: str, T: int, S: int, *, seed: int) -> int:
+    """out = a + cs_rank(b) — proves cs_rank's intermediate buffer is
+    handed off correctly to a downstream JIT'd kernel.  Forces
+    partition_factor=1 so the graph splits into >= 2 kernels."""
+    import cupy as cp
+
+    print(f"=== cs_rank-mixed (float32) T={T} S={S} ===")
+    f = _build_cs_rank_mixed()
+    kcfg = KunCompilerConfig(input_layout="TS", output_layout="TS",
+                              dtype='float', partition_factor=1)
+    ccfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4)
+    mod = to_mlir(_build_cs_rank_mixed(), kcfg, ccfg)
+    print("--- mlir ---")
+    print(mod.to_string())
+
+    exe = compile_func(f, kcfg, ccfg)
+    print(f"  kernel_names={exe.kernel_names}  "
+          f"num_kernels={exe.num_kernels}  "
+          f"num_buffers={exe.num_buffers}  "
+          f"peak_intermediate_slots={exe.peak_intermediate_slots}")
+
+    # The whole point: at least 2 kernels (cs_rank + downstream Add),
+    # and at least one intermediate slot threading them together.
+    assert exe.num_kernels >= 2, exe.num_kernels
+    assert exe.peak_intermediate_slots >= 1, exe.peak_intermediate_slots
+
+    rng = np.random.default_rng(seed)
+    a_h = rng.standard_normal((T, S), dtype=np.float32)
+    b_h = rng.standard_normal((T, S), dtype=np.float32)
+    out_d = cp.zeros((T, S), dtype=cp.float32)
+
+    ex = KunMLIR.Executor()
+    ex.runGraph(exe,
+                inputs={'a': cp.asarray(a_h), 'b': cp.asarray(b_h)},
+                outputs={'out': out_d})
+    ex.synchronize()
+    out_h = cp.asnumpy(out_d)
+
+    ref = a_h + _ref_cs_rank(b_h)
+    diff = np.abs(out_h - ref)
+    max_abs = float(diff.max())
+    if max_abs > 1e-5:
+        idx = np.unravel_index(diff.argmax(), diff.shape)
+        print(f"  FAIL — max |Δ| = {max_abs:.3e} at {idx}", file=sys.stderr)
+        return 1
+    print(f"  ok — max |Δ| = {max_abs:.3e} across {exe.num_kernels} kernels")
+    return 0
+
+
+# ── Entry ────────────────────────────────────────────────────────────
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--target", default=None)
+    ap.add_argument("-T", "--time-length", type=int, default=8)
+    ap.add_argument("-S", "--num-stocks", type=int, default=257)
+    args = ap.parse_args()
+
+    import cupy as cp
+    from KunQuant.jit.env import get_cuda_compute_capability
+    args.target = args.target or get_cuda_compute_capability()
+    cp.cuda.Device(0).use()
+    _ = cp.zeros((1,), dtype=cp.float32)
+
+    rc = 0
+    # f32 — golden path
+    rc |= _run_cs_rank_only(args.target, 'float',
+                              args.time_length, args.num_stocks,
+                              with_nan=False, with_ties=False, seed=1)
+    print()
+    # f32 with ties — exercises the equal-range averaging
+    rc |= _run_cs_rank_only(args.target, 'float',
+                              args.time_length, args.num_stocks,
+                              with_nan=False, with_ties=True, seed=2)
+    print()
+    # f32 with NaN + ties — exercises every branch
+    rc |= _run_cs_rank_only(args.target, 'float',
+                              args.time_length, args.num_stocks,
+                              with_nan=True, with_ties=True, seed=3)
+    print()
+    # NOTE: the cs_rank kernel itself is templated and `kun_cs_rank_f64`
+    # is built into the embedded PTX, but the rest of the runtime
+    # (Runtime.cpp's slot pool, MlirBinding.cpp's CAI typestr check)
+    # is still float32-only.  Lifting that requires plumbing dtype
+    # through Executable / ExecutableData and is out of scope here.
+    # When that lands, this test can re-enable:
+    #
+    #   rc |= _run_cs_rank_only(args.target, 'double',
+    #                            args.time_length, args.num_stocks,
+    #                            with_nan=True, with_ties=True, seed=4)
+    print()
+    # Mixed cs_rank + Add — proves intermediate buffer flow works
+    rc |= _run_cs_rank_mixed(args.target,
+                              args.time_length, args.num_stocks, seed=5)
+    return rc
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/mlir/test/python/test_kun_mlir.py b/mlir/test/python/test_kun_mlir.py
new file mode 100644
index 0000000..2486630
--- /dev/null
+++ b/mlir/test/python/test_kun_mlir.py
@@ -0,0 +1,178 @@
+#!/usr/bin/env python3
+# RUN: %python %s
+"""End-to-end test for the `KunMLIR` Python bindings.
+
+  parse → to_string → lower_to_ptx (debug only) → compile → launch
+
+Usage:
+    PATH=$CUDA_BIN:$PATH PYTHONPATH=<build>/mlir/lib/Python \
+        kun python test_kun_mlir.py [--target sm_xx]
+"""
+
+from __future__ import annotations
+import argparse
+import json
+import sys
+import tempfile
+import textwrap
+from pathlib import Path
+
+
+SAMPLE_KUNIR = textwrap.dedent("""
+gpu.module @kungpu_kernels {
+  kunir.func @test_addsum(%a: !kunir.ts<f32, inf>, %b: !kunir.ts<f32, inf>)
+      inputs {%a = "a", %b = "b"}
+      outputs {"sum"}
+      target {occupancy = 1, warps_per_cta = 4, smem_size = 49152, vector_size = 1} unreliable_count = 0
+      -> !kunir.ts<f32, 1> {
+    %s = kunir.add %a, %b : !kunir.ts<f32, inf>, !kunir.ts<f32, inf>
+    kunir.return %s : !kunir.ts<f32, 1>
+  }
+}
+""").strip()
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--target", default=None,
+                     help="GPU compute capability (e.g. sm_120, sm_90, sm_80)")
+    ap.add_argument("-T", "--time-length", type=int, default=64)
+    ap.add_argument("-S", "--num-stocks", type=int, default=2048)
+    args = ap.parse_args()
+
+    from KunQuant.jit import KunMLIR
+    import numpy as np
+    from KunQuant.jit.cuda import find_cuda_toolkit
+    from utils import resolve_cuda_compute_capability
+
+    args.target, has_cuda_device = resolve_cuda_compute_capability(args.target)
+
+    print(f"=== parse + to_string ===")
+    mod = KunMLIR.parse(SAMPLE_KUNIR)
+    text = mod.to_string()
+    assert "kunir.func @test_addsum" in text, "module text missing kunir.func"
+    print("ok — module round-trips through parse/to_string")
+
+    toolkit = find_cuda_toolkit()
+
+    print()
+    print(f"=== lower_to_ptx (target={args.target}, O3, debug only) ===")
+    # Debug entry point — same lowering pipeline as compile() but stops
+    # at PTX text via gpu-module-to-binary{format=isa}.  Mutates `mod`
+    # (replaces the gpu.module with a gpu.binary), so we re-parse for
+    # the main compile step below.
+    ptx = KunMLIR.lower_to_ptx(mod, gpu_arch=args.target, opt_level=3,
+                                  toolkit_path=toolkit)
+    assert "test_addsum" in ptx
+    print(f"ok — produced {len(ptx)} bytes of PTX text")
+
+    if not has_cuda_device:
+        print()
+        print("skip — no CUDA device is visible; skipping Executable "
+              "construction and runGraph checks")
+        return 0
+
+    import cupy as cp
+    # Force-initialise the CUDA driver + create the primary context now,
+    # so subsequent KunMLIR.compile() / Executor.runGraph() find one.
+    cp.cuda.Device(0).use()
+    _ = cp.zeros((1,), dtype=cp.float32)
+
+    print()
+    print(f"=== compile (all-in-one) ===")
+    mod2 = KunMLIR.parse(SAMPLE_KUNIR)
+    exe = KunMLIR.compile(mod2,
+                            graph_inputs=["a", "b"],
+                            graph_outputs=["sum"],
+                            gpu_arch=args.target, opt_level=3,
+                            toolkit_path=toolkit)
+    print(f"  kernel_names           = {exe.kernel_names}")
+    print(f"  num_kernels            = {exe.num_kernels}")
+    print(f"  launch_order           = {exe.launch_order}")
+    print(f"  num_buffers            = {exe.num_buffers}")
+    print(f"  peak_intermediate_slots= {exe.peak_intermediate_slots}")
+    print(f"  input_names            = {exe.input_names}")
+    print(f"  output_names           = {exe.output_names}")
+    print(f"  warps_per_cta          = {exe.warps_per_cta}")
+    print(f"  vector_size            = {exe.vector_size}")
+    print(f"  cubin bytes            = {len(exe.cubin)}")
+    assert exe.kernel_names == ["test_addsum"]
+    assert exe.num_kernels == 1
+    assert exe.launch_order == [0]
+    assert exe.num_buffers == 3      # a, b, sum
+    assert exe.peak_intermediate_slots == 0  # no intermediates
+    assert exe.input_names  == ["a", "b"]
+    assert exe.output_names == ["sum"]
+    assert exe.warps_per_cta == 4
+    assert exe.vector_size   == 1
+
+    clone = exe.clone()
+    assert clone.kernel_names == exe.kernel_names
+    assert clone.input_names  == exe.input_names
+    assert clone.output_names == exe.output_names
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        exe.save_to_files(tmpdir, "addsum")
+        metadata_path = Path(tmpdir) / "addsum.json"
+        cubin_path = Path(tmpdir) / "addsum.cubin"
+        assert metadata_path.exists()
+        assert cubin_path.exists()
+        with metadata_path.open("r", encoding="utf-8") as f:
+            metadata = json.load(f)
+        assert metadata["format"] == "kun_cuda_executable_data"
+        assert metadata["version"] == 1
+        assert metadata["cubin"] == "addsum.cubin"
+        loaded = KunMLIR.Executable.load_from_files(tmpdir, "addsum")
+    assert loaded.kernel_names == exe.kernel_names
+    assert loaded.input_names  == exe.input_names
+    assert loaded.output_names == exe.output_names
+
+    # Run original, cloned, and file-loaded executables over two num_stocks
+    # values:
+    #  - one that's a multiple of (warps_per_cta * 32 * vector_size) — no
+    #    tail block;
+    #  - one that isn't — exercises the active-thread guard inserted by
+    #    convert-kungpu-to-llvm phase 1.
+    block_x = exe.warps_per_cta * 32 * exe.vector_size
+    rng = np.random.default_rng(0)
+    rc = 0
+    for run_exe, label, S in [
+            (exe, "aligned", args.num_stocks),
+            (clone, "unaligned clone (tail block)",
+             args.num_stocks + (block_x // 2 + 7)),
+            (loaded, "loaded from files", args.num_stocks)]:
+        T = args.time_length
+        print()
+        is_aligned = (S % block_x == 0)
+        print(f"=== launch ({T} × {S}) — {label}, "
+               f"S % {block_x} = {S % block_x}, "
+               f"aligned={is_aligned} ===")
+        a_h = rng.standard_normal((T, S), dtype=np.float32)
+        b_h = rng.standard_normal((T, S), dtype=np.float32)
+        a   = cp.asarray(a_h)
+        b   = cp.asarray(b_h)
+        out = cp.zeros((T, S), dtype=cp.float32)
+        executor = KunMLIR.Executor()
+        executor.runGraph(run_exe,
+                          inputs={"a": a, "b": b},
+                          outputs={"sum": out})
+        # No explicit synchronize: default-stream Executor + cupy's
+        # default stream → cp.asnumpy's D2H memcpy goes on the same
+        # stream and waits for our kernels.  See test_multi_kernel.py
+        # for the case where sync IS required (non-blocking user stream).
+        out_h = cp.asnumpy(out)
+        expected = a_h + b_h
+        if not np.allclose(out_h, expected, atol=1e-5):
+            diff = np.abs(out_h - expected)
+            print(f"  FAIL — max abs diff {diff.max()}, "
+                    f"argmax @ {np.unravel_index(diff.argmax(), diff.shape)}",
+                    file=sys.stderr)
+            rc = 1
+        else:
+            print(f"  ok — output matches a + b on every (t, s) cell "
+                   f"({T*S} cells)")
+    return rc
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/mlir/test/python/test_kun_to_cuda.py b/mlir/test/python/test_kun_to_cuda.py
new file mode 100644
index 0000000..bc31a87
--- /dev/null
+++ b/mlir/test/python/test_kun_to_cuda.py
@@ -0,0 +1,813 @@
+#!/usr/bin/env python3
+# RUN: %python %s
+# REQUIRES: cuda-device
+"""End-to-end test for the KunQuant Python-IR → MLIR → CUDA path.
+
+Builds a KunQuant Function with the high-level Op API, runs the same
+Driver.optimize() pipeline the CPU compileit uses, then compiles to a
+CUDA Executable via KunMLIR and validates against numpy.
+
+Three factors are exercised:
+  * elemwise:   out = (a + b) * a - b * b           (binary elemwise only;
+                  doesn't touch libdevice)
+  * libdevice:  out = log(abs(a)) * sign(b - a)     (Abs / Log / Sign all
+                  lower to math.* ops that emit __nv_* libdevice externs;
+                  the upstream `gpu-module-to-binary` pass links libdevice
+                  for us, so this works end-to-end)
+  * windowed:   ws  = WindowedSum(a + b, N)         (decomposes into
+                  ForeachBackWindow + ReduceAdd inside the optimizer pass)
+
+The runtime auto-discovers the CUDA toolkit (CUDA_HOME / CUDA_PATH /
+CUDA_TOOLKIT_PATH / CUDA_ROOT or standard install paths).  Override
+with `CudaCompilerConfig(toolkit_path=...)` if needed.
+"""
+
+from __future__ import annotations
+import argparse
+import sys
+
+import numpy as np
+
+from KunQuant.Op import (
+    Builder, Input, Output, ConstantOp,
+    WindowedTempOutput, ForeachBackWindow, IterValue,
+)
+from KunQuant.ops import Add, Sub, Mul, Abs, Log, Sign, WindowedSum, ReduceMax
+from KunQuant.ops.ElewiseOp import (
+    GreaterThan, GreaterEqual, LessThan, LessEqual, Equals,
+    And, Or, Not, Select,
+)
+from KunQuant.ops.MiscOp import (
+    BackRef, FastWindowedSum,
+    Accumulator, SetAccumulator, ReturnFirstValue,
+)
+from KunQuant.Stage import Function
+from KunQuant.Driver import KunCompilerConfig
+from KunQuant.jit import KunMLIR
+from KunQuant.jit.cuda import compile_func, compileit, CudaCompilerConfig
+
+
+# GPU backend only supports TS layout; share one KunCompilerConfig across
+# every test that doesn't need to customise other graph-rewrite knobs.
+_KCFG_TS = KunCompilerConfig(input_layout="TS", output_layout="TS")
+
+
+def build_func_elemwise() -> Function:
+    """out = (a + b) * a - b * b"""
+    builder = Builder()
+    with builder:
+        a = Input("a")
+        bin_ = Input("b")
+        v = Sub(Mul(Add(a, bin_), a), Mul(bin_, bin_))
+        Output(v, "out")
+    return Function(builder.ops, name="elemwise_kernel")
+
+
+def build_func_libdevice() -> Function:
+    """out = log(abs(a)) * sign(b - a) — exercises Abs/Log/Sign, all of
+    which lower to math.* ops that need libdevice."""
+    builder = Builder()
+    with builder:
+        a = Input("a")
+        bin_ = Input("b")
+        v = Mul(Log(Abs(a)), Sign(Sub(bin_, a)))
+        Output(v, "out")
+    return Function(builder.ops, name="libdevice_kernel")
+
+
+def build_func_windowed(N: int) -> Function:
+    """Two outputs over c = a + b:
+       ws        = WindowedSum(c, N)
+       ws_maxabs = max_{k in [0..N-1]} |c[t-k] - c[t]|
+
+    `ws_maxabs` is a hand-built ForeachBackWindow whose body reads BOTH:
+      - the block-arg  (= c[t-k], the iter value)
+      - the outer ts c (= c[t], current time step)
+    and reduces |·| via ReduceMax.  This exercises the kunir-to-kungpu
+    inner-scope inheritance of the outer scalarMap/tsMap: `c` is computed
+    outside the loop but used inside.
+    """
+    builder = Builder()
+    with builder:
+        a = Input("a")
+        bin_ = Input("b")
+        c = Add(a, bin_)
+        Output(WindowedSum(c, N), "ws")
+
+        wtemp = WindowedTempOutput(c, N)
+        loop  = ForeachBackWindow(wtemp, N)
+        builder.set_loop(loop)
+        diff = Sub(IterValue(loop, wtemp), c)
+        a_diff = Abs(diff)
+        builder.set_loop(None)
+        Output(ReduceMax(a_diff), "ws_maxabs")
+    return Function(builder.ops, name="windowed_kernel")
+
+
+def build_func_backref(N: int) -> Function:
+    """out = BackRef(a + b, N) — value of (a+b) at time t-N"""
+    builder = Builder()
+    with builder:
+        a = Input("a")
+        bin_ = Input("b")
+        Output(BackRef(Add(a, bin_), N), "out")
+    return Function(builder.ops, name="backref_kernel")
+
+
+def build_func_output_backref(N: int, delay: int) -> Function:
+    """raw = WindowedSum(a, N); delayed = BackRef(raw, delay).
+
+    `raw` is also a graph output.  When the runtime slices time, the
+    optimizer must keep a local WindowedTempOutput for the BackRef source
+    rather than reusing the graph output buffer: warmup rows in each chunk
+    are computed but masked from graph-output stores.
+    """
+    builder = Builder()
+    with builder:
+        a = Input("a")
+        raw = WindowedSum(a, N)
+        Output(raw, "raw")
+        Output(BackRef(raw, delay), "delayed")
+    return Function(builder.ops, name="output_backref_kernel")
+
+
+def build_func_fastwindowedsum(N: int) -> Function:
+    """ws = FastWindowedSum(a + b, N) — same windowed-sum semantics as
+    WindowedSum, but uses the stateful Kahan-corrected algorithm from
+    cpp/Kun/Ops.hpp."""
+    builder = Builder()
+    with builder:
+        a = Input("a")
+        bin_ = Input("b")
+        Output(FastWindowedSum(Add(a, bin_), N), "ws")
+    return Function(builder.ops, name="fastwindowedsum_kernel")
+
+
+def build_func_accumulator() -> Function:
+    """Running count of timesteps where a > 0:
+
+       cnt[t] = cnt[t-1] + (a[t] > 0 ? 1 : 0)            (cnt[-1] = 0)
+
+    Built directly with Accumulator + SetAccumulator + ReturnFirstValue.
+    `is_whole_time_required=True` propagates `unreliable_count = -1` into
+    kunir.func; the runtime treats that as a hard "single chunk" signal.
+    """
+    builder = Builder()
+    with builder:
+        a = Input("a")
+        cnt = Accumulator(a, "cnt", is_whole_time_required=True)
+        mask = GreaterThan(a, ConstantOp(0))
+        new_cnt = Select(mask, Add(cnt, ConstantOp(1)), cnt)
+        sa = SetAccumulator(cnt, mask, new_cnt)
+        Output(ReturnFirstValue([new_cnt, sa]), "cnt_out")
+    return Function(builder.ops, name="accumulator_kernel")
+
+
+def build_func_cmp_logical() -> Function:
+    """Single-graph multi-output factor that exercises every kunir cmp,
+    logical, and select op in one shot:
+
+      gt_out  = a > b  ? a : b              # element-wise max
+      lt_out  = a < b  ? a : b              # element-wise min
+      ge_out  = a >= b ? a : b              # max (tiebreaks to a)
+      le_out  = a <= b ? a : b              # min (tiebreaks to a)
+      eq_out  = a == b ? a : b              # always a where they match
+      and_out = (a > 0)  & (b > 0)  ? a : b # gt + and
+      or_out  = (a > 0)  | (b > 0)  ? a : b # gt + or
+      not_out = !(a > b) ? a : b            # = (a <= b) ? a : b
+    """
+    builder = Builder()
+    with builder:
+        a = Input("a")
+        bin_ = Input("b")
+        zero = ConstantOp(0)
+        Output(Select(GreaterThan(a, bin_), a, bin_), "gt_out")
+        Output(Select(LessThan(a, bin_),    a, bin_), "lt_out")
+        Output(Select(GreaterEqual(a, bin_), a, bin_), "ge_out")
+        Output(Select(LessEqual(a, bin_),    a, bin_), "le_out")
+        Output(Select(Equals(a, bin_),       a, bin_), "eq_out")
+        Output(Select(And(GreaterThan(a, zero), GreaterThan(bin_, zero)),
+                       a, bin_), "and_out")
+        Output(Select(Or(GreaterThan(a, zero), GreaterThan(bin_, zero)),
+                       a, bin_), "or_out")
+        Output(Select(Not(GreaterThan(a, bin_)), a, bin_), "not_out")
+    return Function(builder.ops, name="cmp_logical_kernel")
+
+
+def build_func_multipartition() -> Function:
+    """A graph with three independent outputs.  Combined with
+    `partition_factor=1` this drives `do_partition` to split into
+    multiple sub-Functions, each becoming its own kunir.func — the
+    primary thing this test exercises."""
+    builder = Builder()
+    with builder:
+        a = Input("a")
+        bin_ = Input("b")
+        Output(Add(a, bin_), "add_out")
+        Output(Mul(a, bin_), "mul_out")
+        Output(Sub(a, bin_), "sub_out")
+    return Function(builder.ops, name="multi")
+
+
+def _compare_post_warmup(out_h: np.ndarray, expected: np.ndarray,
+                            valid_start: int, atol: float) -> int:
+    """Validate kernel output against the reference on rows
+    `[valid_start:]`.  Fails loudly on **any** NaN in the kernel
+    output past the warmup region — the naive `np.abs(NaN-x).max() >
+    atol` form silently returns False because NaN comparisons are
+    False, which would let a multi-chunk regression slip through.
+    """
+    tail = out_h[valid_start:]
+    if np.isnan(tail).any():
+        nrows = int(np.unique(np.where(np.isnan(tail))[0]).size)
+        print(f"  FAIL — {nrows} of {tail.shape[0]} validated rows "
+               f"contain NaN past row {valid_start}", file=sys.stderr)
+        return 1
+    diff = np.abs(tail - expected[valid_start:])
+    max_abs = float(diff.max())
+    if max_abs > atol:
+        idx = np.unravel_index(diff.argmax(), diff.shape)
+        print(f"  FAIL — max |Δ| = {max_abs:.3e} > {atol:.0e} at "
+               f"row {valid_start + idx[0]}, col {idx[1]}", file=sys.stderr)
+        return 1
+    print(f"  ok — max |Δ| = {max_abs:.3e} (atol={atol:.0e})")
+    return 0
+
+
+def _run_one(label: str, build_fn, expected_fn, target: str, T: int, S: int,
+              atol: float = 1e-5) -> int:
+    """Compile a Function, launch it, validate against numpy."""
+    print(f"=== {label} ===")
+    f = build_fn()
+    ccfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4)
+
+    exe = compile_func(f, _KCFG_TS, ccfg)
+    print(f"  kernels={exe.kernel_names}  num_buffers={exe.num_buffers}  "
+           f"peak_intermediate_slots={exe.peak_intermediate_slots}")
+
+    import cupy as cp
+    rng = np.random.default_rng(0)
+    a_h = rng.standard_normal((T, S), dtype=np.float32)
+    b_h = rng.standard_normal((T, S), dtype=np.float32)
+    out = cp.zeros((T, S), dtype=cp.float32)
+
+    executor = KunMLIR.Executor()
+    executor.runGraph(exe,
+                       inputs={"a": cp.asarray(a_h), "b": cp.asarray(b_h)},
+                       outputs={"out": out})
+    out_h = cp.asnumpy(out)
+
+    expected = expected_fn(a_h, b_h)
+    if not np.allclose(out_h, expected, atol=atol, equal_nan=True):
+        diff = np.abs(out_h - expected)
+        print(f"  FAIL — max abs diff {np.nanmax(diff)}", file=sys.stderr)
+        return 1
+    print(f"  ok — output matches reference on every cell ({T*S} cells)")
+    return 0
+
+
+def run_elemwise(target: str, T: int, S: int) -> int:
+    return _run_one("elemwise: out = (a+b)*a - b*b",
+                     build_func_elemwise,
+                     lambda a, b: (a + b) * a - b * b,
+                     target, T, S)
+
+
+def run_libdevice(target: str, T: int, S: int) -> int:
+    # `sign` differs between MLIR (math.copysign — keeps sign bit incl 0)
+    # and numpy.sign (returns 0 at 0).  With Gaussian inputs the
+    # sign-of-zero case has measure zero, so equality holds.
+    return _run_one("libdevice: out = log(abs(a)) * sign(b - a)",
+                     build_func_libdevice,
+                     lambda a, b: np.log(np.abs(a)) * np.sign(b - a),
+                     target, T, S, atol=1e-4)
+
+
+def run_backref(target: str, T: int, S: int, N: int) -> int:
+    print(f"=== backref: out = (a+b)[t - {N}] ===")
+    f = build_func_backref(N)
+    ccfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4)
+
+    exe = compile_func(f, _KCFG_TS, ccfg)
+    print(f"  kernels={exe.kernel_names}  num_buffers={exe.num_buffers}  "
+           f"peak_intermediate_slots={exe.peak_intermediate_slots}")
+
+    import cupy as cp
+    rng = np.random.default_rng(2)
+    a_h = rng.standard_normal((T, S), dtype=np.float32)
+    b_h = rng.standard_normal((T, S), dtype=np.float32)
+    out = cp.zeros((T, S), dtype=cp.float32)
+
+    executor = KunMLIR.Executor()
+    executor.runGraph(exe,
+                       inputs={"a": cp.asarray(a_h), "b": cp.asarray(b_h)},
+                       outputs={"out": out})
+    out_h = cp.asnumpy(out)
+
+    # Reference: out[t] = (a+b)[t-N] for t >= N; undefined for t < N.
+    c = a_h + b_h
+    diff = np.abs(out_h[N:] - c[: T - N])
+    max_abs = float(diff.max())
+    if max_abs > 1e-5:
+        idx = np.unravel_index(diff.argmax(), diff.shape)
+        print(f"  FAIL — max abs diff {max_abs} at {idx}", file=sys.stderr)
+        return 1
+    print(f"  ok — max |Δ| = {max_abs:.3e} on the {T - N} valid time steps")
+    return 0
+
+
+def run_fastwindowedsum(target: str, T: int, S: int, N: int) -> int:
+    print(f"=== fast_windowed_sum: ws = FastWindowedSum(a + b, N={N}) ===")
+    f = build_func_fastwindowedsum(N)
+    ccfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4)
+
+    exe = compile_func(f, _KCFG_TS, ccfg)
+    print(f"  kernels={exe.kernel_names}  num_buffers={exe.num_buffers}  "
+           f"peak_intermediate_slots={exe.peak_intermediate_slots}")
+
+    import cupy as cp
+    rng = np.random.default_rng(3)
+    a_h = rng.standard_normal((T, S), dtype=np.float32)
+    b_h = rng.standard_normal((T, S), dtype=np.float32)
+    out = cp.zeros((T, S), dtype=cp.float32)
+
+    executor = KunMLIR.Executor()
+    executor.runGraph(exe,
+                       inputs={"a": cp.asarray(a_h), "b": cp.asarray(b_h)},
+                       outputs={"ws": out})
+    out_h = cp.asnumpy(out)
+
+    # Reference matches WindowedSum (same window, no NaN inputs).
+    c = a_h + b_h
+    cumsum = np.cumsum(c, axis=0, dtype=np.float64)
+    expected = np.empty((T, S), dtype=np.float32)
+    expected[:N - 1] = np.nan
+    expected[N - 1] = cumsum[N - 1]
+    if T > N:
+        expected[N:] = (cumsum[N:] - cumsum[:-N]).astype(np.float32)
+
+    return _compare_post_warmup(out_h, expected, valid_start=N - 1,
+                                  atol=max(1e-3, 5e-7 * N))
+
+
+def run_multipartition(target: str, T: int, S: int) -> int:
+    """End-to-end test of the do_partition + post_optimize path:
+    three independent outputs forced into separate partitions by
+    `partition_factor=1`, each becoming a sibling kunir.func in the
+    generated gpu.module."""
+    print("=== multipartition: 3 outputs (add/mul/sub) split via "
+           "partition_factor=1 ===")
+    f = build_func_multipartition()
+    ccfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4)
+    kcfg = KunCompilerConfig(input_layout="TS", output_layout="TS",
+                              partition_factor=1)
+
+    exe = compile_func(f, kcfg, ccfg)
+    print(f"  kernel_names           = {exe.kernel_names}")
+    print(f"  num_kernels            = {exe.num_kernels}")
+    print(f"  launch_order           = {exe.launch_order}")
+    print(f"  num_buffers            = {exe.num_buffers}")
+    print(f"  peak_intermediate_slots= {exe.peak_intermediate_slots}")
+
+    # The point of the test: the partitioner actually produced more
+    # than one kunir.func.  No intermediates because the three outputs
+    # are independent (each consumes only graph inputs).
+    assert exe.num_kernels >= 2, exe.num_kernels
+    assert exe.peak_intermediate_slots == 0, exe.peak_intermediate_slots
+    assert set(exe.input_names)  == {"a", "b"}
+    assert set(exe.output_names) == {"add_out", "mul_out", "sub_out"}
+
+    import cupy as cp
+    rng = np.random.default_rng(7)
+    a_h = rng.standard_normal((T, S), dtype=np.float32)
+    b_h = rng.standard_normal((T, S), dtype=np.float32)
+    add_out = cp.zeros((T, S), dtype=cp.float32)
+    mul_out = cp.zeros((T, S), dtype=cp.float32)
+    sub_out = cp.zeros((T, S), dtype=cp.float32)
+
+    executor = KunMLIR.Executor()
+    executor.runGraph(exe,
+                       inputs={"a": cp.asarray(a_h), "b": cp.asarray(b_h)},
+                       outputs={"add_out": add_out,
+                                 "mul_out": mul_out,
+                                 "sub_out": sub_out})
+
+    add_h = cp.asnumpy(add_out)
+    mul_h = cp.asnumpy(mul_out)
+    sub_h = cp.asnumpy(sub_out)
+    if not (np.allclose(add_h, a_h + b_h, atol=1e-5)
+            and np.allclose(mul_h, a_h * b_h, atol=1e-5)
+            and np.allclose(sub_h, a_h - b_h, atol=1e-5)):
+        print(f"  FAIL — at least one of add/mul/sub mismatch",
+                file=sys.stderr)
+        return 1
+    print(f"  ok — all 3 outputs match across {exe.num_kernels} kernels")
+    return 0
+
+
+def run_accumulator(target: str, T: int, S: int) -> int:
+    """End-to-end correctness of Accumulator + SetAccumulator +
+    ReturnFirstValue: cnt[t] = cnt[t-1] + (a[t] > 0 ? 1 : 0).
+
+    With default `sm_fill_factor` the runtime would normally split this
+    T-sized job into many chunks; the `is_whole_time_required=True` flag
+    on the Accumulator propagates `unreliable_count = -1` through the
+    kunir.func attr, and computeChunkPlan collapses to a single chunk.
+    A failure here means the sentinel path is broken — multi-chunk
+    accumulators silently reset across chunk boundaries."""
+    print(f"=== accumulator: cnt[t] = cnt[t-1] + (a[t] > 0)  "
+           f"(whole-time sentinel) ===")
+    f = build_func_accumulator()
+    ccfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4)
+    exe = compile_func(f, _KCFG_TS, ccfg)
+    print(f"  kernels={exe.kernel_names}  num_buffers={exe.num_buffers}  "
+           f"peak_intermediate_slots={exe.peak_intermediate_slots}")
+
+    import cupy as cp
+    rng = np.random.default_rng(13)
+    a_h = rng.standard_normal((T, S), dtype=np.float32)
+    out = cp.zeros((T, S), dtype=cp.float32)
+
+    executor = KunMLIR.Executor()
+    executor.runGraph(exe,
+                       inputs={"a": cp.asarray(a_h)},
+                       outputs={"cnt_out": out})
+    out_h = cp.asnumpy(out)
+
+    expected = np.cumsum((a_h > 0).astype(np.float32), axis=0)
+    return _compare_post_warmup(out_h, expected, valid_start=0, atol=1e-5)
+
+
+def run_cmp_logical(target: str, T: int, S: int) -> int:
+    """End-to-end test for kunir.gt/ge/lt/le/eq + and/or/not + select.
+
+    Verifies a single graph with eight outputs against the obvious numpy
+    reference.  Exercises both bool-producing ops (cmp) and bool-consuming
+    ops (and/or/not/select) plus the i1 ts type round-tripping through the
+    kunir → kungpu lowering.
+    """
+    print("=== cmp/logical/select: 8 outputs exercising kunir bool ops ===")
+    f = build_func_cmp_logical()
+    ccfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4)
+
+    exe = compile_func(f, _KCFG_TS, ccfg)
+    print(f"  kernels={exe.kernel_names}  num_buffers={exe.num_buffers}  "
+           f"peak_intermediate_slots={exe.peak_intermediate_slots}")
+
+    import cupy as cp
+    rng = np.random.default_rng(11)
+    a_h = rng.standard_normal((T, S), dtype=np.float32)
+    b_h = rng.standard_normal((T, S), dtype=np.float32)
+
+    out_names = ["gt_out", "lt_out", "ge_out", "le_out",
+                  "eq_out", "and_out", "or_out", "not_out"]
+    outs = {n: cp.zeros((T, S), dtype=cp.float32) for n in out_names}
+
+    executor = KunMLIR.Executor()
+    executor.runGraph(exe,
+                       inputs={"a": cp.asarray(a_h), "b": cp.asarray(b_h)},
+                       outputs=outs)
+
+    def ref(cond: np.ndarray) -> np.ndarray:
+        return np.where(cond, a_h, b_h)
+
+    zero = np.zeros_like(a_h)
+    expected = {
+        "gt_out":  ref(a_h >  b_h),
+        "lt_out":  ref(a_h <  b_h),
+        "ge_out":  ref(a_h >= b_h),
+        "le_out":  ref(a_h <= b_h),
+        "eq_out":  ref(a_h == b_h),
+        "and_out": ref((a_h > zero) & (b_h > zero)),
+        "or_out":  ref((a_h > zero) | (b_h > zero)),
+        "not_out": ref(~(a_h > b_h)),
+    }
+
+    rc = 0
+    for n in out_names:
+        out_h = cp.asnumpy(outs[n])
+        if not np.allclose(out_h, expected[n], atol=1e-5):
+            diff = np.abs(out_h - expected[n])
+            idx  = np.unravel_index(int(np.nanargmax(diff)), diff.shape)
+            print(f"  FAIL {n} — max |Δ|={float(diff.max()):.3e} at {idx}",
+                    file=sys.stderr)
+            rc = 1
+        else:
+            print(f"  ok {n}")
+    if rc == 0:
+        print(f"  ok — all 8 outputs match across {T*S} cells")
+    return rc
+
+
+def build_windowed(target: str, N: int):
+    """Compile `build_func_windowed(N)` once.  The returned executable
+    can be reused across multiple `test_windowed` invocations with
+    different T / S / mask (anything that doesn't change the graph
+    topology or window size N)."""
+    f = build_func_windowed(N)
+    ccfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4)
+    exe = compile_func(f, _KCFG_TS, ccfg)
+    print(f"  [build windowed N={N}] kernels={exe.kernel_names}  "
+           f"num_buffers={exe.num_buffers}  "
+           f"peak_intermediate_slots={exe.peak_intermediate_slots}")
+    return exe
+
+
+def test_windowed(exe, T: int, S: int, N: int, mask: int = 0) -> int:
+    """Correctness check against numpy for the two outputs of
+    `build_func_windowed`:
+       ws        = WindowedSum(c, N)             — stateful fast_windowed_sum
+       ws_maxabs = max_k |c[t-k] - c[t]|         — hand-built ForeachBackWindow
+                                                    body that reads BOTH the
+                                                    block-arg (c[t-k]) AND the
+                                                    outer ts c (c[t]).
+       (c = a + b, k in [0..N-1])
+
+    With `mask > 0` the kernel only writes rows `[mask, T)` of every
+    output (rows `[0, mask)` are warmup and stay at the allocator's
+    initial value).  Exercises the multi-chunk + mask path (chunk-local
+    `t - loop_lb >= window` guard) for both outputs.
+
+    `exe` must have been compiled with the matching `N`.
+    """
+    assert 0 <= mask < T
+    mask_tag = f", mask={mask}" if mask else ""
+    print(f"=== windowed: ws = WindowedSum(a + b, N={N}){mask_tag}; "
+           f"ws_maxabs = max_k |c[t-k] - c[t]|  (c = a+b) ===")
+
+    import cupy as cp
+    rng = np.random.default_rng(1)
+    a_h = rng.standard_normal((T, S), dtype=np.float32)
+    b_h = rng.standard_normal((T, S), dtype=np.float32)
+    # Output is the same shape as input; the binding leaves rows
+    # `[0, mask)` untouched.
+    ws_out     = cp.zeros((T, S), dtype=cp.float32)
+    maxabs_out = cp.zeros((T, S), dtype=cp.float32)
+
+    executor = KunMLIR.Executor()
+    executor.runGraph(exe,
+                       inputs={"a": cp.asarray(a_h), "b": cp.asarray(b_h)},
+                       mask=mask,
+                       outputs={"ws": ws_out, "ws_maxabs": maxabs_out})
+    ws_h     = cp.asnumpy(ws_out)
+    maxabs_h = cp.asnumpy(maxabs_out)
+
+    # Reference is the full-T factor: output[t] = factor at time t.
+    c = a_h + b_h
+    cumsum = np.cumsum(c, axis=0, dtype=np.float64)
+    ws_expected = np.empty((T, S), dtype=np.float32)
+    ws_expected[:N - 1] = np.nan
+    ws_expected[N - 1] = cumsum[N - 1]
+    if T > N:
+        ws_expected[N:] = (cumsum[N:] - cumsum[:-N]).astype(np.float32)
+
+    maxabs_expected = np.empty((T, S), dtype=np.float32)
+    maxabs_expected[:N - 1] = np.nan
+    for t in range(N - 1, T):
+        window = c[t - N + 1 : t + 1]                     # (N, S)
+        maxabs_expected[t] = np.max(np.abs(window - c[t]), axis=0)
+
+    # Valid-from-row: the later of the kernel-written region (mask) and
+    # the windowed-op warmup (N - 1).
+    valid_start = max(mask, N - 1)
+    rc = 0
+    rc |= _compare_post_warmup(ws_h, ws_expected,
+                                  valid_start=valid_start,
+                                  atol=max(1e-3, 5e-7 * N))
+    rc |= _compare_post_warmup(maxabs_h, maxabs_expected,
+                                  valid_start=valid_start, atol=1e-5)
+    return rc
+
+
+def run_backref_with_mask(target: str, T: int, S: int, N: int,
+                              mask: int) -> int:
+    """Same BackRef(a+b, N) graph as `run_backref`, but driven with a
+    non-zero `mask`.  Picked over `WindowedSum` for the mask test
+    BackRef is stateless along the time axis (each output is a gmem
+    load at offset -N), so this case isolates the mask/warmup
+    interaction from any rolling-state concerns.  The windowed sum
+    counterpart below covers the stateful path.
+    """
+    print(f"=== backref + mask: out = (a+b)[t - {N}], mask={mask} ===")
+    assert 0 < mask < T, "test requires 0 < mask < T"
+    f = build_func_backref(N)
+    ccfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4)
+
+    exe = compile_func(f, _KCFG_TS, ccfg)
+    print(f"  kernels={exe.kernel_names}  num_buffers={exe.num_buffers}  "
+           f"peak_intermediate_slots={exe.peak_intermediate_slots}")
+
+    import cupy as cp
+    rng = np.random.default_rng(4)
+    a_h = rng.standard_normal((T, S), dtype=np.float32)
+    b_h = rng.standard_normal((T, S), dtype=np.float32)
+    # Output same shape as input; the binding leaves rows `[0, mask)`
+    # untouched.
+    out = cp.zeros((T, S), dtype=cp.float32)
+
+    executor = KunMLIR.Executor()
+    executor.runGraph(exe,
+                       inputs={"a": cp.asarray(a_h), "b": cp.asarray(b_h)},
+                       mask=mask,
+                       outputs={"out": out})
+    out_h = cp.asnumpy(out)
+
+    # Reference: expected[t] = (a+b)[t-N] for t ≥ N; NaN for t < N.
+    c = a_h + b_h
+    expected = np.empty((T, S), dtype=np.float32)
+    expected[:N] = np.nan
+    expected[N:] = c[:T - N]
+    # Valid-from-row: later of the kernel-written region (mask) and the
+    # BackRef warmup (N).
+    valid_start = max(mask, N)
+    return _compare_post_warmup(out_h, expected,
+                                  valid_start=valid_start, atol=1e-5)
+
+
+def run_output_backref_multichunk(target: str, T: int, S: int,
+                                    N: int) -> int:
+    """Regression test for TempWindowElim under time slicing.
+
+    The graph outputs `raw = WindowedSum(a, N)` and also consumes `raw`
+    through `BackRef(raw, 2)`.  With multi-chunk launches, replacing the
+    BackRef's local temp window with the graph output buffer is wrong:
+    chunk warmup rows are intentionally not stored to graph outputs, and
+    peer chunks are not globally synchronized inside one kernel launch.
+    """
+    delay = 2
+    print(f"=== output-backed BackRef regression: raw=WindowedSum(a, N={N}), "
+           f"delayed=raw[t-{delay}] ===")
+    f = build_func_output_backref(N, delay)
+    ccfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4)
+
+    exe = compile_func(f, _KCFG_TS, ccfg)
+    print(f"  kernels={exe.kernel_names}  num_buffers={exe.num_buffers}  "
+           f"peak_intermediate_slots={exe.peak_intermediate_slots}")
+
+    import cupy as cp
+    rng = np.random.default_rng(17)
+    a_h = rng.standard_normal((T, S), dtype=np.float32)
+    raw_out = cp.zeros((T, S), dtype=cp.float32)
+    delayed_out = cp.zeros((T, S), dtype=cp.float32)
+
+    executor = KunMLIR.Executor()
+    executor.runGraph(exe,
+                       inputs={"a": cp.asarray(a_h)},
+                       outputs={"raw": raw_out, "delayed": delayed_out})
+    raw_h = cp.asnumpy(raw_out)
+    delayed_h = cp.asnumpy(delayed_out)
+
+    cumsum = np.cumsum(a_h, axis=0, dtype=np.float64)
+    raw_expected = np.empty((T, S), dtype=np.float32)
+    raw_expected[:N - 1] = np.nan
+    raw_expected[N - 1] = cumsum[N - 1]
+    if T > N:
+        raw_expected[N:] = (cumsum[N:] - cumsum[:-N]).astype(np.float32)
+
+    delayed_expected = np.empty((T, S), dtype=np.float32)
+    delayed_expected[:N - 1 + delay] = np.nan
+    delayed_expected[N - 1 + delay:] = raw_expected[N - 1:T - delay]
+
+    rc = 0
+    rc |= _compare_post_warmup(raw_h, raw_expected,
+                                  valid_start=N - 1,
+                                  atol=max(1e-3, 5e-7 * N))
+    rc |= _compare_post_warmup(delayed_h, delayed_expected,
+                                  valid_start=N - 1 + delay,
+                                  atol=max(1e-3, 5e-7 * N))
+    return rc
+
+
+def run_library(target: str, T: int, S: int) -> int:
+    """Exercise the multi-Function `compileit` shape and `Library.getModule`,
+    plus the auto-allocated-output path on `Executor.runGraph` (omitting
+    `outputs=` so the binding allocates fresh nb::ndarrays for every
+    graph output).
+
+    Two independent functions are compiled into one `Library`:
+      * elemwise_kernel : out = (a+b)*a - b*b
+      * libdevice_kernel: out = log(abs(a)) * sign(b - a)
+    """
+    print("=== library: multi-Function compileit + Library.getModule + "
+           "auto-allocated outputs ===")
+    ccfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4)
+    funclist = [
+        ("elemwise_kernel",  build_func_elemwise(),  _KCFG_TS),
+        ("libdevice_kernel", build_func_libdevice(), _KCFG_TS),
+    ]
+    lib = compileit(funclist, "test_library", ccfg)
+    print(f"  library modules = {lib.names}")
+    assert set(lib.names) == {"elemwise_kernel", "libdevice_kernel"}, lib.names
+
+    import cupy as cp
+    rng = np.random.default_rng(31)
+    a_h = rng.standard_normal((T, S), dtype=np.float32)
+    b_h = rng.standard_normal((T, S), dtype=np.float32)
+    inputs = {"a": cp.asarray(a_h), "b": cp.asarray(b_h)}
+
+    executor = KunMLIR.Executor()
+    rc = 0
+    expected_by_name = {
+        "elemwise_kernel":  (a_h + b_h) * a_h - b_h * b_h,
+        "libdevice_kernel": np.log(np.abs(a_h)) * np.sign(b_h - a_h),
+    }
+    tol_by_name = {"elemwise_kernel": 1e-5, "libdevice_kernel": 1e-4}
+    for mod_name, expected in expected_by_name.items():
+        exe = lib.getModule(mod_name)
+        # No `outputs=`: the binding auto-allocates a CUDA buffer for "out"
+        # and hands it back in the returned dict.  Re-wrap via DLPack so
+        # cupy treats it as a managed cupy array we can copy back to host.
+        ret = executor.runGraph(exe, inputs=inputs)
+        assert set(ret.keys()) == {"out"}, ret.keys()
+        out_h = cp.asnumpy(cp.from_dlpack(ret["out"]))
+        if not np.allclose(out_h, expected,
+                            atol=tol_by_name[mod_name], equal_nan=True):
+            diff = np.abs(out_h - expected)
+            print(f"  FAIL {mod_name} — max abs diff "
+                   f"{np.nanmax(diff):.3e}", file=sys.stderr)
+            rc = 1
+        else:
+            print(f"  ok {mod_name} — auto-allocated output matches reference")
+
+    # Library getModule on an unknown name must raise.
+    try:
+        lib.getModule("does_not_exist")
+        print("  FAIL — getModule('does_not_exist') should have raised",
+                file=sys.stderr)
+        rc = 1
+    except RuntimeError:
+        print("  ok — getModule on unknown name raised")
+    return rc
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--target", default=None)
+    # Defaults sized to comfortably trigger multi-chunk: T=128 with
+    # warmup=5 (N) gives `cap_warmup = 128/(4*5) = 6` chunks; S=1024
+    # gives `stock_tiles = 1024/(4*32) = 8`, so even on a small GPU
+    # the sm-fill target ≥ 2 — well inside the multi-chunk regime.
+    ap.add_argument("-T", "--time-length", type=int, default=128)
+    ap.add_argument("-S", "--num-stocks", type=int, default=1024)
+    ap.add_argument("-N", "--window", type=int, default=5)
+    args = ap.parse_args()
+
+    import cupy as cp
+    from KunQuant.jit.env import get_cuda_compute_capability
+    args.target = args.target or get_cuda_compute_capability()
+    cp.cuda.Device(0).use()
+    _ = cp.zeros((1,), dtype=cp.float32)
+
+    rc = 0
+    rc |= run_elemwise(args.target, args.time_length, args.num_stocks)
+    print()
+    rc |= run_libdevice(args.target, args.time_length, args.num_stocks)
+    print()
+    # Build once for N=args.window, reuse across the mask=0 and mask=3
+    # validations (graph topology + window size are the same; only T/S/mask
+    # differ at run time).
+    windowed_exe = build_windowed(args.target, args.window)
+    rc |= test_windowed(windowed_exe, args.time_length, args.num_stocks,
+                          args.window)
+    print()
+    rc |= run_backref(args.target, args.time_length, args.num_stocks, args.window)
+    print()
+    # Mask smaller than the window, so the post-mask output still
+    # contains unreliable rows — exercises both warmup overlap (chunks
+    # ≥ 1 prime by reading back `unreliable_count` steps) AND the
+    # mask-skip-vs-warmup-skip distinction on chunk 0.  Two graphs:
+    # stateless BackRef and stateful WindowedSum / fast_windowed_sum.
+    rc |= run_backref_with_mask(args.target, args.time_length, args.num_stocks,
+                                  args.window, mask=3)
+    print()
+    rc |= run_output_backref_multichunk(args.target, args.time_length,
+                                          args.num_stocks, args.window)
+    print()
+    rc |= test_windowed(windowed_exe, args.time_length, args.num_stocks,
+                          args.window, mask=3)
+    print()
+    rc |= run_fastwindowedsum(args.target, args.time_length, args.num_stocks,
+                                args.window)
+    print()
+    # Single-chunk fallback corner case: warmup so large relative to T
+    # that `cap_warmup = T/(K*N) = 64/(4*20) = 0` clamps num_chunks to 1.
+    # Exercises the multi-chunk kernel binary in its degenerate
+    # grid_y=1 launch configuration — guards against regressions in
+    # time_lb / time_ub / write-gating when `chunk_size = T`.  Different
+    # N → fresh build.
+    windowed_exe_n20 = build_windowed(args.target, N=20)
+    rc |= test_windowed(windowed_exe_n20, T=64, S=args.num_stocks,
+                          N=20, mask=1)
+    print()
+    rc |= run_multipartition(args.target, args.time_length, args.num_stocks)
+    print()
+    rc |= run_accumulator(args.target, args.time_length, args.num_stocks)
+    print()
+    rc |= run_cmp_logical(args.target, args.time_length, args.num_stocks)
+    print()
+    rc |= run_library(args.target, args.time_length, args.num_stocks)
+    return rc
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/mlir/test/python/test_multi_kernel.py b/mlir/test/python/test_multi_kernel.py
new file mode 100644
index 0000000..a4cea7a
--- /dev/null
+++ b/mlir/test/python/test_multi_kernel.py
@@ -0,0 +1,168 @@
+#!/usr/bin/env python3
+# RUN: %python %s
+# RUN: %python %s --use-cuda-graph
+# REQUIRES: cuda-device
+"""End-to-end test for the v0 multi-kernel pipeline.
+
+Builds a graph with two kernels chained through one intermediate buffer:
+
+    add_kernel:    tmp = a + b
+    scale_kernel:  out = tmp * c
+
+graph_inputs  = ["a", "b", "c"]
+graph_outputs = ["out"]
+intermediate  = "tmp"  → 1 slot expected
+
+Verifies the compile-time topology / slot plan, then runs the kernels and
+checks the result against numpy.
+"""
+
+from __future__ import annotations
+import argparse
+import sys
+import textwrap
+
+import numpy as np
+
+
+SAMPLE_KUNIR = textwrap.dedent("""
+gpu.module @kungpu_kernels {
+  kunir.func @add_kernel(%a: !kunir.ts<f32, inf>, %b: !kunir.ts<f32, inf>)
+      inputs {%a = "a", %b = "b"}
+      outputs {"tmp"}
+      target {occupancy = 1, warps_per_cta = 4, smem_size = 49152, vector_size = 1} unreliable_count = 0
+      -> !kunir.ts<f32, 1> {
+    %s = kunir.add %a, %b : !kunir.ts<f32, inf>, !kunir.ts<f32, inf>
+    kunir.return %s : !kunir.ts<f32, 1>
+  }
+
+  kunir.func @scale_kernel(%t: !kunir.ts<f32, inf>, %c: !kunir.ts<f32, inf>)
+      inputs {%t = "tmp", %c = "c"}
+      outputs {"out"}
+      target {occupancy = 1, warps_per_cta = 4, smem_size = 49152, vector_size = 1} unreliable_count = 0
+      -> !kunir.ts<f32, 1> {
+    %s = kunir.mul %t, %c : !kunir.ts<f32, inf>, !kunir.ts<f32, inf>
+    kunir.return %s : !kunir.ts<f32, 1>
+  }
+}
+""").strip()
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--target", default=None)
+    ap.add_argument("-T", "--time-length", type=int, default=64)
+    ap.add_argument("-S", "--num-stocks", type=int, default=2048)
+    ap.add_argument("--use-cuda-graph", action="store_true")
+    args = ap.parse_args()
+
+    from KunQuant.jit import KunMLIR
+    from KunQuant.jit.cuda import find_cuda_toolkit
+    from KunQuant.jit.env import get_cuda_compute_capability
+
+    args.target = args.target or get_cuda_compute_capability()
+    toolkit = find_cuda_toolkit()
+
+    import cupy as cp
+    cp.cuda.Device(0).use()
+    _ = cp.zeros((1,), dtype=cp.float32)
+
+    print("=== compile two-kernel graph ===")
+    mod = KunMLIR.parse(SAMPLE_KUNIR)
+    exe = KunMLIR.compile(mod,
+                            graph_inputs=["a", "b", "c"],
+                            graph_outputs=["out"],
+                            gpu_arch=args.target, opt_level=3,
+                            toolkit_path=toolkit)
+
+    print(f"  kernel_names           = {exe.kernel_names}")
+    print(f"  num_kernels            = {exe.num_kernels}")
+    print(f"  launch_order           = {exe.launch_order}")
+    print(f"  num_buffers            = {exe.num_buffers}")
+    print(f"  peak_intermediate_slots= {exe.peak_intermediate_slots}")
+    print(f"  input_names            = {exe.input_names}")
+    print(f"  output_names           = {exe.output_names}")
+
+    # Topology checks.
+    assert exe.num_kernels == 2, exe.num_kernels
+    assert set(exe.kernel_names) == {"add_kernel", "scale_kernel"}, exe.kernel_names
+    # Producer (add) must come before consumer (scale).
+    add_pos = exe.launch_order.index(exe.kernel_names.index("add_kernel"))
+    scl_pos = exe.launch_order.index(exe.kernel_names.index("scale_kernel"))
+    assert add_pos < scl_pos, (exe.kernel_names, exe.launch_order)
+    # 3 graph inputs + 1 graph output + 1 intermediate.
+    assert exe.num_buffers == 5, exe.num_buffers
+    # One intermediate ("tmp") → exactly one slot.
+    assert exe.peak_intermediate_slots == 1, exe.peak_intermediate_slots
+    assert exe.input_names  == ["a", "b", "c"]
+    assert exe.output_names == ["out"]
+
+    # === launch ===
+    T, S = args.time_length, args.num_stocks
+    rng = np.random.default_rng(0)
+    a_h = rng.standard_normal((T, S), dtype=np.float32)
+    b_h = rng.standard_normal((T, S), dtype=np.float32)
+    c_h = rng.standard_normal((T, S), dtype=np.float32)
+    a = cp.asarray(a_h)
+    b = cp.asarray(b_h)
+    c = cp.asarray(c_h)
+    out = cp.zeros((T, S), dtype=cp.float32)
+
+    # Default-stream Executor — the simplest path.  No explicit sync
+    # needed because cp.asnumpy's D2H memcpy goes through cupy's default
+    # stream (= legacy stream 0), which is the same stream our kernels
+    # are queued on.
+    executor = KunMLIR.Executor()
+    print()
+    print(f"=== launch ({T} × {S}) — default stream ===")
+    print(f"  executor.stream = {executor.stream}  (0 ↔ CUDA default)")
+    executor.runGraph(exe, {"a": a, "b": b, "c": c},
+                      outputs={"out": out},
+                      use_cuda_graph=args.use_cuda_graph)
+    out_h = cp.asnumpy(out)
+
+    expected = (a_h + b_h) * c_h
+    if not np.allclose(out_h, expected, atol=1e-5):
+        diff = np.abs(out_h - expected)
+        print(f"  FAIL — max abs diff {diff.max()}, "
+                f"argmax @ {np.unravel_index(diff.argmax(), diff.shape)}",
+                file=sys.stderr)
+        return 1
+
+    print(f"  ok — output matches (a+b)*c on every (t, s) cell ({T*S} cells)")
+
+    # === second launch with different shape on a user-supplied stream ===
+    # Exercises (a) slot-pool re-alloc on shape change, (b) injecting a
+    # cupy-managed CUstream into the Executor.
+    T2, S2 = T // 2, S + 64
+    a2 = cp.asarray(rng.standard_normal((T2, S2), dtype=np.float32))
+    b2 = cp.asarray(rng.standard_normal((T2, S2), dtype=np.float32))
+    c2 = cp.asarray(rng.standard_normal((T2, S2), dtype=np.float32))
+    out2 = cp.zeros((T2, S2), dtype=cp.float32)
+
+    cp_stream = cp.cuda.Stream(non_blocking=True)
+    executor2 = KunMLIR.Executor(stream=cp_stream)   # cupy stream injected
+    print()
+    print(f"=== launch ({T2} × {S2}) — cupy stream {hex(cp_stream.ptr)} ===")
+    print(f"  executor.stream = {hex(executor2.stream)}")
+    assert executor2.stream == cp_stream.ptr, \
+        (executor2.stream, cp_stream.ptr)
+    executor2.runGraph(exe, {"a": a2, "b": b2, "c": c2},
+                       outputs={"out": out2},
+                       use_cuda_graph=args.use_cuda_graph)
+    # Sync is REQUIRED here: cp_stream is non-blocking, so cp.asnumpy's
+    # D2H memcpy on cupy's default stream wouldn't otherwise wait for
+    # our kernels.
+    executor2.synchronize()
+    out2_h = cp.asnumpy(out2)
+    expected2 = (cp.asnumpy(a2) + cp.asnumpy(b2)) * cp.asnumpy(c2)
+    if not np.allclose(out2_h, expected2, atol=1e-5):
+        diff = np.abs(out2_h - expected2)
+        print(f"  FAIL — max abs diff {diff.max()}", file=sys.stderr)
+        return 1
+    print(f"  ok — re-launched on cupy stream, output matches")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/mlir/test/python/test_overlap_runner.py b/mlir/test/python/test_overlap_runner.py
new file mode 100644
index 0000000..3615bdc
--- /dev/null
+++ b/mlir/test/python/test_overlap_runner.py
@@ -0,0 +1,144 @@
+#!/usr/bin/env python3
+# RUN: %python %s
+# RUN: %python %s --use-cuda-graph
+# REQUIRES: cuda-device
+"""Regression test for KunQuantMLIR.OverlapRunner.
+
+The test submits more runs than there are runner slots, while changing both
+the time length and the stock count. This exercises slot reuse, cached device
+output reallocation, CUDA graph state rebuild/update, and the host output block
+that is returned as per-output NumPy slices.
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+import textwrap
+from dataclasses import dataclass
+
+import numpy as np
+
+
+SAMPLE_KUNIR = textwrap.dedent("""
+gpu.module @kungpu_kernels {
+  kunir.func @overlap_runner_kernel(%a: !kunir.ts<f32, inf>, %b: !kunir.ts<f32, inf>)
+      inputs {%a = "a", %b = "b"}
+      outputs {"sum", "diff"}
+      target {occupancy = 1, warps_per_cta = 4, smem_size = 49152, vector_size = 1} unreliable_count = 0
+      -> (!kunir.ts<f32, 1>, !kunir.ts<f32, 1>) {
+    %sum = kunir.add %a, %b : !kunir.ts<f32, inf>, !kunir.ts<f32, inf>
+    %diff = kunir.sub %a, %b : !kunir.ts<f32, inf>, !kunir.ts<f32, inf>
+    kunir.return %sum, %diff : !kunir.ts<f32, 1>, !kunir.ts<f32, 1>
+  }
+}
+""").strip()
+
+
+@dataclass
+class Case:
+    label: str
+    time_length: int
+    num_stocks: int
+    length_arg: int
+
+
+def make_inputs(case: Case, seed: int) -> dict[str, np.ndarray]:
+    rng = np.random.default_rng(seed)
+    return {
+        "a": rng.standard_normal(
+            (case.time_length, case.num_stocks), dtype=np.float32),
+        "b": rng.standard_normal(
+            (case.time_length, case.num_stocks), dtype=np.float32),
+    }
+
+
+def expected_outputs(inputs: dict[str, np.ndarray]) -> dict[str, np.ndarray]:
+    return {
+        "sum": inputs["a"] + inputs["b"],
+        "diff": inputs["a"] - inputs["b"],
+    }
+
+
+def check_outputs(case: Case, actual: dict[str, np.ndarray],
+                  expected: dict[str, np.ndarray]) -> None:
+    expected_shape = (case.time_length, case.num_stocks)
+    assert set(actual) == {"sum", "diff"}, actual.keys()
+    for name in ("sum", "diff"):
+        arr = actual[name]
+        assert arr.shape == expected_shape, (case, name, arr.shape)
+        assert arr.dtype == np.float32, (case, name, arr.dtype)
+        assert arr.flags.c_contiguous, (case, name, arr.strides)
+        assert not arr.flags.owndata, (case, name)
+        np.testing.assert_allclose(arr, expected[name], rtol=1e-6, atol=1e-6)
+
+
+def build_cases(base_time: int, base_stocks: int) -> list[Case]:
+    return [
+        Case("infer-initial", base_time, base_stocks, 0),
+        Case("explicit-length-change", base_time + 7, base_stocks,
+             base_time + 7),
+        Case("stock-count-change", max(8, base_time - 5),
+             base_stocks + 37, max(8, base_time - 5)),
+        Case("infer-both-change", base_time + 3, base_stocks + 79, 0),
+        Case("explicit-shorter-shape", max(8, base_time // 2),
+             max(8, base_stocks - 11), max(8, base_time // 2)),
+    ]
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--target", default=None)
+    ap.add_argument("-T", "--time-length", type=int, default=32)
+    ap.add_argument("-S", "--num-stocks", type=int, default=257)
+    ap.add_argument("--use-cuda-graph", action="store_true")
+    args = ap.parse_args()
+
+    from KunQuant.jit import KunMLIR
+    from KunQuant.jit.cuda import find_cuda_toolkit
+    from KunQuant.jit.env import get_cuda_compute_capability
+    from KunQuantMLIR.OverlapRunner import OverlapRunner
+
+    import cupy as cp
+
+    args.target = args.target or get_cuda_compute_capability()
+    cp.cuda.Device(0).use()
+    _ = cp.zeros((1,), dtype=cp.float32)
+
+    mod = KunMLIR.parse(SAMPLE_KUNIR)
+    exe = KunMLIR.compile(mod,
+                          graph_inputs=["a", "b"],
+                          graph_outputs=["sum", "diff"],
+                          gpu_arch=args.target, opt_level=3,
+                          toolkit_path=find_cuda_toolkit())
+    assert exe.output_names == ["sum", "diff"], exe.output_names
+
+    compute_stream = cp.cuda.Stream(non_blocking=True)
+    executor = KunMLIR.Executor(stream=compute_stream)
+    runner = OverlapRunner(exe, executor, num_slots=2)
+
+    print("=== overlap runner ===")
+    print(f"  target={args.target}  use_cuda_graph={args.use_cuda_graph}")
+    print(f"  executor.stream={hex(executor.stream)}")
+
+    pending = []
+    for i, case in enumerate(build_cases(args.time_length, args.num_stocks)):
+        inputs = make_inputs(case, seed=100 + i)
+        result = runner.submit(inputs,
+                               length=case.length_arg,
+                               use_cuda_graph=args.use_cuda_graph)
+        pending.append((case, inputs, result))
+        print(f"  submitted {case.label}: T={case.time_length}, "
+              f"S={case.num_stocks}, length={case.length_arg}")
+
+    for case, inputs, result in pending:
+        actual = result.wait()
+        check_outputs(case, actual, expected_outputs(inputs))
+        print(f"  ok {case.label}")
+
+    runner.synchronize()
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/mlir/test/python/test_validation_cuda.py b/mlir/test/python/test_validation_cuda.py
new file mode 100644
index 0000000..e516e4b
--- /dev/null
+++ b/mlir/test/python/test_validation_cuda.py
@@ -0,0 +1,281 @@
+#!/usr/bin/env python3
+# RUN: %python %s
+# REQUIRES: cuda-device
+"""Negative tests for the KunMLIR launch-time validation path.
+
+The runtime consumes every input/output via DLPack (the protocol
+implemented by CuPy / PyTorch / JAX / TensorFlow).  This file
+exercises:
+
+  * DLPack field validation — wrong dtype, wrong ndim, non-contiguous
+    strided view, host-only ndarray (DLPack CPU device), object that
+    implements neither protocol at all.
+  * Graph-arg checks — missing kwarg, unknown kwarg, cross-arg shape
+    mismatch.
+  * cs_rank dynamic-smem cap — pick `num_stocks` exceeding the device's
+    MAX_SHARED_MEMORY_PER_BLOCK_OPTIN and assert the runtime fails with
+    the GPU-aware message instead of letting cuLaunchKernel emit a
+    generic CUDA_ERROR_INVALID_VALUE; the at-cap case must still launch.
+  * DLPack-only producer — verify it works when the object hides CAI
+    behind a wrapper, since DLPack is the path we rely on for non-CuPy
+    frameworks.
+
+`_expect_fail` returns 0 if the right error fires, 1 otherwise.
+"""
+
+from __future__ import annotations
+import argparse
+import sys
+
+import numpy as np
+
+from KunQuant.Driver import KunCompilerConfig
+
+
+_KCFG_TS = KunCompilerConfig(input_layout="TS", output_layout="TS")
+
+
+# ── Fixture helpers ──────────────────────────────────────────────────
+
+def _build_elemwise_exe(cfg):
+    """Add(a, b) → out.  Workhorse for arg-validation tests."""
+    from KunQuant.Op import Builder, Input, Output
+    from KunQuant.ops import Add
+    from KunQuant.Stage import Function
+    from KunQuant.jit.cuda import compile_func
+    b = Builder()
+    with b:
+        a = Input("a"); bb = Input("b")
+        Output(Add(a, bb), "out")
+    f = Function(b.ops, name="addk")
+    return compile_func(f, _KCFG_TS, cfg)
+
+
+def _build_cs_rank_exe(cfg):
+    """cs_rank(a) → r.  Used for the smem-cap test."""
+    from KunQuant.Op import Builder, Input, Output, Rank
+    from KunQuant.Stage import Function
+    from KunQuant.jit.cuda import compile_func
+    b = Builder()
+    with b:
+        Output(Rank(Input("a")), "r")
+    f = Function(b.ops, name="csr")
+    return compile_func(f, _KCFG_TS, cfg)
+
+
+def _expect_fail(label, fn, needle):
+    print(f"  {label} ...", end=" ", flush=True)
+    try:
+        fn()
+    except Exception as e:
+        msg = str(e)
+        if needle in msg:
+            print(f"ok (raised: {msg.splitlines()[0][:100]})")
+            return 0
+        print(f"FAIL — wrong message: {msg!r}", file=sys.stderr)
+        return 1
+    print("FAIL — no exception raised", file=sys.stderr)
+    return 1
+
+
+# ── DLPack / arg-validation test set ────────────────────────────────
+
+def run_validation_tests(target):
+    import cupy as cp
+    from KunQuant.jit import KunMLIR
+    from KunQuant.jit.cuda import CudaCompilerConfig
+
+    print("=== DLPack + arg validation ===")
+    cfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4)
+    exe = _build_elemwise_exe(cfg)
+    ex  = KunMLIR.Executor()
+    T, S = 4, 32
+
+    rc = 0
+    a   = cp.zeros((T, S), dtype=cp.float32)
+    b   = cp.zeros((T, S), dtype=cp.float32)
+    out = cp.zeros((T, S), dtype=cp.float32)
+
+    # 1. Object implementing neither CAI nor DLPack (a plain int)
+    rc |= _expect_fail(
+        "object without __dlpack__ rejected",
+        lambda: ex.runGraph(exe,
+                            inputs={"a": 0xdeadbeef, "b": b},
+                            outputs={"out": out}),
+        "does not implement __dlpack__")
+
+    # 2. Host numpy array — numpy is a CPU-only producer; it refuses
+    #    `stream != None` (our binding always passes the executor's
+    #    CUDA stream).  The error comes from numpy, not from us, but
+    #    the effect is what we want: host arrays can't sneak into a
+    #    GPU launch.
+    rc |= _expect_fail(
+        "host numpy array rejected (CPU producer)",
+        lambda: ex.runGraph(exe,
+                            inputs={"a": np.zeros((T, S), dtype=np.float32),
+                                    "b": b},
+                            outputs={"out": out}),
+        "stream")
+
+    # 3. Wrong dtype: float64
+    rc |= _expect_fail(
+        "f64 dtype rejected",
+        lambda: ex.runGraph(exe,
+                            inputs={"a": cp.zeros((T, S), dtype=cp.float64),
+                                    "b": b},
+                            outputs={"out": out}),
+        "kernel expects float32")
+
+    # 4. Wrong ndim: 1-D
+    rc |= _expect_fail(
+        "1-D array rejected",
+        lambda: ex.runGraph(exe,
+                            inputs={"a": cp.zeros((T*S,), dtype=cp.float32),
+                                    "b": b},
+                            outputs={"out": out}),
+        "must be 2-D")
+
+    # 5. Wrong ndim: 3-D
+    rc |= _expect_fail(
+        "3-D array rejected",
+        lambda: ex.runGraph(exe,
+                            inputs={"a": cp.zeros((T, S, 1), dtype=cp.float32),
+                                    "b": b},
+                            outputs={"out": out}),
+        "must be 2-D")
+
+    # 6. Non-contiguous strided view (transpose).  (T, S) and (S, T) are
+    #    different shapes, so build matching transposed b/out too.
+    a_t = a.T                                          # (S, T) view of (T, S)
+    b_t = cp.zeros((S, T), dtype=cp.float32)
+    out_t = cp.zeros((S, T), dtype=cp.float32)
+    rc |= _expect_fail(
+        "non-contiguous transposed view rejected",
+        lambda: ex.runGraph(exe,
+                            inputs={"a": a_t, "b": b_t},
+                            outputs={"out": out_t}),
+        "not C-contiguous")
+
+    # 7. Missing graph input.  Outputs may be omitted by design: the
+    #    binding auto-allocates them and returns the buffer dict.
+    rc |= _expect_fail(
+        "missing graph_input rejected",
+        lambda: ex.runGraph(exe,
+                            inputs={"a": a},
+                            outputs={"out": out}),
+        "missing input 'b'")
+
+    # 8. Shape mismatch between args
+    rc |= _expect_fail(
+        "shape mismatch rejected",
+        lambda: ex.runGraph(exe,
+                            inputs={"a": a,
+                                    "b": cp.zeros((T, S+1), dtype=cp.float32)},
+                            outputs={"out": out}),
+        "expected")
+
+    # 9. Unknown kwarg (the hot-path skip kicks in for size == ordered,
+    #    so add a real extra to trip the strict check).
+    rc |= _expect_fail(
+        "unknown argument rejected",
+        lambda: ex.runGraph(exe,
+                            inputs={"a": a, "b": b, "bogus": a},
+                            outputs={"out": out}),
+        "unexpected input 'bogus'")
+
+    # 10. DLPack-only producer — wrap a cupy ndarray and hide every
+    #     attribute except __dlpack__ / __dlpack_device__.  Verifies the
+    #     binding works for objects that don't quack like CuPy (e.g.
+    #     JAX, TF, custom buffers).
+    class DLOnly:
+        def __init__(self, arr):
+            self._arr = arr
+        def __dlpack__(self, stream=None):
+            return self._arr.__dlpack__(stream=stream)
+        def __dlpack_device__(self):
+            return self._arr.__dlpack_device__()
+
+    print("  dlpack-only producer happy path ...", end=" ", flush=True)
+    try:
+        ex.runGraph(exe,
+                    inputs={"a": DLOnly(a), "b": DLOnly(b)},
+                    outputs={"out": DLOnly(out)})
+        ex.synchronize()
+        print("ok")
+    except Exception as e:
+        print(f"FAIL — DLPack-only happy path raised: {e}", file=sys.stderr)
+        rc |= 1
+
+    return rc
+
+
+# ── cs_rank smem-cap test ────────────────────────────────────────────
+
+def run_smem_cap_tests(target):
+    import cupy as cp
+    from KunQuant.jit import KunMLIR
+    from KunQuant.jit.cuda import CudaCompilerConfig
+
+    print("=== cs_rank smem cap ===")
+    cfg = CudaCompilerConfig(gpu_arch=target, warps_per_cta=4)
+    exe = _build_cs_rank_exe(cfg)
+    ex  = KunMLIR.Executor()
+    rc = 0
+
+    # CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97
+    dev = cp.cuda.Device(0)
+    try:
+        max_smem = cp.cuda.runtime.deviceGetAttribute(97, dev.id)
+    except Exception:
+        max_smem = 49152    # conservative fallback
+    too_many = max_smem // 4 + 1   # one stock past the float32 cap
+    print(f"  device max_smem={max_smem} bytes; using num_stocks={too_many} "
+          f"(needs {too_many*4} bytes)")
+
+    T = 2
+    a   = cp.zeros((T, too_many), dtype=cp.float32)
+    out = cp.zeros((T, too_many), dtype=cp.float32)
+    rc |= _expect_fail(
+        "smem cap exceeded → clear error",
+        lambda: ex.runGraph(exe, inputs={"a": a}, outputs={"r": out}),
+        "MAX_SHARED_MEMORY_PER_BLOCK_OPTIN")
+
+    # At-cap case must still launch (off-by-one regression guard).
+    at_limit = max_smem // 4
+    a2   = cp.zeros((T, at_limit), dtype=cp.float32)
+    out2 = cp.zeros((T, at_limit), dtype=cp.float32)
+    print(f"  at-cap launch (num_stocks={at_limit}) ...", end=" ", flush=True)
+    try:
+        ex.runGraph(exe, inputs={"a": a2}, outputs={"r": out2})
+        ex.synchronize()
+        print("ok")
+    except Exception as e:
+        print(f"FAIL — at-cap should succeed but got: {e}", file=sys.stderr)
+        rc |= 1
+    return rc
+
+
+# ── main ─────────────────────────────────────────────────────────────
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--target", default=None)
+    args = ap.parse_args()
+
+    import cupy as cp
+    from KunQuant.jit.env import get_cuda_compute_capability
+    args.target = args.target or get_cuda_compute_capability()
+    cp.cuda.Device(0).use()
+    _ = cp.zeros((1,), dtype=cp.float32)
+
+    rc = 0
+    rc |= run_validation_tests(args.target)
+    print()
+    rc |= run_smem_cap_tests(args.target)
+    print()
+    print("=== all tests done ===")
+    return rc
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/mlir/test/python/test_windowed_temp.py b/mlir/test/python/test_windowed_temp.py
new file mode 100644
index 0000000..8b83483
--- /dev/null
+++ b/mlir/test/python/test_windowed_temp.py
@@ -0,0 +1,172 @@
+#!/usr/bin/env python3
+# RUN: %python %s
+# REQUIRES: cuda-device
+"""End-to-end test for the windowed_temp lowering across both placements
+the memory-planning pass can choose:
+
+    sum_window(a, b, N)[t][s] = sum_{i=0}^{N-1} ( a[t-i][s] + b[t-i][s] )
+
+is compiled twice — once with a small N (fits in shared memory) and once
+with a large N (spills to local memory) — and each run is checked
+against a numpy reference.  Memory-planning's per-block budget for our
+target_spec is
+
+    bytes / windowed_temp = N * (warps_per_cta * 32) * vector_size * 4
+                          = N * 128 * 4         (warps_per_cta = 4)
+                          = 512 * N
+
+so with smem_size = 49152 the cutoff is N ≤ 96 → smem, N > 96 → local.
+We pick N = 5 and N = 200 to bracket that.
+"""
+
+from __future__ import annotations
+import argparse
+import sys
+import textwrap
+
+import numpy as np
+
+
+def build_ir(N: int, warps_per_cta: int = 4, smem_size: int = 49152) -> str:
+    """A minimal kunir program that computes a rolling sum of (a + b).
+
+    `unreliable_count = N` mirrors KunQuant's `infer_window` policy of
+    summing op windows along the chain: a single window-N reduction
+    contributes N.  The runtime uses this to back up `warmup` time
+    steps when launching chunks ≥ 1 so their rolling state is fully
+    primed before they start writing reliable outputs.
+    """
+    return textwrap.dedent(f"""
+gpu.module @kungpu_kernels {{
+  kunir.func @sum_window(%a: !kunir.ts<f32, inf>, %b: !kunir.ts<f32, inf>)
+      inputs {{%a = "a", %b = "b"}}
+      outputs {{"out"}}
+      target {{occupancy = 1, warps_per_cta = {warps_per_cta}, smem_size = {smem_size}, vector_size = 1}} unreliable_count = {N}
+      -> !kunir.ts<f32, 1> {{
+    %c = kunir.add %a, %b : !kunir.ts<f32, inf>, !kunir.ts<f32, inf>
+    %w = kunir.windowed_output %c [length = {N}] : !kunir.ts<f32, 1> -> !kunir.ts<f32, {N}>
+    %total = kunir.for_each_back_window
+        (%w : !kunir.ts<f32, {N}>) [window = {N}]
+        (%cur : !kunir.ts<f32, 1>)
+        -> (!kunir.ts<f32, 1>) {{
+      %s = kunir.reduce_add %cur : !kunir.ts<f32, 1>
+      kunir.yield %s : !kunir.ts<f32, 1>
+    }}
+    kunir.return %total : !kunir.ts<f32, 1>
+  }}
+}}
+""").strip()
+
+
+def reference_sum_window(a: np.ndarray, b: np.ndarray, N: int) -> np.ndarray:
+    """CPU rolling-sum of (a + b) with window length N along axis 0.
+    Output for t < N-1 is undefined; we fill nan there and skip it
+    when comparing.
+    """
+    c = a + b
+    T, S = c.shape
+    out = np.empty((T, S), dtype=np.float32)
+    out[:N - 1] = np.nan
+    cumsum = np.cumsum(c, axis=0, dtype=np.float64)  # higher-precision ref
+    out[N - 1] = cumsum[N - 1]
+    if T > N:
+        out[N:] = (cumsum[N:] - cumsum[:-N])
+    return out
+
+
+def assert_planning(N: int, warps_per_cta: int, smem_size: int,
+                     expected: str) -> None:
+    """Sanity-check our N choices against the memory-planning formula
+    so the test self-documents which placement it exercises."""
+    bytes_per_buf = N * warps_per_cta * 32 * 1 * 4   # vector_size=1, f32
+    fits_smem = bytes_per_buf <= smem_size
+    actual = "smem" if fits_smem else "local"
+    if actual != expected:
+        raise AssertionError(
+            f"N={N} ({bytes_per_buf} bytes) would land in '{actual}', "
+            f"but the test wanted '{expected}' (smem budget {smem_size}).")
+
+
+def run_one(N: int, expected_placement: str, target: str,
+              warps_per_cta: int = 4, smem_size: int = 49152,
+              T: int = 64, S: int = 2048) -> int:
+    from KunQuant.jit import KunMLIR
+    from KunQuant.jit.cuda import find_cuda_toolkit
+
+    print(f"=== N = {N}  ({expected_placement} temp buffer) ===")
+    assert_planning(N, warps_per_cta, smem_size, expected_placement)
+
+    ir = build_ir(N, warps_per_cta=warps_per_cta, smem_size=smem_size)
+    mod = KunMLIR.parse(ir)
+
+    exe = KunMLIR.compile(mod,
+                            graph_inputs=["a", "b"],
+                            graph_outputs=["out"],
+                            gpu_arch=target, opt_level=3,
+                            toolkit_path=find_cuda_toolkit())
+    print(f"  kernels={exe.kernel_names}  warps_per_cta={exe.warps_per_cta}  "
+           f"vector_size={exe.vector_size}  cubin={len(exe.cubin)} bytes")
+
+    import cupy as cp
+    # Random input.  T must be > N so we have at least one valid window.
+    if T <= N:
+        T = N + 32
+    rng = np.random.default_rng(0)
+    a_h = rng.standard_normal((T, S), dtype=np.float32)
+    b_h = rng.standard_normal((T, S), dtype=np.float32)
+    a   = cp.asarray(a_h)
+    b   = cp.asarray(b_h)
+    out = cp.zeros((T, S), dtype=cp.float32)
+
+    executor = KunMLIR.Executor()
+    executor.runGraph(exe,
+                      inputs={"a": a, "b": b},
+                      outputs={"out": out})
+    out_h = cp.asnumpy(out)            # implicitly waits via stream 0
+
+    expected = reference_sum_window(a_h, b_h, N)
+
+    # Only the t >= N-1 region is well-defined.
+    diff = np.abs(out_h[N - 1:] - expected[N - 1:])
+    max_abs = float(diff.max())
+    # Tolerance scales with N: each output is a sum of N IID N(0,1)
+    # samples, so its magnitude is ~sqrt(N), and float32 ULP-style error
+    # accumulates roughly like N * eps.
+    atol = max(1e-3, 5e-7 * N)
+    if max_abs > atol:
+        idx = np.unravel_index(diff.argmax(), diff.shape)
+        print(f"  FAIL: max |Δ| = {max_abs:.3e} > {atol:.0e} at "
+               f"{idx} (out_h={out_h[N-1:][idx]:.6g} vs "
+               f"expected={expected[N-1:][idx]:.6g})", file=sys.stderr)
+        return 1
+    print(f"  ok — max |Δ| = {max_abs:.3e} (atol={atol:.0e}, "
+           f"shape={(T - N + 1, S)} validated cells)")
+    return 0
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description=__doc__)
+    ap.add_argument("--target", default=None)
+    ap.add_argument("-T", "--time-length", type=int, default=64)
+    ap.add_argument("-S", "--num-stocks", type=int, default=2048)
+    args = ap.parse_args()
+
+    from KunQuant.jit.env import get_cuda_compute_capability
+    args.target = args.target or get_cuda_compute_capability()
+
+    import cupy as cp
+    cp.cuda.Device(0).use()
+    _ = cp.zeros((1,), dtype=cp.float32)
+
+    rc = 0
+    rc |= run_one(N=5,   expected_placement="smem",
+                   target=args.target, T=args.time_length, S=args.num_stocks)
+    print()
+    rc |= run_one(N=200, expected_placement="local",
+                   target=args.target,
+                   T=max(args.time_length, 256), S=args.num_stocks)
+    return rc
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/mlir/test/python/utils.py b/mlir/test/python/utils.py
new file mode 100644
index 0000000..603f82e
--- /dev/null
+++ b/mlir/test/python/utils.py
@@ -0,0 +1,21 @@
+from __future__ import annotations
+
+from typing import Optional, Tuple
+
+from KunQuant.jit.env import get_cuda_compute_capability
+
+
+def resolve_cuda_compute_capability(explicit_target: Optional[str] = None,
+                                    fallback: str = "sm_80"
+                                    ) -> Tuple[str, bool]:
+    """Return `(gpu_arch, has_device)` for MLIR Python tests.
+
+    Tests that can still cover compile-only behavior without a visible GPU use
+    this helper to select a conservative fallback architecture and decide
+    whether to skip runtime `Executable` / `runGraph` checks.
+    """
+    try:
+        detected = get_cuda_compute_capability()
+        return explicit_target or detected, True
+    except RuntimeError:
+        return explicit_target or fallback, False
diff --git a/python/kunquant_mlir/pyproject.toml b/python/kunquant_mlir/pyproject.toml
new file mode 100644
index 0000000..4a26837
--- /dev/null
+++ b/python/kunquant_mlir/pyproject.toml
@@ -0,0 +1,3 @@
+[build-system]
+requires = ["setuptools>=69", "wheel", "cmake>=3.18"]
+build-backend = "setuptools.build_meta"
diff --git a/python/kunquant_mlir/setup.py b/python/kunquant_mlir/setup.py
new file mode 100644
index 0000000..46d63c9
--- /dev/null
+++ b/python/kunquant_mlir/setup.py
@@ -0,0 +1,142 @@
+import datetime
+import os
+import platform
+import shutil
+import subprocess
+import sys
+from pathlib import Path
+
+from setuptools import Extension, setup
+from setuptools.command.build_ext import build_ext
+
+
+_STABLE_ABI_MIN = (3, 12)
+_HAS_STABLE_ABI = (
+    sys.version_info >= _STABLE_ABI_MIN
+    and platform.python_implementation() == "CPython"
+)
+
+_PKG_ROOT = Path(__file__).resolve().parent
+_REPO_ROOT = _PKG_ROOT.parents[1]
+_VERSION_BASE = "0.1.10"
+
+
+class CMakeExtension(Extension):
+    def __init__(self, name: str, sourcedir: Path):
+        super().__init__(name, sources=[], py_limited_api=_HAS_STABLE_ABI)
+        self.sourcedir = str(sourcedir)
+
+
+class CMakeBuildExtension(build_ext):
+    def build_extension(self, ext):
+        ext_dir = Path(self.get_ext_fullpath(ext.name)).resolve().parent
+        build_temp = Path(self.build_temp).resolve()
+        build_temp.mkdir(parents=True, exist_ok=True)
+        ext_dir.mkdir(parents=True, exist_ok=True)
+
+        build_type = os.environ.get("KUN_BUILD_TYPE", "Release")
+        python_exe = sys.executable
+        cmake_args = [
+            f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={ext_dir}",
+            f"-DKUN_MLIR_PYTHON_PACKAGE_DIR={ext_dir}",
+            "-DKUN_BUILD_CPU_RUNNER=OFF",
+            "-DKUN_BUILD_MLIR=ON",
+            f"-DPython_EXECUTABLE={python_exe}",
+            f"-DPYTHON_EXECUTABLE={python_exe}",
+            f"-DCMAKE_BUILD_TYPE={build_type}",
+        ]
+
+        if os.environ.get("KUN_SANITIZER", "0") != "0":
+            cmake_args.append("-DKUN_SANITIZER=ON")
+        else:
+            cmake_args.append("-DKUN_SANITIZER=OFF")
+
+        if os.environ.get("KUN_NO_AVX2", "0") != "0":
+            cmake_args.append("-DKUN_NO_AVX2=ON")
+        else:
+            cmake_args.append("-DKUN_NO_AVX2=OFF")
+
+        for var in (
+            "LLVM_DIR",
+            "MLIR_DIR",
+            "CUDAToolkit_ROOT",
+            "CMAKE_CUDA_COMPILER",
+            "LLVM_EXTERNAL_LIT",
+        ):
+            value = os.environ.get(var)
+            if value:
+                cmake_args.append(f"-D{var}={value}")
+
+        generator = os.environ.get("CMAKE_GENERATOR")
+        if not generator and shutil.which("ninja"):
+            cmake_args.extend(["-G", "Ninja"])
+
+        if "PLAT" in os.environ:
+            del os.environ["PLAT"]
+
+        subprocess.check_call(
+            ["cmake", "-S", ext.sourcedir, "-B", str(build_temp)] + cmake_args
+        )
+
+        build_args = [
+            "cmake",
+            "--build",
+            str(build_temp),
+            "--target",
+            "KunMLIR",
+        ]
+        if platform.system() == "Windows":
+            build_args += ["--config", build_type]
+        else:
+            build_args += ["--parallel"]
+        subprocess.check_call(build_args)
+
+
+try:
+    from setuptools.command.bdist_wheel import bdist_wheel
+except ImportError:
+    from wheel.bdist_wheel import bdist_wheel  # type: ignore
+
+
+class BdistWheelABI3(bdist_wheel):
+    def finalize_options(self):
+        super().finalize_options()
+        if _HAS_STABLE_ABI:
+            self.py_limited_api = "cp{}{}".format(*_STABLE_ABI_MIN)
+            self.root_is_pure = False
+
+
+if os.environ.get("KUN_USE_GIT_VERSION", "0") != "0":
+    git_ver = "." + datetime.datetime.now().strftime("%Y%m%d")
+else:
+    git_ver = ""
+
+version = _VERSION_BASE + git_ver
+package_dir = os.path.relpath(_REPO_ROOT / "KunQuantMLIR", _PKG_ROOT)
+
+
+setup(
+    name="KunQuant-MLIR",
+    version=version,
+    description="Optional MLIR/CUDA backend for KunQuant",
+    long_description=(_REPO_ROOT / "Readme.md").read_text(encoding="utf-8"),
+    long_description_content_type="text/markdown",
+    author="Menooker",
+    author_email="menooker@live.com",
+    packages=["KunQuantMLIR"],
+    package_dir={"KunQuantMLIR": package_dir},
+    package_data={"KunQuantMLIR": ["*.so", "*.pyd", "*.dll", "*.dylib"]},
+    include_package_data=True,
+    ext_modules=[
+        CMakeExtension("KunQuantMLIR.KunMLIR", _REPO_ROOT),
+    ],
+    cmdclass={
+        "build_ext": CMakeBuildExtension,
+        "bdist_wheel": BdistWheelABI3,
+    },
+    python_requires=">=3.9",
+    install_requires=[
+        f"KunQuant=={version}",
+    ],
+    zip_safe=False,
+)
diff --git a/tests/KunTestUtil/ref_alpha101.py b/tests/KunTestUtil/ref_alpha101.py
index 41f6970..965b337 100644
--- a/tests/KunTestUtil/ref_alpha101.py
+++ b/tests/KunTestUtil/ref_alpha101.py
@@ -193,7 +193,8 @@ def decay_linear(df, period=10):
     # The backtest engine should assure to be snooping bias free.
     for row in range(period - 1, df.shape[0]):
         x = na_series[row - period + 1: row + 1, :]
-        na_lwma[row, :] = (np.dot(x.T, y))
+        with np.errstate(invalid="ignore"):
+            na_lwma[row, :] = np.dot(x.T, y)
     return pd.DataFrame(na_lwma, index=df.index, columns=df.columns)  
 # endregion
 
@@ -834,5 +835,3 @@ def alpha099(self):
     # Alpha#101	 ((close - open) / ((high - low) + .001))
     def alpha101(self):
         return (self.close - self.open) /((self.high - self.low) + 0.001)
-     
-     
\ No newline at end of file
diff --git a/tests/test.py b/tests/test.py
index f545f47..7302551 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -166,7 +166,123 @@ def check_tempwindow_elim():
 v5 = ForeachBackWindow@{window:10}(v2)
 v6 = ReduceAdd@(v5)
 v7 = Output@{name:}(v4)
-v8 = Output@{name:}(v6)''')        
+v8 = Output@{name:}(v6)''')
+
+    # case 4, Output wraps a WindowedTempOutput directly.  The pre-pass
+    # peels it off; the resulting Output(Mul) then triggers the existing
+    # WindowedTempOutput → Output fold, leaving the loop reading the
+    # Output as a windowed source.
+    builder = Builder()
+    with builder:
+        inp = Input("a")
+        sq = Mul(inp, inp)
+        wto = WindowedTempOutput(sq, 10)
+        v1 = ReduceAdd(ForeachBackWindow(wto, 10))
+        Output(wto, "xport")
+        Output(v1, "reduced")
+    f = Function(builder.ops)
+    temp_window_elim(f)
+    expect_output(f, '''v0 = Input@{name:a}()
+v1 = Mul@(v0,v0)
+v2 = Output@{name:xport}(v1)
+v3 = ForeachBackWindow@{window:10}(v2)
+v4 = ReduceAdd@(v3)
+v5 = Output@{name:reduced}(v4)''')
+
+    # case 5, a WindowedTempOutput shared by both an Output and multiple
+    # windowed consumers in the same function.
+    builder = Builder()
+    with builder:
+        inp = Input("a")
+        sq = Mul(inp, inp)
+        wto = WindowedTempOutput(sq, 31)
+        v1 = ReduceAdd(ForeachBackWindow(wto, 30))
+        v2 = ReduceAdd(ForeachBackWindow(wto, 20))
+        Output(wto, "xport")
+        Output(v1, "r30")
+        Output(v2, "r20")
+    f = Function(builder.ops)
+    temp_window_elim(f)
+    expect_output(f, '''v0 = Input@{name:a}()
+v1 = Mul@(v0,v0)
+v2 = Output@{name:xport}(v1)
+v3 = ForeachBackWindow@{window:30}(v2)
+v4 = ReduceAdd@(v3)
+v5 = ForeachBackWindow@{window:20}(v2)
+v6 = ReduceAdd@(v5)
+v7 = Output@{name:r30}(v4)
+v8 = Output@{name:r20}(v6)''')
+
+    # case 6, when time slicing is allowed, keep a local temp window
+    # instead of reading history from the output buffer.
+    builder = Builder()
+    with builder:
+        inp = Input("a")
+        sq = Mul(inp, inp)
+        wto = WindowedTempOutput(sq, 10)
+        v1 = ReduceAdd(ForeachBackWindow(wto, 10))
+        Output(sq, "xport")
+        Output(v1, "reduced")
+    f = Function(builder.ops)
+    temp_window_elim(f, {"may_slice_time": True})
+    expect_output(f, '''v0 = Input@{name:a}()
+v1 = Mul@(v0,v0)
+v2 = WindowedTempOutput@{window:10}(v1)
+v3 = ForeachBackWindow@{window:10}(v2)
+v4 = ReduceAdd@(v3)
+v5 = Output@{name:xport}(v1)
+v6 = Output@{name:reduced}(v4)''')
+
+    # case 7, may_slice_time still allows Input and larger-temp-window
+    # replacement; only Output replacement is disabled.
+    builder = Builder()
+    with builder:
+        inp = Input("a")
+        wto = WindowedTempOutput(inp, 10)
+        v1 = ReduceAdd(ForeachBackWindow(wto, 10))
+        Output(v1)
+    f = Function(builder.ops)
+    temp_window_elim(f, {"may_slice_time": True})
+    expect_output(f, '''v0 = Input@{name:a}()
+v1 = ForeachBackWindow@{window:10}(v0)
+v2 = ReduceAdd@(v1)
+v3 = Output@{name:}(v2)''')
+
+    builder = Builder()
+    with builder:
+        inp = Input("a")
+        sq = Mul(inp, inp)
+        wto10 = WindowedTempOutput(sq, 10)
+        wto15 = WindowedTempOutput(sq, 15)
+        v1 = ReduceAdd(ForeachBackWindow(wto10, 10))
+        v2 = ReduceAdd(ForeachBackWindow(wto15, 10))
+        Output(sq, "xport")
+        Output(v1, "r10")
+        Output(v2, "r15")
+    f = Function(builder.ops)
+    temp_window_elim(f, {"may_slice_time": True})
+    for op in f.ops:
+        if isinstance(op, ForeachBackWindow):
+            if isinstance(op.inputs[0], Output):
+                raise RuntimeError("may_slice_time replaced temp with Output")
+            if not isinstance(op.inputs[0], WindowedTempOutput):
+                raise RuntimeError("larger temp window replacement failed")
+
+    # case 8, if no windowed op consumes the temp window, it can be
+    # replaced by its input even with may_slice_time enabled.
+    builder = Builder()
+    with builder:
+        inp = Input("a")
+        sq = Mul(inp, inp)
+        wto = WindowedTempOutput(sq, 10)
+        v1 = AddConst(wto, 1)
+        Output(v1, "out")
+    f = Function(builder.ops)
+    temp_window_elim(f, {"may_slice_time": True})
+    expect_output(f, '''v0 = Input@{name:a}()
+v1 = Mul@(v0,v0)
+v2 = AddConst@{value:1}(v1)
+v3 = Output@{name:out}(v2)''')
 
 def check_window():
     # case 1, temp window on input
@@ -505,4 +621,4 @@ def check_pow():
     check_mergeLoop()
     check_toposort()
     check_duplicate_rank_out()
-    check_duplicate_rank_in()
\ No newline at end of file
+    check_duplicate_rank_in()
diff --git a/tests/test2.py b/tests/test2.py
index 24816da..599f1b3 100644
--- a/tests/test2.py
+++ b/tests/test2.py
@@ -3,6 +3,7 @@
 from KunQuant.ops import *
 import KunQuant.passes
 from KunQuant.passes import *
+from KunQuant.Driver import post_optimize
 
 def optimize(f: Function):
     decompose(f)
@@ -115,6 +116,45 @@ def test_partition_rank_out():
 v3 = Output@{name:out2}(v2)''']
     check_partition(f, exp1, exp2)
 
+def test_partition_wto_input_peel():
+    # WTO whose underlying value gets pulled cross-partition.  Many
+    # AddConst-Output pairs split the producing partition off from the
+    # FBS consumers; without the WTO(Input) peel in the partitioner, the
+    # FBS-side partition rewires WTO.inputs[0] to a local synthetic
+    # Input and post-partition `temp_window_elim` folds WTO(Input) →
+    # Input, leaving a degenerate `Output(Input)` passthrough.
+    # original IR:
+    # partition 1:
+    #   a = Input("xxx")  # partition temp input
+    #   b = WindowedTempOutput(a)
+    #   c = use(b)
+    # partition 2:
+    #   d = use(b)   # cross partition op
+    # if without peeling, partition 2 will import WindowedTempOutput as cross partition op.
+    # So WindowedTempOutput will be wired to an output op of partition 1. This is bad for performance.
+    builder = Builder()
+    with builder:
+        a = Input("a")
+        b = Input("b")
+        x = Mul(a, b)
+        for i in range(8):
+            Output(AddConst(x, float(i)), f"add_{i}")
+        wt = WindowedTempOutput(x, 30)
+        for i in range(5):
+            Output(FastWindowedSum(wt, 5 + i * 4), f"fbs_{i}")
+    f = Function(builder.ops)
+    optimize(f)
+    _, impl = do_partition(f, 1)
+    post_optimize(impl, {})
+    for sub in impl:
+        for op in sub.ops:
+            if isinstance(op, Output) and isinstance(op.inputs[0], Input):
+                raise RuntimeError(
+                    f"partitioner left Output(Input) passthrough in "
+                    f"partition {sub.name!r}: {op}")
+
+
 test_partition1()
 test_partition_cylic()
-test_partition_rank_out()
\ No newline at end of file
+test_partition_rank_out()
+test_partition_wto_input_peel()
\ No newline at end of file
diff --git a/tests/test_alpha101.py b/tests/test_alpha101.py
index c778672..0f5c8ff 100644
--- a/tests/test_alpha101.py
+++ b/tests/test_alpha101.py
@@ -1,5 +1,7 @@
 from KunQuant.Driver import KunCompilerConfig
 from KunTestUtil import ref_alpha101, gen_data
+import argparse
+import dataclasses
 import numpy as np
 import pandas as pd
 import sys
@@ -10,10 +12,43 @@
 from KunQuant.Stage import Function
 from KunQuant.predefined.Alpha101 import AllData, all_alpha
 from KunQuant.runner import KunRunner as kr
-from KunQuant.jit.env import cpu_arch
+from KunQuant.jit.env import cpu_arch, get_cuda_compute_capability
 
 isx86 = cpu_arch != "aarch64"
 
+_argp = argparse.ArgumentParser(add_help=False)
+_argp.add_argument("action", nargs="?")
+_argp.add_argument("--gpu-arch", default="")
+_argp.add_argument("--benchmode", action="store_true")
+_argp.add_argument("--use-cuda-graph", action="store_true")
+_argp.add_argument("--time", type=int, default=260)
+_argp.add_argument("--num-stocks", type=int, default=64)
+_argp.add_argument("--num-threads", type=int, default=4)
+
+_args, _ = _argp.parse_known_args()
+action = _args.action or ("run_gpu" if _args.gpu_arch else "avx2")
+if _args.gpu_arch == "auto":
+    GPU_ARCH = get_cuda_compute_capability()
+else:
+    GPU_ARCH = _args.gpu_arch
+GPU_MODE = bool(GPU_ARCH)
+BENCHMODE = _args.benchmode
+USE_CUDA_GRAPH = _args.use_cuda_graph
+TIME = _args.time
+NUM_STOCKS = _args.num_stocks
+NUM_THREADS = _args.num_threads
+if GPU_MODE:
+    import cupy as cp
+    from KunQuant.jit import KunMLIR as _kr_mlir
+    from KunQuant.jit import cuda as _cuda_jit
+    from KunQuantMLIR.OverlapRunner import OverlapRunner
+    if BENCHMODE:
+        cuda_stream = cp.cuda.Stream(non_blocking=True)
+    else:
+        cuda_stream = None
+    cp.cuda.Device(0).use()
+    cp.zeros((1,), dtype=cp.float32)
+
 def get_simd_len(avx: str, dtype: str = "float"):
     element_width = 32 if dtype == "float" else 64
     if avx == "avx512":
@@ -130,6 +165,65 @@ def TS_ST(data: np.ndarray) -> np.ndarray:
 def ST_TS(data: np.ndarray) -> np.ndarray:
     return np.ascontiguousarray(data.transpose())
 
+
+def get_output_layout(modu):
+    return "TS" if GPU_MODE else modu.output_layout
+
+
+def create_single_thread_executor():
+    return _kr_mlir.Executor(cuda_stream) if GPU_MODE else kr.createSingleThreadExecutor()
+
+
+def create_multi_thread_executor(n):
+    return _kr_mlir.Executor(cuda_stream) if GPU_MODE else kr.createMultiThreadExecutor(n)
+
+GPU_OVERLAP_SLOTS = 3
+
+def warmup_gpu_overlap_runner(overlap_runner, modu, inputs,
+                              cur_time, length, **kwargs):
+    last = None
+    for _ in range(GPU_OVERLAP_SLOTS):
+        last = run_graph(overlap_runner.executor, True, modu, inputs,
+                         cur_time, length, None,
+                         overlap_runner=overlap_runner, **kwargs)
+    return last.wait() if last is not None else None
+
+def run_graph(executor, benchmode, modu, inputs, cur_time, length, outputs=None,
+              overlap_runner=None, **kwargs):
+    if not GPU_MODE:
+        return kr.runGraph(executor, modu, inputs, cur_time, length,
+                           outputs if outputs is not None else {}, **kwargs)
+    if cur_time != 0:
+        raise RuntimeError("GPU alpha101 test only supports cur_time=0")
+    kwargs.pop("skip_check", None)
+    kwargs.pop("num_stocks", None)
+    if benchmode:
+        if overlap_runner is None:
+            raise RuntimeError("GPU benchmark mode requires overlap_runner")
+        return overlap_runner.submit(
+            inputs,
+            cur_time=cur_time,
+            length=length,
+            use_cuda_graph=USE_CUDA_GRAPH,
+            **kwargs,
+        )
+    gpu_inputs = {k: cp.asarray(v) for k, v in inputs.items()}
+    ret = executor.runGraph(modu, gpu_inputs, cur_time=cur_time,
+                            length=length,
+                            use_cuda_graph=USE_CUDA_GRAPH,
+                            **kwargs)
+    out_np = {}
+    for k, v in ret.items():
+        arr = v if isinstance(v, cp.ndarray) else cp.from_dlpack(v)
+        host = cp.asnumpy(arr)
+        if outputs is not None and k in outputs:
+            outputs[k][...] = host
+            out_np[k] = outputs[k]
+        else:
+            out_np[k] = host
+    return out_np
+
+
 def make_data_and_ref(num_stock, num_time, ischeck, input_ST8t, dtype="float32"):
     rng = np.random.get_state()
     start = time.time()
@@ -263,7 +357,9 @@ def check_result(out, ref, outnames, start_window, num_stock, start_time, num_ti
         cur_rtol = tolerance["rtol"].get(k, rtol)
         cur_atol = tolerance["atol"].get(k, atol)
         check_start = 0
-        if start_time or k in tolerance["skip_head"]:
+        # GPU kernels do not match the CPU/pandas partial-window warmup rows even
+        # when start_time == 0, so the nonzero start_time skip is not enough.
+        if GPU_MODE or start_time or k in tolerance["skip_head"]:
             check_start = start_window[k] + start_time
         v = out[k][:,check_start-start_time:]
         refv = ref[k][check_start:].to_numpy().transpose()
@@ -295,9 +391,12 @@ def check_result(out, ref, outnames, start_window, num_stock, start_time, num_ti
     return done
 
 def test(modu, executor, start_window, num_stock, num_time, my_input, ref, ischeck, start_time):
+    if GPU_MODE and start_time != 0:
+        print(f"[skip on GPU] start_time={start_time}")
+        return True
     # prepare outputs
     outnames = modu.getOutputNames()
-    layout = modu.output_layout
+    layout = get_output_layout(modu)
     outbuffers = dict()
     print(layout)
     if layout == "TS":
@@ -310,15 +409,30 @@ def test(modu, executor, start_window, num_stock, num_time, my_input, ref, ische
     # blocked = TS_STs(inp)
     
     if not ischeck:
-        out = kr.runGraph(executor, modu, my_input, start_time, num_time-start_time, outbuffers)
+        out = run_graph(executor, False, modu, my_input, start_time,
+                        num_time-start_time, outbuffers,
+                        overlap_runner=None)
+        overlap_runner = None
+        if GPU_MODE:
+            overlap_runner = OverlapRunner(
+                modu, executor, num_slots=GPU_OVERLAP_SLOTS)
+            warmup_gpu_overlap_runner(overlap_runner, modu, my_input, start_time,
+                                      num_time-start_time)
         start = time.time()
         for _ in range(20):
-            out = kr.runGraph(executor, modu, my_input, start_time, num_time-start_time, outbuffers, skip_check = True)
+            out = run_graph(executor, True, modu, my_input, start_time,
+                            num_time-start_time, outbuffers,
+                            overlap_runner=overlap_runner)
+        if GPU_MODE:
+            overlap_runner.synchronize()
+            out = out.wait()
         end = time.time()
         tdiff = (end-start)/20
     else:
         start = time.time()
-        out = kr.runGraph(executor, modu, my_input, start_time, num_time-start_time, outbuffers, num_stocks = num_stock)
+        out = run_graph(executor, False, modu, my_input, start_time,
+                        num_time-start_time, outbuffers,
+                        overlap_runner=None, num_stocks=num_stock)
         end = time.time()
         tdiff = end-start
     print(f"Exec takes: {tdiff:.6f} seconds")
@@ -372,9 +486,12 @@ def streammain(num_stock):
 
 
 def test64(modu, executor, start_window, num_stock, num_time, my_input, ref, ischeck, start_time):
+    if GPU_MODE and start_time != 0:
+        print(f"[skip on GPU] start_time={start_time}")
+        return True
     # prepare outputs
     outnames = modu.getOutputNames()
-    layout = modu.output_layout
+    layout = get_output_layout(modu)
     outbuffers = dict()
     print(layout)
     if layout == "TS":
@@ -386,15 +503,30 @@ def test64(modu, executor, start_window, num_stock, num_time, my_input, ref, isc
     # print(ref.alpha001())
     # blocked = TS_STs(inp)
     if not ischeck:
-        out = kr.runGraph(executor, modu, my_input, start_time, num_time-start_time, outbuffers)
+        out = run_graph(executor, False, modu, my_input, start_time,
+                        num_time-start_time, outbuffers,
+                        overlap_runner=None)
+        overlap_runner = None
+        if GPU_MODE:
+            overlap_runner = OverlapRunner(
+                modu, executor, num_slots=GPU_OVERLAP_SLOTS)
+            warmup_gpu_overlap_runner(overlap_runner, modu, my_input, start_time,
+                                      num_time-start_time)
         start = time.time()
         for _ in range(20):
-            out = kr.runGraph(executor, modu, my_input, start_time, num_time-start_time, outbuffers, skip_check = True)
+            out = run_graph(executor, True, modu, my_input, start_time,
+                            num_time-start_time, outbuffers,
+                            overlap_runner=overlap_runner)
+        if GPU_MODE:
+            overlap_runner.synchronize()
+            out = out.wait()
         end = time.time()
         tdiff = (end-start)/20
     else:
         start = time.time()
-        out = kr.runGraph(executor, modu, my_input, start_time, num_time-start_time, outbuffers)
+        out = run_graph(executor, False, modu, my_input, start_time,
+                        num_time-start_time, outbuffers,
+                        overlap_runner=None)
         end = time.time()
         tdiff = end-start
     print(f"Exec takes: {tdiff:.6f} seconds")
@@ -411,22 +543,23 @@ def test64(modu, executor, start_window, num_stock, num_time, my_input, ref, isc
 def main(is64: bool, is_check: bool):
     modu = lib.getModule("alpha_101" if not is64 else "alpha_101_double")
     start_window = modu.getOutputUnreliableCount()
-    num_stock = 64
+    num_stock = NUM_STOCKS
     done = True
     testfunc = test64 if is64 else test
-    blocking_num = modu.blocking_len
+    blocking_num = 1 if GPU_MODE else modu.blocking_len
     # fp64 version is compiled with TS format
-    blocking = 0 if is64 else blocking_num
+    blocking = 0 if is64 or GPU_MODE else blocking_num
     def compute():
         nonlocal done
-        num_time = 260
+        num_time = TIME
         my_input, pd_ref = make_data_and_ref(num_stock, num_time, is_check, blocking, "float64" if is64 else "float32")
-        executor = kr.createSingleThreadExecutor()
-        done = done & testfunc(modu, executor, start_window, num_stock, num_time, my_input, pd_ref, is_check, 0)
-        done = done & testfunc(modu, executor, start_window, num_stock, num_time, my_input, pd_ref, is_check, 50)
-        executor = kr.createMultiThreadExecutor(4)
+        if not GPU_MODE:
+            executor = create_single_thread_executor()
+            done = done & testfunc(modu, executor, start_window, num_stock, num_time, my_input, pd_ref, is_check, 0)
+            if not BENCHMODE:
+                done = done & testfunc(modu, executor, start_window, num_stock, num_time, my_input, pd_ref, is_check, 50)
+        executor = create_multi_thread_executor(NUM_THREADS)
         done = done & testfunc(modu, executor, start_window, num_stock, num_time, my_input, pd_ref, is_check, 0)
-    num_stock = 64
     compute()
     # skip benchmarking on unaligned mode
     if not is_check:
@@ -439,13 +572,24 @@ def compute():
     if not done:
         exit(1)
 
-action = sys.argv[1]
 def do_compile(avx, keep, tempdir):
     funclist = [
         check_alpha101(avx),
         check_alpha101_stream(avx),
         check_alpha101_double(avx)
         ]
+    if GPU_MODE:
+        gpu_funclist = []
+        for name, f, kcfg in funclist:
+            if name == "alpha_101_stream":
+                continue
+            kcfg = dataclasses.replace(kcfg, input_layout="TS",
+                                       output_layout="TS",
+                                       partition_factor=2,
+                                       blocking_len=1)
+            gpu_funclist.append((name, f, kcfg))
+        ccfg = _cuda_jit.CudaCompilerConfig(gpu_arch=GPU_ARCH)
+        return _cuda_jit.compileit(gpu_funclist, "test", ccfg)
     if avx == "avx512":
         machine = cfake.X64CPUFlags(avx512=True, avx512dq=True, avx512vl=True)
     else:
@@ -460,14 +604,15 @@ def do_compile(avx, keep, tempdir):
     lib = do_compile(action, False, None)
 
 print("Check f64 batch")
-main(True, True)
+main(True, not BENCHMODE)
 print("======================================")
 print("Check f32 batch")
-main(False, True)
-print("======================================")
-print("Check f32 stream")
-streammain(64)
-if action != "run_avx512" and isx86:
+main(False, not BENCHMODE)
+if not GPU_MODE and not BENCHMODE:
+    print("======================================")
+    print("Check f32 stream")
+    streammain(64)
+if not GPU_MODE and action != "run_avx512" and isx86 and not BENCHMODE:
     print("======================================")
     print("Check f32 stream unaligned")
     streammain(63)
diff --git a/tests/test_alpha158.py b/tests/test_alpha158.py
index 943df2a..5b6b11f 100644
--- a/tests/test_alpha158.py
+++ b/tests/test_alpha158.py
@@ -10,11 +10,39 @@
 from KunQuant.Op import Builder, Input, Output
 from KunQuant.Stage import Function
 from KunQuant.predefined.Alpha158 import AllData
-from KunQuant.jit.env import cpu_arch
+from KunQuant.jit.env import cpu_arch, get_cuda_compute_capability
 
 isx86 = cpu_arch != "aarch64"
 
-def check_alpha158(avx512, keep, tempdir):
+
+# Factor families the GPU backend can't compile yet (the underlying op
+# has no kunir lowering).  We filter their Output ops out of the Function
+# before compileit on the GPU path — the rest of alpha158 compiles fine.
+#   QTLU / QTLD → WindowedQuantile → SkipList (CPU-only).
+_GPU_SKIP_FACTOR_PREFIXES = ("QTLU", "QTLD")
+
+
+def _filter_outputs_for_gpu(f: Function) -> None:
+    """Remove Output ops whose name starts with a `_GPU_SKIP_FACTOR_PREFIXES`
+    entry.  Mutates `f` in place via `set_ops`.  The dropped intermediate
+    compute ops are GC'd as part of the downstream optimization pipeline
+    (anything with no remaining user is dead)."""
+    kept = []
+    dropped = []
+    for op in f.ops:
+        if isinstance(op, Output):
+            name = op.attrs.get("name", "")
+            if any(name.startswith(p) for p in _GPU_SKIP_FACTOR_PREFIXES):
+                dropped.append(name)
+                continue
+        kept.append(op)
+    if dropped:
+        print(f"[gpu] dropping {len(dropped)} unsupported outputs: "
+              f"{sorted(set(n.rstrip('0123456789') for n in dropped))}")
+    f.set_ops(kept)
+
+
+def check_alpha158(avx512, keep, tempdir, gpu_arch=""):
     builder = Builder()
     with builder:
         pack_158 = AllData(low=Input("low"), high=Input("high"), close=Input(
@@ -38,6 +66,17 @@ def check_alpha158(avx512, keep, tempdir):
             Output(v, k)
     print("Total names: ", len(names))
     f = Function(builder.ops)
+    if gpu_arch:
+        _filter_outputs_for_gpu(f)
+        from KunQuant.jit import cuda as _cuda_jit
+        target = [("alpha158", f, KunCompilerConfig(
+            dtype='double', blocking_len=1, partition_factor=2,
+            output_layout="TS", input_layout="TS",
+            options={"opt_reduce": True, "fast_log": True,
+                     'no_fast_stat': 'no_warn'}))]
+        ccfg = _cuda_jit.CudaCompilerConfig(gpu_arch=gpu_arch)
+        return _cuda_jit.compileit(target, "testalpha158", ccfg)
+
     if avx512:
         simd_len = 8
     elif isx86:
@@ -45,7 +84,8 @@ def check_alpha158(avx512, keep, tempdir):
     else:
         simd_len = 2
     target = [("alpha158", f, KunCompilerConfig(dtype='double', blocking_len=simd_len, partition_factor=4,
-               output_layout="TS", input_layout="TS", options={"opt_reduce": True, "fast_log": True}))]
+               output_layout="TS", input_layout="TS", options={"opt_reduce": True, "fast_log": True,
+                                                                'no_fast_stat': 'no_warn'}))]
     if avx512:
         machine = cfake.X64CPUFlags(avx512=True, avx512dq=True, avx512vl=True)
     else:
@@ -65,28 +105,68 @@ def ST_TS(data: np.ndarray) -> np.ndarray:
     return np.ascontiguousarray(data.transpose()).astype('float64')
 
 
-def test(lib: kr.Library, inputs: Dict[str, np.ndarray], ref: Dict[str, np.ndarray]):
-    rtol = 1e-4
-    atol = 1e-5
-    modu = lib.getModule("alpha158")
-    start_window = modu.getOutputUnreliableCount()
-    num_stock = 8
-    num_time = 260
-    outnames = modu.getOutputNames()
-    print("Total num alphas", len(outnames))
-    executor = kr.createMultiThreadExecutor(8)
-    my_input = {"high": ST_TS(inputs['dhigh']), "low": ST_TS(inputs['dlow']), "close": ST_TS(inputs['dclose']),
-                "open": ST_TS(inputs['dopen']), "volume": ST_TS(inputs['dvol']), "amount": ST_TS(inputs['damount'])}
-    outbuffers = dict()
-    # Factors, Time, Stock
-    sharedbuf = np.empty((len(outnames), num_time, num_stock), dtype="float64")
-    sharedbuf[:] = np.nan
-    for idx, name in enumerate(outnames):
-        outbuffers[name] = sharedbuf[idx]
-    start = time.time()
-    out = kr.runGraph(executor, modu, my_input, 0, num_time, outbuffers)
-    end = time.time()
-    print(f"Exec takes: {end-start:.6f} seconds")
+# ── Backend shims ───────────────────────────────────────────────────
+#
+# CPU and GPU have the same conceptual flow (prepare → execute → fetch),
+# they differ only in the runtime calls and where the buffers live.
+# Wrap each backend in a tiny object exposing the three methods so the
+# `test()` body stays single-source.
+
+class _CpuBackend:
+    def __init__(self, lib: kr.Library, modname: str):
+        self.modu = lib.getModule(modname)
+        self.start_window = self.modu.getOutputUnreliableCount()
+        self.outnames = self.modu.getOutputNames()
+        self.executor = kr.createMultiThreadExecutor(8)
+        # Pre-allocated NaN-filled output buffers; the CPU runtime writes
+        # in place and we hand the same dict back to `_compare`.
+        self._outbuffers = {}
+        sharedbuf = np.empty((len(self.outnames), num_time, num_stock),
+                              dtype="float64")
+        sharedbuf[:] = np.nan
+        for idx, name in enumerate(self.outnames):
+            self._outbuffers[name] = sharedbuf[idx]
+
+    def prepare_input(self, host_input):
+        return host_input
+
+    def execute(self, inputs):
+        kr.runGraph(self.executor, self.modu, inputs, 0, num_time,
+                     self._outbuffers)
+
+    def fetch_output(self):
+        return self._outbuffers
+
+
+class _GpuBackend:
+    def __init__(self, lib, modname: str):
+        import cupy as cp
+        from KunQuant.jit import KunMLIR as _kr_mlir
+        self._cp = cp
+        self.modu = lib.getModule(modname)
+        self.start_window = self.modu.getOutputUnreliableCount()
+        self.outnames = self.modu.output_names
+        self.executor = _kr_mlir.Executor()
+        self._raw = None
+
+    def prepare_input(self, host_input):
+        return {k: self._cp.asarray(v) for k, v in host_input.items()}
+
+    def execute(self, inputs):
+        self._raw = self.executor.runGraph(self.modu, inputs)
+        self.executor.synchronize()
+
+    def fetch_output(self):
+        cp = self._cp
+        out = {}
+        for k in self.outnames:
+            v = self._raw[k]
+            arr = v if isinstance(v, cp.ndarray) else cp.from_dlpack(v)
+            out[k] = cp.asnumpy(arr)
+        return out
+
+
+def _compare(outbuffers, ref, start_window, rtol, atol):
     for k, v in outbuffers.items():
         s = start_window[k]
         if not np.allclose(v[s:], ref[k][s:], rtol=rtol, atol=atol, equal_nan=True):
@@ -102,6 +182,22 @@ def test(lib: kr.Library, inputs: Dict[str, np.ndarray], ref: Dict[str, np.ndarr
                     exit(1)
 
 
+def test(backend, inputs: Dict[str, np.ndarray],
+          ref: Dict[str, np.ndarray]) -> None:
+    rtol = 1e-4
+    atol = 1e-5
+    print("Total num alphas", len(backend.outnames))
+    host_input = {"high": ST_TS(inputs['dhigh']), "low": ST_TS(inputs['dlow']),
+                  "close": ST_TS(inputs['dclose']), "open": ST_TS(inputs['dopen']),
+                  "volume": ST_TS(inputs['dvol']), "amount": ST_TS(inputs['damount'])}
+    be_input = backend.prepare_input(host_input)
+    start = time.time()
+    backend.execute(be_input)
+    end = time.time()
+    print(f"Exec takes: {end-start:.6f} seconds")
+    _compare(backend.fetch_output(), ref, backend.start_window, rtol, atol)
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         prog="Run and check alpha158 again pre-computed result")
@@ -110,15 +206,30 @@ def test(lib: kr.Library, inputs: Dict[str, np.ndarray], ref: Dict[str, np.ndarr
     parser.add_argument("--ref", required=True, type=str,
                         help="The path to the reference output npz file")
     parser.add_argument("--action", required=True, type=str,
-                        help="The path to the reference output npz file")
+                        help="One of: compile_avx512, run_avx512, run_native, run_gpu")
+    parser.add_argument("--gpu-arch", default="sm_80", type=str,
+                        help="GPU compute capability for --action=run_gpu (e.g. sm_80)")
     args = parser.parse_args()
     if args.action == "compile_avx512":
         check_alpha158(True, True, "./build")
         exit(0)
     elif args.action == "run_avx512":
         lib = kr.Library.load(os.path.join("./build/testalpha158", "testalpha158.so"))
+        inp, ref = load(args.inputs, args.ref)
+        test(_CpuBackend(lib, "alpha158"), inp, ref)
+    elif args.action == "run_gpu":
+        # Touch the cupy allocator before compileit so the primary CUDA
+        # context exists when KunMLIR.compile inherits it.
+        import cupy as cp
+        cp.cuda.Device(0).use()
+        cp.zeros((1,), dtype=cp.float64)
+        if args.gpu_arch == "auto":
+            args.gpu_arch = get_cuda_compute_capability()
+        lib = check_alpha158(False, False, None, gpu_arch=args.gpu_arch)
+        inp, ref = load(args.inputs, args.ref)
+        test(_GpuBackend(lib, "alpha158"), inp, ref)
     else:
         lib = check_alpha158(False, False, None)
-    inp, ref = load(args.inputs, args.ref)
-    test(lib, inp, ref)
+        inp, ref = load(args.inputs, args.ref)
+        test(_CpuBackend(lib, "alpha158"), inp, ref)
     print("done")
diff --git a/tests/test_runtime.py b/tests/test_runtime.py
index de7c9d9..f1f5f99 100644
--- a/tests/test_runtime.py
+++ b/tests/test_runtime.py
@@ -1,4 +1,6 @@
 from KunQuant.Driver import KunCompilerConfig
+import argparse
+import dataclasses
 import numpy as np
 import pandas as pd
 import sys
@@ -12,7 +14,193 @@
 from KunQuant.predefined.Alpha101 import *
 from KunQuant.runner import KunRunner as kr
 import sys
-from KunQuant.jit.env import cpu_arch
+from KunQuant.jit.env import cpu_arch, get_cuda_compute_capability
+
+
+# ── Backend dispatch (CPU vs GPU) ────────────────────────────────────
+#
+# `--gpu-arch sm_XX` flips us into GPU mode: every compile / executor /
+# runGraph call below the line goes through KunMLIR / KunQuant.jit.cuda
+# instead of cfake / KunRunner.  Tests that the GPU backend can't run
+# yet (STs layout, double dtype, streaming, custom cross-sectional C++,
+# aggregrate/corrwith helpers, Library.load) are skipped — see
+# `_GPU_SKIP_TESTS` below.
+_argp = argparse.ArgumentParser()
+_argp.add_argument("--gpu-arch", default="",
+                    help="GPU compute capability (e.g. sm_80).  Empty = CPU.")
+_args, _ = _argp.parse_known_args()
+if _args.gpu_arch == "auto":
+    GPU_ARCH = get_cuda_compute_capability()
+else:
+    GPU_ARCH = _args.gpu_arch
+GPU_MODE = bool(GPU_ARCH)
+
+if GPU_MODE:
+    import cupy as cp
+    from KunQuant.jit import KunMLIR as _kr_mlir
+    from KunQuant.jit import cuda as _cuda_jit
+    # KunMLIR.compile + cuMemAlloc inherit the calling thread's primary
+    # CUDA context — touch a cupy allocator early so it exists.
+    cp.cuda.Device(0).use()
+    cp.zeros((1,), dtype=cp.float32)
+
+
+def compileit(funclist, libname):
+    """Backend-aware wrapper around `cfake.compileit` / `cuda.compileit`.
+    Both have the same `(funclist, libname, compiler_config)` shape; we
+    just pick the right compiler_config based on `GPU_MODE`.
+
+    GPU codegen only supports `TS` layout, so when a test built a
+    function with `STs` we transparently flip the layout flags to `TS`
+    here.  The matching `runGraph` wrapper does the input/output
+    blocking-shape reshape, keeping the test source unchanged."""
+    if GPU_MODE:
+        gpu_funclist = []
+        for name, f, kcfg in funclist:
+            if kcfg.input_layout == "STs" or kcfg.output_layout == "STs":
+                kcfg = dataclasses.replace(kcfg, input_layout="TS",
+                                                  output_layout="TS")
+            gpu_funclist.append((name, f, kcfg))
+        ccfg = _cuda_jit.CudaCompilerConfig(gpu_arch=GPU_ARCH)
+        return _cuda_jit.compileit(gpu_funclist, libname, ccfg)
+    return cfake.compileit(funclist, libname,
+                            cfake.CppCompilerConfig(machine=get_compiler_flags()))
+
+
+def createSingleThreadExecutor():
+    if GPU_MODE:
+        return _kr_mlir.Executor()
+    return kr.createSingleThreadExecutor()
+
+
+def createMultiThreadExecutor(n):
+    if GPU_MODE:
+        return _kr_mlir.Executor()
+    return kr.createMultiThreadExecutor(n)
+
+
+def _sts_unblock(blocked: np.ndarray) -> np.ndarray:
+    """STs blocked input `(S/blocking, T, blocking)`  →  TS dense
+    `(T, S)`.  test_runtime.py's STs convention has the stock axis
+    outer and time inner, so we transpose `(1, 0, 2)` before reshaping
+    the stock-block + lane back into one S axis."""
+    Sb, T, blocking = blocked.shape
+    return np.ascontiguousarray(
+        blocked.transpose((1, 0, 2)).reshape((T, Sb * blocking)))
+
+
+def _sts_reblock(flat: np.ndarray, blocking: int) -> np.ndarray:
+    """TS dense `(T, S)`  →  STs blocked `(S/blocking, T, blocking)`.
+    Inverse of `_sts_unblock`; pull `blocking` from the source rather
+    than re-deriving from dtype so it stays consistent with whatever
+    the test used to block the input."""
+    T, S = flat.shape
+    return np.ascontiguousarray(
+        flat.reshape((T, S // blocking, blocking)).transpose((1, 0, 2)))
+
+
+def runGraph(executor, modu, inputs, cur_time, length, outputs=None,
+              gpu_sm_fill_factor=None):
+    """Backend-aware `kr.runGraph`.  CPU path is a pass-through; GPU
+    path moves numpy inputs to cupy, runs, syncs, and copies results
+    back into the caller-supplied numpy outputs (if any).  Returns the
+    `{name: numpy_ndarray}` dict the CPU runtime also returns.
+
+    STs-blocked (3-D) inputs are transparently unblocked to TS before
+    launch; outputs are re-blocked to match.  The matching `compileit`
+    wrapper has already rewritten the function's layout attr to `TS`,
+    so the kunir codegen never sees `STs`.
+
+    `gpu_sm_fill_factor` (GPU only) overrides the runtime's chunk-grid
+    heuristic — pass `0.0` to force a single time chunk, useful when
+    a test asserts bit-exactness against a single-pass reference (the
+    multi-chunk Kahan restart introduces ≤1 ulp drift).
+    """
+    if not GPU_MODE:
+        return kr.runGraph(executor, modu, inputs, cur_time, length,
+                            outputs if outputs is not None else {})
+
+    # Strip STs blocking on inputs; remember the blocking factor so we
+    # can re-block matching outputs.  1-D inputs (e.g. __init single-
+    # value) pass through untouched.
+    blocking = None
+    ts_inputs = {}
+    for k, v in inputs.items():
+        if v.ndim == 3:
+            blocking = v.shape[-1]
+            ts_inputs[k] = _sts_unblock(v)
+        else:
+            ts_inputs[k] = v
+
+    gpu_inputs = {k: cp.asarray(v) for k, v in ts_inputs.items()}
+    rg_kwargs = {"cur_time": cur_time, "length": length}
+    if gpu_sm_fill_factor is not None:
+        rg_kwargs["sm_fill_factor"] = gpu_sm_fill_factor
+    ret = executor.runGraph(modu, gpu_inputs, **rg_kwargs)
+    executor.synchronize()
+
+    out_np = {}
+    for k, v in ret.items():
+        arr = v if isinstance(v, cp.ndarray) else cp.from_dlpack(v)
+        host = cp.asnumpy(arr)
+        if blocking is not None:
+            host = _sts_reblock(host, blocking)
+        if outputs is not None and k in outputs:
+            outputs[k][...] = host
+            out_np[k] = outputs[k]
+        else:
+            out_np[k] = host
+    return out_np
+
+
+# Tests not yet runnable through the GPU backend (STs / double / stream /
+# unsupported ops / aggregrate / corrwith / Library.load).  Anything else
+# is attempted in GPU mode.
+_GPU_SKIP_TESTS = {
+    "test_stream_lifetime_gh_issue_41",
+    "test_corrwith",
+    "test_aggregrate",
+    "test_runtime",
+    "test_ema_init",           # __init Input not supported yet
+    "test_aligned",            # CPU-only shape-error check
+    "test_quantile",           # SkipList
+    "test_stream_double",
+    "test_repro_crash_gh_issue_71",
+    "test_generic_cross_sectional",
+}
+
+# Names from `check_xxx()` factory tuples that GPU can actually compile.
+# Anything not in here is filtered out of the lib funclist before
+# `compileit` runs on the GPU side — keeps the build green even though
+# most check_xxx entries still produce unsupported kunir.
+_GPU_LIB_NAMES = {
+    "avg_and_stddev",       # WindowedAvg + WindowedStddev (Sqrt + FBW)
+    "avg_and_stddev_TS",    # same, double dtype, TS layout
+    "test_rank",            # cross-sectional Rank (external cs_rank_f32)
+    "test_rank2",           # Add + Rank, double dtype (cs_rank_f64)
+    "test_rank_alpha029",   # Rank chain + WindowedSum, double
+    "test_log",             # float32
+    "test_log64",           # float64
+    "test_pow",             # Pow → Exp(Log(x) * expo) + Sqrt special-case
+    "test_covar",           # WindowedCovariance + WindowedCorrelation, double
+    "test_skew",            # WindowedSkew/Kurt (both fast & slow paths)
+    "test_large_rank",      # TsRank/TsArgMin/Max via naive FBW (no_skip_list)
+    "test_argmin",          # TsArgMin/TsRank/WindowedMin small-window
+    "test_max_drawdown",    # WindowedMaxDrawdown (uses WindowLoopIndex)
+    "test_ema",             # ExpMovingAvg (expanded by experimental_expand)
+}
+
+
+def _run(fn, *args, **kwargs):
+    """Call `fn(*args, **kwargs)` unless we're in GPU mode and `fn` is in
+    `_GPU_SKIP_TESTS` — then just print and return.  Keeps the dispatch
+    block at the bottom of the file unchanged in shape."""
+    if GPU_MODE:
+        name = fn.__name__
+        if name in _GPU_SKIP_TESTS:
+            print(f"[skip on GPU] {name}")
+            return
+    fn(*args, **kwargs)
 
 def test_aggregrate(dtype):
     a = np.random.rand(240, 16).astype(dtype)
@@ -128,13 +316,14 @@ def test_cfake():
         inp2 = Input("b")
         Output(inp1 * inp2 + 10, "out")
     f = Function(builder.ops)
-    lib = cfake.compileit([("test1", f, cfake.KunCompilerConfig(input_layout="TS", output_layout="TS"))],
-        "cfaketest", cfake.CppCompilerConfig(machine=get_compiler_flags()))
+    lib = compileit(
+        [("test1", f, KunCompilerConfig(input_layout="TS", output_layout="TS"))],
+        "cfaketest")
     mod = lib.getModule("test1")
     inp = np.random.rand(10, 24).astype("float32")
     inp2 = np.random.rand(10, 24).astype("float32")
-    executor = kr.createSingleThreadExecutor()
-    out = kr.runGraph(executor, mod, {"a": inp, "b": inp2}, 0, 10)
+    executor = createSingleThreadExecutor()
+    out = runGraph(executor, mod, {"a": inp, "b": inp2}, 0, 10)
     np.testing.assert_allclose(inp * inp2 + 10, out["out"])
 
 def test_runtime(libpath):
@@ -192,8 +381,8 @@ def test_avg_stddev(lib):
     expected_mean = df.rolling(10).mean().to_numpy().transpose()
     expected_stddev = df.rolling(10).std().to_numpy().transpose()
     blocked = ST_ST8t(inp)
-    executor = kr.createSingleThreadExecutor()
-    out = kr.runGraph(executor, modu, {"a": blocked}, 0, 20)
+    executor = createSingleThreadExecutor()
+    out = runGraph(executor, modu, {"a": blocked}, 0, 20)
     outmean = ST8t_ST(out["ou1"])
     outstd = ST8t_ST(out["ou2"])
     np.testing.assert_allclose(outmean, expected_mean, rtol=1e-6, equal_nan=True)
@@ -212,8 +401,8 @@ def test_avg_stddev_TS(lib):
     expected_mean = df.rolling(10).mean().to_numpy().transpose()
     expected_stddev = df.rolling(10).std().to_numpy().transpose()
     blocked = np.ascontiguousarray(inp.transpose())
-    executor = kr.createSingleThreadExecutor()
-    out = kr.runGraph(executor, modu, {"a": blocked}, 0, 20)
+    executor = createSingleThreadExecutor()
+    out = runGraph(executor, modu, {"a": blocked}, 0, 20)
     outmean = out["ou1"].transpose()
     outstd = out["ou2"].transpose()
     np.testing.assert_allclose(outmean, expected_mean, rtol=1e-6, equal_nan=True)
@@ -239,8 +428,8 @@ def test_covar(lib):
     df2 = pd.DataFrame(inp2)
     expected_covar = df.rolling(10).cov(df2).to_numpy()
     expected_corr = df.rolling(10).corr(df2).to_numpy()
-    executor = kr.createSingleThreadExecutor()
-    out = kr.runGraph(executor, modu, {"a": inp, "b": inp2}, 0, 200)
+    executor = createSingleThreadExecutor()
+    out = runGraph(executor, modu, {"a": inp, "b": inp2}, 0, 200)
     outcovar = out["ou1"]
     outcorr = out["ou2"]
     np.testing.assert_allclose(outcovar, expected_covar, rtol=1e-6, equal_nan=True)
@@ -288,8 +477,8 @@ def test_large_rank(lib):
     # test with duplicates
     inp[400:410,:] = -1
     # inp[1400:1410,:] = 10
-    executor = kr.createSingleThreadExecutor()
-    out = kr.runGraph(executor, modu, {"a": inp}, 0, 2000)
+    executor = createSingleThreadExecutor()
+    out = runGraph(executor, modu, {"a": inp}, 0, 2000)
     outrank = out["ou1"]
     df = pd.DataFrame(inp)
     expected_rank = df.rolling(200).rank().to_numpy()
@@ -318,8 +507,8 @@ def test_ema(lib):
     assert(modu)
     inp = np.random.rand(20, 24).astype("float32")
     inp[5,:] = np.nan
-    executor = kr.createSingleThreadExecutor()
-    out = kr.runGraph(executor, modu, {"a": inp}, 0, 20)
+    executor = createSingleThreadExecutor()
+    out = runGraph(executor, modu, {"a": inp}, 0, 20)
     output = out["ou2"]
     df = pd.DataFrame(inp)
     expected = RefExpMovingAvg(df)
@@ -371,8 +560,8 @@ def test_argmin_issue19(lib):
     data = [ 0.6898481863442985, 0.6992020600574415, 0.6992020600574417, 0.6968635916291558, 0.6968635916291558, 0.6968635916291558 ]
     for i in range(6):
         inp[i, :] = data[i]
-    executor = kr.createSingleThreadExecutor()
-    out = kr.runGraph(executor, modu, {"a": inp}, 0, 6)
+    executor = createSingleThreadExecutor()
+    out = runGraph(executor, modu, {"a": inp}, 0, 6)
     df = pd.DataFrame(inp)
     expected =df.rolling(5, min_periods=1).apply(lambda x: x.argmin() + 1, raw=True)
     output = out["ou2"][4:]
@@ -398,8 +587,8 @@ def check(inp, timelen):
         # print(df)
         expected = df.rank(pct=True, axis = 1).to_numpy().transpose()
         blocked = ST_ST8t(inp)
-        executor = kr.createSingleThreadExecutor()
-        out = kr.runGraph(executor, modu, {"a": blocked}, 0, timelen)
+        executor = createSingleThreadExecutor()
+        out = runGraph(executor, modu, {"a": blocked}, 0, timelen)
         output = ST8t_ST(out["ou2"])
         # print(expected[:,0])
         # print(output[:,0])
@@ -435,8 +624,8 @@ def compute(stocks):
         df = df + df
         expected = (df.rank(pct=True, axis = 1) + df).to_numpy().transpose()
         blocked = np.ascontiguousarray(inp.transpose())
-        executor = kr.createSingleThreadExecutor()
-        out = kr.runGraph(executor, modu, {"a": blocked}, 0, 200)
+        executor = createSingleThreadExecutor()
+        out = runGraph(executor, modu, {"a": blocked}, 0, 200)
         output = out["out"].transpose()
         # print(expected[:,0])
         # print(output[:,0])
@@ -478,8 +667,15 @@ def compute(stocks):
         inner = inner.to_numpy().transpose()
         expected = expected.to_numpy().transpose()
         blocked = np.ascontiguousarray(inp.transpose())
-        executor = kr.createSingleThreadExecutor()
-        out = kr.runGraph(executor, modu, {"a": blocked}, 0, 300)
+        executor = createSingleThreadExecutor()
+        # The outer Rank(WindowedSum(...)) is sensitive to near-ties:
+        # GPU multi-chunk Kahan drifts ≤1 ulp in the inner sum, which
+        # can flip cross-sectional tie-breaking and shift rank buckets
+        # by 0.025-0.05.  Force single-chunk so the Kahan state runs
+        # uninterrupted — perf is irrelevant for a correctness test
+        # and this restores bit-exactness with pandas.
+        out = runGraph(executor, modu, {"a": blocked}, 0, 300,
+                        gpu_sm_fill_factor=0.0 if GPU_MODE else None)
         output1 = out["ou1"].transpose()
         output2 = out["ou2"].transpose()
         np.set_printoptions(precision=60)
@@ -508,8 +704,8 @@ def test_log(lib, dtype, name):
     inp[1,:] = np.nan
     # print(inp)
     blocked = ST_ST8t(inp, is_double=(dtype=="float64"))
-    executor = kr.createSingleThreadExecutor()
-    out = kr.runGraph(executor, modu, {"a": blocked}, 0, 20)
+    executor = createSingleThreadExecutor()
+    out = runGraph(executor, modu, {"a": blocked}, 0, 20)
     output = ST8t_ST(out["outlog"])
     # print(expected[:,0])
     # print(output[:,0])
@@ -546,8 +742,8 @@ def test_pow(lib):
         expo[i,:] = pow(10, i/8-1)
     expo[-1,:] = 0
     expo[1,:] = np.nan
-    executor = kr.createSingleThreadExecutor()
-    out = kr.runGraph(executor, modu, {"a": ST_ST8t(base), "b": ST_ST8t(expo)}, 0, 20)
+    executor = createSingleThreadExecutor()
+    out = runGraph(executor, modu, {"a": ST_ST8t(base), "b": ST_ST8t(expo)}, 0, 20)
     # print(out.keys())
     # print(expected[:,0])
     # print(output[:,0])
@@ -615,8 +811,8 @@ def test_skew_kurt():
     modu = lib.getModule("test_skew")
     assert(modu)
     inp = np.random.rand(20, 24)
-    executor = kr.createSingleThreadExecutor()
-    out = kr.runGraph(executor, modu, {"a": inp}, 0, 20)
+    executor = createSingleThreadExecutor()
+    out = runGraph(executor, modu, {"a": inp}, 0, 20)
     output = out["ou2"]
     df = pd.DataFrame(inp)
     expected = df.rolling(5).skew()
@@ -707,8 +903,8 @@ def test_loop_index():
     modu = lib.getModule("test_max_drawdown")
     assert(modu)
     inp = np.random.rand(20, 24).astype("float32")
-    executor = kr.createSingleThreadExecutor()
-    out = kr.runGraph(executor, modu, {"a": inp}, 0, 20)
+    executor = createSingleThreadExecutor()
+    out = runGraph(executor, modu, {"a": inp}, 0, 20)
     output = out["out"]
     
     # reference implementation, from https://stackoverflow.com/a/21059308. Modified for our version of maxdd
@@ -736,10 +932,16 @@ def rolling_max_dd(x, window_size, min_periods=1):
         expected[:,i] = rolling_max_dd(inp[:,i], 5, min_periods=1)
     np.testing.assert_allclose(output[5:], expected[5:], equal_nan=True, atol=1e-7, rtol=1e-7)
 
-test_stream_lifetime_gh_issue_41()
-test_corrwith()
-test_aggregrate("float32")
-test_aggregrate("float64")
+_run(test_stream_lifetime_gh_issue_41)
+_run(test_corrwith)
+_run(test_aggregrate, "float32")
+_run(test_aggregrate, "float64")
+# The shared library bundles all the lib-based tests.  CPU compiles
+# the whole thing; GPU keeps only entries in `_GPU_LIB_NAMES` since
+# the rest is STs / double / stream / ops the kunir codegen doesn't
+# support yet.  Lib-consuming tests that aren't in `_GPU_LIB_NAMES`
+# are in `_GPU_SKIP_TESTS`, so `_run` short-circuits before they ever
+# try to `lib.getModule(...)`.
 funclist = [
     check_1(),
     check_TS(),
@@ -762,30 +964,34 @@ def rolling_max_dd(x, window_size, min_periods=1):
     check_large_rank(),
     repro_crash_gh_issue_71(),
     ]
-lib = cfake.compileit(funclist, "test", cfake.CppCompilerConfig(machine=get_compiler_flags()))
-
-test_cfake()
-test_avg_stddev_TS(lib)
-kun_test_dll = os.path.join(cfake.get_runtime_path(), "KunTest.dll" if cfake.is_windows() else "libKunTest.so")
-if os.path.exists(kun_test_dll):
-    test_runtime(kun_test_dll)
-test_avg_stddev(lib)
-test_rank(lib)
-test_log(lib, "float32", "")
-test_pow(lib)
-test_ema(lib)
-test_ema_init(lib)
-test_argmin_issue19(lib)
-test_generic_cross_sectional()
-test_stream_double()
-test_log(lib, "float64", "64")
-test_rank2(lib)
-test_rank029(lib)
-test_skew_kurt()
-test_aligned(lib)
-test_loop_index()
-test_covar(lib)
-test_quantile(lib)
-test_large_rank(lib)
-test_repro_crash_gh_issue_71(lib)
+if GPU_MODE:
+    funclist = [t for t in funclist if t[0] in _GPU_LIB_NAMES]
+lib = compileit(funclist, "test")
+
+_run(test_cfake)
+_run(test_avg_stddev_TS, lib)
+if not GPU_MODE:
+    kun_test_dll = os.path.join(cfake.get_runtime_path(),
+                                  "KunTest.dll" if cfake.is_windows() else "libKunTest.so")
+    if os.path.exists(kun_test_dll):
+        _run(test_runtime, kun_test_dll)
+_run(test_avg_stddev, lib)
+_run(test_rank, lib)
+_run(test_log, lib, "float32", "")
+_run(test_pow, lib)
+_run(test_ema, lib)
+_run(test_ema_init, lib)
+_run(test_argmin_issue19, lib)
+_run(test_generic_cross_sectional)
+_run(test_stream_double)
+_run(test_log, lib, "float64", "64")
+_run(test_rank2, lib)
+_run(test_rank029, lib)
+_run(test_skew_kurt)
+_run(test_aligned, lib)
+_run(test_loop_index)
+_run(test_covar, lib)
+_run(test_quantile, lib)
+_run(test_large_rank, lib)
+_run(test_repro_crash_gh_issue_71, lib)
 print("done")
diff --git a/tests/tests_gpu.sh b/tests/tests_gpu.sh
new file mode 100644
index 0000000..4420b4f
--- /dev/null
+++ b/tests/tests_gpu.sh
@@ -0,0 +1,7 @@
+set -e
+echo "KunQuant runtime tests"
+python tests/test_runtime.py --gpu-arch auto
+echo "KunQuant alpha158 tests"
+python ./tests/test_alpha158.py --inputs ./build/input.npz --ref ./build/alpha158.npz --action run_gpu --gpu-arch auto
+echo "KunQuant alpha101 tests"
+python ./tests/test_alpha101.py --gpu-arch auto
\ No newline at end of file