From fac3949302ae032e07b2609aacbd5b6645782adc Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Thu, 2 Apr 2026 04:27:45 +0000 Subject: [PATCH] Add challenge 90: Causal Depthwise Conv1d (Medium) Implement the causal depthwise 1D convolution used in Mamba/SSM models, where each channel is convolved independently with only past and present input positions (zero-padded left boundary). Teaches GPU memory access patterns for channels-last layout and sliding-window boundary handling. Co-Authored-By: Claude Sonnet 4.6 --- .../90_causal_depthwise_conv1d/challenge.html | 144 ++++++++++++++ .../90_causal_depthwise_conv1d/challenge.py | 187 ++++++++++++++++++ .../starter/starter.cu | 5 + .../starter/starter.cute.py | 17 ++ .../starter/starter.jax.py | 11 ++ .../starter/starter.mojo | 16 ++ .../starter/starter.pytorch.py | 15 ++ .../starter/starter.triton.py | 17 ++ 8 files changed, 412 insertions(+) create mode 100644 challenges/medium/90_causal_depthwise_conv1d/challenge.html create mode 100644 challenges/medium/90_causal_depthwise_conv1d/challenge.py create mode 100644 challenges/medium/90_causal_depthwise_conv1d/starter/starter.cu create mode 100644 challenges/medium/90_causal_depthwise_conv1d/starter/starter.cute.py create mode 100644 challenges/medium/90_causal_depthwise_conv1d/starter/starter.jax.py create mode 100644 challenges/medium/90_causal_depthwise_conv1d/starter/starter.mojo create mode 100644 challenges/medium/90_causal_depthwise_conv1d/starter/starter.pytorch.py create mode 100644 challenges/medium/90_causal_depthwise_conv1d/starter/starter.triton.py diff --git a/challenges/medium/90_causal_depthwise_conv1d/challenge.html b/challenges/medium/90_causal_depthwise_conv1d/challenge.html new file mode 100644 index 0000000..628f5bf --- /dev/null +++ b/challenges/medium/90_causal_depthwise_conv1d/challenge.html @@ -0,0 +1,144 @@ +

+ Implement a causal depthwise 1D convolution over a batched sequence tensor + x of shape (B, L, D), producing an output of the same shape. + In a depthwise convolution, each channel d is convolved independently using its + own kernel weight[d, :] — there is no mixing across channels. + The convolution is causal: output position l may only depend on + input positions 0, 1, …, l (past and present), never future positions. + This operation is a key component of state-space models such as Mamba, where it is applied + before the selective scan to mix local context within each feature channel. +

+ + + + + + + + + + + + + Causal Depthwise Conv1d (K=3, one channel shown) + + + x[d] + + + + x₀ + + + x₁ + + + x₂ + + + x₃ + + + x₄ + + + x₅ + + + w[d] + + w₀ + + w₁ + + w₂ + + + kernel at l=4: reads x₂,x₃,x₄ + + + + + + y[d] + + + + y₀ + + + y₁ + + + y₂ + + + y₃ + + + y₄ + + + y₅ + + + + y[d,l] = bias[d] + Σ w[d,k] · x[d, l−k] (x[d,l−k] = 0 if l−k < 0) + + + +

+ Formally, for each batch element b, sequence position l, and channel d: +

+ +\[ +\text{output}[b,\, l,\, d] += \text{bias}[d] ++ \sum_{k=0}^{K-1} \text{weight}[d,\, k] \cdot x[b,\, l - k,\, d] +\] + +

+ where positions l − k < 0 are treated as zero (zero-pad the left boundary). + The tensor layout is channels-last: x[b, l, d] is stored at offset + b × L × D + l × D + d. +

+ +

Implementation Requirements

+ + +

Example

+ +

With B = 1, L = 4, D = 2, K = 3:

+ +
+x      = [[[1.0, 2.0],    # l=0
+           [3.0, 4.0],    # l=1
+           [5.0, 6.0],    # l=2
+           [7.0, 8.0]]]   # l=3   shape (1, 4, 2)
+
+weight = [[ 1.0,  0.0, -1.0],   # channel d=0
+          [ 1.0,  1.0,  1.0]]   # channel d=1   shape (2, 3)
+
+bias   = [0.0, 0.0]
+
+output = [[[1.0,  2.0],   # l=0: d0: 1*1=1          d1: 1*2=2
+           [3.0,  6.0],   # l=1: d0: 3*1+1*0=3      d1: 4*1+2*1=6
+           [4.0, 12.0],   # l=2: d0: 5*1+3*0+1*(-1)=4  d1: 6+4+2=12
+           [4.0, 18.0]]]  # l=3: d0: 7*1+5*0+3*(-1)=4  d1: 8+6+4=18
+
+ +

Constraints

+ diff --git a/challenges/medium/90_causal_depthwise_conv1d/challenge.py b/challenges/medium/90_causal_depthwise_conv1d/challenge.py new file mode 100644 index 0000000..db0c8fb --- /dev/null +++ b/challenges/medium/90_causal_depthwise_conv1d/challenge.py @@ -0,0 +1,187 @@ +import ctypes +from typing import Any, Dict, List + +import torch +import torch.nn.functional as F +from core.challenge_base import ChallengeBase + + +class Challenge(ChallengeBase): + def __init__(self): + super().__init__( + name="Causal Depthwise Conv1d", + atol=1e-04, + rtol=1e-04, + num_gpus=1, + access_tier="free", + ) + + def reference_impl( + self, + x: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + output: torch.Tensor, + B: int, + L: int, + D: int, + K: int, + ): + assert x.shape == (B, L, D) + assert weight.shape == (D, K) + assert bias.shape == (D,) + assert output.shape == (B, L, D) + assert x.dtype == weight.dtype == bias.dtype == output.dtype == torch.float32 + assert x.device.type == "cuda" + assert weight.device.type == "cuda" + assert bias.device.type == "cuda" + assert output.device.type == "cuda" + + # Reshape to (B, D, L) for conv1d + x_t = x.permute(0, 2, 1).contiguous() # (B, D, L) + + # Causal padding: pad K-1 zeros on the left so each output position + # only sees current and past input positions + x_padded = F.pad(x_t, (K - 1, 0)) # (B, D, L + K - 1) + + # Depthwise conv: weight (D, K) -> (D, 1, K), groups=D + # Flip the kernel so weight[d, 0] applies to the current position (l-0) + # and weight[d, K-1] applies to the oldest position (l-(K-1)). + # F.conv1d uses cross-correlation (no implicit flip), so we flip explicitly. + w = weight.flip(1).unsqueeze(1) # (D, 1, K) + result = F.conv1d(x_padded, w, bias=bias, groups=D) # (B, D, L) + + output.copy_(result.permute(0, 2, 1)) # (B, L, D) + + def get_solve_signature(self) -> Dict[str, tuple]: + return { + "x": (ctypes.POINTER(ctypes.c_float), "in"), + "weight": (ctypes.POINTER(ctypes.c_float), "in"), + "bias": (ctypes.POINTER(ctypes.c_float), "in"), + "output": (ctypes.POINTER(ctypes.c_float), "out"), + "B": (ctypes.c_int, "in"), + "L": (ctypes.c_int, "in"), + "D": (ctypes.c_int, "in"), + "K": (ctypes.c_int, "in"), + } + + def generate_example_test(self) -> Dict[str, Any]: + B, L, D, K = 1, 4, 2, 3 + x = torch.tensor( + [[[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0]]], + device="cuda", + dtype=torch.float32, + ) + weight = torch.tensor( + [[1.0, 0.0, -1.0], [1.0, 1.0, 1.0]], device="cuda", dtype=torch.float32 + ) + bias = torch.zeros(D, device="cuda", dtype=torch.float32) + output = torch.empty(B, L, D, device="cuda", dtype=torch.float32) + return { + "x": x, + "weight": weight, + "bias": bias, + "output": output, + "B": B, + "L": L, + "D": D, + "K": K, + } + + def generate_functional_test(self) -> List[Dict[str, Any]]: + dtype = torch.float32 + test_cases = [] + + def make_case(B, L, D, K, x_vals=None, w_vals=None, b_vals=None): + if x_vals is not None: + x = torch.tensor(x_vals, device="cuda", dtype=dtype) + else: + x = torch.randn(B, L, D, device="cuda", dtype=dtype) + if w_vals is not None: + weight = torch.tensor(w_vals, device="cuda", dtype=dtype) + else: + weight = torch.randn(D, K, device="cuda", dtype=dtype) + if b_vals is not None: + bias = torch.tensor(b_vals, device="cuda", dtype=dtype) + else: + bias = torch.randn(D, device="cuda", dtype=dtype) + output = torch.empty(B, L, D, device="cuda", dtype=dtype) + return { + "x": x, + "weight": weight, + "bias": bias, + "output": output, + "B": B, + "L": L, + "D": D, + "K": K, + } + + # Example test (matches generate_example_test) + test_cases.append( + make_case( + 1, + 4, + 2, + 3, + x_vals=[[[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0]]], + w_vals=[[1.0, 0.0, -1.0], [1.0, 1.0, 1.0]], + b_vals=[0.0, 0.0], + ) + ) + + # Edge cases: minimal sizes + test_cases.append(make_case(1, 1, 1, 1)) # single element, kernel=1 + test_cases.append(make_case(1, 2, 1, 2)) # L < K, so first output is partial + test_cases.append(make_case(2, 3, 4, 3)) # small batch, B=2 + + # Zero inputs + x_zero = torch.zeros(1, 8, 4, device="cuda", dtype=dtype) + w_zero = torch.randn(4, 3, device="cuda", dtype=dtype) + b_zero = torch.randn(4, device="cuda", dtype=dtype) + test_cases.append( + { + "x": x_zero, + "weight": w_zero, + "bias": b_zero, + "output": torch.empty(1, 8, 4, device="cuda", dtype=dtype), + "B": 1, + "L": 8, + "D": 4, + "K": 3, + } + ) + + # Negative values + test_cases.append(make_case(1, 16, 8, 4)) + + # Power-of-2 sizes + test_cases.append(make_case(2, 32, 16, 4)) + test_cases.append(make_case(4, 64, 32, 4)) + + # Non-power-of-2 sizes + test_cases.append(make_case(3, 30, 12, 3)) + test_cases.append(make_case(2, 100, 24, 4)) + + # Realistic inference size (Mamba-like small) + test_cases.append(make_case(2, 256, 128, 4)) + + return test_cases + + def generate_performance_test(self) -> Dict[str, Any]: + B, L, D, K = 8, 2048, 4096, 4 + dtype = torch.float32 + x = torch.randn(B, L, D, device="cuda", dtype=dtype) + weight = torch.randn(D, K, device="cuda", dtype=dtype) + bias = torch.randn(D, device="cuda", dtype=dtype) + output = torch.empty(B, L, D, device="cuda", dtype=dtype) + return { + "x": x, + "weight": weight, + "bias": bias, + "output": output, + "B": B, + "L": L, + "D": D, + "K": K, + } diff --git a/challenges/medium/90_causal_depthwise_conv1d/starter/starter.cu b/challenges/medium/90_causal_depthwise_conv1d/starter/starter.cu new file mode 100644 index 0000000..b6fd040 --- /dev/null +++ b/challenges/medium/90_causal_depthwise_conv1d/starter/starter.cu @@ -0,0 +1,5 @@ +#include + +// x, weight, bias, output are device pointers +extern "C" void solve(const float* x, const float* weight, const float* bias, float* output, int B, + int L, int D, int K) {} diff --git a/challenges/medium/90_causal_depthwise_conv1d/starter/starter.cute.py b/challenges/medium/90_causal_depthwise_conv1d/starter/starter.cute.py new file mode 100644 index 0000000..177b7fc --- /dev/null +++ b/challenges/medium/90_causal_depthwise_conv1d/starter/starter.cute.py @@ -0,0 +1,17 @@ +import cutlass +import cutlass.cute as cute + + +# x, weight, bias, output are tensors on the GPU +@cute.jit +def solve( + x: cute.Tensor, + weight: cute.Tensor, + bias: cute.Tensor, + output: cute.Tensor, + B: cute.Int32, + L: cute.Int32, + D: cute.Int32, + K: cute.Int32, +): + pass diff --git a/challenges/medium/90_causal_depthwise_conv1d/starter/starter.jax.py b/challenges/medium/90_causal_depthwise_conv1d/starter/starter.jax.py new file mode 100644 index 0000000..4f61597 --- /dev/null +++ b/challenges/medium/90_causal_depthwise_conv1d/starter/starter.jax.py @@ -0,0 +1,11 @@ +import jax +import jax.numpy as jnp + + +# x, weight, bias are tensors on GPU +@jax.jit +def solve( + x: jax.Array, weight: jax.Array, bias: jax.Array, B: int, L: int, D: int, K: int +) -> jax.Array: + # return output tensor directly + pass diff --git a/challenges/medium/90_causal_depthwise_conv1d/starter/starter.mojo b/challenges/medium/90_causal_depthwise_conv1d/starter/starter.mojo new file mode 100644 index 0000000..315e67f --- /dev/null +++ b/challenges/medium/90_causal_depthwise_conv1d/starter/starter.mojo @@ -0,0 +1,16 @@ +from gpu.host import DeviceContext +from memory import UnsafePointer + +# x, weight, bias, output are device pointers +@export +def solve( + x: UnsafePointer[Float32], + weight: UnsafePointer[Float32], + bias: UnsafePointer[Float32], + output: UnsafePointer[Float32], + B: Int32, + L: Int32, + D: Int32, + K: Int32, +): + pass diff --git a/challenges/medium/90_causal_depthwise_conv1d/starter/starter.pytorch.py b/challenges/medium/90_causal_depthwise_conv1d/starter/starter.pytorch.py new file mode 100644 index 0000000..c812b9a --- /dev/null +++ b/challenges/medium/90_causal_depthwise_conv1d/starter/starter.pytorch.py @@ -0,0 +1,15 @@ +import torch + + +# x, weight, bias, output are tensors on the GPU +def solve( + x: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + output: torch.Tensor, + B: int, + L: int, + D: int, + K: int, +): + pass diff --git a/challenges/medium/90_causal_depthwise_conv1d/starter/starter.triton.py b/challenges/medium/90_causal_depthwise_conv1d/starter/starter.triton.py new file mode 100644 index 0000000..c13da18 --- /dev/null +++ b/challenges/medium/90_causal_depthwise_conv1d/starter/starter.triton.py @@ -0,0 +1,17 @@ +import torch +import triton +import triton.language as tl + + +# x, weight, bias, output are tensors on the GPU +def solve( + x: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + output: torch.Tensor, + B: int, + L: int, + D: int, + K: int, +): + pass