Add challenge 74: Layer Normalization (Medium)

github-actions[bot] · claude · github-actions[bot] · commit 5b062db7a9da · 2026-02-25T09:51:37.000Z
Layer normalization normalizes each row of an N×D input independently,
which is the key operation in transformer/LLM architectures. Unlike batch
normalization (column-wise), this requires per-row reductions that cannot
be trivially parallelized — solvers must think carefully about shared
memory reductions, work distribution (one block per row), and the
two-pass algorithm (mean then variance).

Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/challenges/medium/74_layer_normalization/challenge.html b/challenges/medium/74_layer_normalization/challenge.html
@@ -0,0 +1,103 @@
+<p>
+  Implement the layer normalization forward pass for a 2D input tensor of shape [N, D], where N is
+  the number of samples and D is the feature dimension. Unlike batch normalization (which normalizes
+  across the batch), layer normalization computes independent statistics for each sample and normalizes
+  across its feature dimension, then applies per-feature learnable scale (<code>gamma</code>) and
+  shift (<code>beta</code>) parameters.
+</p>
+
+<p>
+  For each sample i, layer normalization computes:
+  \[
+  \begin{align}
+  \mu_i &= \frac{1}{D} \sum_{j=0}^{D-1} x_{i,j} \\
+  \sigma_i^2 &= \frac{1}{D} \sum_{j=0}^{D-1} (x_{i,j} - \mu_i)^2 \\
+  \hat{x}_{i,j} &= \frac{x_{i,j} - \mu_i}{\sqrt{\sigma_i^2 + \epsilon}} \\
+  y_{i,j} &= \gamma_j \hat{x}_{i,j} + \beta_j
+  \end{align}
+  \]
+</p>
+
+<h2>Implementation Requirements</h2>
+<ul>
+  <li>Use only native features (external libraries are not permitted)</li>
+  <li>The <code>solve</code> function signature must remain unchanged</li>
+  <li>The final result must be stored in the <code>output</code> tensor</li>
+</ul>
+
+<h2>Example 1:</h2>
+<p>
+Input:<br>
+\( \text{input} \) (\(2 \times 4\)):
+\[
+\begin{bmatrix}
+1.0 & 2.0 & 3.0 & 4.0 \\
+5.0 & 6.0 & 7.0 & 8.0
+\end{bmatrix}
+\]
+\( \text{gamma} \) (\(4\)):
+\[
+\begin{bmatrix}
+1.0 & 1.0 & 1.0 & 1.0
+\end{bmatrix}
+\]
+\( \text{beta} \) (\(4\)):
+\[
+\begin{bmatrix}
+0.0 & 0.0 & 0.0 & 0.0
+\end{bmatrix}
+\]
+\( \epsilon \) = 1e-5<br><br>
+Output:<br>
+\( \text{output} \) (\(2 \times 4\)):
+\[
+\begin{bmatrix}
+-1.3416 & -0.4472 & 0.4472 & 1.3416 \\
+-1.3416 & -0.4472 & 0.4472 & 1.3416
+\end{bmatrix}
+\]
+</p>
+
+<h2>Example 2:</h2>
+<p>
+Input:<br>
+\( \text{input} \) (\(2 \times 4\)):
+\[
+\begin{bmatrix}
+2.0 & 2.0 & 4.0 & 4.0 \\
+1.0 & 3.0 & 3.0 & 5.0
+\end{bmatrix}
+\]
+\( \text{gamma} \) (\(4\)):
+\[
+\begin{bmatrix}
+1.0 & 1.0 & 1.0 & 1.0
+\end{bmatrix}
+\]
+\( \text{beta} \) (\(4\)):
+\[
+\begin{bmatrix}
+0.0 & 0.0 & 0.0 & 0.0
+\end{bmatrix}
+\]
+\( \epsilon \) = 1e-5<br><br>
+Output:<br>
+\( \text{output} \) (\(2 \times 4\)):
+\[
+\begin{bmatrix}
+-1.0 & -1.0 & 1.0 & 1.0 \\
+-1.4142 & 0.0 & 0.0 & 1.4142
+\end{bmatrix}
+\]
+</p>
+
+<h2>Constraints</h2>
+<ul>
+  <li>1 &le; <code>N</code> &le; 65,536</li>
+  <li>1 &le; <code>D</code> &le; 8,192</li>
+  <li><code>eps</code> = 1e-5</li>
+  <li>-100.0 &le; input values &le; 100.0</li>
+  <li>0.1 &le; gamma values &le; 10.0</li>
+  <li>-10.0 &le; beta values &le; 10.0</li>
+  <li>Performance is measured with <code>N</code> = 8,192, <code>D</code> = 4,096</li>
+</ul>
diff --git a/challenges/medium/74_layer_normalization/challenge.py b/challenges/medium/74_layer_normalization/challenge.py
@@ -0,0 +1,238 @@
+import ctypes
+from typing import Any, Dict, List
+
+import torch
+from core.challenge_base import ChallengeBase
+
+
+class Challenge(ChallengeBase):
+    def __init__(self):
+        super().__init__(
+            name="Layer Normalization",
+            atol=1e-04,
+            rtol=1e-04,
+            num_gpus=1,
+            access_tier="free",
+        )
+
+    def reference_impl(
+        self,
+        input: torch.Tensor,
+        gamma: torch.Tensor,
+        beta: torch.Tensor,
+        output: torch.Tensor,
+        N: int,
+        D: int,
+        eps: float,
+    ):
+        assert input.shape == (N, D), f"Expected input.shape=({N}, {D}), got {input.shape}"
+        assert output.shape == (N, D), f"Expected output.shape=({N}, {D}), got {output.shape}"
+        assert gamma.shape == (D,), f"Expected gamma.shape=({D},), got {gamma.shape}"
+        assert beta.shape == (D,), f"Expected beta.shape=({D},), got {beta.shape}"
+        assert input.dtype == gamma.dtype == beta.dtype == output.dtype == torch.float32
+        assert input.device.type == "cuda"
+        assert gamma.device.type == "cuda"
+        assert beta.device.type == "cuda"
+        assert output.device.type == "cuda"
+
+        mean = input.mean(dim=1, keepdim=True)
+        var = input.var(dim=1, keepdim=True, unbiased=False)
+        normalized = (input - mean) / torch.sqrt(var + eps)
+        output.copy_(gamma * normalized + beta)
+
+    def get_solve_signature(self) -> Dict[str, tuple]:
+        return {
+            "input": (ctypes.POINTER(ctypes.c_float), "in"),
+            "gamma": (ctypes.POINTER(ctypes.c_float), "in"),
+            "beta": (ctypes.POINTER(ctypes.c_float), "in"),
+            "output": (ctypes.POINTER(ctypes.c_float), "out"),
+            "N": (ctypes.c_int, "in"),
+            "D": (ctypes.c_int, "in"),
+            "eps": (ctypes.c_float, "in"),
+        }
+
+    def generate_example_test(self) -> Dict[str, Any]:
+        dtype = torch.float32
+        N, D = 2, 4
+        input = torch.tensor(
+            [[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0]], device="cuda", dtype=dtype
+        )
+        gamma = torch.tensor([1.0, 1.0, 1.0, 1.0], device="cuda", dtype=dtype)
+        beta = torch.tensor([0.0, 0.0, 0.0, 0.0], device="cuda", dtype=dtype)
+        output = torch.empty((N, D), device="cuda", dtype=dtype)
+        return {
+            "input": input,
+            "gamma": gamma,
+            "beta": beta,
+            "output": output,
+            "N": N,
+            "D": D,
+            "eps": 1e-5,
+        }
+
+    def generate_functional_test(self) -> List[Dict[str, Any]]:
+        dtype = torch.float32
+        tests = []
+
+        # single_sample_small_d
+        N, D = 1, 4
+        tests.append(
+            {
+                "input": torch.tensor([[1.0, 2.0, 3.0, 4.0]], device="cuda", dtype=dtype),
+                "gamma": torch.ones(D, device="cuda", dtype=dtype),
+                "beta": torch.zeros(D, device="cuda", dtype=dtype),
+                "output": torch.empty((N, D), device="cuda", dtype=dtype),
+                "N": N,
+                "D": D,
+                "eps": 1e-5,
+            }
+        )
+
+        # single_sample_single_feature — all same value; var=0, norm output = beta
+        N, D = 1, 1
+        tests.append(
+            {
+                "input": torch.tensor([[3.0]], device="cuda", dtype=dtype),
+                "gamma": torch.tensor([2.0], device="cuda", dtype=dtype),
+                "beta": torch.tensor([1.0], device="cuda", dtype=dtype),
+                "output": torch.empty((N, D), device="cuda", dtype=dtype),
+                "N": N,
+                "D": D,
+                "eps": 1e-5,
+            }
+        )
+
+        # all_zeros_input
+        N, D = 4, 8
+        tests.append(
+            {
+                "input": torch.zeros((N, D), device="cuda", dtype=dtype),
+                "gamma": torch.ones(D, device="cuda", dtype=dtype),
+                "beta": torch.zeros(D, device="cuda", dtype=dtype),
+                "output": torch.empty((N, D), device="cuda", dtype=dtype),
+                "N": N,
+                "D": D,
+                "eps": 1e-5,
+            }
+        )
+
+        # negative_numbers
+        N, D = 2, 4
+        tests.append(
+            {
+                "input": torch.tensor(
+                    [[-1.0, -2.0, -3.0, -4.0], [-5.0, -6.0, -7.0, -8.0]],
+                    device="cuda",
+                    dtype=dtype,
+                ),
+                "gamma": torch.ones(D, device="cuda", dtype=dtype),
+                "beta": torch.zeros(D, device="cuda", dtype=dtype),
+                "output": torch.empty((N, D), device="cuda", dtype=dtype),
+                "N": N,
+                "D": D,
+                "eps": 1e-5,
+            }
+        )
+
+        # different_gamma_beta
+        N, D = 2, 4
+        tests.append(
+            {
+                "input": torch.tensor(
+                    [[0.0, 1.0, 2.0, 3.0], [-3.0, -1.0, 1.0, 3.0]],
+                    device="cuda",
+                    dtype=dtype,
+                ),
+                "gamma": torch.tensor([2.0, 0.5, 1.0, 3.0], device="cuda", dtype=dtype),
+                "beta": torch.tensor([1.0, -1.0, 0.0, 0.5], device="cuda", dtype=dtype),
+                "output": torch.empty((N, D), device="cuda", dtype=dtype),
+                "N": N,
+                "D": D,
+                "eps": 1e-5,
+            }
+        )
+
+        # power_of_2_medium
+        N, D = 16, 64
+        tests.append(
+            {
+                "input": torch.empty((N, D), device="cuda", dtype=dtype).uniform_(-5.0, 5.0),
+                "gamma": torch.empty(D, device="cuda", dtype=dtype).uniform_(0.5, 2.0),
+                "beta": torch.empty(D, device="cuda", dtype=dtype).uniform_(-1.0, 1.0),
+                "output": torch.empty((N, D), device="cuda", dtype=dtype),
+                "N": N,
+                "D": D,
+                "eps": 1e-5,
+            }
+        )
+
+        # power_of_2_large_d
+        N, D = 32, 512
+        tests.append(
+            {
+                "input": torch.empty((N, D), device="cuda", dtype=dtype).uniform_(-10.0, 10.0),
+                "gamma": torch.empty(D, device="cuda", dtype=dtype).uniform_(0.5, 2.0),
+                "beta": torch.empty(D, device="cuda", dtype=dtype).uniform_(-2.0, 2.0),
+                "output": torch.empty((N, D), device="cuda", dtype=dtype),
+                "N": N,
+                "D": D,
+                "eps": 1e-5,
+            }
+        )
+
+        # non_power_of_2
+        N, D = 30, 100
+        tests.append(
+            {
+                "input": torch.empty((N, D), device="cuda", dtype=dtype).uniform_(-3.0, 3.0),
+                "gamma": torch.empty(D, device="cuda", dtype=dtype).uniform_(0.1, 3.0),
+                "beta": torch.empty(D, device="cuda", dtype=dtype).uniform_(-5.0, 5.0),
+                "output": torch.empty((N, D), device="cuda", dtype=dtype),
+                "N": N,
+                "D": D,
+                "eps": 1e-5,
+            }
+        )
+
+        # non_power_of_2_large
+        N, D = 255, 300
+        tests.append(
+            {
+                "input": torch.empty((N, D), device="cuda", dtype=dtype).uniform_(-50.0, 50.0),
+                "gamma": torch.empty(D, device="cuda", dtype=dtype).uniform_(0.5, 2.0),
+                "beta": torch.empty(D, device="cuda", dtype=dtype).uniform_(-1.0, 1.0),
+                "output": torch.empty((N, D), device="cuda", dtype=dtype),
+                "N": N,
+                "D": D,
+                "eps": 1e-5,
+            }
+        )
+
+        # realistic_transformer_size
+        N, D = 1024, 768
+        tests.append(
+            {
+                "input": torch.empty((N, D), device="cuda", dtype=dtype).uniform_(-10.0, 10.0),
+                "gamma": torch.empty(D, device="cuda", dtype=dtype).uniform_(0.5, 2.0),
+                "beta": torch.empty(D, device="cuda", dtype=dtype).uniform_(-1.0, 1.0),
+                "output": torch.empty((N, D), device="cuda", dtype=dtype),
+                "N": N,
+                "D": D,
+                "eps": 1e-5,
+            }
+        )
+
+        return tests
+
+    def generate_performance_test(self) -> Dict[str, Any]:
+        dtype = torch.float32
+        N, D = 8192, 4096
+        return {
+            "input": torch.empty((N, D), device="cuda", dtype=dtype).uniform_(-10.0, 10.0),
+            "gamma": torch.empty(D, device="cuda", dtype=dtype).uniform_(0.5, 2.0),
+            "beta": torch.empty(D, device="cuda", dtype=dtype).uniform_(-1.0, 1.0),
+            "output": torch.empty((N, D), device="cuda", dtype=dtype),
+            "N": N,
+            "D": D,
+            "eps": 1e-5,
+        }
diff --git a/challenges/medium/74_layer_normalization/starter/starter.cu b/challenges/medium/74_layer_normalization/starter/starter.cu
@@ -0,0 +1,5 @@
+#include <cuda_runtime.h>
+
+// input, gamma, beta, output are device pointers
+extern "C" void solve(const float* input, const float* gamma, const float* beta, float* output,
+                      int N, int D, float eps) {}
diff --git a/challenges/medium/74_layer_normalization/starter/starter.cute.py b/challenges/medium/74_layer_normalization/starter/starter.cute.py
@@ -0,0 +1,16 @@
+import cutlass
+import cutlass.cute as cute
+
+
+# input, gamma, beta, output are tensors on the GPU
+@cute.jit
+def solve(
+    input: cute.Tensor,
+    gamma: cute.Tensor,
+    beta: cute.Tensor,
+    output: cute.Tensor,
+    N: cute.Int32,
+    D: cute.Int32,
+    eps: cute.Float32,
+):
+    pass
diff --git a/challenges/medium/74_layer_normalization/starter/starter.jax.py b/challenges/medium/74_layer_normalization/starter/starter.jax.py
@@ -0,0 +1,11 @@
+import jax
+import jax.numpy as jnp
+
+
+# input, gamma, beta are tensors on GPU
+@jax.jit
+def solve(
+    input: jax.Array, gamma: jax.Array, beta: jax.Array, N: int, D: int, eps: float
+) -> jax.Array:
+    # return output tensor directly
+    pass
diff --git a/challenges/medium/74_layer_normalization/starter/starter.mojo b/challenges/medium/74_layer_normalization/starter/starter.mojo
@@ -0,0 +1,11 @@
+from gpu.host import DeviceContext
+from gpu.id import block_dim, block_idx, thread_idx
+from memory import UnsafePointer
+from math import ceildiv
+
+# input, gamma, beta, output are device pointers
+@export
+def solve(input: UnsafePointer[Float32], gamma: UnsafePointer[Float32],
+          beta: UnsafePointer[Float32], output: UnsafePointer[Float32],
+          N: Int32, D: Int32, eps: Float32):
+    pass
diff --git a/challenges/medium/74_layer_normalization/starter/starter.pytorch.py b/challenges/medium/74_layer_normalization/starter/starter.pytorch.py
diff --git a/challenges/medium/74_layer_normalization/starter/starter.triton.py b/challenges/medium/74_layer_normalization/starter/starter.triton.py