Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 48 additions & 0 deletions challenges/medium/76_bilinear_image_scaling/challenge.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
<p>
Given an input image of dimensions <code>H</code> &times; <code>W</code> (stored as 32-bit floats
in row-major order), produce a scaled output image of dimensions <code>H_out</code> &times;
<code>W_out</code> using bilinear interpolation with align-corners convention. For each output
pixel <code>(i, j)</code>, compute its corresponding source coordinate in the input image and
blend the four surrounding pixels with bilinearly-weighted coefficients.
</p>

<h2>Implementation Requirements</h2>
<ul>
<li>Use only native features (external libraries are not permitted)</li>
<li>The <code>solve</code> function signature must remain unchanged</li>
<li>The final result must be stored in <code>output</code></li>
<li>Read exclusively from <code>image</code> and write exclusively to <code>output</code></li>
</ul>

<h2>Example</h2>
<p>
Input <code>image</code> (<code>H</code> = 2, <code>W</code> = 2):
\[
\begin{bmatrix}
1.0 & 3.0 \\
7.0 & 9.0
\end{bmatrix}
\]
<code>H_out</code> = 3, <code>W_out</code> = 3<br><br>
Source coordinates (align-corners): \(\text{src\_y} = i \cdot \frac{H-1}{H_\text{out}-1}\), \(\text{src\_x} = j \cdot \frac{W-1}{W_\text{out}-1}\)<br>
For output size 3 and input size 2 this gives a scale of 0.5, so each output step maps to 0.5 input steps.<br><br>
Output (<code>H_out</code> = 3, <code>W_out</code> = 3):
\[
\begin{bmatrix}
1.0 & 2.0 & 3.0 \\
4.0 & 5.0 & 6.0 \\
7.0 & 8.0 & 9.0
\end{bmatrix}
\]
For example, output pixel \((1, 1)\): \(\text{src\_y} = 0.5\), \(\text{src\_x} = 0.5\), so
\(0.25 \times (1.0 + 3.0 + 7.0 + 9.0) = 5.0\).
</p>

<h2>Constraints</h2>
<ul>
<li>1 &le; <code>H</code>, <code>W</code> &le; 8,192</li>
<li>1 &le; <code>H_out</code>, <code>W_out</code> &le; 16,384</li>
<li>Input values are in the range [-10, 10]</li>
<li>All values are 32-bit floats</li>
<li>Performance is measured with <code>H</code> = 4,096, <code>W</code> = 4,096, <code>H_out</code> = 8,192, <code>W_out</code> = 8,192</li>
</ul>
208 changes: 208 additions & 0 deletions challenges/medium/76_bilinear_image_scaling/challenge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
import ctypes
from typing import Any, Dict, List

import torch
from core.challenge_base import ChallengeBase


class Challenge(ChallengeBase):
def __init__(self):
super().__init__(
name="Bilinear Image Scaling",
atol=1e-05,
rtol=1e-05,
num_gpus=1,
access_tier="free",
)

def reference_impl(
self,
image: torch.Tensor,
output: torch.Tensor,
H: int,
W: int,
H_out: int,
W_out: int,
):
assert image.shape == (H, W), f"Expected image.shape=({H},{W}), got {image.shape}"
assert output.shape == (
H_out,
W_out,
), f"Expected output.shape=({H_out},{W_out}), got {output.shape}"
assert image.dtype == torch.float32
assert output.dtype == torch.float32
assert image.device.type == "cuda"

img = image.unsqueeze(0).unsqueeze(0) # 1x1xHxW
result = torch.nn.functional.interpolate(
img, size=(H_out, W_out), mode="bilinear", align_corners=True
)
output.copy_(result.squeeze(0).squeeze(0))

def get_solve_signature(self) -> Dict[str, tuple]:
return {
"image": (ctypes.POINTER(ctypes.c_float), "in"),
"output": (ctypes.POINTER(ctypes.c_float), "out"),
"H": (ctypes.c_int, "in"),
"W": (ctypes.c_int, "in"),
"H_out": (ctypes.c_int, "in"),
"W_out": (ctypes.c_int, "in"),
}

def generate_example_test(self) -> Dict[str, Any]:
dtype = torch.float32
image = torch.tensor(
[[1.0, 3.0], [7.0, 9.0]],
device="cuda",
dtype=dtype,
)
output = torch.empty((3, 3), device="cuda", dtype=dtype)
return {
"image": image,
"output": output,
"H": 2,
"W": 2,
"H_out": 3,
"W_out": 3,
}

def generate_functional_test(self) -> List[Dict[str, Any]]:
dtype = torch.float32
tests = []

# Edge case: 1x1 -> 1x1 (single pixel, no interpolation)
tests.append(
{
"image": torch.tensor([[5.0]], device="cuda", dtype=dtype),
"output": torch.empty((1, 1), device="cuda", dtype=dtype),
"H": 1,
"W": 1,
"H_out": 1,
"W_out": 1,
}
)

# Edge case: 2x2 -> 2x2 (identity, no scaling)
tests.append(
{
"image": torch.tensor([[-1.0, 2.0], [3.0, -4.0]], device="cuda", dtype=dtype),
"output": torch.empty((2, 2), device="cuda", dtype=dtype),
"H": 2,
"W": 2,
"H_out": 2,
"W_out": 2,
}
)

# Edge case: 3x3 -> 5x5 (small upsampling, includes zeros)
tests.append(
{
"image": torch.zeros((3, 3), device="cuda", dtype=dtype),
"output": torch.empty((5, 5), device="cuda", dtype=dtype),
"H": 3,
"W": 3,
"H_out": 5,
"W_out": 5,
}
)

# Edge case: 4x4 -> 4x12 (width-only scaling, 3x)
tests.append(
{
"image": torch.empty((4, 4), device="cuda", dtype=dtype).uniform_(-5.0, 5.0),
"output": torch.empty((4, 12), device="cuda", dtype=dtype),
"H": 4,
"W": 4,
"H_out": 4,
"W_out": 12,
}
)

# Power-of-2: 16x16 -> 32x32 (all zeros)
tests.append(
{
"image": torch.zeros((16, 16), device="cuda", dtype=dtype),
"output": torch.empty((32, 32), device="cuda", dtype=dtype),
"H": 16,
"W": 16,
"H_out": 32,
"W_out": 32,
}
)

# Power-of-2: 64x64 -> 128x128 (negative values)
tests.append(
{
"image": torch.empty((64, 64), device="cuda", dtype=dtype).uniform_(-10.0, 0.0),
"output": torch.empty((128, 128), device="cuda", dtype=dtype),
"H": 64,
"W": 64,
"H_out": 128,
"W_out": 128,
}
)

# Power-of-2: 256x256 -> 512x512 (mixed values)
tests.append(
{
"image": torch.empty((256, 256), device="cuda", dtype=dtype).uniform_(-1.0, 1.0),
"output": torch.empty((512, 512), device="cuda", dtype=dtype),
"H": 256,
"W": 256,
"H_out": 512,
"W_out": 512,
}
)

# Non-power-of-2: 30x40 -> 60x80
tests.append(
{
"image": torch.empty((30, 40), device="cuda", dtype=dtype).uniform_(-3.0, 3.0),
"output": torch.empty((60, 80), device="cuda", dtype=dtype),
"H": 30,
"W": 40,
"H_out": 60,
"W_out": 80,
}
)

# Non-power-of-2: 100x150 -> 255x400 (non-integer scale factors)
tests.append(
{
"image": torch.empty((100, 150), device="cuda", dtype=dtype).uniform_(-2.0, 2.0),
"output": torch.empty((255, 400), device="cuda", dtype=dtype),
"H": 100,
"W": 150,
"H_out": 255,
"W_out": 400,
}
)

# Realistic: 1024x1024 -> 2048x2048
tests.append(
{
"image": torch.empty((1024, 1024), device="cuda", dtype=dtype).uniform_(-5.0, 5.0),
"output": torch.empty((2048, 2048), device="cuda", dtype=dtype),
"H": 1024,
"W": 1024,
"H_out": 2048,
"W_out": 2048,
}
)

return tests

def generate_performance_test(self) -> Dict[str, Any]:
dtype = torch.float32
H = 4096
W = 4096
H_out = 8192
W_out = 8192
return {
"image": torch.empty((H, W), device="cuda", dtype=dtype).uniform_(-1.0, 1.0),
"output": torch.empty((H_out, W_out), device="cuda", dtype=dtype),
"H": H,
"W": W,
"H_out": H_out,
"W_out": W_out,
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#include <cuda_runtime.h>

// image, output are device pointers
extern "C" void solve(const float* image, float* output, int H, int W, int H_out, int W_out) {}
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import cutlass
import cutlass.cute as cute


# image, output are tensors on the GPU
@cute.jit
def solve(
image: cute.Tensor,
output: cute.Tensor,
H: cute.Int32,
W: cute.Int32,
H_out: cute.Int32,
W_out: cute.Int32,
):
pass
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import jax
import jax.numpy as jnp


# image is a tensor on the GPU
@jax.jit
def solve(image: jax.Array, H: int, W: int, H_out: int, W_out: int) -> jax.Array:
# return output tensor directly
pass
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from gpu.host import DeviceContext
from gpu.id import block_dim, block_idx, thread_idx
from memory import UnsafePointer
from math import ceildiv

# image, output are device pointers
@export
def solve(
image: UnsafePointer[Float32],
output: UnsafePointer[Float32],
H: Int32,
W: Int32,
H_out: Int32,
W_out: Int32,
):
pass
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
import torch


# image, output are tensors on the GPU
def solve(image: torch.Tensor, output: torch.Tensor, H: int, W: int, H_out: int, W_out: int):
pass
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import torch
import triton
import triton.language as tl


# image, output are tensors on the GPU
def solve(image: torch.Tensor, output: torch.Tensor, H: int, W: int, H_out: int, W_out: int):
pass
Loading