Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
236 changes: 187 additions & 49 deletions src/gasbench/processing/transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,13 +82,20 @@ def apply_robustness_augmentations(
seed=None,
jpeg_quality=55,
scale_factor=0.5,
webp_quality=75,
):
"""Fixed augmentation suite for image augmentation robustness evaluation.

Simulates the dominant real-world internet distribution pipeline:
1. Downscale + upscale — thumbnail/CDN resize chain
2. JPEG roundtrip at jpeg_quality — first platform upload (e.g. WhatsApp ~55)
3. Second JPEG roundtrip at 80 — re-share / re-host recompression
3. WebP roundtrip at webp_quality — CDN/platform re-host (Facebook, Google)
4. Second JPEG roundtrip at 80 — re-share / re-host recompression

Step 3 exercises the cross-codec re-hosting case: many platforms serve
WebP, whose VP8 intra coding leaves a different artifact family than JPEG
DCT, so a detector that survives repeated JPEG can still collapse on it.
Pass webp_quality=None to skip it and recover the JPEG-only chain.

Returns the same 4-tuple as apply_random_augmentations for drop-in use
in PrefetchPipeline when robustness_pass=True. Deterministic given seed
Expand All @@ -113,92 +120,184 @@ def apply_robustness_augmentations(
# First JPEG pass — heavy platform compression (WhatsApp/Telegram ~q55)
img = compress_image_jpeg_pil(img, quality=jpeg_quality)

# Cross-codec re-host — CDN/platform WebP transcode (Facebook, Google)
if webp_quality is not None:
img = compress_image_webp_pil(img, quality=webp_quality)

# Second JPEG pass — lighter re-share recompression (Twitter/Instagram ~q80)
img = compress_image_jpeg_pil(img, quality=80)

# Resize to model input size (same crop+resize as base pipeline)
tforms = get_base_transforms(target_size, (1.0, 1.0))
aug_hwc, _ = tforms(img, None, reuse_params=False)

params = {"jpeg_quality": jpeg_quality, "scale_factor": scale_factor, "jpeg_quality_2": 80}
params = {
"jpeg_quality": jpeg_quality,
"scale_factor": scale_factor,
"webp_quality": webp_quality,
"jpeg_quality_2": 80,
}
return aug_hwc, None, "robustness", params


def apply_video_robustness_augmentations(
video_array,
target_size,
seed=None,
crf=23,
fps=25,
):
"""H.264 compression roundtrip for video augmentation robustness evaluation.
def _decode_video_rgb(tmp_path, num_frames):
"""Decode up to num_frames RGB frames from a video file, padding the last
frame if the decoder returns fewer. Returns a (T, H, W, 3) uint8 array or
None on failure."""
cap = cv2.VideoCapture(tmp_path)
decoded = []
while len(decoded) < num_frames:
ret, frame = cap.read()
if not ret:
break
decoded.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
cap.release()
if not decoded:
return None
while len(decoded) < num_frames:
decoded.append(decoded[-1])
return np.stack(decoded[:num_frames], axis=0)

Mirrors the FaceForensics++ c23/c40 evaluation protocol — encode to H.264
at a given CRF then decode back, simulating platform re-encoding pipelines.
CRF 23 = light (FF++ c23, YouTube-tier), CRF 40 = heavy (FF++ c40,
WhatsApp/Messenger-tier).

cv2's VideoWriter with avc1 (libx264) does not expose CRF directly; the
quality parameter is mapped linearly from the CRF range [18, 51].
def _h264_roundtrip_ffmpeg(video_array, crf, fps):
"""Faithful H.264 roundtrip via the ffmpeg CLI using a real ``-crf`` value.

Returns the same 4-tuple as apply_random_augmentations for drop-in use in
VideoPrefetchPipeline when robustness_pass=True.
This is the only path that reproduces the FaceForensics++ CRF protocol
exactly; cv2's VideoWriter quality knob does not map to CRF and is ignored
on many OpenCV builds. Returns the decoded (T, H, W, 3) uint8 array, or
None if ffmpeg is unavailable or the roundtrip fails (caller falls back).
"""
import shutil
import subprocess
import tempfile
import os

if seed is not None:
np.random.seed(seed)

if video_array.dtype != np.uint8:
video_array = np.clip(video_array, 0, 255).astype(np.uint8)
if shutil.which("ffmpeg") is None:
return None

T, H, W, C = video_array.shape
tmp_path = None
try:
with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f:
tmp_path = f.name
cmd = [
"ffmpeg", "-y", "-loglevel", "error",
"-f", "rawvideo", "-pix_fmt", "rgb24",
"-s", f"{W}x{H}", "-r", str(int(fps)), "-i", "-",
"-c:v", "libx264", "-crf", str(int(crf)),
"-pix_fmt", "yuv420p", tmp_path,
]

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Odd sizes break ffmpeg CRF

Medium Severity

_h264_roundtrip_ffmpeg encodes with libx264 and -pix_fmt yuv420p but never forces even width or height. libx264 rejects odd dimensions, so the subprocess often exits non‑zero, the helper returns None, and robustness runs fall back to cv2 or per‑frame JPEG despite ffmpeg being on PATH and method sometimes implying a CRF roundtrip that did not happen.

Additional Locations (1)
Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit d9148fb. Configure here.

proc = subprocess.run(
cmd, input=np.ascontiguousarray(video_array).tobytes(),
capture_output=True,
)
if proc.returncode != 0 or not os.path.exists(tmp_path) or os.path.getsize(tmp_path) == 0:
return None
return _decode_video_rgb(tmp_path, T)
except Exception:
return None
finally:
if tmp_path:
try:
os.unlink(tmp_path)
except Exception:
pass

# Map CRF [18, 51] → cv2 quality [100, 0] linearly

def _h264_roundtrip_cv2(video_array, crf, fps):
"""Best-effort H.264 roundtrip via cv2's avc1 writer. CRF cannot be set
directly, so it is approximated through VIDEOWRITER_PROP_QUALITY (a perceptual
0-100 knob that some builds ignore). Returns (T, H, W, 3) uint8 or None."""
import tempfile
import os

T, H, W, C = video_array.shape
# Map CRF [18, 51] → cv2 quality [100, 0] linearly (approximate only)
cv2_quality = max(0, min(100, round((51 - crf) / 33.0 * 100)))

compressed = None
tmp_path = None
try:
with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f:
tmp_path = f.name

fourcc = cv2.VideoWriter_fourcc(*"avc1")
writer = cv2.VideoWriter(tmp_path, fourcc, float(fps), (W, H))
if writer.isOpened():
writer.set(cv2.VIDEOWRITER_PROP_QUALITY, cv2_quality)
for t in range(T):
writer.write(cv2.cvtColor(video_array[t], cv2.COLOR_RGB2BGR))
if not writer.isOpened():
writer.release()
return None

cap = cv2.VideoCapture(tmp_path)
decoded = []
while len(decoded) < T:
ret, frame = cap.read()
if not ret:
break
decoded.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
cap.release()

if decoded:
# Pad to T frames if decoder returned fewer
while len(decoded) < T:
decoded.append(decoded[-1])
compressed = np.stack(decoded[:T], axis=0)
else:
writer.release()
writer.set(cv2.VIDEOWRITER_PROP_QUALITY, cv2_quality)
for t in range(T):
writer.write(cv2.cvtColor(video_array[t], cv2.COLOR_RGB2BGR))
writer.release()
return _decode_video_rgb(tmp_path, T)
except Exception:
pass
return None
finally:
if tmp_path:
try:
os.unlink(tmp_path)
except Exception:
pass

# Fallback: per-frame JPEG at quality approximating the requested CRF severity

def apply_video_robustness_augmentations(
video_array,
target_size,
seed=None,
crf=23,
fps=25,
scale_factor=1.0,
):
"""H.264 compression roundtrip for video augmentation robustness evaluation.

Mirrors the FaceForensics++ c23/c40 evaluation protocol — encode to H.264
at a given CRF then decode back, simulating platform re-encoding pipelines.
CRF 23 = light (FF++ c23, YouTube-tier), CRF 40 = heavy (FF++ c40,
WhatsApp/Messenger-tier).

Encoding tries, in order:
1. ffmpeg CLI with a real ``-crf`` value — faithful FF++ reproduction.
2. cv2 avc1 writer with an approximate quality mapping — used only if
ffmpeg is not on PATH.
3. per-frame JPEG at an equivalent severity — last-resort fallback that
still preserves chroma-subsampling artifacts.

scale_factor < 1.0 first downscales every frame (resolution ladder) before
encoding, mirroring platform transcodes that drop 1080p → 720p → 480p. Left
at 1.0 by default so the CRF-only pass stays faithful to the FF++ protocol;
set it (e.g. 0.5) to additionally exercise resolution degradation.

Returns the same 4-tuple as apply_random_augmentations for drop-in use in
VideoPrefetchPipeline when robustness_pass=True.
"""
if seed is not None:
np.random.seed(seed)

if video_array.dtype != np.uint8:
video_array = np.clip(video_array, 0, 255).astype(np.uint8)

# Resolution ladder — downscale frames before encoding (platform transcode)
if scale_factor < 1.0:
T, H, W, C = video_array.shape
sh = max(2, int(round(H * scale_factor)))
sw = max(2, int(round(W * scale_factor)))
video_array = np.stack(
[cv2.resize(video_array[t], (sw, sh), interpolation=cv2.INTER_AREA)
for t in range(T)],
axis=0,
)

T, H, W, C = video_array.shape

method = "ffmpeg_crf"
compressed = _h264_roundtrip_ffmpeg(video_array, crf, fps)
if compressed is None:
method = "cv2_avc1"
compressed = _h264_roundtrip_cv2(video_array, crf, fps)
if compressed is None:
# Fallback: per-frame JPEG at quality approximating the requested CRF severity
method = "jpeg_fallback"
fallback_q = max(20, min(95, round(100 - (crf - 18) * 2.3)))
compressed = compress_video_frames_jpeg_torchvision(video_array, quality=fallback_q)

Expand All @@ -207,7 +306,7 @@ def apply_video_robustness_augmentations(
compressed, target_size, seed=seed, level=0, crop_prob=0.0
)

params = {"crf": crf, "fps": fps, "cv2_quality": cv2_quality}
params = {"crf": crf, "fps": fps, "scale_factor": scale_factor, "method": method}
return aug_thwc, None, "robustness_video", params


Expand Down Expand Up @@ -844,12 +943,51 @@ def compress_image_jpeg_pil(image_hwc: np.ndarray, quality: int = 75) -> np.ndar

pil_img = Image.fromarray(image_hwc, mode="RGB")
buffer = BytesIO()
pil_img.save(buffer, format="JPEG", quality=int(quality))
# subsampling=2 forces 4:2:0 chroma subsampling regardless of quality/Pillow
# version. This is the operation the social-media compression literature
# identifies as destroying high-frequency DCT fingerprints, so we pin it
# rather than letting Pillow pick subsampling per quality level.
pil_img.save(buffer, format="JPEG", quality=int(quality), subsampling=2)
buffer.seek(0)
decoded_pil = Image.open(buffer).convert("RGB")
return np.array(decoded_pil)


def compress_image_webp_pil(image_hwc: np.ndarray, quality: int = 75) -> np.ndarray:
"""
Compress a single image using a PIL WebP (lossy) round-trip at fixed quality.

Facebook, Google, and many CDNs re-host uploads as WebP, whose VP8 intra
coding leaves a different artifact family than JPEG's DCT blocks. Including
a WebP pass alongside the JPEG passes exercises detectors against the
cross-codec re-hosting that real distribution chains produce.

Args:
image_hwc: numpy array (H, W, C), dtype uint8, RGB
quality: WebP quality (default 75)

Returns:
numpy array (H, W, C), dtype uint8, RGB
"""
if image_hwc is None:
return image_hwc
if image_hwc.dtype != np.uint8:
image_hwc = np.clip(image_hwc, 0, 255).astype(np.uint8)
if image_hwc.ndim != 3 or image_hwc.shape[2] != 3:
return image_hwc

try:
pil_img = Image.fromarray(image_hwc, mode="RGB")
buffer = BytesIO()
pil_img.save(buffer, format="WEBP", quality=int(quality), method=4)
buffer.seek(0)
decoded_pil = Image.open(buffer).convert("RGB")
return np.array(decoded_pil)
except Exception:
# WebP support is missing in some Pillow builds; fall back to original.
return image_hwc


def compress_video_frames_jpeg_torchvision(video_thwc: np.ndarray, quality: int = 75) -> np.ndarray:
"""
Compress each frame of a video using torchvision's encode_jpeg/decode_jpeg at fixed quality.
Expand Down