From 939614fb802ed8f792ff923de91692cbafa4e6f7 Mon Sep 17 00:00:00 2001
From: Thomas Arcila <134677+tarcila@users.noreply.github.com>
Date: Thu, 30 Apr 2026 05:51:19 -0400
Subject: [PATCH 01/20] rtx: Decouple BLAS/volume/light invalidation timestamps

Split the global `lastBLASChange` into per-domain stamps so a geometry
edit no longer invalidates volume BVHs or per-group light index lists,
and vice versa.

  lastSurfaceBLASChange -> bumped by Geometry, Surface
  lastVolumeBLASChange  -> bumped by Volume, SpatialField
  lastLightSetChange    -> bumped on Group commits (light array rebind)
  lastTLASChange        -> unchanged (Instance)

Group::markFinalized bumps all three since any of its arrays may have
been rebound. Each Group::rebuild* gates on its own stamp; World fans
the trigger across the three for the BLAS rebuild path.
---
 devices/rtx/device/geometry/Geometry.cpp      |  2 +-
 devices/rtx/device/optix_visrtx.h             |  4 +++-
 .../rtx/device/spatial_field/SpatialField.cpp |  2 +-
 devices/rtx/device/surface/Surface.cpp        |  2 +-
 devices/rtx/device/volume/Volume.cpp          |  2 +-
 devices/rtx/device/world/Group.cpp            | 19 +++++++++++++++++--
 devices/rtx/device/world/World.cpp            |  6 +++++-
 7 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/devices/rtx/device/geometry/Geometry.cpp b/devices/rtx/device/geometry/Geometry.cpp
index 127eb865e..e39552c72 100644
--- a/devices/rtx/device/geometry/Geometry.cpp
+++ b/devices/rtx/device/geometry/Geometry.cpp
@@ -129,7 +129,7 @@ void Geometry::commitParameters()
 void Geometry::markFinalized()
 {
   Object::markFinalized();
-  deviceState()->objectUpdates.lastBLASChange = helium::newTimeStamp();
+  deviceState()->objectUpdates.lastSurfaceBLASChange = helium::newTimeStamp();
 }
 
 GeometryGPUData Geometry::gpuData() const
diff --git a/devices/rtx/device/optix_visrtx.h b/devices/rtx/device/optix_visrtx.h
index 60c330dab..2f6082ad7 100644
--- a/devices/rtx/device/optix_visrtx.h
+++ b/devices/rtx/device/optix_visrtx.h
@@ -205,7 +205,9 @@ struct DeviceGlobalState : public helium::BaseGlobalDeviceState
 
   struct ObjectUpdates
   {
-    helium::TimeStamp lastBLASChange{0};
+    helium::TimeStamp lastSurfaceBLASChange{0};
+    helium::TimeStamp lastVolumeBLASChange{0};
+    helium::TimeStamp lastLightSetChange{0};
     helium::TimeStamp lastTLASChange{0};
   } objectUpdates;
 
diff --git a/devices/rtx/device/spatial_field/SpatialField.cpp b/devices/rtx/device/spatial_field/SpatialField.cpp
index b0404b473..2aba5be38 100644
--- a/devices/rtx/device/spatial_field/SpatialField.cpp
+++ b/devices/rtx/device/spatial_field/SpatialField.cpp
@@ -49,7 +49,7 @@ SpatialField::SpatialField(DeviceGlobalState *s)
 void SpatialField::markFinalized()
 {
   Object::markFinalized();
-  deviceState()->objectUpdates.lastBLASChange = helium::newTimeStamp();
+  deviceState()->objectUpdates.lastVolumeBLASChange = helium::newTimeStamp();
 }
 
 SpatialField *SpatialField::createInstance(
diff --git a/devices/rtx/device/surface/Surface.cpp b/devices/rtx/device/surface/Surface.cpp
index 5458b2d79..13d9f865f 100644
--- a/devices/rtx/device/surface/Surface.cpp
+++ b/devices/rtx/device/surface/Surface.cpp
@@ -63,7 +63,7 @@ void Surface::finalize()
 void Surface::markFinalized()
 {
   Object::markFinalized();
-  deviceState()->objectUpdates.lastBLASChange = helium::newTimeStamp();
+  deviceState()->objectUpdates.lastSurfaceBLASChange = helium::newTimeStamp();
 }
 
 bool Surface::isValid() const
diff --git a/devices/rtx/device/volume/Volume.cpp b/devices/rtx/device/volume/Volume.cpp
index 10ce94912..57d10933d 100644
--- a/devices/rtx/device/volume/Volume.cpp
+++ b/devices/rtx/device/volume/Volume.cpp
@@ -51,7 +51,7 @@ void Volume::commitParameters()
 void Volume::markFinalized()
 {
   Object::markFinalized();
-  deviceState()->objectUpdates.lastBLASChange = helium::newTimeStamp();
+  deviceState()->objectUpdates.lastVolumeBLASChange = helium::newTimeStamp();
 }
 
 bool Volume::isVisible() const
diff --git a/devices/rtx/device/world/Group.cpp b/devices/rtx/device/world/Group.cpp
index ed2a54526..7a5ef1584 100644
--- a/devices/rtx/device/world/Group.cpp
+++ b/devices/rtx/device/world/Group.cpp
@@ -102,7 +102,11 @@ void Group::finalize()
 void Group::markFinalized()
 {
   Object::markFinalized();
-  deviceState()->objectUpdates.lastBLASChange = helium::newTimeStamp();
+  auto &updates = deviceState()->objectUpdates;
+  const auto now = helium::newTimeStamp();
+  updates.lastSurfaceBLASChange = now;
+  updates.lastVolumeBLASChange = now;
+  updates.lastLightSetChange = now;
 }
 
 OptixTraversableHandle Group::optixTraversableTriangle() const
@@ -194,7 +198,8 @@ const std::vector<Light *> &Group::lights() const
 void Group::rebuildSurfaceBVHs()
 {
   const auto &state = *deviceState();
-  if (state.objectUpdates.lastBLASChange < m_objectUpdates.lastSurfaceBVHBuilt)
+  if (state.objectUpdates.lastSurfaceBLASChange
+      < m_objectUpdates.lastSurfaceBVHBuilt)
     return;
 
   partitionValidGeometriesByType();
@@ -249,10 +254,16 @@ void Group::rebuildSurfaceBVHs()
 
 void Group::rebuildVolumeBVH()
 {
+  const auto &state = *deviceState();
+  if (state.objectUpdates.lastVolumeBLASChange
+      < m_objectUpdates.lastVolumeBVHBuilt)
+    return;
+
   partitionValidVolumes();
   if (m_volumes.empty()) {
     m_volumeBounds = box3();
     m_traversableVolume = {};
+    m_objectUpdates.lastVolumeBVHBuilt = helium::newTimeStamp();
     reportMessage(
         ANARI_SEVERITY_DEBUG, "visrtx::Group skipping volume BVH build");
     return;
@@ -272,6 +283,10 @@ void Group::rebuildVolumeBVH()
 
 void Group::rebuildLights()
 {
+  const auto &state = *deviceState();
+  if (state.objectUpdates.lastLightSetChange < m_objectUpdates.lastLightRebuild)
+    return;
+
   partitionValidLights();
   buildLightGPUData();
   m_objectUpdates.lastLightRebuild = helium::newTimeStamp();
diff --git a/devices/rtx/device/world/World.cpp b/devices/rtx/device/world/World.cpp
index 90c70ef89..93d05c961 100644
--- a/devices/rtx/device/world/World.cpp
+++ b/devices/rtx/device/world/World.cpp
@@ -203,7 +203,11 @@ void World::rebuildWorld()
 {
   const auto &state = *deviceState();
 
-  if (state.objectUpdates.lastBLASChange >= m_objectUpdates.lastBLASCheck) {
+  const auto &updates = state.objectUpdates;
+  const auto lastCheck = m_objectUpdates.lastBLASCheck;
+  if (updates.lastSurfaceBLASChange >= lastCheck
+      || updates.lastVolumeBLASChange >= lastCheck
+      || updates.lastLightSetChange >= lastCheck) {
     m_objectUpdates.lastTLASBuild = 0; // BLAS changed, so need to build TLAS
     rebuildBLASs();
   }

From 5168cd6d74643a1889db2692ad9189680dcf25a7 Mon Sep 17 00:00:00 2001
From: Thomas Arcila <134677+tarcila@users.noreply.github.com>
Date: Wed, 29 Apr 2026 11:35:45 -0400
Subject: [PATCH 02/20] rtx: Fix null-deref in face-varying triangle validation

The faceVarying.normal/tangent count check dereferenced m_index
unconditionally, crashing on triangle-soup geometry (no index buffer).
---
 devices/rtx/device/geometry/Triangle.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/devices/rtx/device/geometry/Triangle.cpp b/devices/rtx/device/geometry/Triangle.cpp
index 4369243f3..52aa41216 100644
--- a/devices/rtx/device/geometry/Triangle.cpp
+++ b/devices/rtx/device/geometry/Triangle.cpp
@@ -78,12 +78,14 @@ void Triangle::finalize()
         m_vertex->size());
   }
 
-  if (m_vertexNormalFV && 3 * m_index->size() != m_vertexNormalFV->size()) {
+  const size_t numTriangles = m_index ? m_index->size() : m_vertex->size() / 3;
+
+  if (m_vertexNormalFV && 3 * numTriangles != m_vertexNormalFV->size()) {
     reportMessage(ANARI_SEVERITY_WARNING,
         "'faceVarying.normal' on triangle geometry is not matching "
         "the number of triangles in 'primitive.index' (%zu) vs. (%zu)",
         m_vertexNormalFV->size(),
-        m_index->size());
+        numTriangles);
   }
 
   if (m_vertexTangent && m_vertex->size() != m_vertexTangent->size()) {
@@ -94,12 +96,12 @@ void Triangle::finalize()
         m_vertex->size());
   }
 
-  if (m_vertexTangentFV && 3 * m_index->size() != m_vertexTangentFV->size()) {
+  if (m_vertexTangentFV && 3 * numTriangles != m_vertexTangentFV->size()) {
     reportMessage(ANARI_SEVERITY_WARNING,
         "'faceVarying.Tangent' on triangle geometry is not matching "
         "the number of triangles in 'primitive.index' (%zu) vs. (%zu)",
         m_vertexTangentFV->size(),
-        m_index->size());
+        numTriangles);
   }
 
   if (!m_vertexTangent && !m_vertexTangentFV) {

From 6955236538972d2e5b2b25bb0f910dd83b32ffcc Mon Sep 17 00:00:00 2001
From: Thomas Arcila <134677+tarcila@users.noreply.github.com>
Date: Wed, 29 Apr 2026 11:39:12 -0400
Subject: [PATCH 03/20] rtx: Own generated tangent buffer

The computed-tangent Array1D was built without a deleter, so the
cudaMalloc'd memory leaked when the array was released.
Also remove the unused bitangent parameter on computeTangents calls.
---
 devices/rtx/device/geometry/ComputeTangent.cu | 70 +++++++++++++------
 1 file changed, 49 insertions(+), 21 deletions(-)

diff --git a/devices/rtx/device/geometry/ComputeTangent.cu b/devices/rtx/device/geometry/ComputeTangent.cu
index bffeb4bb0..1560ffff5 100644
--- a/devices/rtx/device/geometry/ComputeTangent.cu
+++ b/devices/rtx/device/geometry/ComputeTangent.cu
@@ -52,6 +52,25 @@ namespace {
 
 constexpr const auto eps = 1e-8f;
 
+void cudaFreeMemoryDeleter(const void *, const void *memory)
+{
+  cudaFree(const_cast<void *>(memory));
+}
+
+bool reportCudaError(
+    visrtx::Triangle *triangle, cudaError_t error, const char *operation)
+{
+  if (error == cudaSuccess)
+    return false;
+
+  triangle->reportMessage(ANARI_SEVERITY_ERROR,
+      "CUDA error while computing tangents for Triangle %p during %s: %s",
+      triangle,
+      operation,
+      cudaGetErrorString(error));
+  return true;
+}
+
 } // namespace
 
 namespace visrtx {
@@ -108,7 +127,6 @@ template <bool VerticesIndexed,
     typename TexCoord>
 __global__ void __doComputeTangents(
     glm::vec4 *tangents, // Output tangent vectors with handedness (w component)
-    glm::vec3 *bitangents, // Output bitangent vectors
     const glm::uvec3 *indices, // Input triangle indices
     const glm::vec3 *positions, // Input vertex positions
     const glm::vec3 *normals, // Input vertex normals
@@ -197,7 +215,6 @@ template <bool VerticesIndexed,
     typename TexCoord>
 void __computeTangents(
     glm::vec4 *tangents, // Output tangent vectors with handedness (w component)
-    glm::vec3 *bitangents, // Output bitangent vectors
     const glm::uvec3 *indices, // Input triangle indices
     const glm::vec3 *positions, // Input vertex positions
     const glm::vec3 *normals, // Input vertex normals
@@ -207,7 +224,7 @@ void __computeTangents(
 {
   __doComputeTangents<VerticesIndexed, NormalsIndexed, UVsIndexed, TexCoord>
       <<<(numTriangles + 63) / 64, 64>>>(
-          tangents, bitangents, indices, positions, normals, uvs, numTriangles);
+          tangents, indices, positions, normals, uvs, numTriangles);
 }
 
 void updateGeometryTangent(Triangle *triangle)
@@ -257,12 +274,24 @@ void updateGeometryTangent(Triangle *triangle)
 
   auto tangentsCount = indices ? (indices->size() * 3) : positions->size();
   auto trianglesCount = indices ? indices->size() : positions->size() / 3;
+  if (trianglesCount == 0 || tangentsCount == 0) {
+    triangle->reportMessage(ANARI_SEVERITY_INFO,
+        "Triangle %p has no triangles, cannot compute tangents",
+        triangle);
+    return;
+  }
+
   glm::vec4 *tangents = {};
-  cudaMalloc(&tangents, sizeof(glm::vec4) * tangentsCount);
-  cudaMemset(tangents, 0, sizeof(glm::vec4) * tangentsCount);
-  glm::vec3 *bitangents = {};
-  cudaMalloc(&bitangents, sizeof(glm::vec3) * tangentsCount);
-  cudaMemset(bitangents, 0, sizeof(glm::vec3) * tangentsCount);
+  auto status = cudaMalloc(
+      reinterpret_cast<void **>(&tangents), sizeof(glm::vec4) * tangentsCount);
+  if (reportCudaError(triangle, status, "allocating tangent buffer"))
+    return;
+
+  status = cudaMemset(tangents, 0, sizeof(glm::vec4) * tangentsCount);
+  if (reportCudaError(triangle, status, "clearing tangent buffer")) {
+    cudaFree(tangents);
+    return;
+  }
 
   auto positionsPtr = positions->dataAs<const glm::vec3>(AddressSpace::GPU);
   if (indices) {
@@ -274,7 +303,6 @@ void updateGeometryTangent(Triangle *triangle)
           auto uvsPtr = uvsFV->dataAs<const glm::vec2>(AddressSpace::GPU);
           // Vertex indexed, face varying normals and face varyings vec2 UVs.
           __computeTangents<true, false, false>(tangents,
-              bitangents,
               indicesPtr,
               positionsPtr,
               normalsPtr,
@@ -284,7 +312,6 @@ void updateGeometryTangent(Triangle *triangle)
           auto uvsPtr = uvsFV->dataAs<const glm::vec3>(AddressSpace::GPU);
           // Vertex indexed, face varying normals and face varyings vec3 UVs.
           __computeTangents<true, false, false>(tangents,
-              bitangents,
               indicesPtr,
               positionsPtr,
               normalsPtr,
@@ -296,7 +323,6 @@ void updateGeometryTangent(Triangle *triangle)
           // Vertex indexed,  face varying normals and indexed vec2 UVs.
           auto uvsPtr = uvs->dataAs<const glm::vec2>(AddressSpace::GPU);
           __computeTangents<true, false, true>(tangents,
-              bitangents,
               indicesPtr,
               positionsPtr,
               normalsPtr,
@@ -306,7 +332,6 @@ void updateGeometryTangent(Triangle *triangle)
           // Vertex indexed,  face varying normals and indexed vec3 UVs.
           auto uvsPtr = uvs->dataAs<const glm::vec3>(AddressSpace::GPU);
           __computeTangents<true, false, true>(tangents,
-              bitangents,
               indicesPtr,
               positionsPtr,
               normalsPtr,
@@ -321,7 +346,6 @@ void updateGeometryTangent(Triangle *triangle)
           auto uvsPtr = uvsFV->dataAs<const glm::vec2>(AddressSpace::GPU);
           // Vertex indexed, index normals and face varyings vec2 UVs.
           __computeTangents<true, true, false>(tangents,
-              bitangents,
               indicesPtr,
               positionsPtr,
               normalsPtr,
@@ -331,7 +355,6 @@ void updateGeometryTangent(Triangle *triangle)
           auto uvsPtr = uvsFV->dataAs<const glm::vec3>(AddressSpace::GPU);
           // Vertex indexed, indexed normals and face varyings vec3 UVs.
           __computeTangents<true, true, false>(tangents,
-              bitangents,
               indicesPtr,
               positionsPtr,
               normalsPtr,
@@ -343,7 +366,6 @@ void updateGeometryTangent(Triangle *triangle)
           // Vertex indexed, indexed normals and indexed vec2 UVs.
           auto uvsPtr = uvs->dataAs<const glm::vec2>(AddressSpace::GPU);
           __computeTangents<true, true, true>(tangents,
-              bitangents,
               indicesPtr,
               positionsPtr,
               normalsPtr,
@@ -353,7 +375,6 @@ void updateGeometryTangent(Triangle *triangle)
           // Vertex indexed, indexed normals and indexed vec3 UVs.
           auto uvsPtr = uvs->dataAs<const glm::vec3>(AddressSpace::GPU);
           __computeTangents<true, true, true>(tangents,
-              bitangents,
               indicesPtr,
               positionsPtr,
               normalsPtr,
@@ -373,7 +394,6 @@ void updateGeometryTangent(Triangle *triangle)
       // Non indexed vertices, face varying normals and face varyings vec2 UVs.
       auto uvsPtr = uvs->dataAs<const glm::vec2>(AddressSpace::GPU);
       __computeTangents<false, false, false>(tangents,
-          bitangents,
           indicesPtr,
           positionsPtr,
           normalsPtr,
@@ -383,7 +403,6 @@ void updateGeometryTangent(Triangle *triangle)
       // Non indexed vertices, face varying normals and face varyings vec3 UVs.
       auto uvsPtr = uvs->dataAs<const glm::vec3>(AddressSpace::GPU);
       __computeTangents<false, false, false>(tangents,
-          bitangents,
           indicesPtr,
           positionsPtr,
           normalsPtr,
@@ -392,13 +411,22 @@ void updateGeometryTangent(Triangle *triangle)
     }
   }
 
-  // Release transient bitangent store
-  cudaFree(bitangents);
+  status = cudaGetLastError();
+  if (reportCudaError(triangle, status, "launching tangent kernel")) {
+    cudaFree(tangents);
+    return;
+  }
+
+  status = cudaDeviceSynchronize();
+  if (reportCudaError(triangle, status, "computing tangents")) {
+    cudaFree(tangents);
+    return;
+  }
 
   auto desc = Array1DMemoryDescriptor{
       {
           tangents,
-          {}, // deleter
+          cudaFreeMemoryDeleter, // deleter
           {}, // deleterPtr
           ANARI_FLOAT32_VEC4,
       },

From e7bcdabde7bafad61d2dff40579b292ef87da2ee Mon Sep 17 00:00:00 2001
From: Thomas Arcila <134677+tarcila@users.noreply.github.com>
Date: Wed, 29 Apr 2026 11:31:12 -0400
Subject: [PATCH 04/20] rtx: Negate generated bitangent
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The bitangent formula was (t.x*e1 - s.x*e2)/det — sign-reversed vs.
the standard (s.x*e2 - t.x*e1)/det used by MikkTSpace. Normal-mapped
lighting on generated tangents was mirrored along the bitangent axis.
---
 devices/rtx/device/geometry/ComputeTangent.cu | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/devices/rtx/device/geometry/ComputeTangent.cu b/devices/rtx/device/geometry/ComputeTangent.cu
index 1560ffff5..9105f335b 100644
--- a/devices/rtx/device/geometry/ComputeTangent.cu
+++ b/devices/rtx/device/geometry/ComputeTangent.cu
@@ -83,8 +83,7 @@ __device__ void __computeTangentAndBitangent(
     glm::vec3 p2,
     glm::vec2 uv0, // Input texture coordinates
     glm::vec2 uv1,
-    glm::vec2 uv2
-)
+    glm::vec2 uv2)
 {
   // Compute edges of the triangle
   glm::vec3 e1 = p1 - p0;
@@ -116,7 +115,7 @@ __device__ void __computeTangentAndBitangent(
       // Compute the determinant
       float invdet = 1.0f / cross;
       *tangent = (t.y * e1 - s.y * e2) * invdet;
-      *bitangent = (t.x * e1 - s.x * e2) * invdet;
+      *bitangent = (s.x * e2 - t.x * e1) * invdet;
     }
   }
 }

From 2ff810f8cd4b0393f25c896843f83e9e4219e0ef Mon Sep 17 00:00:00 2001
From: Thomas Arcila <134677+tarcila@users.noreply.github.com>
Date: Wed, 29 Apr 2026 11:35:45 -0400
Subject: [PATCH 05/20] rtx: Harden tangent generation against degenerate
 inputs

Degenerate triangles or NaN/zero input normals produced fixed
(1,0,0)/(0,1,0) tangents that conflicted with the actual surface
normal. Add safeNormalize, fall back to a Pixar orthonormal basis
built from the geometric normal, and factor the per-corner
Gram-Schmidt + handedness into a shared orthogonalizeTangent helper.
---
 devices/rtx/device/geometry/ComputeTangent.cu | 128 ++++++++++++------
 1 file changed, 86 insertions(+), 42 deletions(-)

diff --git a/devices/rtx/device/geometry/ComputeTangent.cu b/devices/rtx/device/geometry/ComputeTangent.cu
index 9105f335b..dbad934b5 100644
--- a/devices/rtx/device/geometry/ComputeTangent.cu
+++ b/devices/rtx/device/geometry/ComputeTangent.cu
@@ -52,6 +52,49 @@ namespace {
 
 constexpr const auto eps = 1e-8f;
 
+__device__ glm::vec3 safeNormalize(
+    const glm::vec3 &v, const glm::vec3 &fallback)
+{
+  const float l2 = glm::dot(v, v);
+  return l2 > eps ? v * rsqrtf(l2) : fallback;
+}
+
+__device__ void makeTangentFrame(
+    const glm::vec3 &normal, glm::vec3 *tangent, glm::vec3 *bitangent)
+{
+  // https://graphics.pixar.com/library/OrthonormalB/paper.pdf
+  const glm::vec3 n = safeNormalize(normal, glm::vec3(0.f, 0.f, 1.f));
+  const float sign = n.z >= 0.0f ? 1.0f : -1.0f;
+  const float a = -1.0f / (sign + n.z);
+  const float b = n.x * n.y * a;
+  *tangent = glm::vec3(1.0f + sign * n.x * n.x * a, sign * b, -sign * n.x);
+  *bitangent = glm::vec3(b, sign + n.y * n.y * a, -n.y);
+}
+
+__device__ glm::vec3 computeGeometricNormal(
+    const glm::vec3 &e1, const glm::vec3 &e2)
+{
+  return safeNormalize(glm::cross(e1, e2), glm::vec3(0.f, 0.f, 1.f));
+}
+
+__device__ void orthogonalizeTangent(const glm::vec3 &tangent,
+    const glm::vec3 &bitangent,
+    const glm::vec3 &normal,
+    glm::vec3 *outTangent,
+    float *outHandedness)
+{
+  glm::vec3 fallbackTangent;
+  glm::vec3 fallbackBitangent;
+  makeTangentFrame(normal, &fallbackTangent, &fallbackBitangent);
+
+  const glm::vec3 n = safeNormalize(normal, glm::vec3(0.f, 0.f, 1.f));
+  *outTangent =
+      safeNormalize(tangent - n * glm::dot(n, tangent), fallbackTangent);
+
+  const float bitangentSign = glm::dot(glm::cross(n, *outTangent), bitangent);
+  *outHandedness = bitangentSign < 0.0f ? -1.0f : 1.0f;
+}
+
 void cudaFreeMemoryDeleter(const void *, const void *memory)
 {
   cudaFree(const_cast<void *>(memory));
@@ -88,32 +131,22 @@ __device__ void __computeTangentAndBitangent(
   // Compute edges of the triangle
   glm::vec3 e1 = p1 - p0;
   glm::vec3 e2 = p2 - p0;
+  const auto normal = computeGeometricNormal(e1, e2);
 
-  if (dot(e1, e1) < eps || dot(e2, e2) < eps) {
-    // Degenerate triangle, use a default tangent and bitangent
-    *tangent = glm::vec3(1.0f, 0.0f, 0.0f);
-    *bitangent = glm::vec3(0.0f, 1.0f, 0.0f);
+  if (glm::dot(e1, e1) < eps || glm::dot(e2, e2) < eps) {
+    makeTangentFrame(normal, tangent, bitangent);
   } else {
-    auto normal = normalize(cross(e1, e2));
-
     // Compute differences in texture coordinates
     auto s = uv1 - uv0;
     auto t = uv2 - uv0;
 
-    auto cross = s.x * t.y - s.y * t.x;
-
-    if (abs(cross) < eps) { // degenerate triangle (null vectors or collinears)
-      // Create a default orthonormal basis:
-      // https://graphics.pixar.com/library/OrthonormalB/paper.pdf
-      float sign = normal.z >= 0.0f ? 1.0f : -1.0f;
-      float a = -1.0f / (sign + normal.z);
-      float b = normal.x * normal.y * a;
-      *tangent = glm::vec3(
-          1.0f + sign * normal.x * normal.x * a, sign * b, -sign * normal.x);
-      *bitangent = glm::vec3(b, sign + normal.y * normal.y * a, -normal.y);
+    auto det = s.x * t.y - s.y * t.x;
+
+    if (glm::abs(det) < eps) {
+      makeTangentFrame(normal, tangent, bitangent);
     } else {
       // Compute the determinant
-      float invdet = 1.0f / cross;
+      float invdet = 1.0f / det;
       *tangent = (t.y * e1 - s.y * e2) * invdet;
       *bitangent = (s.x * e2 - t.x * e1) * invdet;
     }
@@ -180,28 +213,36 @@ __global__ void __doComputeTangents(
       uv2 // Input texture coordinates
   );
 
-  vec3 n0, n1, n2;
-  if constexpr (NormalsIndexed) {
-    // Use indexed normals
-    n0 = normals[indexedIdx.x];
-    n1 = normals[indexedIdx.y];
-    n2 = normals[indexedIdx.z];
-  } else {
-    // Use per-face normals
-    n0 = normals[perFaceBaseIdx.x];
-    n1 = normals[perFaceBaseIdx.y];
-    n2 = normals[perFaceBaseIdx.z];
+  const vec3 geometricNormal = computeGeometricNormal(p1 - p0, p2 - p0);
+  vec3 n0 = geometricNormal;
+  vec3 n1 = geometricNormal;
+  vec3 n2 = geometricNormal;
+  if (normals) {
+    if constexpr (NormalsIndexed) {
+      // Use indexed normals
+      n0 = normals[indexedIdx.x];
+      n1 = normals[indexedIdx.y];
+      n2 = normals[indexedIdx.z];
+    } else {
+      // Use per-face normals
+      n0 = normals[perFaceBaseIdx.x];
+      n1 = normals[perFaceBaseIdx.y];
+      n2 = normals[perFaceBaseIdx.z];
+    }
+    n0 = safeNormalize(n0, geometricNormal);
+    n1 = safeNormalize(n1, geometricNormal);
+    n2 = safeNormalize(n2, geometricNormal);
   }
 
-  // Gram-Schmidt orthogonalize and compute handedness
-  vec3 t0 = normalize(tangent - n0 * dot(n0, tangent));
-  float h0 = copysign(1.0f, dot(cross(n0, t0), bitangent));
-
-  vec3 t1 = normalize(tangent - n1 * dot(n1, tangent));
-  float h1 = copysign(1.0f, dot(cross(n1, t1), bitangent));
-
-  vec3 t2 = normalize(tangent - n2 * dot(n2, tangent));
-  float h2 = copysign(1.0f, dot(cross(n2, t2), bitangent));
+  vec3 t0;
+  vec3 t1;
+  vec3 t2;
+  float h0;
+  float h1;
+  float h2;
+  orthogonalizeTangent(tangent, bitangent, n0, &t0, &h0);
+  orthogonalizeTangent(tangent, bitangent, n1, &t1, &h1);
+  orthogonalizeTangent(tangent, bitangent, n2, &t2, &h2);
 
   tangents[perFaceBaseIdx.x] = glm::vec4(t0, h0);
   tangents[perFaceBaseIdx.y] = glm::vec4(t1, h1);
@@ -235,9 +276,9 @@ void updateGeometryTangent(Triangle *triangle)
   auto normalsFV = triangle->getParamObject<Array1D>("faceVarying.normal");
   auto uvsFV = triangle->getParamObject<Array1D>("faceVarying.attribute0");
 
-  if (!positions || (!normals && !normalsFV)) {
+  if (!positions) {
     triangle->reportMessage(ANARI_SEVERITY_INFO,
-        "Triangle %p has no position or normals, cannot compute tangents",
+        "Triangle %p has no positions, cannot compute tangents",
         triangle);
     return;
   }
@@ -339,7 +380,9 @@ void updateGeometryTangent(Triangle *triangle)
         }
       }
     } else {
-      auto normalsPtr = normals->dataAs<const glm::vec3>(AddressSpace::GPU);
+      const auto *normalsPtr = normals
+          ? normals->dataAs<const glm::vec3>(AddressSpace::GPU)
+          : nullptr;
       if (uvsFV) {
         if (uvsFV->elementType() == ANARI_FLOAT32_VEC2) {
           auto uvsPtr = uvsFV->dataAs<const glm::vec2>(AddressSpace::GPU);
@@ -387,7 +430,8 @@ void updateGeometryTangent(Triangle *triangle)
     normals = normalsFV ? normalsFV : normals;
     uvs = uvsFV ? uvsFV : uvs;
 
-    auto normalsPtr = normals->dataAs<const glm::vec3>(AddressSpace::GPU);
+    const auto *normalsPtr =
+        normals ? normals->dataAs<const glm::vec3>(AddressSpace::GPU) : nullptr;
 
     if (uvs->elementType() == ANARI_FLOAT32_VEC2) {
       // Non indexed vertices, face varying normals and face varyings vec2 UVs.

From e9c2b2b86682c83de083a5d55ec3e561946d3a5a Mon Sep 17 00:00:00 2001
From: Thomas Arcila <134677+tarcila@users.noreply.github.com>
Date: Wed, 6 May 2026 19:36:21 +0000
Subject: [PATCH 06/20] rtx: Compute per-vertex tangents via angle-weighted
 accumulation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Per-triangle T at a shared corner depended on each face's own UV
gradient, so orthogonalizing against the same vertex normal still
produced different tangents per face — the shading frame jumped at
every triangle edge.

Replace with two passes: pass 1 atomically accumulates angle-weighted
T/B/N into per-vertex slots (MikkTSpace's averaging scheme); pass 2
normalizes and orthogonalizes. Output moves from face-varying
(3*numTri) to per-vertex 'vertex.tangent'. UV mirror seams must be
vertex-split — same constraint as MikkTSpace defaults.
---
 devices/rtx/device/geometry/ComputeTangent.cu | 355 ++++++++++++------
 1 file changed, 241 insertions(+), 114 deletions(-)

diff --git a/devices/rtx/device/geometry/ComputeTangent.cu b/devices/rtx/device/geometry/ComputeTangent.cu
index dbad934b5..f6c1f1f56 100644
--- a/devices/rtx/device/geometry/ComputeTangent.cu
+++ b/devices/rtx/device/geometry/ComputeTangent.cu
@@ -77,22 +77,30 @@ __device__ glm::vec3 computeGeometricNormal(
   return safeNormalize(glm::cross(e1, e2), glm::vec3(0.f, 0.f, 1.f));
 }
 
-__device__ void orthogonalizeTangent(const glm::vec3 &tangent,
-    const glm::vec3 &bitangent,
-    const glm::vec3 &normal,
-    glm::vec3 *outTangent,
-    float *outHandedness)
+// Each face-vertex's contribution to its vertex's accumulated tangent frame
+// is weighted by the triangle's interior angle at that corner — same scheme
+// MikkTSpace uses to average across incident faces. Angle weighting (over
+// uniform or area) keeps thin sliver triangles from dominating shared
+// vertices.
+__device__ float cornerAngle(
+    const glm::vec3 &a, const glm::vec3 &b, const glm::vec3 &c)
 {
-  glm::vec3 fallbackTangent;
-  glm::vec3 fallbackBitangent;
-  makeTangentFrame(normal, &fallbackTangent, &fallbackBitangent);
-
-  const glm::vec3 n = safeNormalize(normal, glm::vec3(0.f, 0.f, 1.f));
-  *outTangent =
-      safeNormalize(tangent - n * glm::dot(n, tangent), fallbackTangent);
+  const glm::vec3 ab = b - a;
+  const glm::vec3 ac = c - a;
+  const float lab = sqrtf(glm::dot(ab, ab));
+  const float lac = sqrtf(glm::dot(ac, ac));
+  if (lab < eps || lac < eps)
+    return 0.0f;
+  const float cosT =
+      glm::clamp(glm::dot(ab, ac) / (lab * lac), -1.0f, 1.0f);
+  return acosf(cosT);
+}
 
-  const float bitangentSign = glm::dot(glm::cross(n, *outTangent), bitangent);
-  *outHandedness = bitangentSign < 0.0f ? -1.0f : 1.0f;
+__device__ void atomicAddVec3(glm::vec3 &dst, const glm::vec3 &v)
+{
+  atomicAdd(&dst.x, v.x);
+  atomicAdd(&dst.y, v.y);
+  atomicAdd(&dst.z, v.z);
 }
 
 void cudaFreeMemoryDeleter(const void *, const void *memory)
@@ -135,36 +143,44 @@ __device__ void __computeTangentAndBitangent(
 
   if (glm::dot(e1, e1) < eps || glm::dot(e2, e2) < eps) {
     makeTangentFrame(normal, tangent, bitangent);
-  } else {
-    // Compute differences in texture coordinates
-    auto s = uv1 - uv0;
-    auto t = uv2 - uv0;
+    return;
+  }
 
-    auto det = s.x * t.y - s.y * t.x;
+  // Compute differences in texture coordinates
+  auto s = uv1 - uv0;
+  auto t = uv2 - uv0;
 
-    if (glm::abs(det) < eps) {
-      makeTangentFrame(normal, tangent, bitangent);
-    } else {
-      // Compute the determinant
-      float invdet = 1.0f / det;
-      *tangent = (t.y * e1 - s.y * e2) * invdet;
-      *bitangent = (s.x * e2 - t.x * e1) * invdet;
-    }
+  auto det = s.x * t.y - s.y * t.x;
+
+  if (glm::abs(det) < eps) {
+    makeTangentFrame(normal, tangent, bitangent);
+    return;
   }
+
+  float invdet = 1.0f / det;
+  *tangent = (t.y * e1 - s.y * e2) * invdet;
+  *bitangent = (s.x * e2 - t.x * e1) * invdet;
 }
 
+// Pass 1 (one thread per triangle): compute the per-triangle T/B from the UV
+// gradient, then atomicAdd those vectors into per-vertex accumulators —
+// weighted by the triangle's interior angle at each corner. Per-vertex
+// normals are accumulated the same way so Pass 2 has a coordinate frame to
+// orthogonalize against, regardless of whether input normals are vertex,
+// face-varying, or absent.
 template <bool VerticesIndexed,
     bool NormalsIndexed,
     bool UVsIndexed,
     typename TexCoord>
-__global__ void __doComputeTangents(
-    glm::vec4 *tangents, // Output tangent vectors with handedness (w component)
-    const glm::uvec3 *indices, // Input triangle indices
-    const glm::vec3 *positions, // Input vertex positions
-    const glm::vec3 *normals, // Input vertex normals
-    const TexCoord *uvs, // Input texture coordinates
-    unsigned int numTriangles // Number of triangles
-)
+__global__ void __doAccumulateTangents(
+    glm::vec3 *tangentAccum,
+    glm::vec3 *bitangentAccum,
+    glm::vec3 *normalAccum,
+    const glm::uvec3 *indices,
+    const glm::vec3 *positions,
+    const glm::vec3 *normals,
+    const TexCoord *uvs,
+    unsigned int numTriangles)
 {
   unsigned int tri = blockIdx.x * blockDim.x + threadIdx.x;
 
@@ -189,29 +205,18 @@ __global__ void __doComputeTangents(
 
   vec2 uv0, uv1, uv2;
   if constexpr (UVsIndexed) {
-    // Use indexed UVs
     uv0 = uvs[indexedIdx.x];
     uv1 = uvs[indexedIdx.y];
     uv2 = uvs[indexedIdx.z];
   } else {
-    // Use per-face UVs
     uv0 = uvs[perFaceBaseIdx.x];
     uv1 = uvs[perFaceBaseIdx.y];
     uv2 = uvs[perFaceBaseIdx.z];
   }
 
   vec3 tangent, bitangent;
-
   __computeTangentAndBitangent(
-      &tangent, // Output tangent vectors with handedness (w component)
-      &bitangent, // Output bitangent vectors
-      p0,
-      p1,
-      p2, // Input vertex positions
-      uv0,
-      uv1,
-      uv2 // Input texture coordinates
-  );
+      &tangent, &bitangent, p0, p1, p2, uv0, uv1, uv2);
 
   const vec3 geometricNormal = computeGeometricNormal(p1 - p0, p2 - p0);
   vec3 n0 = geometricNormal;
@@ -219,12 +224,10 @@ __global__ void __doComputeTangents(
   vec3 n2 = geometricNormal;
   if (normals) {
     if constexpr (NormalsIndexed) {
-      // Use indexed normals
       n0 = normals[indexedIdx.x];
       n1 = normals[indexedIdx.y];
       n2 = normals[indexedIdx.z];
     } else {
-      // Use per-face normals
       n0 = normals[perFaceBaseIdx.x];
       n1 = normals[perFaceBaseIdx.y];
       n2 = normals[perFaceBaseIdx.z];
@@ -234,19 +237,26 @@ __global__ void __doComputeTangents(
     n2 = safeNormalize(n2, geometricNormal);
   }
 
-  vec3 t0;
-  vec3 t1;
-  vec3 t2;
-  float h0;
-  float h1;
-  float h2;
-  orthogonalizeTangent(tangent, bitangent, n0, &t0, &h0);
-  orthogonalizeTangent(tangent, bitangent, n1, &t1, &h1);
-  orthogonalizeTangent(tangent, bitangent, n2, &t2, &h2);
-
-  tangents[perFaceBaseIdx.x] = glm::vec4(t0, h0);
-  tangents[perFaceBaseIdx.y] = glm::vec4(t1, h1);
-  tangents[perFaceBaseIdx.z] = glm::vec4(t2, h2);
+  // For indexed meshes, accumulate at the shared vertex slot so adjacent
+  // triangles average their contributions. For triangle-soup each face-vertex
+  // already has a unique slot.
+  const glm::uvec3 outIdx = VerticesIndexed ? indexedIdx : perFaceBaseIdx;
+
+  const float w0 = cornerAngle(p0, p1, p2);
+  const float w1 = cornerAngle(p1, p0, p2);
+  const float w2 = cornerAngle(p2, p0, p1);
+
+  atomicAddVec3(tangentAccum[outIdx.x], tangent * w0);
+  atomicAddVec3(tangentAccum[outIdx.y], tangent * w1);
+  atomicAddVec3(tangentAccum[outIdx.z], tangent * w2);
+
+  atomicAddVec3(bitangentAccum[outIdx.x], bitangent * w0);
+  atomicAddVec3(bitangentAccum[outIdx.y], bitangent * w1);
+  atomicAddVec3(bitangentAccum[outIdx.z], bitangent * w2);
+
+  atomicAddVec3(normalAccum[outIdx.x], n0 * w0);
+  atomicAddVec3(normalAccum[outIdx.y], n1 * w1);
+  atomicAddVec3(normalAccum[outIdx.z], n2 * w2);
 }
 
 template <bool VerticesIndexed,
@@ -254,17 +264,60 @@ template <bool VerticesIndexed,
     bool UVsIndexed,
     typename TexCoord>
 void __computeTangents(
-    glm::vec4 *tangents, // Output tangent vectors with handedness (w component)
-    const glm::uvec3 *indices, // Input triangle indices
-    const glm::vec3 *positions, // Input vertex positions
-    const glm::vec3 *normals, // Input vertex normals
-    const TexCoord *uvs, // Input texture coordinates
-    unsigned int numTriangles // Number of triangles
-)
+    glm::vec3 *tangentAccum,
+    glm::vec3 *bitangentAccum,
+    glm::vec3 *normalAccum,
+    const glm::uvec3 *indices,
+    const glm::vec3 *positions,
+    const glm::vec3 *normals,
+    const TexCoord *uvs,
+    unsigned int numTriangles)
 {
-  __doComputeTangents<VerticesIndexed, NormalsIndexed, UVsIndexed, TexCoord>
-      <<<(numTriangles + 63) / 64, 64>>>(
-          tangents, indices, positions, normals, uvs, numTriangles);
+  __doAccumulateTangents<VerticesIndexed,
+      NormalsIndexed,
+      UVsIndexed,
+      TexCoord><<<(numTriangles + 63) / 64, 64>>>(tangentAccum,
+      bitangentAccum,
+      normalAccum,
+      indices,
+      positions,
+      normals,
+      uvs,
+      numTriangles);
+}
+
+// Pass 2 (one thread per vertex): normalize the accumulated frame and write
+// vec4(T_orthog, sign). The accumulated normal is used as the orthogonalization
+// basis; for vertex-indexed input it averages back to each vertex's authored
+// normal, and for face-varying or missing normals it gives the angle-weighted
+// average across incident faces.
+__global__ void __doFinalizeTangents(glm::vec4 *tangents,
+    const glm::vec3 *tangentAccum,
+    const glm::vec3 *bitangentAccum,
+    const glm::vec3 *normalAccum,
+    unsigned int numVertices)
+{
+  unsigned int v = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (v >= numVertices)
+    return;
+
+  const vec3 T_in = tangentAccum[v];
+  const vec3 B_in = bitangentAccum[v];
+  const vec3 N_in = normalAccum[v];
+
+  const vec3 n = safeNormalize(N_in, vec3(0.0f, 0.0f, 1.0f));
+
+  vec3 fallbackT, fallbackB;
+  makeTangentFrame(n, &fallbackT, &fallbackB);
+
+  const vec3 T_orth =
+      safeNormalize(T_in - n * glm::dot(n, T_in), fallbackT);
+
+  const float bitangentSign = glm::dot(glm::cross(n, T_orth), B_in);
+  const float sign = bitangentSign < 0.0f ? -1.0f : 1.0f;
+
+  tangents[v] = glm::vec4(T_orth, sign);
 }
 
 void updateGeometryTangent(Triangle *triangle)
@@ -306,30 +359,71 @@ void updateGeometryTangent(Triangle *triangle)
     return;
   }
 
-  // Always go with faceVarying tangents. Rational is the following:
-  // - Correct UV and normal sharing is achieve through indexing
-  // - If faceVarying UVs/normals are used then, it should already imply correct
-  // sharing
-  //   on common vertices.
-
-  auto tangentsCount = indices ? (indices->size() * 3) : positions->size();
-  auto trianglesCount = indices ? indices->size() : positions->size() / 3;
-  if (trianglesCount == 0 || tangentsCount == 0) {
+  // Output is per-vertex (vertex.tangent). For indexed meshes the per-vertex
+  // buffer is what lets adjacent triangles share tangent data at common
+  // vertices — that sharing is what eliminates the per-triangle facets a
+  // face-varying buffer would produce. For triangle-soup input each face-vertex
+  // is its own slot, so the same layout works without changes.
+  const auto numVertices = static_cast<unsigned int>(positions->size());
+  const auto trianglesCount = static_cast<unsigned int>(
+      indices ? indices->size() : positions->size() / 3);
+  if (trianglesCount == 0 || numVertices == 0) {
     triangle->reportMessage(ANARI_SEVERITY_INFO,
         "Triangle %p has no triangles, cannot compute tangents",
         triangle);
     return;
   }
 
-  glm::vec4 *tangents = {};
-  auto status = cudaMalloc(
-      reinterpret_cast<void **>(&tangents), sizeof(glm::vec4) * tangentsCount);
-  if (reportCudaError(triangle, status, "allocating tangent buffer"))
-    return;
+  glm::vec3 *tangentAccum = nullptr;
+  glm::vec3 *bitangentAccum = nullptr;
+  glm::vec3 *normalAccum = nullptr;
+  glm::vec4 *tangents = nullptr;
 
-  status = cudaMemset(tangents, 0, sizeof(glm::vec4) * tangentsCount);
-  if (reportCudaError(triangle, status, "clearing tangent buffer")) {
+  auto cleanup = [&] {
+    cudaFree(tangentAccum);
+    cudaFree(bitangentAccum);
+    cudaFree(normalAccum);
     cudaFree(tangents);
+  };
+
+  auto status = cudaMalloc(reinterpret_cast<void **>(&tangentAccum),
+      sizeof(glm::vec3) * numVertices);
+  if (reportCudaError(triangle, status, "allocating tangent accumulator")) {
+    cleanup();
+    return;
+  }
+  status = cudaMalloc(reinterpret_cast<void **>(&bitangentAccum),
+      sizeof(glm::vec3) * numVertices);
+  if (reportCudaError(triangle, status, "allocating bitangent accumulator")) {
+    cleanup();
+    return;
+  }
+  status = cudaMalloc(reinterpret_cast<void **>(&normalAccum),
+      sizeof(glm::vec3) * numVertices);
+  if (reportCudaError(triangle, status, "allocating normal accumulator")) {
+    cleanup();
+    return;
+  }
+  status = cudaMalloc(
+      reinterpret_cast<void **>(&tangents), sizeof(glm::vec4) * numVertices);
+  if (reportCudaError(triangle, status, "allocating tangent output buffer")) {
+    cleanup();
+    return;
+  }
+
+  status = cudaMemset(tangentAccum, 0, sizeof(glm::vec3) * numVertices);
+  if (reportCudaError(triangle, status, "clearing tangent accumulator")) {
+    cleanup();
+    return;
+  }
+  status = cudaMemset(bitangentAccum, 0, sizeof(glm::vec3) * numVertices);
+  if (reportCudaError(triangle, status, "clearing bitangent accumulator")) {
+    cleanup();
+    return;
+  }
+  status = cudaMemset(normalAccum, 0, sizeof(glm::vec3) * numVertices);
+  if (reportCudaError(triangle, status, "clearing normal accumulator")) {
+    cleanup();
     return;
   }
 
@@ -342,7 +436,9 @@ void updateGeometryTangent(Triangle *triangle)
         if (uvsFV->elementType() == ANARI_FLOAT32_VEC2) {
           auto uvsPtr = uvsFV->dataAs<const glm::vec2>(AddressSpace::GPU);
           // Vertex indexed, face varying normals and face varyings vec2 UVs.
-          __computeTangents<true, false, false>(tangents,
+          __computeTangents<true, false, false>(tangentAccum,
+              bitangentAccum,
+              normalAccum,
               indicesPtr,
               positionsPtr,
               normalsPtr,
@@ -351,7 +447,9 @@ void updateGeometryTangent(Triangle *triangle)
         } else {
           auto uvsPtr = uvsFV->dataAs<const glm::vec3>(AddressSpace::GPU);
           // Vertex indexed, face varying normals and face varyings vec3 UVs.
-          __computeTangents<true, false, false>(tangents,
+          __computeTangents<true, false, false>(tangentAccum,
+              bitangentAccum,
+              normalAccum,
               indicesPtr,
               positionsPtr,
               normalsPtr,
@@ -362,7 +460,9 @@ void updateGeometryTangent(Triangle *triangle)
         if (uvs->elementType() == ANARI_FLOAT32_VEC2) {
           // Vertex indexed,  face varying normals and indexed vec2 UVs.
           auto uvsPtr = uvs->dataAs<const glm::vec2>(AddressSpace::GPU);
-          __computeTangents<true, false, true>(tangents,
+          __computeTangents<true, false, true>(tangentAccum,
+              bitangentAccum,
+              normalAccum,
               indicesPtr,
               positionsPtr,
               normalsPtr,
@@ -371,7 +471,9 @@ void updateGeometryTangent(Triangle *triangle)
         } else {
           // Vertex indexed,  face varying normals and indexed vec3 UVs.
           auto uvsPtr = uvs->dataAs<const glm::vec3>(AddressSpace::GPU);
-          __computeTangents<true, false, true>(tangents,
+          __computeTangents<true, false, true>(tangentAccum,
+              bitangentAccum,
+              normalAccum,
               indicesPtr,
               positionsPtr,
               normalsPtr,
@@ -387,7 +489,9 @@ void updateGeometryTangent(Triangle *triangle)
         if (uvsFV->elementType() == ANARI_FLOAT32_VEC2) {
           auto uvsPtr = uvsFV->dataAs<const glm::vec2>(AddressSpace::GPU);
           // Vertex indexed, index normals and face varyings vec2 UVs.
-          __computeTangents<true, true, false>(tangents,
+          __computeTangents<true, true, false>(tangentAccum,
+              bitangentAccum,
+              normalAccum,
               indicesPtr,
               positionsPtr,
               normalsPtr,
@@ -396,7 +500,9 @@ void updateGeometryTangent(Triangle *triangle)
         } else {
           auto uvsPtr = uvsFV->dataAs<const glm::vec3>(AddressSpace::GPU);
           // Vertex indexed, indexed normals and face varyings vec3 UVs.
-          __computeTangents<true, true, false>(tangents,
+          __computeTangents<true, true, false>(tangentAccum,
+              bitangentAccum,
+              normalAccum,
               indicesPtr,
               positionsPtr,
               normalsPtr,
@@ -407,7 +513,9 @@ void updateGeometryTangent(Triangle *triangle)
         if (uvs->elementType() == ANARI_FLOAT32_VEC2) {
           // Vertex indexed, indexed normals and indexed vec2 UVs.
           auto uvsPtr = uvs->dataAs<const glm::vec2>(AddressSpace::GPU);
-          __computeTangents<true, true, true>(tangents,
+          __computeTangents<true, true, true>(tangentAccum,
+              bitangentAccum,
+              normalAccum,
               indicesPtr,
               positionsPtr,
               normalsPtr,
@@ -416,7 +524,9 @@ void updateGeometryTangent(Triangle *triangle)
         } else {
           // Vertex indexed, indexed normals and indexed vec3 UVs.
           auto uvsPtr = uvs->dataAs<const glm::vec3>(AddressSpace::GPU);
-          __computeTangents<true, true, true>(tangents,
+          __computeTangents<true, true, true>(tangentAccum,
+              bitangentAccum,
+              normalAccum,
               indicesPtr,
               positionsPtr,
               normalsPtr,
@@ -426,17 +536,20 @@ void updateGeometryTangent(Triangle *triangle)
       }
     }
   } else {
-    auto indicesPtr = nullptr;
-    normals = normalsFV ? normalsFV : normals;
-    uvs = uvsFV ? uvsFV : uvs;
+    const glm::uvec3 *indicesPtr = nullptr;
+    auto effectiveNormals = normalsFV ? normalsFV : normals;
+    auto effectiveUvs = uvsFV ? uvsFV : uvs;
 
-    const auto *normalsPtr =
-        normals ? normals->dataAs<const glm::vec3>(AddressSpace::GPU) : nullptr;
+    const auto *normalsPtr = effectiveNormals
+        ? effectiveNormals->dataAs<const glm::vec3>(AddressSpace::GPU)
+        : nullptr;
 
-    if (uvs->elementType() == ANARI_FLOAT32_VEC2) {
+    if (effectiveUvs->elementType() == ANARI_FLOAT32_VEC2) {
       // Non indexed vertices, face varying normals and face varyings vec2 UVs.
-      auto uvsPtr = uvs->dataAs<const glm::vec2>(AddressSpace::GPU);
-      __computeTangents<false, false, false>(tangents,
+      auto uvsPtr = effectiveUvs->dataAs<const glm::vec2>(AddressSpace::GPU);
+      __computeTangents<false, false, false>(tangentAccum,
+          bitangentAccum,
+          normalAccum,
           indicesPtr,
           positionsPtr,
           normalsPtr,
@@ -444,8 +557,10 @@ void updateGeometryTangent(Triangle *triangle)
           trianglesCount);
     } else {
       // Non indexed vertices, face varying normals and face varyings vec3 UVs.
-      auto uvsPtr = uvs->dataAs<const glm::vec3>(AddressSpace::GPU);
-      __computeTangents<false, false, false>(tangents,
+      auto uvsPtr = effectiveUvs->dataAs<const glm::vec3>(AddressSpace::GPU);
+      __computeTangents<false, false, false>(tangentAccum,
+          bitangentAccum,
+          normalAccum,
           indicesPtr,
           positionsPtr,
           normalsPtr,
@@ -455,14 +570,29 @@ void updateGeometryTangent(Triangle *triangle)
   }
 
   status = cudaGetLastError();
-  if (reportCudaError(triangle, status, "launching tangent kernel")) {
-    cudaFree(tangents);
+  if (reportCudaError(triangle, status, "launching accumulate kernel")) {
+    cleanup();
+    return;
+  }
+
+  __doFinalizeTangents<<<(numVertices + 63) / 64, 64>>>(
+      tangents, tangentAccum, bitangentAccum, normalAccum, numVertices);
+
+  status = cudaGetLastError();
+  if (reportCudaError(triangle, status, "launching finalize kernel")) {
+    cleanup();
     return;
   }
 
   status = cudaDeviceSynchronize();
+  cudaFree(tangentAccum);
+  cudaFree(bitangentAccum);
+  cudaFree(normalAccum);
+  tangentAccum = nullptr;
+  bitangentAccum = nullptr;
+  normalAccum = nullptr;
   if (reportCudaError(triangle, status, "computing tangents")) {
-    cudaFree(tangents);
+    cleanup();
     return;
   }
 
@@ -473,16 +603,13 @@ void updateGeometryTangent(Triangle *triangle)
           {}, // deleterPtr
           ANARI_FLOAT32_VEC4,
       },
-      tangentsCount,
+      numVertices,
   };
   auto tangentsArray = new Array1D(triangle->deviceState(), desc);
   tangentsArray->commitParameters();
   tangentsArray->finalize();
 
-  if (indices)
-    triangle->setParam("faceVarying.tangent", tangentsArray);
-  else
-    triangle->setParam("vertex.tangent", tangentsArray);
+  triangle->setParam("vertex.tangent", tangentsArray);
   triangle->commitParameters();
   triangle->finalize();
 

From 8d2595d47c51f13a25c534670fc92f6732b73013 Mon Sep 17 00:00:00 2001
From: Thomas Arcila <134677+tarcila@users.noreply.github.com>
Date: Wed, 29 Apr 2026 11:40:03 -0400
Subject: [PATCH 07/20] rtx: Read full vec4 tangent and interpolate handedness

Tangent fetch was discarding .w (vec3 cast on a vec4 array) and reading
handedness from vertex 0 alone. Read the vec4 properly and barycentric-
interpolate the sign across the corners, then quantize to +-1 for the
basis flip.
---
 devices/rtx/device/gpu/populateHit.h | 30 ++++++++++++++++------------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/devices/rtx/device/gpu/populateHit.h b/devices/rtx/device/gpu/populateHit.h
index 4930c05e5..c2e2f652e 100644
--- a/devices/rtx/device/gpu/populateHit.h
+++ b/devices/rtx/device/gpu/populateHit.h
@@ -197,9 +197,11 @@ VISRTX_DEVICE const VolumeGPUData &volumeData(const FrameGPUData &frameData)
   return frameData.registry.volumes[idx];
 }
 
-VISRTX_DEVICE const SpatialFieldGPUData &fieldData(const FrameGPUData &frameData, const VolumeGPUData &volumeData)
+VISRTX_DEVICE const SpatialFieldGPUData &fieldData(
+    const FrameGPUData &frameData, const VolumeGPUData &volumeData)
 {
-  // Currently only TF1D volume type is supported, so assume this is what we have
+  // Currently only TF1D volume type is supported, so assume this is what we
+  // have
   return frameData.registry.fields[volumeData.data.tf1d.field];
 }
 
@@ -243,20 +245,22 @@ VISRTX_DEVICE void computeTangentSpace(
     if (ggd.tri.vertexTangentsFV != nullptr) {
       const uvec3 idx = uvec3(0, 1, 2) + (hit.primID * 3);
 
-      const vec3 t0 = ggd.tri.vertexTangentsFV[idx.x];
-      const vec3 t1 = ggd.tri.vertexTangentsFV[idx.y];
-      const vec3 t2 = ggd.tri.vertexTangentsFV[idx.z];
-      const float handedness = ggd.tri.vertexTangentsFV[idx.x].w;
+      const vec4 t0 = ggd.tri.vertexTangentsFV[idx.x];
+      const vec4 t1 = ggd.tri.vertexTangentsFV[idx.y];
+      const vec4 t2 = ggd.tri.vertexTangentsFV[idx.z];
+      const float handedness = b.x * t0.w + b.y * t1.w + b.z * t2.w;
+      const float sign = handedness < 0.f ? -1.f : 1.f;
       hit.tU = normalize(b.x * vec3(t0) + b.y * vec3(t1) + b.z * vec3(t2));
-      hit.tV = handedness * normalize(cross(hit.Ns, hit.tU));
+      hit.tV = sign * normalize(cross(hit.Ns, hit.tU));
     } else if (ggd.tri.vertexTangents != nullptr) {
-      const vec3 t0 = ggd.tri.vertexTangents[idx.x];
-      const vec3 t1 = ggd.tri.vertexTangents[idx.y];
-      const vec3 t2 = ggd.tri.vertexTangents[idx.z];
-      const float handedness = ggd.tri.vertexTangents[idx.x].w;
+      const vec4 t0 = ggd.tri.vertexTangents[idx.x];
+      const vec4 t1 = ggd.tri.vertexTangents[idx.y];
+      const vec4 t2 = ggd.tri.vertexTangents[idx.z];
+      const float handedness = b.x * t0.w + b.y * t1.w + b.z * t2.w;
+      const float sign = handedness < 0.f ? -1.f : 1.f;
 
-      hit.tU = normalize(b.x * t0 + b.y * t1 + b.z * t2);
-      hit.tV = handedness * normalize(cross(hit.Ns, hit.tU));
+      hit.tU = normalize(b.x * vec3(t0) + b.y * vec3(t1) + b.z * vec3(t2));
+      hit.tV = sign * normalize(cross(hit.Ns, hit.tU));
     } else {
       auto tangentSpace = computeOrthonormalBasis(hit.Ng);
       hit.tU = tangentSpace[0];

From a7d098806e040552e23a690673e735663bc839e0 Mon Sep 17 00:00:00 2001
From: Thomas Arcila <134677+tarcila@users.noreply.github.com>
Date: Wed, 6 May 2026 19:36:08 +0000
Subject: [PATCH 08/20] rtx: NaN-safe shading and per-vertex bitangent blending

Numerical fixes:
- Rewrite ggxD denom as alpha2*x + (1-x) instead of x*(alpha2-1) + 1.
  The textbook form cancels catastrophically when alpha2 < eps(1) and
  collapses to zero at x=1, producing NaN throughput.
- Negate NdotL early-outs so a NaN takes the rejection path instead of
  slipping through (NaN compares false to <=, > etc.).
- Fall back from Ns to Ng in Matte and PBR shading state when the
  normal length squared is non-positive (catches both NaN and zero).

Mirror-seam tangents:
- Compute per-vertex bitangents B_i = t_i.w * cross(N_i, T_i) and
  barycentric-blend B and T independently, instead of blending t.w
  signs and applying once at the hit. Matches glTF Sample Renderer /
  PBRT / Filament; avoids carving seam edges into the tangent frame.
---
 devices/rtx/device/gpu/populateHit.h          | 60 +++++++++++--------
 .../material/shaders/MatteShader_ptx.cu       |  6 ++
 .../shaders/PhysicallyBasedShader_ptx.cu      | 21 ++++++-
 3 files changed, 61 insertions(+), 26 deletions(-)

diff --git a/devices/rtx/device/gpu/populateHit.h b/devices/rtx/device/gpu/populateHit.h
index c2e2f652e..c43a0b17f 100644
--- a/devices/rtx/device/gpu/populateHit.h
+++ b/devices/rtx/device/gpu/populateHit.h
@@ -225,42 +225,54 @@ VISRTX_DEVICE void computeTangentSpace(
     if (!optixIsFrontFaceHit())
       hit.Ng = -hit.Ng;
 
+    const bool hasVertexNormals = ggd.tri.vertexNormalsFV != nullptr
+        || ggd.tri.vertexNormals != nullptr;
+    vec3 n0, n1, n2;
     if (ggd.tri.vertexNormalsFV != nullptr) {
-      const uvec3 idx = uvec3(0, 1, 2) + (hit.primID * 3);
+      const uvec3 nIdx = uvec3(0, 1, 2) + (hit.primID * 3);
 
-      const vec3 n0 = ggd.tri.vertexNormalsFV[idx.x];
-      const vec3 n1 = ggd.tri.vertexNormalsFV[idx.y];
-      const vec3 n2 = ggd.tri.vertexNormalsFV[idx.z];
+      n0 = ggd.tri.vertexNormalsFV[nIdx.x];
+      n1 = ggd.tri.vertexNormalsFV[nIdx.y];
+      n2 = ggd.tri.vertexNormalsFV[nIdx.z];
       hit.Ns = b.x * n0 + b.y * n1 + b.z * n2;
     } else if (ggd.tri.vertexNormals != nullptr) {
-      const vec3 n0 = ggd.tri.vertexNormals[idx.x];
-      const vec3 n1 = ggd.tri.vertexNormals[idx.y];
-      const vec3 n2 = ggd.tri.vertexNormals[idx.z];
+      n0 = ggd.tri.vertexNormals[idx.x];
+      n1 = ggd.tri.vertexNormals[idx.y];
+      n2 = ggd.tri.vertexNormals[idx.z];
       hit.Ns = b.x * n0 + b.y * n1 + b.z * n2;
     } else
       hit.Ns = hit.Ng;
 
     hit.Ns = normalize(hit.Ns);
 
-    if (ggd.tri.vertexTangentsFV != nullptr) {
-      const uvec3 idx = uvec3(0, 1, 2) + (hit.primID * 3);
+    const bool hasTangentsFV = ggd.tri.vertexTangentsFV != nullptr;
+    const bool hasTangentsV = ggd.tri.vertexTangents != nullptr;
+    if (hasTangentsFV || hasTangentsV) {
+      const uvec3 tIdx = hasTangentsFV
+          ? uvec3(0, 1, 2) + (hit.primID * 3)
+          : idx;
+      const vec4 *tArr = hasTangentsFV
+          ? ggd.tri.vertexTangentsFV
+          : ggd.tri.vertexTangents;
+      const vec4 t0 = tArr[tIdx.x];
+      const vec4 t1 = tArr[tIdx.y];
+      const vec4 t2 = tArr[tIdx.z];
+
+      // At UV mirror seams the sign flips between adjacent vertices;
+      // barycentric-summing the signs and applying a single sign at the
+      // hit point would carve seam edges into the tangent frame.
+      // Build each vertex's bitangent with its own sign and normal,
+      // then blend B and T independently — same convention
+      // as glTF Sample Renderer, PBRT, Filament.
+      const vec3 N0 = hasVertexNormals ? n0 : hit.Ng;
+      const vec3 N1 = hasVertexNormals ? n1 : hit.Ng;
+      const vec3 N2 = hasVertexNormals ? n2 : hit.Ng;
+      const vec3 B0 = t0.w * cross(N0, vec3(t0));
+      const vec3 B1 = t1.w * cross(N1, vec3(t1));
+      const vec3 B2 = t2.w * cross(N2, vec3(t2));
 
-      const vec4 t0 = ggd.tri.vertexTangentsFV[idx.x];
-      const vec4 t1 = ggd.tri.vertexTangentsFV[idx.y];
-      const vec4 t2 = ggd.tri.vertexTangentsFV[idx.z];
-      const float handedness = b.x * t0.w + b.y * t1.w + b.z * t2.w;
-      const float sign = handedness < 0.f ? -1.f : 1.f;
       hit.tU = normalize(b.x * vec3(t0) + b.y * vec3(t1) + b.z * vec3(t2));
-      hit.tV = sign * normalize(cross(hit.Ns, hit.tU));
-    } else if (ggd.tri.vertexTangents != nullptr) {
-      const vec4 t0 = ggd.tri.vertexTangents[idx.x];
-      const vec4 t1 = ggd.tri.vertexTangents[idx.y];
-      const vec4 t2 = ggd.tri.vertexTangents[idx.z];
-      const float handedness = b.x * t0.w + b.y * t1.w + b.z * t2.w;
-      const float sign = handedness < 0.f ? -1.f : 1.f;
-
-      hit.tU = normalize(b.x * vec3(t0) + b.y * vec3(t1) + b.z * vec3(t2));
-      hit.tV = sign * normalize(cross(hit.Ns, hit.tU));
+      hit.tV = normalize(b.x * B0 + b.y * B1 + b.z * B2);
     } else {
       auto tangentSpace = computeOrthonormalBasis(hit.Ng);
       hit.tU = tangentSpace[0];
diff --git a/devices/rtx/device/material/shaders/MatteShader_ptx.cu b/devices/rtx/device/material/shaders/MatteShader_ptx.cu
index 06429e5fa..371a4cb1c 100644
--- a/devices/rtx/device/material/shaders/MatteShader_ptx.cu
+++ b/devices/rtx/device/material/shaders/MatteShader_ptx.cu
@@ -48,6 +48,12 @@ VISRTX_CALLABLE void __direct_callable__init(MatteShadingState *shadingState,
   shadingState->normal = hit->Ns;
   shadingState->opacity =
       adjustedMaterialOpacity(color.w * opacity, md->alphaMode, md->cutoff);
+
+  // Fall back to the geometric normal if hit->Ns is NaN (e.g. coincident
+  // curve control points) or zero-length. Negated comparison catches both
+  // since NaN compares false to anything.
+  if (!(glm::dot(shadingState->normal, shadingState->normal) > 1e-12f))
+    shadingState->normal = hit->Ng;
 }
 
 VISRTX_CALLABLE NextRay __direct_callable__nextRay(
diff --git a/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu b/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu
index 5e710b729..b3d834e01 100644
--- a/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu
+++ b/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu
@@ -116,7 +116,13 @@ VISRTX_DEVICE float smithG1GGX(float NdotV, float alpha2)
 
 VISRTX_DEVICE float ggxD(float NdotH, float alpha2)
 {
-  const float denom = NdotH * NdotH * (alpha2 - 1.0f) + 1.0f;
+  // The textbook denom `x·(α²−1) + 1` cancels catastrophically in fp32 once
+  // α² is below eps(1) ≈ 1.19e-7 (our α² floor is 1e-8): `α²−1` rounds to
+  // exactly −1, and at x=1 the whole denom collapses to 0. The algebraically
+  // equivalent `α²·x + (1−x)` has no near-1 subtraction so it stays exact.
+  // The fminf clamp keeps `1−x ≥ 0` against dot-product rounding above 1.
+  const float NdotH2 = fminf(NdotH * NdotH, 1.0f);
+  const float denom = alpha2 * NdotH2 + (1.0f - NdotH2);
   return alpha2 / (float(M_PI) * denom * denom);
 }
 
@@ -313,6 +319,15 @@ VISRTX_CALLABLE void __direct_callable__init(
   shadingState->iridescenceIor = md->iridescenceIor;
   shadingState->iridescenceThickness =
       getMaterialParameter(*fd, md->iridescenceThickness, *hit).x;
+
+  // Fall back to the geometric normal if sampleNormalMap produced NaN
+  // (texel decodes to zero, or zero-summed tangents) or zero-length. The
+  // negated comparison catches both since NaN compares false to anything.
+  if (!(glm::dot(shadingState->normal, shadingState->normal) > 1e-12f))
+    shadingState->normal = hit->Ng;
+  if (!(glm::dot(shadingState->clearcoatNormal, shadingState->clearcoatNormal)
+          > 1e-12f))
+    shadingState->clearcoatNormal = hit->Ng;
 }
 
 //-----------------------------------------------------------------------------
@@ -390,7 +405,9 @@ VISRTX_CALLABLE vec3 __direct_callable__shadeSurface(
   const vec3 L = lightSample->dir;
 
   const float NdotL = dot(N, L);
-  if (NdotL <= 0.0f)
+  // Negated form so a NaN NdotL takes this early-out — NaN compares false
+  // to everything, so `NdotL <= 0.0f` would let it pass through.
+  if (!(NdotL > 0.0f))
     return vec3(0.0f);
 
   const vec3 H = normalize(L + V);

From cc40202017ee867e5e84ca26aabf60926004becb Mon Sep 17 00:00:00 2001
From: Thomas Arcila <134677+tarcila@users.noreply.github.com>
Date: Wed, 6 May 2026 19:35:40 +0000
Subject: [PATCH 09/20] rtx: Gate Quality renderer direct light on Ns instead
 of Ng

The lightDotNg gate and the ambient hemisphere normal both used Ng,
which carved per-triangle facet shapes into the lit/unlit boundary at
grazing angles on smooth-normal meshes. Switch both to Ns so the
terminator follows the shading surface; the material's own NdotL guard
still rejects light from below.
---
 devices/rtx/device/renderer/Quality_ptx.cu | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/devices/rtx/device/renderer/Quality_ptx.cu b/devices/rtx/device/renderer/Quality_ptx.cu
index fe319255c..8e3b84ce1 100644
--- a/devices/rtx/device/renderer/Quality_ptx.cu
+++ b/devices/rtx/device/renderer/Quality_ptx.cu
@@ -360,10 +360,13 @@ VISRTX_GLOBAL void __raygen__()
 
         sample.color += sampleContribution * materialEmission * materialOpacity;
         LightSample lightSample =
-            sampleLights(ss, frameData, surfaceHit.hitpoint, surfaceHit.Ng);
+            sampleLights(ss, frameData, shadowOrigin, surfaceHit.Ns);
         if (lightSample.pdf >= ATTENUATION_EPSILON && lightSample.dist > 0.0f) {
-          const float lightDotNg = dot(lightSample.dir, surfaceHit.Ng);
-          if (lightDotNg > 0.0f) {
+          // Gate on the shading normal so the terminator follows the smooth
+          // surface; gating on Ng would carve the per-triangle facet shape
+          // into the lit/unlit boundary at grazing light angles.
+          const float lightDotNs = dot(lightSample.dir, surfaceHit.Ns);
+          if (lightDotNs > 0.0f) {
             const Ray shadowRay = {
                 surfaceHit.hitpoint + surfaceHit.Ng * surfaceHit.epsilon,
                 lightSample.dir,

From f3d7c064868dfa05e8b1697561c2e4c093e64d1d Mon Sep 17 00:00:00 2001
From: Thomas Arcila <134677+tarcila@users.noreply.github.com>
Date: Wed, 29 Apr 2026 14:55:58 -0400
Subject: [PATCH 10/20] rtx: Fix biased uniform-sphere and cosine-hemisphere
 samplers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

randomDir was normalizing a cube-uniform vector — that clusters samples
toward cube corners, not uniform on the sphere. Replace with the
analytic cosTheta = 1 - 2u mapping. sampleHemisphere had Malley's
method inverted (z = u, r = sqrt(1-sqrt(u))) instead of (r = sqrt(u),
z = sqrt(1-r^2)), biasing AO and diffuse estimates toward grazing.
Fold cos(theta)/pi into the ambient LightSample pdf so it matches the
corrected density.
---
 devices/rtx/device/gpu/gpu_util.h          | 29 ++++++++++------------
 devices/rtx/device/renderer/Quality_ptx.cu | 13 ++++++++--
 2 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/devices/rtx/device/gpu/gpu_util.h b/devices/rtx/device/gpu/gpu_util.h
index 9e17c44ba..88b3af0fb 100644
--- a/devices/rtx/device/gpu/gpu_util.h
+++ b/devices/rtx/device/gpu/gpu_util.h
@@ -195,18 +195,14 @@ VISRTX_DEVICE vec3 boolColor(bool pred)
   return pred ? vec3(0.f, 1.f, 0.f) : vec3(1.f, 0.f, 0.f);
 }
 
+// Uniform on the unit sphere via Marsaglia (1972); pdf = 1/(4*pi).
+// Downstream uses: isotropic volume scatter, AO/bounce hemisphere base.
 VISRTX_DEVICE vec3 randomDir(RandState &rs)
 {
-#if 0
-  const float r1 = curand_uniform(&rs);
-  const float r2 = curand_uniform(&rs);
-  return normalize(vec3(cos(2 * float(M_PI) * r1) * sqrt(1 - (r2 * r2)),
-      sin(2 * float(M_PI) * r1) * sqrt(1 - (r2 * r2)),
-      r2 * r2));
-#else
-  const auto r = curand_uniform4(&rs);
-  return normalize((2.f * vec3(r.x, r.y, r.z)) - vec3(1.f));
-#endif
+  const float cosTheta = 1.f - 2.f * curand_uniform(&rs);
+  const float sinTheta = sqrtf(fmaxf(0.f, 1.f - cosTheta * cosTheta));
+  const float phi = 2.f * float(M_PI) * curand_uniform(&rs);
+  return vec3(sinTheta * cosf(phi), sinTheta * sinf(phi), cosTheta);
 }
 
 VISRTX_DEVICE vec3 randomDir(RandState &rs, const vec3 &normal)
@@ -228,14 +224,15 @@ VISRTX_DEVICE mat3 computeOrthonormalBasis(const vec3 &normal)
   return mat3(u, v, normal);
 }
 
+// Cosine-weighted hemisphere sample (Malley's method); pdf = cos(theta)/pi.
 VISRTX_DEVICE vec3 sampleHemisphere(RandState &rs, const vec3 &normal)
 {
-  auto z = curand_uniform(&rs);
-  auto r = sqrtf(1.f - sqrt(z));
-  auto phi = 2.0f * float(M_PI) * curand_uniform(&rs);
-
-  auto sample = vec3(r * cos(phi), r * sin(phi), z);
-
+  const float u1 = curand_uniform(&rs);
+  const float u2 = curand_uniform(&rs);
+  const float r = sqrtf(u1);
+  const float z = sqrtf(fmaxf(0.f, 1.f - r * r));
+  const float phi = 2.f * float(M_PI) * u2;
+  const vec3 sample(r * cosf(phi), r * sinf(phi), z);
   return computeOrthonormalBasis(normal) * sample;
 }
 
diff --git a/devices/rtx/device/renderer/Quality_ptx.cu b/devices/rtx/device/renderer/Quality_ptx.cu
index 8e3b84ce1..dede6347d 100644
--- a/devices/rtx/device/renderer/Quality_ptx.cu
+++ b/devices/rtx/device/renderer/Quality_ptx.cu
@@ -153,11 +153,14 @@ VISRTX_DEVICE LightSample sampleLights(ScreenSample &ss,
   // last index is reserved for ambient light if it exists
   if (selectedIdx == world.numLightInstances) {
     const auto &rendererParams = frameData.renderer;
+    // Fold the hemisphere-sample pdf cos(theta)/pi with the uniform light pick.
+    const vec3 dir = sampleHemisphere(ss.rs, normal);
+    const float cosNs = fmaxf(0.f, dot(dir, normal));
     return LightSample{
         rendererParams.ambientColor * rendererParams.ambientIntensity,
-        sampleHemisphere(ss.rs, normal),
+        dir,
         std::numeric_limits<float>::max(),
-        lightPickPdf / (2.0f * float(M_PI)),
+        lightPickPdf * cosNs * float(M_1_PI),
     };
   } else {
     const auto &lightInstance = world.lightInstances[selectedIdx];
@@ -359,6 +362,12 @@ VISRTX_GLOBAL void __raygen__()
         }
 
         sample.color += sampleContribution * materialEmission * materialOpacity;
+        // Sample around the shading normal so the cosine-weighted hemisphere's
+        // pdf matches the BRDF's NdotL (which uses Ns). Sampling around Ng
+        // would bias the Lambertian estimator by cos_Ns/cos_Ng on smooth or
+        // bump-mapped surfaces.
+        const vec3 shadowOrigin =
+            shadingHitpoint(surfaceHit) + surfaceHit.Ng * surfaceHit.epsilon;
         LightSample lightSample =
             sampleLights(ss, frameData, shadowOrigin, surfaceHit.Ns);
         if (lightSample.pdf >= ATTENUATION_EPSILON && lightSample.dist > 0.0f) {

From c3e34bacf8d65a25c31f8966f5eb3741565b0fe1 Mon Sep 17 00:00:00 2001
From: Thomas Arcila <134677+tarcila@users.noreply.github.com>
Date: Wed, 29 Apr 2026 16:01:21 -0400
Subject: [PATCH 11/20] rtx: Add diffuse lobe to PBR nextRay path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For opaque non-metals reflectProb=1 always, so the indirect bounce was
purely glossy GGX and multi-bounce Lambertian light was missing entirely
— rough dielectrics rendered black on indirect bounces. Add a third
cosine-weighted diffuse lobe with importance proxy
(1-F)*(1-metallic)*(1-transmission)*luminance(baseColor); existing
reflect/transmit weights now divide by their (now <1) lobe-pick
probability. Sampled around Ns to match shadeSurface's diffuseBRDF axis.
---
 .../shaders/PhysicallyBasedShader_ptx.cu      | 51 ++++++++++++++-----
 1 file changed, 37 insertions(+), 14 deletions(-)

diff --git a/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu b/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu
index b3d834e01..f216565d4 100644
--- a/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu
+++ b/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu
@@ -531,36 +531,59 @@ VISRTX_CALLABLE NextRay __direct_callable__nextRay(
       ? vec3(0.0f)
       : glm::max(vec3(1.0f) - F, vec3(0.0f)) * transmissionFilter;
 
+  // Diffuse importance: the Lambertian throughput collapses to
+  //   (1-F) * baseColor * (1-metallic) * (1-transmission) * occlusion
+  // when sampled cosine-weighted (cos / pdf cancels with 1/pi). Mirror the
+  // factors used by shadeSurface's diffuseBRDF so the lobe split tracks the
+  // BRDF being estimated. TIR has no diffuse share (all energy is reflected).
+  const vec3 diffuseEnergy = totalInternalReflection
+      ? vec3(0.0f)
+      : glm::max(vec3(1.0f) - F, vec3(0.0f)) * state->baseColor
+          * (1.0f - state->metallic) * (1.0f - state->transmission)
+          * state->occlusion;
+
   const float reflectStrength =
       fmaxf(luminance(glm::max(reflectEnergy, vec3(0.0f))), 0.0f);
   const float transmitStrength =
       fmaxf(luminance(glm::max(transmitEnergy, vec3(0.0f))), 0.0f);
-  const float combinedStrength = reflectStrength + transmitStrength;
+  const float diffuseStrength =
+      fmaxf(luminance(glm::max(diffuseEnergy, vec3(0.0f))), 0.0f);
+  const float combinedStrength =
+      reflectStrength + transmitStrength + diffuseStrength;
   if (combinedStrength <= 0.0f)
     return NextRay{N, vec3(0.0f)};
 
   const float reflectProb = reflectStrength / combinedStrength;
-  const bool sampleTransmission = curand_uniform(rs) > reflectProb;
+  const float transmitProb = transmitStrength / combinedStrength;
+  const float diffuseProb = diffuseStrength / combinedStrength;
+
+  const float u = curand_uniform(rs);
+  if (u < reflectProb) {
+    if (Lrefl.z <= 0.0f)
+      return NextRay{N, vec3(0.0f)};
+    const float NdotL = Lrefl.z;
+    const float G1 = smithG1GGX(NdotV, alpha2);
+    const float G2 = smithG2GGX(NdotV, NdotL, alpha2);
+    const vec3 weight =
+        reflectEnergy * (G2 / fmaxf(G1, 1e-8f)) / fmaxf(reflectProb, 1e-8f);
+    return NextRay{normalize(toWorld * Lrefl), weight};
+  }
 
-  if (sampleTransmission) {
+  if (u < reflectProb + transmitProb) {
     const float NdotL = -Ltrans.z; // L points through the surface.
     const float G1 = smithG1GGX(NdotV, alpha2);
     const float G2 = smithG2GGX(NdotV, NdotL, alpha2);
     const vec3 weight = transmitEnergy * (G2 / fmaxf(G1, 1e-8f))
-        / fmaxf(1.0f - reflectProb, 1e-8f);
+        / fmaxf(transmitProb, 1e-8f);
     return NextRay{normalize(toWorld * Ltrans),
         weight,
         NEXT_RAY_CONTINUES_THROUGH_SURFACE};
   }
 
-  // Reflection.
-  if (Lrefl.z <= 0.0f)
-    return NextRay{N, vec3(0.0f)};
-
-  const float NdotL = Lrefl.z;
-  const float G1 = smithG1GGX(NdotV, alpha2);
-  const float G2 = smithG2GGX(NdotV, NdotL, alpha2);
-  const vec3 weight =
-      reflectEnergy * (G2 / fmaxf(G1, 1e-8f)) / fmaxf(reflectProb, 1e-8f);
-  return NextRay{normalize(toWorld * Lrefl), weight};
+  // Diffuse: sample around the shading normal so pdf=cos/pi matches the BRDF's
+  // NdotL (same axis as shadeSurface's diffuse term). Cos and pdf cancel,
+  // leaving only the energy term and the lobe-pick divisor.
+  const vec3 wi = sampleHemisphere(*rs, N);
+  const vec3 weight = diffuseEnergy / fmaxf(diffuseProb, 1e-8f);
+  return NextRay{wi, weight};
 }

From 4f6ae3de7a24f877cbdc0705350e72a1dedba049 Mon Sep 17 00:00:00 2001
From: Thomas Arcila <134677+tarcila@users.noreply.github.com>
Date: Thu, 30 Apr 2026 07:10:07 -0400
Subject: [PATCH 12/20] rtx: Fuse prepareDenoiseInput and prepareDenoiseGuides

Two sequential kernels did identical resolveSample work for color and
for the albedo/normal guides. Merge into a single kernel; null-pointer
checks gate the guide writes when the guides are absent. Halves the
prepare-denoise dispatch overhead per frame.
---
 devices/rtx/device/frame/Frame.cu | 79 ++++++++++---------------------
 1 file changed, 24 insertions(+), 55 deletions(-)

diff --git a/devices/rtx/device/frame/Frame.cu b/devices/rtx/device/frame/Frame.cu
index e2aa825c6..b89db4074 100644
--- a/devices/rtx/device/frame/Frame.cu
+++ b/devices/rtx/device/frame/Frame.cu
@@ -73,8 +73,12 @@ __device__ bool resolveSample(uint32_t idx,
   return divisor > 0;
 }
 
-__global__ void prepareDenoiseInput(const vec4 *__restrict__ accumColor,
+__global__ void prepareDenoiseInputs(const vec4 *__restrict__ accumColor,
+    const vec3 *__restrict__ accumAlbedo,
+    const vec3 *__restrict__ accumNormal,
     vec4 *__restrict__ denoiseInput,
+    vec3 *__restrict__ denoiseAlbedo,
+    vec3 *__restrict__ denoiseNormal,
     uvec2 size,
     int frameID,
     int checkerboardID,
@@ -88,45 +92,6 @@ __global__ void prepareDenoiseInput(const vec4 *__restrict__ accumColor,
   int divisor;
   if (!resolveSample(idx, size, frameID, checkerboardID, srcIdx, divisor)) {
     denoiseInput[idx] = vec4(0.f);
-    return;
-  }
-
-  vec4 c = accumColor[srcIdx] / float(divisor);
-  if (fireflyFilter)
-    c = detail::inverseTonemap(c);
-  denoiseInput[idx] = c;
-}
-
-void launchPrepareDenoiseInput(const vec4 *accumColor,
-    vec4 *denoiseInput,
-    uvec2 size,
-    int frameID,
-    int checkerboardID,
-    bool fireflyFilter,
-    cudaStream_t stream)
-{
-  const uint32_t nPixels = size.x * size.y;
-  const uint32_t blockSize = 256;
-  const uint32_t gridSize = (nPixels + blockSize - 1) / blockSize;
-  prepareDenoiseInput<<<gridSize, blockSize, 0, stream>>>(
-      accumColor, denoiseInput, size, frameID, checkerboardID, fireflyFilter);
-}
-
-__global__ void prepareDenoiseGuides(const vec3 *__restrict__ accumAlbedo,
-    const vec3 *__restrict__ accumNormal,
-    vec3 *__restrict__ denoiseAlbedo,
-    vec3 *__restrict__ denoiseNormal,
-    uvec2 size,
-    int frameID,
-    int checkerboardID)
-{
-  const uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= size.x * size.y)
-    return;
-
-  uint32_t srcIdx;
-  int divisor;
-  if (!resolveSample(idx, size, frameID, checkerboardID, srcIdx, divisor)) {
     if (denoiseAlbedo)
       denoiseAlbedo[idx] = vec3(0.f);
     if (denoiseNormal)
@@ -135,6 +100,11 @@ __global__ void prepareDenoiseGuides(const vec3 *__restrict__ accumAlbedo,
   }
 
   const float invDivisor = 1.0f / float(divisor);
+  vec4 c = accumColor[srcIdx] * invDivisor;
+  if (fireflyFilter)
+    c = detail::inverseTonemap(c);
+  denoiseInput[idx] = c;
+
   if (denoiseAlbedo)
     denoiseAlbedo[idx] = accumAlbedo[srcIdx] * invDivisor;
 
@@ -146,25 +116,31 @@ __global__ void prepareDenoiseGuides(const vec3 *__restrict__ accumAlbedo,
   }
 }
 
-void launchPrepareDenoiseGuides(const vec3 *accumAlbedo,
+void launchPrepareDenoiseInputs(const vec4 *accumColor,
+    const vec3 *accumAlbedo,
     const vec3 *accumNormal,
+    vec4 *denoiseInput,
     vec3 *denoiseAlbedo,
     vec3 *denoiseNormal,
     uvec2 size,
     int frameID,
     int checkerboardID,
+    bool fireflyFilter,
     cudaStream_t stream)
 {
   const uint32_t nPixels = size.x * size.y;
   const uint32_t blockSize = 256;
   const uint32_t gridSize = (nPixels + blockSize - 1) / blockSize;
-  prepareDenoiseGuides<<<gridSize, blockSize, 0, stream>>>(accumAlbedo,
+  prepareDenoiseInputs<<<gridSize, blockSize, 0, stream>>>(accumColor,
+      accumAlbedo,
       accumNormal,
+      denoiseInput,
       denoiseAlbedo,
       denoiseNormal,
       size,
       frameID,
-      checkerboardID);
+      checkerboardID,
+      fireflyFilter);
 }
 
 __global__ void compositeBackground(vec4 *__restrict__ accumColor,
@@ -561,25 +537,18 @@ void Frame::renderFrame()
   const bool useFloatOutput = m_denoise || m_colorType == ANARI_FLOAT32_VEC4;
 
   if (m_denoise) {
-    launchPrepareDenoiseInput(m_accumColor.ptrAs<vec4>(),
+    launchPrepareDenoiseInputs(m_accumColor.ptrAs<vec4>(),
+        m_accumAlbedo.ptrAs<vec3>(),
+        m_accumNormal.ptrAs<vec3>(),
         m_denoiseInput.ptrAs<vec4>(),
+        m_denoiseAlbedo.ptrAs<vec3>(),
+        m_denoiseNormal.ptrAs<vec3>(),
         hd.fb.size,
         hd.fb.frameID,
         hd.fb.checkerboardID,
         hd.renderer.fireflyFilter,
         state.stream);
 
-    if (m_denoiseUsingAlbedo || m_denoiseUsingNormal) {
-      launchPrepareDenoiseGuides(m_accumAlbedo.ptrAs<vec3>(),
-          m_accumNormal.ptrAs<vec3>(),
-          m_denoiseAlbedo.ptrAs<vec3>(),
-          m_denoiseNormal.ptrAs<vec3>(),
-          hd.fb.size,
-          hd.fb.frameID,
-          hd.fb.checkerboardID,
-          state.stream);
-    }
-
     m_denoiser.launch();
 
     launchCompositeBackground(m_accumColor.ptrAs<vec4>(),

From 17efc098267f43e696574d1cdf4bb15a74afdfab Mon Sep 17 00:00:00 2001
From: Thomas Arcila <134677+tarcila@users.noreply.github.com>
Date: Thu, 30 Apr 2026 06:56:39 -0400
Subject: [PATCH 13/20] rtx: Drop redundant dir/t term from epsilonFrom
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The original `compMax(vec4(abs(P), compMax(abs(dir))*t))` was redundant:
for normalized rays `hitpoint = origin + dir*t`, so `|hitpoint|_inf`
already bounds the magnitude that drives ULP-scale self-intersection
offsets at the surface. Drop the dir/t inputs and use `|P|_inf` alone.

The sole caller (`populateSurfaceHit` in populateHit.h) only runs from
__closesthit__/__anyhit__ programs, where `optixGetRayTmax()` is the
finite hit distance — there was no functional bug, this is a cleanup.
---
 devices/rtx/device/gpu/gpu_util.h    | 4 ++--
 devices/rtx/device/gpu/populateHit.h | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/devices/rtx/device/gpu/gpu_util.h b/devices/rtx/device/gpu/gpu_util.h
index 88b3af0fb..42dba9e8f 100644
--- a/devices/rtx/device/gpu/gpu_util.h
+++ b/devices/rtx/device/gpu/gpu_util.h
@@ -249,9 +249,9 @@ VISRTX_DEVICE vec3 sampleUnitSphere(RandState &rs, const vec3 &normal)
 
 #define ulpEpsilon 0x1.fp-21
 
-VISRTX_DEVICE float epsilonFrom(const vec3 &P, const vec3 &dir, float t)
+VISRTX_DEVICE float epsilonFrom(const vec3 &P)
 {
-  return glm::compMax(vec4(abs(P), glm::compMax(abs(dir)) * t)) * ulpEpsilon;
+  return glm::compMax(abs(P)) * ulpEpsilon;
 }
 
 VISRTX_DEVICE bool pixelOutOfFrame(
diff --git a/devices/rtx/device/gpu/populateHit.h b/devices/rtx/device/gpu/populateHit.h
index c43a0b17f..19088f325 100644
--- a/devices/rtx/device/gpu/populateHit.h
+++ b/devices/rtx/device/gpu/populateHit.h
@@ -399,7 +399,7 @@ VISRTX_DEVICE void populateSurfaceHit(SurfaceHit &hit)
   hit.primID = ray::primID();
   hit.objID = sd.id;
   hit.instID = isd.id;
-  hit.epsilon = epsilonFrom(ray::hitpoint(), ray::direction(), ray::t());
+  hit.epsilon = epsilonFrom(ray::hitpoint());
   ray::computeTangentSpace(gd, ray::primID(), hit);
 
   const auto &handle = optixGetTransformListHandle(0);

From 8728a4b983cc0814314bcd122b5f47e8843d4938 Mon Sep 17 00:00:00 2001
From: Thomas Arcila <134677+tarcila@users.noreply.github.com>
Date: Wed, 6 May 2026 19:35:11 +0000
Subject: [PATCH 14/20] rtx: Lift shadow-ray origins onto smooth surface
 (Hanika fix)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Smooth-normal triangles produce dark, triangle-shaped bands at grazing
light: the planar hit point lies below the smooth surface implied by
per-vertex normals, so direct-light shadow rays start inside that
implied curvature and self-occlude on the tessellation.

Add a shadingHitpoint() helper (Hanika RTGII ch. 4: signed distance from
each vertex tangent plane, projected back along the vertex normal,
barycentric-blended) and call it at shadow/AO ray origins only.
Continuation rays keep the unmodified facet hitpoint — transmission
needs it: the smoothed point can sit far enough above the facet that
the -Ng*epsilon offset still leaves the origin outside the volume,
blocking the refracted path from reaching the back wall.
---
 devices/rtx/device/gpu/computeAO.h            |  2 +-
 devices/rtx/device/gpu/gpu_math.h             | 10 +++
 devices/rtx/device/gpu/gpu_util.h             | 81 +++++++++++++++++++
 devices/rtx/device/gpu/populateHit.h          | 22 ++---
 .../rtx/device/renderer/Interactive_ptx.cu    |  5 +-
 devices/rtx/device/renderer/Quality_ptx.cu    |  2 +-
 6 files changed, 108 insertions(+), 14 deletions(-)

diff --git a/devices/rtx/device/gpu/computeAO.h b/devices/rtx/device/gpu/computeAO.h
index 77861164b..3d55c3c57 100644
--- a/devices/rtx/device/gpu/computeAO.h
+++ b/devices/rtx/device/gpu/computeAO.h
@@ -45,7 +45,7 @@ VISRTX_DEVICE float computeAO(ScreenSample &ss,
   float weights = 0.0f;
   float hits = 0.0f;
   Ray aoRay;
-  aoRay.org = currentHit.hitpoint + currentHit.Ns * currentHit.epsilon;
+  aoRay.org = shadingHitpoint(currentHit) + currentHit.Ng * currentHit.epsilon;
   aoRay.t.lower = currentHit.epsilon;
   aoRay.t.upper = dist;
 
diff --git a/devices/rtx/device/gpu/gpu_math.h b/devices/rtx/device/gpu/gpu_math.h
index 0b9692792..83dddd432 100644
--- a/devices/rtx/device/gpu/gpu_math.h
+++ b/devices/rtx/device/gpu/gpu_math.h
@@ -326,4 +326,14 @@ VISRTX_HOST_DEVICE vec3 xfmPoint(const mat4 &m, const vec3 &p)
   return m * vec4(p, 1.0f);
 }
 
+// mat3x4 stores OptiX-style affine rows: glm column i = OptiX row i =
+// (m_{i,0}, m_{i,1}, m_{i,2}, m_{i,3}). The transform is
+//   out_i = sum_j m_{i,j} * p_j + m_{i,3}.
+VISRTX_HOST_DEVICE vec3 xfmPoint(const mat3x4 &m, const vec3 &p)
+{
+  return vec3(glm::dot(vec3(m[0]), p) + m[0].w,
+      glm::dot(vec3(m[1]), p) + m[1].w,
+      glm::dot(vec3(m[2]), p) + m[2].w);
+}
+
 } // namespace visrtx
diff --git a/devices/rtx/device/gpu/gpu_util.h b/devices/rtx/device/gpu/gpu_util.h
index 42dba9e8f..e1b8523f5 100644
--- a/devices/rtx/device/gpu/gpu_util.h
+++ b/devices/rtx/device/gpu/gpu_util.h
@@ -254,6 +254,87 @@ VISRTX_DEVICE float epsilonFrom(const vec3 &P)
   return glm::compMax(abs(P)) * ulpEpsilon;
 }
 
+// Hanika's shadow-terminator fix (Ray Tracing Gems II, ch. 4): lifts a
+// triangle hit point onto the smooth surface implied by per-vertex normals.
+// Without this, grazing-angle shadow rays self-occlude on the planar facet
+// and produce dark bands shaped like the underlying tessellation. All inputs
+// must share a coordinate space.
+VISRTX_DEVICE vec3 shadowTerminatorOffset(const vec3 &P,
+    const vec3 &v0,
+    const vec3 &v1,
+    const vec3 &v2,
+    const vec3 &n0,
+    const vec3 &n1,
+    const vec3 &n2,
+    const vec3 &bary)
+{
+  const float du = glm::dot(P - v0, n0);
+  const float dv = glm::dot(P - v1, n1);
+  const float dw = glm::dot(P - v2, n2);
+  const vec3 lu = du < 0.f ? -du * n0 : vec3(0.f);
+  const vec3 lv = dv < 0.f ? -dv * n1 : vec3(0.f);
+  const vec3 lw = dw < 0.f ? -dw * n2 : vec3(0.f);
+  return P + bary.x * lu + bary.y * lv + bary.z * lw;
+}
+
+// World-space hit position lifted onto the smooth surface implied by
+// per-vertex normals (Hanika shadow-terminator fix). Use this as the origin
+// for direct-light/AO shadow rays so grazing-angle queries do not self-shadow
+// the planar facet. Do NOT use it for path-continuation rays — transmission
+// especially needs the original facet point, since the smoothed point can sit
+// far enough above the facet that an "into-the-surface" offset still ends up
+// outside the volume.
+VISRTX_DEVICE vec3 shadingHitpoint(const SurfaceHit &hit)
+{
+  if (hit.geometry == nullptr || hit.geometry->type != GeometryType::TRIANGLE)
+    return hit.hitpoint;
+
+  const auto &tri = hit.geometry->tri;
+  if (tri.vertexNormalsFV == nullptr && tri.vertexNormals == nullptr)
+    return hit.hitpoint;
+
+  const uvec3 idx = tri.indices ? tri.indices[hit.primID]
+                                : uvec3(0, 1, 2) + hit.primID * 3;
+  const vec3 v0 = tri.vertices[idx.x];
+  const vec3 v1 = tri.vertices[idx.y];
+  const vec3 v2 = tri.vertices[idx.z];
+
+  vec3 n0, n1, n2;
+  if (tri.vertexNormalsFV != nullptr) {
+    const uvec3 nidx = uvec3(0, 1, 2) + hit.primID * 3;
+    n0 = tri.vertexNormalsFV[nidx.x];
+    n1 = tri.vertexNormalsFV[nidx.y];
+    n2 = tri.vertexNormalsFV[nidx.z];
+  } else {
+    n0 = tri.vertexNormals[idx.x];
+    n1 = tri.vertexNormals[idx.y];
+    n2 = tri.vertexNormals[idx.z];
+  }
+
+  // Hanika's tangent-plane projection assumes unit normals; user data is
+  // not guaranteed to be normalized.
+  n0 = normalize(n0);
+  n1 = normalize(n1);
+  n2 = normalize(n2);
+
+  // populateHit.h flips hit.Ng/Ns for back-face hits so they point toward
+  // the ray origin. The per-vertex normals here are still in the original
+  // outward orientation; flip them too so the smooth surface bulges onto
+  // the ray-origin side of the facet (otherwise Hanika lifts P away from
+  // the ray origin and the trailing `+ Ng * epsilon` can land below the
+  // facet).
+  if (!hit.isFrontFace) {
+    n0 = -n0;
+    n1 = -n1;
+    n2 = -n2;
+  }
+
+  const vec3 Plocal = xfmPoint(hit.worldToObject, hit.hitpoint);
+  const vec3 Psmooth =
+      shadowTerminatorOffset(Plocal, v0, v1, v2, n0, n1, n2, hit.uvw);
+  return xfmPoint(hit.objectToWorld, Psmooth);
+}
+
 VISRTX_DEVICE bool pixelOutOfFrame(
     const uvec2 &pixel, const FramebufferGPUData &fb)
 {
diff --git a/devices/rtx/device/gpu/populateHit.h b/devices/rtx/device/gpu/populateHit.h
index 19088f325..c5c649f5f 100644
--- a/devices/rtx/device/gpu/populateHit.h
+++ b/devices/rtx/device/gpu/populateHit.h
@@ -225,22 +225,24 @@ VISRTX_DEVICE void computeTangentSpace(
     if (!optixIsFrontFaceHit())
       hit.Ng = -hit.Ng;
 
-    const bool hasVertexNormals = ggd.tri.vertexNormalsFV != nullptr
-        || ggd.tri.vertexNormals != nullptr;
     vec3 n0, n1, n2;
+    bool hasVertexNormals = true;
     if (ggd.tri.vertexNormalsFV != nullptr) {
-      const uvec3 nIdx = uvec3(0, 1, 2) + (hit.primID * 3);
-
-      n0 = ggd.tri.vertexNormalsFV[nIdx.x];
-      n1 = ggd.tri.vertexNormalsFV[nIdx.y];
-      n2 = ggd.tri.vertexNormalsFV[nIdx.z];
-      hit.Ns = b.x * n0 + b.y * n1 + b.z * n2;
+      const uvec3 nidx = uvec3(0, 1, 2) + (hit.primID * 3);
+      n0 = ggd.tri.vertexNormalsFV[nidx.x];
+      n1 = ggd.tri.vertexNormalsFV[nidx.y];
+      n2 = ggd.tri.vertexNormalsFV[nidx.z];
     } else if (ggd.tri.vertexNormals != nullptr) {
       n0 = ggd.tri.vertexNormals[idx.x];
       n1 = ggd.tri.vertexNormals[idx.y];
       n2 = ggd.tri.vertexNormals[idx.z];
+    } else {
+      hasVertexNormals = false;
+    }
+
+    if (hasVertexNormals)
       hit.Ns = b.x * n0 + b.y * n1 + b.z * n2;
-    } else
+    else
       hit.Ns = hit.Ng;
 
     hit.Ns = normalize(hit.Ns);
@@ -399,7 +401,7 @@ VISRTX_DEVICE void populateSurfaceHit(SurfaceHit &hit)
   hit.primID = ray::primID();
   hit.objID = sd.id;
   hit.instID = isd.id;
-  hit.epsilon = epsilonFrom(ray::hitpoint());
+  hit.epsilon = epsilonFrom(hit.hitpoint);
   ray::computeTangentSpace(gd, ray::primID(), hit);
 
   const auto &handle = optixGetTransformListHandle(0);
diff --git a/devices/rtx/device/renderer/Interactive_ptx.cu b/devices/rtx/device/renderer/Interactive_ptx.cu
index c75d0a0ad..1a05ed5e6 100644
--- a/devices/rtx/device/renderer/Interactive_ptx.cu
+++ b/devices/rtx/device/renderer/Interactive_ptx.cu
@@ -87,17 +87,18 @@ struct InteractiveShadingPolicy
         * materialEvaluateTint(shadingState);
 
     // Handle all lights contributions
+    const vec3 shadowOrigin = shadingHitpoint(hit) + hit.Ng * hit.epsilon;
     for (size_t i = 0; i < world.numLightInstances; i++) {
       const auto &light = world.lightInstances[i];
       const auto lightSample =
-          sampleLight(ss, hit.hitpoint, light.lightIndex, light.xfm);
+          sampleLight(ss, shadowOrigin, light.lightIndex, light.xfm);
 
       if (lightSample.pdf == 0.0f)
         continue;
 
       // Shadowing
       const Ray shadowRay = {
-          hit.hitpoint + hit.Ng * hit.epsilon,
+          shadowOrigin,
           lightSample.dir,
           {hit.epsilon, lightSample.dist},
       };
diff --git a/devices/rtx/device/renderer/Quality_ptx.cu b/devices/rtx/device/renderer/Quality_ptx.cu
index dede6347d..652b0e90c 100644
--- a/devices/rtx/device/renderer/Quality_ptx.cu
+++ b/devices/rtx/device/renderer/Quality_ptx.cu
@@ -377,7 +377,7 @@ VISRTX_GLOBAL void __raygen__()
           const float lightDotNs = dot(lightSample.dir, surfaceHit.Ns);
           if (lightDotNs > 0.0f) {
             const Ray shadowRay = {
-                surfaceHit.hitpoint + surfaceHit.Ng * surfaceHit.epsilon,
+                shadowOrigin,
                 lightSample.dir,
                 {surfaceHit.epsilon, lightSample.dist},
             };

From 18d2346a7647da8a7caae71a73366c481afad5a7 Mon Sep 17 00:00:00 2001
From: Thomas Arcila <134677+tarcila@users.noreply.github.com>
Date: Tue, 12 May 2026 13:17:29 -0400
Subject: [PATCH 15/20] rtx: Don't mix transmission into Interactive's surface
 alpha

This breaks cutouts. To be revisited.
---
 devices/rtx/device/gpu/renderer/raygen_helpers.h | 9 ---------
 devices/rtx/device/renderer/Interactive_ptx.cu   | 4 ++--
 2 files changed, 2 insertions(+), 11 deletions(-)

diff --git a/devices/rtx/device/gpu/renderer/raygen_helpers.h b/devices/rtx/device/gpu/renderer/raygen_helpers.h
index 5f65d6405..81bfc7bd8 100644
--- a/devices/rtx/device/gpu/renderer/raygen_helpers.h
+++ b/devices/rtx/device/gpu/renderer/raygen_helpers.h
@@ -61,15 +61,6 @@ VISRTX_DEVICE float volumeAttenuation(ScreenSample &ss, const Ray &r)
   return attenuation;
 }
 
-// Evaluate opacity including transmission
-VISRTX_DEVICE float evaluateOpacity(const MaterialShadingState &shadingState)
-{
-  return materialEvaluateOpacity(shadingState)
-      * (1.0f
-          - glm::luminosity(materialEvaluateTransmission(shadingState))
-              * 0.85f);
-}
-
 // Templated rendering loop
 // ShadingPolicy must implement:
 //   static VISRTX_DEVICE vec4 shadeSurface(
diff --git a/devices/rtx/device/renderer/Interactive_ptx.cu b/devices/rtx/device/renderer/Interactive_ptx.cu
index 1a05ed5e6..2320e3a23 100644
--- a/devices/rtx/device/renderer/Interactive_ptx.cu
+++ b/devices/rtx/device/renderer/Interactive_ptx.cu
@@ -155,7 +155,7 @@ struct InteractiveShadingPolicy
       }
     }
 
-    return vec4(contrib, evaluateOpacity(shadingState));
+    return vec4(contrib, materialEvaluateOpacity(shadingState));
   }
 };
 
@@ -182,7 +182,7 @@ VISRTX_GLOBAL void __anyhit__shadow()
 
     MaterialShadingState shadingState;
     materialInitShading(&shadingState, frameData, *hit.material, hit);
-    auto opacity = evaluateOpacity(shadingState);
+    auto opacity = materialEvaluateOpacity(shadingState);
 
     auto &o = ray::rayData<float>();
 

From b2f3733a1a570b06ea5bf7c9e792e0e84211c5c4 Mon Sep 17 00:00:00 2001
From: Thomas Arcila <134677+tarcila@users.noreply.github.com>
Date: Tue, 12 May 2026 16:10:56 -0400
Subject: [PATCH 16/20] rtx: Use NdotV Fresnel for PBR diffuse weight

Both shadeSurface and nextRay evaluated (1-F) at the GGX half-vector,
but the diffuse direction is independent of H so the weight didn't
match between NEE and the bounce. Evaluate the diffuse Fresnel at NdotV
(Frostbite/Disney) so the two estimators agree at any roughness.

Also factor the Fresnel+iridescence block into evalFresnelWithIridescence
to remove the duplicated block.
---
 .../shaders/PhysicallyBasedShader_ptx.cu      | 46 +++++++++++--------
 1 file changed, 28 insertions(+), 18 deletions(-)

diff --git a/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu b/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu
index f216565d4..506a384cb 100644
--- a/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu
+++ b/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu
@@ -394,6 +394,21 @@ VISRTX_DEVICE vec3 schlickFresnel(vec3 F0, vec3 F90, float VdotH)
   return F0 + (F90 - F0) * pow5(1.0f - fabsf(VdotH));
 }
 
+VISRTX_DEVICE vec3 evalFresnelWithIridescence(
+    const PhysicallyBasedShadingState *state,
+    const vec3 &F0,
+    const vec3 &F90,
+    float cosTheta)
+{
+  vec3 F = schlickFresnel(F0, F90, cosTheta);
+  if (state->iridescence > 0.0f && state->iridescenceThickness > 0.0f) {
+    const vec3 iridescent = evalIridescence(
+        1.0f, state->iridescenceIor, cosTheta, state->iridescenceThickness, F0);
+    F = glm::mix(F, iridescent, state->iridescence);
+  }
+  return F;
+}
+
 VISRTX_CALLABLE vec3 __direct_callable__shadeSurface(
     const PhysicallyBasedShadingState *state,
     const SurfaceHit *hit,
@@ -415,15 +430,13 @@ VISRTX_CALLABLE vec3 __direct_callable__shadeSurface(
   const float NdotV = fmaxf(dot(N, V), 1e-6f);
   const float VdotH = fmaxf(dot(V, H), 0.0f);
 
-  // Base F0 / F90, optionally overridden by iridescence.
-  vec3 F0 = computeF0(state);
-  vec3 F90 = computeF90(state);
-  vec3 F = schlickFresnel(F0, F90, VdotH);
-  if (state->iridescence > 0.0f && state->iridescenceThickness > 0.0f) {
-    const vec3 iridescent = evalIridescence(
-        1.0f, state->iridescenceIor, VdotH, state->iridescenceThickness, F0);
-    F = glm::mix(F, iridescent, state->iridescence);
-  }
+  // Base F0 / F90. Specular uses Fresnel at the microfacet (VdotH); the
+  // diffuse weight uses Fresnel at NdotV (Frostbite/Disney convention) so
+  // shadeSurface and nextRay's diffuse split agree regardless of light dir.
+  const vec3 F0 = computeF0(state);
+  const vec3 F90 = computeF90(state);
+  const vec3 F = evalFresnelWithIridescence(state, F0, F90, VdotH);
+  const vec3 Fdiff = evalFresnelWithIridescence(state, F0, F90, NdotV);
 
   // Base GGX specular lobe.
   const float alpha = fmaxf(pow2(state->roughness), 1e-4f);
@@ -436,7 +449,7 @@ VISRTX_CALLABLE vec3 __direct_callable__shadeSurface(
   // and transmission; metals have no diffuse).
   const vec3 diffuseColor =
       glm::mix(state->baseColor, vec3(0.0f), state->metallic);
-  const vec3 diffuseBRDF = (vec3(1.0f) - F) * float(M_1_PI) * diffuseColor
+  const vec3 diffuseBRDF = (vec3(1.0f) - Fdiff) * float(M_1_PI) * diffuseColor
       * state->occlusion * (1.0f - state->transmission);
 
   vec3 base = diffuseBRDF + specularBRDF;
@@ -508,15 +521,12 @@ VISRTX_CALLABLE NextRay __direct_callable__nextRay(
   const float NdotV = Vlocal.z;
   const float VdotH = fmaxf(dot(Vlocal, Hlocal), 0.0f);
 
-  // Fresnel at the sampled microfacet, with optional iridescence.
+  // Fresnel at the sampled microfacet (specular/transmission split) and at
+  // NdotV (diffuse weight) — matches the convention in shadeSurface.
   const vec3 F0 = computeF0(state);
   const vec3 F90 = computeF90(state);
-  vec3 F = schlickFresnel(F0, F90, VdotH);
-  if (state->iridescence > 0.0f && state->iridescenceThickness > 0.0f) {
-    const vec3 iridescent = evalIridescence(
-        1.0f, state->iridescenceIor, VdotH, state->iridescenceThickness, F0);
-    F = glm::mix(F, iridescent, state->iridescence);
-  }
+  const vec3 F = evalFresnelWithIridescence(state, F0, F90, VdotH);
+  const vec3 Fdiff = evalFresnelWithIridescence(state, F0, F90, NdotV);
 
   const vec3 Lrefl = glm::reflect(-Vlocal, Hlocal);
   const float eta = state->ior; // init() pre-inverted for front-facing hits
@@ -538,7 +548,7 @@ VISRTX_CALLABLE NextRay __direct_callable__nextRay(
   // BRDF being estimated. TIR has no diffuse share (all energy is reflected).
   const vec3 diffuseEnergy = totalInternalReflection
       ? vec3(0.0f)
-      : glm::max(vec3(1.0f) - F, vec3(0.0f)) * state->baseColor
+      : glm::max(vec3(1.0f) - Fdiff, vec3(0.0f)) * state->baseColor
           * (1.0f - state->metallic) * (1.0f - state->transmission)
           * state->occlusion;
 

From a2df2399dfbd5bd179e2fe01cd19c317a3370775 Mon Sep 17 00:00:00 2001
From: Thomas Arcila <134677+tarcila@users.noreply.github.com>
Date: Tue, 12 May 2026 16:18:24 -0400
Subject: [PATCH 17/20] rtx: Fold normal-degeneracy fallback into
 sampleNormalMap

The init() function had two post-hoc dot-product guards to swap a
NaN/zero shading normal for the geometric one. Doing it inside
sampleNormalMap means every caller gets a usable normal back without
the cleanup pass downstream.
---
 .../material/shaders/PhysicallyBasedShader_ptx.cu | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu b/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu
index 506a384cb..39cd8d5d8 100644
--- a/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu
+++ b/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu
@@ -65,7 +65,11 @@ VISRTX_DEVICE vec3 sampleNormalMap(const FrameGPUData &fd,
   if (samplerIdx == ~visrtx::DeviceObjectIndex{0})
     return fallback;
   const vec3 ts = normalize(evaluateSampler(fd, samplerIdx, hit) * 2.0f - 1.0f);
-  return applyNormalMap(ts, hit, hit.Ns);
+  const vec3 N = applyNormalMap(ts, hit, hit.Ns);
+  // Negated comparison catches NaN (zero-decoded texels, zero-summed tangents)
+  // as well as zero-length results — fall back to the geometric normal so the
+  // shading frame is always usable.
+  return (dot(N, N) > 1e-12f) ? N : hit.Ng;
 }
 
 VISRTX_DEVICE float luminance(const vec3 &c)
@@ -319,15 +323,6 @@ VISRTX_CALLABLE void __direct_callable__init(
   shadingState->iridescenceIor = md->iridescenceIor;
   shadingState->iridescenceThickness =
       getMaterialParameter(*fd, md->iridescenceThickness, *hit).x;
-
-  // Fall back to the geometric normal if sampleNormalMap produced NaN
-  // (texel decodes to zero, or zero-summed tangents) or zero-length. The
-  // negated comparison catches both since NaN compares false to anything.
-  if (!(glm::dot(shadingState->normal, shadingState->normal) > 1e-12f))
-    shadingState->normal = hit->Ng;
-  if (!(glm::dot(shadingState->clearcoatNormal, shadingState->clearcoatNormal)
-          > 1e-12f))
-    shadingState->clearcoatNormal = hit->Ng;
 }
 
 //-----------------------------------------------------------------------------

From 65d35740a93aac01f5ba1492a0851e275c2e399c Mon Sep 17 00:00:00 2001
From: Thomas Arcila <134677+tarcila@users.noreply.github.com>
Date: Tue, 12 May 2026 16:36:28 -0400
Subject: [PATCH 18/20] rtx: Rename PBR shading state's ior field to eta

The field stores the refraction ratio (n1/n2 from the incident side),
pre-inverted on front-facing hits so it can feed glm::refract directly.
Calling it 'ior' invited reading it as the material's IOR. Move the
'pre-inverted' note onto the struct field so the contract lives with
the data.
---
 devices/rtx/device/gpu/shadingState.h                      | 5 ++++-
 .../device/material/shaders/PhysicallyBasedShader_ptx.cu   | 7 +++----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/devices/rtx/device/gpu/shadingState.h b/devices/rtx/device/gpu/shadingState.h
index 1831d5436..e85ec911d 100644
--- a/devices/rtx/device/gpu/shadingState.h
+++ b/devices/rtx/device/gpu/shadingState.h
@@ -80,7 +80,10 @@ struct PhysicallyBasedShadingState
   float metallic;
   float roughness;
   float transmission;
-  float ior;
+  // Refraction ratio n1/n2 from the incident side: init() stores 1/md->ior
+  // for front-facing hits and md->ior for back-facing hits, so this can be
+  // plugged directly into glm::refract and Schlick's F0 formula.
+  float eta;
   vec3 emission;
 
   float occlusion;
diff --git a/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu b/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu
index 39cd8d5d8..e21bd5da8 100644
--- a/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu
+++ b/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu
@@ -286,7 +286,7 @@ VISRTX_CALLABLE void __direct_callable__init(
 
   shadingState->opacity =
       adjustedMaterialOpacity(color.w * opacity, md->alphaMode, md->cutoff);
-  shadingState->ior = hit->isFrontFace ? 1.0f / md->ior : md->ior;
+  shadingState->eta = hit->isFrontFace ? 1.0f / md->ior : md->ior;
   shadingState->metallic = getMaterialParameter(*fd, md->metallic, *hit).x;
   shadingState->roughness = getMaterialParameter(*fd, md->roughness, *hit).x;
   shadingState->emission = vec3(getMaterialParameter(*fd, md->emissive, *hit));
@@ -365,7 +365,7 @@ VISRTX_CALLABLE vec3 __direct_callable__evaluateNormal(
 
 VISRTX_DEVICE vec3 computeDielectricF0(const PhysicallyBasedShadingState *state)
 {
-  const float iorF0 = pow2((1.0f - state->ior) / (1.0f + state->ior));
+  const float iorF0 = pow2((1.0f - state->eta) / (1.0f + state->eta));
   if (state->useSpecular == 0)
     return vec3(iorF0);
   return glm::min(vec3(iorF0) * state->specularColor, vec3(1.0f))
@@ -524,8 +524,7 @@ VISRTX_CALLABLE NextRay __direct_callable__nextRay(
   const vec3 Fdiff = evalFresnelWithIridescence(state, F0, F90, NdotV);
 
   const vec3 Lrefl = glm::reflect(-Vlocal, Hlocal);
-  const float eta = state->ior; // init() pre-inverted for front-facing hits
-  const vec3 Ltrans = glm::refract(-Vlocal, Hlocal, eta);
+  const vec3 Ltrans = glm::refract(-Vlocal, Hlocal, state->eta);
   const vec3 transmissionFilter = computeTransmissionFilter(state);
   const bool hasTransmission = luminance(transmissionFilter) > 0.0f;
   const bool totalInternalReflection =

From e18c2a4cae4519e4a78b69991757d9ad1606eb57 Mon Sep 17 00:00:00 2001
From: Thomas Arcila <134677+tarcila@users.noreply.github.com>
Date: Tue, 12 May 2026 16:36:28 -0400
Subject: [PATCH 19/20] rtx: Use the `color` light parameter when sampling an
 HDRI

---
 devices/rtx/device/gpu/sampleLight.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/devices/rtx/device/gpu/sampleLight.h b/devices/rtx/device/gpu/sampleLight.h
index 14bbe64cf..2fe5f5013 100644
--- a/devices/rtx/device/gpu/sampleLight.h
+++ b/devices/rtx/device/gpu/sampleLight.h
@@ -339,7 +339,7 @@ VISRTX_DEVICE LightSample sampleHDRILight(
   ls.dir = xfmVec(xfm, dir);
   ls.dist =
       std::numeric_limits<float>::infinity(); // Environment is at infinity
-  ls.radiance = radiance * ld.hdri.scale;
+  ls.radiance = radiance * ld.hdri.scale * ld.color;
   ls.pdf = pdf;
 
   return ls;
@@ -384,7 +384,7 @@ VISRTX_DEVICE LightSample sampleHDRILight(
   // instead of explicitly transposing/inverting the matrix
   ls.dir = xfmVec(xfm, sphericalCoordsToDirection(thetaPhi) * ld.hdri.xfm);
   ls.dist = 1e20f; // Environment is effectively at infinity
-  ls.radiance = radiance * ld.hdri.scale;
+  ls.radiance = radiance * ld.hdri.scale * ld.color;
   ls.pdf = pdf;
 
   return ls;

From f436ca802d52f157c1432e04580e4eee631ea8b2 Mon Sep 17 00:00:00 2001
From: Thomas Arcila <134677+tarcila@users.noreply.github.com>
Date: Tue, 12 May 2026 17:00:00 -0400
Subject: [PATCH 20/20] rtx: Sample the PBR clearcoat lobe in nextRay

Clearcoat was only evaluated in NEE, so smooth clearcoats over matte
bases never picked up HDRI/sky reflections on the bounce path.
---
 .../shaders/PhysicallyBasedShader_ptx.cu      | 70 ++++++++++++++++---
 1 file changed, 59 insertions(+), 11 deletions(-)

diff --git a/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu b/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu
index e21bd5da8..593457ed3 100644
--- a/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu
+++ b/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu
@@ -489,8 +489,8 @@ VISRTX_CALLABLE vec3 __direct_callable__shadeSurface(
 
 //-----------------------------------------------------------------------------
 // Next-ray importance sampling: stochastic alpha, Fresnel-aware lobe pick,
-// GGX VNDF reflection/refraction. Clearcoat/sheen are NEE-only (no separate
-// lobe sampling), which matches what the base renderer is set up to consume.
+// GGX VNDF reflection/refraction, plus a clearcoat lobe sampled with
+// probability equal to its view-angle Fresnel weight. Sheen is NEE-only.
 //-----------------------------------------------------------------------------
 
 VISRTX_CALLABLE NextRay __direct_callable__nextRay(
@@ -500,8 +500,55 @@ VISRTX_CALLABLE NextRay __direct_callable__nextRay(
   if (curand_uniform(rs) > state->opacity)
     return NextRay{ray->dir, vec3(1.0f), NEXT_RAY_CONTINUES_THROUGH_SURFACE};
 
-  const vec3 N = state->normal;
   const vec3 V = -ray->dir;
+
+  // Clearcoat lobe: pick it with probability `clearcoat·FcV(NcDotV)`. This
+  // exact weight makes the entry-side attenuation `1 - clearcoat·FcV` cancel
+  // the `1/(1-pick)` lobe-pick divisor in the base path below, so the base
+  // returns only need the exit-side `1 - clearcoat·FcL` multiplier.
+  const vec3 Nc = state->clearcoatNormal;
+  const float NcDotV_world = fmaxf(dot(Nc, V), 0.0f);
+  const float FcV_world =
+      CLEARCOAT_F0 + (1.0f - CLEARCOAT_F0) * pow5(1.0f - NcDotV_world);
+  const float clearcoatPick =
+      glm::clamp(state->clearcoat * FcV_world, 0.0f, 1.0f);
+
+  if (clearcoatPick > 0.0f && curand_uniform(rs) < clearcoatPick) {
+    const mat3 toWorldC = computeOrthonormalBasis(Nc);
+    const vec3 VlocalC = glm::transpose(toWorldC) * V;
+    if (VlocalC.z <= 0.0f)
+      return NextRay{Nc, vec3(0.0f)};
+    const float alphaC = fmaxf(pow2(state->clearcoatRoughness), 1e-4f);
+    const float alphaC2 = alphaC * alphaC;
+    const vec3 HlocalC = sampleGGXVNDF(
+        VlocalC, alphaC, curand_uniform(rs), curand_uniform(rs));
+    const vec3 LlocalC = glm::reflect(-VlocalC, HlocalC);
+    if (LlocalC.z <= 0.0f)
+      return NextRay{Nc, vec3(0.0f)};
+    const float VdotHc = fmaxf(dot(VlocalC, HlocalC), 0.0f);
+    const float Fc =
+        CLEARCOAT_F0 + (1.0f - CLEARCOAT_F0) * pow5(1.0f - VdotHc);
+    const float G1c = smithG1GGX(VlocalC.z, alphaC2);
+    const float G2c = smithG2GGX(VlocalC.z, LlocalC.z, alphaC2);
+    // VNDF gives BRDF·cos/pdf = clearcoat·Fc·G2/G1; the clearcoat factor
+    // cancels against the matching factor in clearcoatPick.
+    const vec3 weight = vec3(state->clearcoat * Fc * G2c / fmaxf(G1c, 1e-8f))
+        / fmaxf(clearcoatPick, 1e-8f);
+    return NextRay{normalize(toWorldC * LlocalC), weight};
+  }
+
+  // Exit-side clearcoat attenuation, applied to every base-path return.
+  // `fabsf` handles the transmission case where L points through the surface.
+  auto clearcoatExitAttn = [&](const vec3 &Lworld) -> float {
+    if (state->clearcoat <= 0.0f)
+      return 1.0f;
+    const float NcDotL = fabsf(dot(Nc, Lworld));
+    const float FcL =
+        CLEARCOAT_F0 + (1.0f - CLEARCOAT_F0) * pow5(1.0f - NcDotL);
+    return glm::clamp(1.0f - state->clearcoat * FcL, 0.0f, 1.0f);
+  };
+
+  const vec3 N = state->normal;
   const mat3 toWorld = computeOrthonormalBasis(N);
   const mat3 toLocal = glm::transpose(toWorld);
   const vec3 Vlocal = toLocal * V;
@@ -568,26 +615,27 @@ VISRTX_CALLABLE NextRay __direct_callable__nextRay(
     const float NdotL = Lrefl.z;
     const float G1 = smithG1GGX(NdotV, alpha2);
     const float G2 = smithG2GGX(NdotV, NdotL, alpha2);
-    const vec3 weight =
-        reflectEnergy * (G2 / fmaxf(G1, 1e-8f)) / fmaxf(reflectProb, 1e-8f);
-    return NextRay{normalize(toWorld * Lrefl), weight};
+    const vec3 Lworld = normalize(toWorld * Lrefl);
+    const vec3 weight = reflectEnergy * (G2 / fmaxf(G1, 1e-8f))
+        * clearcoatExitAttn(Lworld) / fmaxf(reflectProb, 1e-8f);
+    return NextRay{Lworld, weight};
   }
 
   if (u < reflectProb + transmitProb) {
     const float NdotL = -Ltrans.z; // L points through the surface.
     const float G1 = smithG1GGX(NdotV, alpha2);
     const float G2 = smithG2GGX(NdotV, NdotL, alpha2);
+    const vec3 Lworld = normalize(toWorld * Ltrans);
     const vec3 weight = transmitEnergy * (G2 / fmaxf(G1, 1e-8f))
-        / fmaxf(transmitProb, 1e-8f);
-    return NextRay{normalize(toWorld * Ltrans),
-        weight,
-        NEXT_RAY_CONTINUES_THROUGH_SURFACE};
+        * clearcoatExitAttn(Lworld) / fmaxf(transmitProb, 1e-8f);
+    return NextRay{Lworld, weight, NEXT_RAY_CONTINUES_THROUGH_SURFACE};
   }
 
   // Diffuse: sample around the shading normal so pdf=cos/pi matches the BRDF's
   // NdotL (same axis as shadeSurface's diffuse term). Cos and pdf cancel,
   // leaving only the energy term and the lobe-pick divisor.
   const vec3 wi = sampleHemisphere(*rs, N);
-  const vec3 weight = diffuseEnergy / fmaxf(diffuseProb, 1e-8f);
+  const vec3 weight =
+      diffuseEnergy * clearcoatExitAttn(wi) / fmaxf(diffuseProb, 1e-8f);
   return NextRay{wi, weight};
 }