diff --git a/devices/rtx/device/frame/Frame.cu b/devices/rtx/device/frame/Frame.cu
index e2aa825c6..b89db4074 100644
--- a/devices/rtx/device/frame/Frame.cu
+++ b/devices/rtx/device/frame/Frame.cu
@@ -73,8 +73,12 @@ __device__ bool resolveSample(uint32_t idx,
   return divisor > 0;
 }
 
-__global__ void prepareDenoiseInput(const vec4 *__restrict__ accumColor,
+__global__ void prepareDenoiseInputs(const vec4 *__restrict__ accumColor,
+    const vec3 *__restrict__ accumAlbedo,
+    const vec3 *__restrict__ accumNormal,
     vec4 *__restrict__ denoiseInput,
+    vec3 *__restrict__ denoiseAlbedo,
+    vec3 *__restrict__ denoiseNormal,
     uvec2 size,
     int frameID,
     int checkerboardID,
@@ -88,45 +92,6 @@ __global__ void prepareDenoiseInput(const vec4 *__restrict__ accumColor,
   int divisor;
   if (!resolveSample(idx, size, frameID, checkerboardID, srcIdx, divisor)) {
     denoiseInput[idx] = vec4(0.f);
-    return;
-  }
-
-  vec4 c = accumColor[srcIdx] / float(divisor);
-  if (fireflyFilter)
-    c = detail::inverseTonemap(c);
-  denoiseInput[idx] = c;
-}
-
-void launchPrepareDenoiseInput(const vec4 *accumColor,
-    vec4 *denoiseInput,
-    uvec2 size,
-    int frameID,
-    int checkerboardID,
-    bool fireflyFilter,
-    cudaStream_t stream)
-{
-  const uint32_t nPixels = size.x * size.y;
-  const uint32_t blockSize = 256;
-  const uint32_t gridSize = (nPixels + blockSize - 1) / blockSize;
-  prepareDenoiseInput<<<gridSize, blockSize, 0, stream>>>(
-      accumColor, denoiseInput, size, frameID, checkerboardID, fireflyFilter);
-}
-
-__global__ void prepareDenoiseGuides(const vec3 *__restrict__ accumAlbedo,
-    const vec3 *__restrict__ accumNormal,
-    vec3 *__restrict__ denoiseAlbedo,
-    vec3 *__restrict__ denoiseNormal,
-    uvec2 size,
-    int frameID,
-    int checkerboardID)
-{
-  const uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= size.x * size.y)
-    return;
-
-  uint32_t srcIdx;
-  int divisor;
-  if (!resolveSample(idx, size, frameID, checkerboardID, srcIdx, divisor)) {
     if (denoiseAlbedo)
       denoiseAlbedo[idx] = vec3(0.f);
     if (denoiseNormal)
@@ -135,6 +100,11 @@ __global__ void prepareDenoiseGuides(const vec3 *__restrict__ accumAlbedo,
   }
 
   const float invDivisor = 1.0f / float(divisor);
+  vec4 c = accumColor[srcIdx] * invDivisor;
+  if (fireflyFilter)
+    c = detail::inverseTonemap(c);
+  denoiseInput[idx] = c;
+
   if (denoiseAlbedo)
     denoiseAlbedo[idx] = accumAlbedo[srcIdx] * invDivisor;
 
@@ -146,25 +116,31 @@ __global__ void prepareDenoiseGuides(const vec3 *__restrict__ accumAlbedo,
   }
 }
 
-void launchPrepareDenoiseGuides(const vec3 *accumAlbedo,
+void launchPrepareDenoiseInputs(const vec4 *accumColor,
+    const vec3 *accumAlbedo,
     const vec3 *accumNormal,
+    vec4 *denoiseInput,
     vec3 *denoiseAlbedo,
     vec3 *denoiseNormal,
     uvec2 size,
     int frameID,
     int checkerboardID,
+    bool fireflyFilter,
     cudaStream_t stream)
 {
   const uint32_t nPixels = size.x * size.y;
   const uint32_t blockSize = 256;
   const uint32_t gridSize = (nPixels + blockSize - 1) / blockSize;
-  prepareDenoiseGuides<<<gridSize, blockSize, 0, stream>>>(accumAlbedo,
+  prepareDenoiseInputs<<<gridSize, blockSize, 0, stream>>>(accumColor,
+      accumAlbedo,
       accumNormal,
+      denoiseInput,
       denoiseAlbedo,
       denoiseNormal,
       size,
       frameID,
-      checkerboardID);
+      checkerboardID,
+      fireflyFilter);
 }
 
 __global__ void compositeBackground(vec4 *__restrict__ accumColor,
@@ -561,25 +537,18 @@ void Frame::renderFrame()
   const bool useFloatOutput = m_denoise || m_colorType == ANARI_FLOAT32_VEC4;
 
   if (m_denoise) {
-    launchPrepareDenoiseInput(m_accumColor.ptrAs<vec4>(),
+    launchPrepareDenoiseInputs(m_accumColor.ptrAs<vec4>(),
+        m_accumAlbedo.ptrAs<vec3>(),
+        m_accumNormal.ptrAs<vec3>(),
         m_denoiseInput.ptrAs<vec4>(),
+        m_denoiseAlbedo.ptrAs<vec3>(),
+        m_denoiseNormal.ptrAs<vec3>(),
         hd.fb.size,
         hd.fb.frameID,
         hd.fb.checkerboardID,
         hd.renderer.fireflyFilter,
         state.stream);
 
-    if (m_denoiseUsingAlbedo || m_denoiseUsingNormal) {
-      launchPrepareDenoiseGuides(m_accumAlbedo.ptrAs<vec3>(),
-          m_accumNormal.ptrAs<vec3>(),
-          m_denoiseAlbedo.ptrAs<vec3>(),
-          m_denoiseNormal.ptrAs<vec3>(),
-          hd.fb.size,
-          hd.fb.frameID,
-          hd.fb.checkerboardID,
-          state.stream);
-    }
-
     m_denoiser.launch();
 
     launchCompositeBackground(m_accumColor.ptrAs<vec4>(),
diff --git a/devices/rtx/device/geometry/ComputeTangent.cu b/devices/rtx/device/geometry/ComputeTangent.cu
index bffeb4bb0..f6c1f1f56 100644
--- a/devices/rtx/device/geometry/ComputeTangent.cu
+++ b/devices/rtx/device/geometry/ComputeTangent.cu
@@ -52,6 +52,76 @@ namespace {
 
 constexpr const auto eps = 1e-8f;
 
+__device__ glm::vec3 safeNormalize(
+    const glm::vec3 &v, const glm::vec3 &fallback)
+{
+  const float l2 = glm::dot(v, v);
+  return l2 > eps ? v * rsqrtf(l2) : fallback;
+}
+
+__device__ void makeTangentFrame(
+    const glm::vec3 &normal, glm::vec3 *tangent, glm::vec3 *bitangent)
+{
+  // https://graphics.pixar.com/library/OrthonormalB/paper.pdf
+  const glm::vec3 n = safeNormalize(normal, glm::vec3(0.f, 0.f, 1.f));
+  const float sign = n.z >= 0.0f ? 1.0f : -1.0f;
+  const float a = -1.0f / (sign + n.z);
+  const float b = n.x * n.y * a;
+  *tangent = glm::vec3(1.0f + sign * n.x * n.x * a, sign * b, -sign * n.x);
+  *bitangent = glm::vec3(b, sign + n.y * n.y * a, -n.y);
+}
+
+__device__ glm::vec3 computeGeometricNormal(
+    const glm::vec3 &e1, const glm::vec3 &e2)
+{
+  return safeNormalize(glm::cross(e1, e2), glm::vec3(0.f, 0.f, 1.f));
+}
+
+// Each face-vertex's contribution to its vertex's accumulated tangent frame
+// is weighted by the triangle's interior angle at that corner — same scheme
+// MikkTSpace uses to average across incident faces. Angle weighting (over
+// uniform or area) keeps thin sliver triangles from dominating shared
+// vertices.
+__device__ float cornerAngle(
+    const glm::vec3 &a, const glm::vec3 &b, const glm::vec3 &c)
+{
+  const glm::vec3 ab = b - a;
+  const glm::vec3 ac = c - a;
+  const float lab = sqrtf(glm::dot(ab, ab));
+  const float lac = sqrtf(glm::dot(ac, ac));
+  if (lab < eps || lac < eps)
+    return 0.0f;
+  const float cosT =
+      glm::clamp(glm::dot(ab, ac) / (lab * lac), -1.0f, 1.0f);
+  return acosf(cosT);
+}
+
+__device__ void atomicAddVec3(glm::vec3 &dst, const glm::vec3 &v)
+{
+  atomicAdd(&dst.x, v.x);
+  atomicAdd(&dst.y, v.y);
+  atomicAdd(&dst.z, v.z);
+}
+
+void cudaFreeMemoryDeleter(const void *, const void *memory)
+{
+  cudaFree(const_cast<void *>(memory));
+}
+
+bool reportCudaError(
+    visrtx::Triangle *triangle, cudaError_t error, const char *operation)
+{
+  if (error == cudaSuccess)
+    return false;
+
+  triangle->reportMessage(ANARI_SEVERITY_ERROR,
+      "CUDA error while computing tangents for Triangle %p during %s: %s",
+      triangle,
+      operation,
+      cudaGetErrorString(error));
+  return true;
+}
+
 } // namespace
 
 namespace visrtx {
@@ -64,57 +134,53 @@ __device__ void __computeTangentAndBitangent(
     glm::vec3 p2,
     glm::vec2 uv0, // Input texture coordinates
     glm::vec2 uv1,
-    glm::vec2 uv2
-)
+    glm::vec2 uv2)
 {
   // Compute edges of the triangle
   glm::vec3 e1 = p1 - p0;
   glm::vec3 e2 = p2 - p0;
+  const auto normal = computeGeometricNormal(e1, e2);
 
-  if (dot(e1, e1) < eps || dot(e2, e2) < eps) {
-    // Degenerate triangle, use a default tangent and bitangent
-    *tangent = glm::vec3(1.0f, 0.0f, 0.0f);
-    *bitangent = glm::vec3(0.0f, 1.0f, 0.0f);
-  } else {
-    auto normal = normalize(cross(e1, e2));
-
-    // Compute differences in texture coordinates
-    auto s = uv1 - uv0;
-    auto t = uv2 - uv0;
-
-    auto cross = s.x * t.y - s.y * t.x;
-
-    if (abs(cross) < eps) { // degenerate triangle (null vectors or collinears)
-      // Create a default orthonormal basis:
-      // https://graphics.pixar.com/library/OrthonormalB/paper.pdf
-      float sign = normal.z >= 0.0f ? 1.0f : -1.0f;
-      float a = -1.0f / (sign + normal.z);
-      float b = normal.x * normal.y * a;
-      *tangent = glm::vec3(
-          1.0f + sign * normal.x * normal.x * a, sign * b, -sign * normal.x);
-      *bitangent = glm::vec3(b, sign + normal.y * normal.y * a, -normal.y);
-    } else {
-      // Compute the determinant
-      float invdet = 1.0f / cross;
-      *tangent = (t.y * e1 - s.y * e2) * invdet;
-      *bitangent = (t.x * e1 - s.x * e2) * invdet;
-    }
+  if (glm::dot(e1, e1) < eps || glm::dot(e2, e2) < eps) {
+    makeTangentFrame(normal, tangent, bitangent);
+    return;
+  }
+
+  // Compute differences in texture coordinates
+  auto s = uv1 - uv0;
+  auto t = uv2 - uv0;
+
+  auto det = s.x * t.y - s.y * t.x;
+
+  if (glm::abs(det) < eps) {
+    makeTangentFrame(normal, tangent, bitangent);
+    return;
   }
+
+  float invdet = 1.0f / det;
+  *tangent = (t.y * e1 - s.y * e2) * invdet;
+  *bitangent = (s.x * e2 - t.x * e1) * invdet;
 }
 
+// Pass 1 (one thread per triangle): compute the per-triangle T/B from the UV
+// gradient, then atomicAdd those vectors into per-vertex accumulators —
+// weighted by the triangle's interior angle at each corner. Per-vertex
+// normals are accumulated the same way so Pass 2 has a coordinate frame to
+// orthogonalize against, regardless of whether input normals are vertex,
+// face-varying, or absent.
 template <bool VerticesIndexed,
     bool NormalsIndexed,
     bool UVsIndexed,
     typename TexCoord>
-__global__ void __doComputeTangents(
-    glm::vec4 *tangents, // Output tangent vectors with handedness (w component)
-    glm::vec3 *bitangents, // Output bitangent vectors
-    const glm::uvec3 *indices, // Input triangle indices
-    const glm::vec3 *positions, // Input vertex positions
-    const glm::vec3 *normals, // Input vertex normals
-    const TexCoord *uvs, // Input texture coordinates
-    unsigned int numTriangles // Number of triangles
-)
+__global__ void __doAccumulateTangents(
+    glm::vec3 *tangentAccum,
+    glm::vec3 *bitangentAccum,
+    glm::vec3 *normalAccum,
+    const glm::uvec3 *indices,
+    const glm::vec3 *positions,
+    const glm::vec3 *normals,
+    const TexCoord *uvs,
+    unsigned int numTriangles)
 {
   unsigned int tri = blockIdx.x * blockDim.x + threadIdx.x;
 
@@ -139,56 +205,58 @@ __global__ void __doComputeTangents(
 
   vec2 uv0, uv1, uv2;
   if constexpr (UVsIndexed) {
-    // Use indexed UVs
     uv0 = uvs[indexedIdx.x];
     uv1 = uvs[indexedIdx.y];
     uv2 = uvs[indexedIdx.z];
   } else {
-    // Use per-face UVs
     uv0 = uvs[perFaceBaseIdx.x];
     uv1 = uvs[perFaceBaseIdx.y];
     uv2 = uvs[perFaceBaseIdx.z];
   }
 
   vec3 tangent, bitangent;
-
   __computeTangentAndBitangent(
-      &tangent, // Output tangent vectors with handedness (w component)
-      &bitangent, // Output bitangent vectors
-      p0,
-      p1,
-      p2, // Input vertex positions
-      uv0,
-      uv1,
-      uv2 // Input texture coordinates
-  );
-
-  vec3 n0, n1, n2;
-  if constexpr (NormalsIndexed) {
-    // Use indexed normals
-    n0 = normals[indexedIdx.x];
-    n1 = normals[indexedIdx.y];
-    n2 = normals[indexedIdx.z];
-  } else {
-    // Use per-face normals
-    n0 = normals[perFaceBaseIdx.x];
-    n1 = normals[perFaceBaseIdx.y];
-    n2 = normals[perFaceBaseIdx.z];
+      &tangent, &bitangent, p0, p1, p2, uv0, uv1, uv2);
+
+  const vec3 geometricNormal = computeGeometricNormal(p1 - p0, p2 - p0);
+  vec3 n0 = geometricNormal;
+  vec3 n1 = geometricNormal;
+  vec3 n2 = geometricNormal;
+  if (normals) {
+    if constexpr (NormalsIndexed) {
+      n0 = normals[indexedIdx.x];
+      n1 = normals[indexedIdx.y];
+      n2 = normals[indexedIdx.z];
+    } else {
+      n0 = normals[perFaceBaseIdx.x];
+      n1 = normals[perFaceBaseIdx.y];
+      n2 = normals[perFaceBaseIdx.z];
+    }
+    n0 = safeNormalize(n0, geometricNormal);
+    n1 = safeNormalize(n1, geometricNormal);
+    n2 = safeNormalize(n2, geometricNormal);
   }
 
-  // Gram-Schmidt orthogonalize and compute handedness
-  vec3 t0 = normalize(tangent - n0 * dot(n0, tangent));
-  float h0 = copysign(1.0f, dot(cross(n0, t0), bitangent));
+  // For indexed meshes, accumulate at the shared vertex slot so adjacent
+  // triangles average their contributions. For triangle-soup each face-vertex
+  // already has a unique slot.
+  const glm::uvec3 outIdx = VerticesIndexed ? indexedIdx : perFaceBaseIdx;
 
-  vec3 t1 = normalize(tangent - n1 * dot(n1, tangent));
-  float h1 = copysign(1.0f, dot(cross(n1, t1), bitangent));
+  const float w0 = cornerAngle(p0, p1, p2);
+  const float w1 = cornerAngle(p1, p0, p2);
+  const float w2 = cornerAngle(p2, p0, p1);
 
-  vec3 t2 = normalize(tangent - n2 * dot(n2, tangent));
-  float h2 = copysign(1.0f, dot(cross(n2, t2), bitangent));
+  atomicAddVec3(tangentAccum[outIdx.x], tangent * w0);
+  atomicAddVec3(tangentAccum[outIdx.y], tangent * w1);
+  atomicAddVec3(tangentAccum[outIdx.z], tangent * w2);
 
-  tangents[perFaceBaseIdx.x] = glm::vec4(t0, h0);
-  tangents[perFaceBaseIdx.y] = glm::vec4(t1, h1);
-  tangents[perFaceBaseIdx.z] = glm::vec4(t2, h2);
+  atomicAddVec3(bitangentAccum[outIdx.x], bitangent * w0);
+  atomicAddVec3(bitangentAccum[outIdx.y], bitangent * w1);
+  atomicAddVec3(bitangentAccum[outIdx.z], bitangent * w2);
+
+  atomicAddVec3(normalAccum[outIdx.x], n0 * w0);
+  atomicAddVec3(normalAccum[outIdx.y], n1 * w1);
+  atomicAddVec3(normalAccum[outIdx.z], n2 * w2);
 }
 
 template <bool VerticesIndexed,
@@ -196,18 +264,60 @@ template <bool VerticesIndexed,
     bool UVsIndexed,
     typename TexCoord>
 void __computeTangents(
-    glm::vec4 *tangents, // Output tangent vectors with handedness (w component)
-    glm::vec3 *bitangents, // Output bitangent vectors
-    const glm::uvec3 *indices, // Input triangle indices
-    const glm::vec3 *positions, // Input vertex positions
-    const glm::vec3 *normals, // Input vertex normals
-    const TexCoord *uvs, // Input texture coordinates
-    unsigned int numTriangles // Number of triangles
-)
+    glm::vec3 *tangentAccum,
+    glm::vec3 *bitangentAccum,
+    glm::vec3 *normalAccum,
+    const glm::uvec3 *indices,
+    const glm::vec3 *positions,
+    const glm::vec3 *normals,
+    const TexCoord *uvs,
+    unsigned int numTriangles)
 {
-  __doComputeTangents<VerticesIndexed, NormalsIndexed, UVsIndexed, TexCoord>
-      <<<(numTriangles + 63) / 64, 64>>>(
-          tangents, bitangents, indices, positions, normals, uvs, numTriangles);
+  __doAccumulateTangents<VerticesIndexed,
+      NormalsIndexed,
+      UVsIndexed,
+      TexCoord><<<(numTriangles + 63) / 64, 64>>>(tangentAccum,
+      bitangentAccum,
+      normalAccum,
+      indices,
+      positions,
+      normals,
+      uvs,
+      numTriangles);
+}
+
+// Pass 2 (one thread per vertex): normalize the accumulated frame and write
+// vec4(T_orthog, sign). The accumulated normal is used as the orthogonalization
+// basis; for vertex-indexed input it averages back to each vertex's authored
+// normal, and for face-varying or missing normals it gives the angle-weighted
+// average across incident faces.
+__global__ void __doFinalizeTangents(glm::vec4 *tangents,
+    const glm::vec3 *tangentAccum,
+    const glm::vec3 *bitangentAccum,
+    const glm::vec3 *normalAccum,
+    unsigned int numVertices)
+{
+  unsigned int v = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (v >= numVertices)
+    return;
+
+  const vec3 T_in = tangentAccum[v];
+  const vec3 B_in = bitangentAccum[v];
+  const vec3 N_in = normalAccum[v];
+
+  const vec3 n = safeNormalize(N_in, vec3(0.0f, 0.0f, 1.0f));
+
+  vec3 fallbackT, fallbackB;
+  makeTangentFrame(n, &fallbackT, &fallbackB);
+
+  const vec3 T_orth =
+      safeNormalize(T_in - n * glm::dot(n, T_in), fallbackT);
+
+  const float bitangentSign = glm::dot(glm::cross(n, T_orth), B_in);
+  const float sign = bitangentSign < 0.0f ? -1.0f : 1.0f;
+
+  tangents[v] = glm::vec4(T_orth, sign);
 }
 
 void updateGeometryTangent(Triangle *triangle)
@@ -219,9 +329,9 @@ void updateGeometryTangent(Triangle *triangle)
   auto normalsFV = triangle->getParamObject<Array1D>("faceVarying.normal");
   auto uvsFV = triangle->getParamObject<Array1D>("faceVarying.attribute0");
 
-  if (!positions || (!normals && !normalsFV)) {
+  if (!positions) {
     triangle->reportMessage(ANARI_SEVERITY_INFO,
-        "Triangle %p has no position or normals, cannot compute tangents",
+        "Triangle %p has no positions, cannot compute tangents",
         triangle);
     return;
   }
@@ -249,20 +359,73 @@ void updateGeometryTangent(Triangle *triangle)
     return;
   }
 
-  // Always go with faceVarying tangents. Rational is the following:
-  // - Correct UV and normal sharing is achieve through indexing
-  // - If faceVarying UVs/normals are used then, it should already imply correct
-  // sharing
-  //   on common vertices.
-
-  auto tangentsCount = indices ? (indices->size() * 3) : positions->size();
-  auto trianglesCount = indices ? indices->size() : positions->size() / 3;
-  glm::vec4 *tangents = {};
-  cudaMalloc(&tangents, sizeof(glm::vec4) * tangentsCount);
-  cudaMemset(tangents, 0, sizeof(glm::vec4) * tangentsCount);
-  glm::vec3 *bitangents = {};
-  cudaMalloc(&bitangents, sizeof(glm::vec3) * tangentsCount);
-  cudaMemset(bitangents, 0, sizeof(glm::vec3) * tangentsCount);
+  // Output is per-vertex (vertex.tangent). For indexed meshes the per-vertex
+  // buffer is what lets adjacent triangles share tangent data at common
+  // vertices — that sharing is what eliminates the per-triangle facets a
+  // face-varying buffer would produce. For triangle-soup input each face-vertex
+  // is its own slot, so the same layout works without changes.
+  const auto numVertices = static_cast<unsigned int>(positions->size());
+  const auto trianglesCount = static_cast<unsigned int>(
+      indices ? indices->size() : positions->size() / 3);
+  if (trianglesCount == 0 || numVertices == 0) {
+    triangle->reportMessage(ANARI_SEVERITY_INFO,
+        "Triangle %p has no triangles, cannot compute tangents",
+        triangle);
+    return;
+  }
+
+  glm::vec3 *tangentAccum = nullptr;
+  glm::vec3 *bitangentAccum = nullptr;
+  glm::vec3 *normalAccum = nullptr;
+  glm::vec4 *tangents = nullptr;
+
+  auto cleanup = [&] {
+    cudaFree(tangentAccum);
+    cudaFree(bitangentAccum);
+    cudaFree(normalAccum);
+    cudaFree(tangents);
+  };
+
+  auto status = cudaMalloc(reinterpret_cast<void **>(&tangentAccum),
+      sizeof(glm::vec3) * numVertices);
+  if (reportCudaError(triangle, status, "allocating tangent accumulator")) {
+    cleanup();
+    return;
+  }
+  status = cudaMalloc(reinterpret_cast<void **>(&bitangentAccum),
+      sizeof(glm::vec3) * numVertices);
+  if (reportCudaError(triangle, status, "allocating bitangent accumulator")) {
+    cleanup();
+    return;
+  }
+  status = cudaMalloc(reinterpret_cast<void **>(&normalAccum),
+      sizeof(glm::vec3) * numVertices);
+  if (reportCudaError(triangle, status, "allocating normal accumulator")) {
+    cleanup();
+    return;
+  }
+  status = cudaMalloc(
+      reinterpret_cast<void **>(&tangents), sizeof(glm::vec4) * numVertices);
+  if (reportCudaError(triangle, status, "allocating tangent output buffer")) {
+    cleanup();
+    return;
+  }
+
+  status = cudaMemset(tangentAccum, 0, sizeof(glm::vec3) * numVertices);
+  if (reportCudaError(triangle, status, "clearing tangent accumulator")) {
+    cleanup();
+    return;
+  }
+  status = cudaMemset(bitangentAccum, 0, sizeof(glm::vec3) * numVertices);
+  if (reportCudaError(triangle, status, "clearing bitangent accumulator")) {
+    cleanup();
+    return;
+  }
+  status = cudaMemset(normalAccum, 0, sizeof(glm::vec3) * numVertices);
+  if (reportCudaError(triangle, status, "clearing normal accumulator")) {
+    cleanup();
+    return;
+  }
 
   auto positionsPtr = positions->dataAs<const glm::vec3>(AddressSpace::GPU);
   if (indices) {
@@ -273,8 +436,9 @@ void updateGeometryTangent(Triangle *triangle)
         if (uvsFV->elementType() == ANARI_FLOAT32_VEC2) {
           auto uvsPtr = uvsFV->dataAs<const glm::vec2>(AddressSpace::GPU);
           // Vertex indexed, face varying normals and face varyings vec2 UVs.
-          __computeTangents<true, false, false>(tangents,
-              bitangents,
+          __computeTangents<true, false, false>(tangentAccum,
+              bitangentAccum,
+              normalAccum,
               indicesPtr,
               positionsPtr,
               normalsPtr,
@@ -283,8 +447,9 @@ void updateGeometryTangent(Triangle *triangle)
         } else {
           auto uvsPtr = uvsFV->dataAs<const glm::vec3>(AddressSpace::GPU);
           // Vertex indexed, face varying normals and face varyings vec3 UVs.
-          __computeTangents<true, false, false>(tangents,
-              bitangents,
+          __computeTangents<true, false, false>(tangentAccum,
+              bitangentAccum,
+              normalAccum,
               indicesPtr,
               positionsPtr,
               normalsPtr,
@@ -295,8 +460,9 @@ void updateGeometryTangent(Triangle *triangle)
         if (uvs->elementType() == ANARI_FLOAT32_VEC2) {
           // Vertex indexed,  face varying normals and indexed vec2 UVs.
           auto uvsPtr = uvs->dataAs<const glm::vec2>(AddressSpace::GPU);
-          __computeTangents<true, false, true>(tangents,
-              bitangents,
+          __computeTangents<true, false, true>(tangentAccum,
+              bitangentAccum,
+              normalAccum,
               indicesPtr,
               positionsPtr,
               normalsPtr,
@@ -305,8 +471,9 @@ void updateGeometryTangent(Triangle *triangle)
         } else {
           // Vertex indexed,  face varying normals and indexed vec3 UVs.
           auto uvsPtr = uvs->dataAs<const glm::vec3>(AddressSpace::GPU);
-          __computeTangents<true, false, true>(tangents,
-              bitangents,
+          __computeTangents<true, false, true>(tangentAccum,
+              bitangentAccum,
+              normalAccum,
               indicesPtr,
               positionsPtr,
               normalsPtr,
@@ -315,13 +482,16 @@ void updateGeometryTangent(Triangle *triangle)
         }
       }
     } else {
-      auto normalsPtr = normals->dataAs<const glm::vec3>(AddressSpace::GPU);
+      const auto *normalsPtr = normals
+          ? normals->dataAs<const glm::vec3>(AddressSpace::GPU)
+          : nullptr;
       if (uvsFV) {
         if (uvsFV->elementType() == ANARI_FLOAT32_VEC2) {
           auto uvsPtr = uvsFV->dataAs<const glm::vec2>(AddressSpace::GPU);
           // Vertex indexed, index normals and face varyings vec2 UVs.
-          __computeTangents<true, true, false>(tangents,
-              bitangents,
+          __computeTangents<true, true, false>(tangentAccum,
+              bitangentAccum,
+              normalAccum,
               indicesPtr,
               positionsPtr,
               normalsPtr,
@@ -330,8 +500,9 @@ void updateGeometryTangent(Triangle *triangle)
         } else {
           auto uvsPtr = uvsFV->dataAs<const glm::vec3>(AddressSpace::GPU);
           // Vertex indexed, indexed normals and face varyings vec3 UVs.
-          __computeTangents<true, true, false>(tangents,
-              bitangents,
+          __computeTangents<true, true, false>(tangentAccum,
+              bitangentAccum,
+              normalAccum,
               indicesPtr,
               positionsPtr,
               normalsPtr,
@@ -342,8 +513,9 @@ void updateGeometryTangent(Triangle *triangle)
         if (uvs->elementType() == ANARI_FLOAT32_VEC2) {
           // Vertex indexed, indexed normals and indexed vec2 UVs.
           auto uvsPtr = uvs->dataAs<const glm::vec2>(AddressSpace::GPU);
-          __computeTangents<true, true, true>(tangents,
-              bitangents,
+          __computeTangents<true, true, true>(tangentAccum,
+              bitangentAccum,
+              normalAccum,
               indicesPtr,
               positionsPtr,
               normalsPtr,
@@ -352,8 +524,9 @@ void updateGeometryTangent(Triangle *triangle)
         } else {
           // Vertex indexed, indexed normals and indexed vec3 UVs.
           auto uvsPtr = uvs->dataAs<const glm::vec3>(AddressSpace::GPU);
-          __computeTangents<true, true, true>(tangents,
-              bitangents,
+          __computeTangents<true, true, true>(tangentAccum,
+              bitangentAccum,
+              normalAccum,
               indicesPtr,
               positionsPtr,
               normalsPtr,
@@ -363,17 +536,20 @@ void updateGeometryTangent(Triangle *triangle)
       }
     }
   } else {
-    auto indicesPtr = nullptr;
-    normals = normalsFV ? normalsFV : normals;
-    uvs = uvsFV ? uvsFV : uvs;
+    const glm::uvec3 *indicesPtr = nullptr;
+    auto effectiveNormals = normalsFV ? normalsFV : normals;
+    auto effectiveUvs = uvsFV ? uvsFV : uvs;
 
-    auto normalsPtr = normals->dataAs<const glm::vec3>(AddressSpace::GPU);
+    const auto *normalsPtr = effectiveNormals
+        ? effectiveNormals->dataAs<const glm::vec3>(AddressSpace::GPU)
+        : nullptr;
 
-    if (uvs->elementType() == ANARI_FLOAT32_VEC2) {
+    if (effectiveUvs->elementType() == ANARI_FLOAT32_VEC2) {
       // Non indexed vertices, face varying normals and face varyings vec2 UVs.
-      auto uvsPtr = uvs->dataAs<const glm::vec2>(AddressSpace::GPU);
-      __computeTangents<false, false, false>(tangents,
-          bitangents,
+      auto uvsPtr = effectiveUvs->dataAs<const glm::vec2>(AddressSpace::GPU);
+      __computeTangents<false, false, false>(tangentAccum,
+          bitangentAccum,
+          normalAccum,
           indicesPtr,
           positionsPtr,
           normalsPtr,
@@ -381,9 +557,10 @@ void updateGeometryTangent(Triangle *triangle)
           trianglesCount);
     } else {
       // Non indexed vertices, face varying normals and face varyings vec3 UVs.
-      auto uvsPtr = uvs->dataAs<const glm::vec3>(AddressSpace::GPU);
-      __computeTangents<false, false, false>(tangents,
-          bitangents,
+      auto uvsPtr = effectiveUvs->dataAs<const glm::vec3>(AddressSpace::GPU);
+      __computeTangents<false, false, false>(tangentAccum,
+          bitangentAccum,
+          normalAccum,
           indicesPtr,
           positionsPtr,
           normalsPtr,
@@ -392,26 +569,47 @@ void updateGeometryTangent(Triangle *triangle)
     }
   }
 
-  // Release transient bitangent store
-  cudaFree(bitangents);
+  status = cudaGetLastError();
+  if (reportCudaError(triangle, status, "launching accumulate kernel")) {
+    cleanup();
+    return;
+  }
+
+  __doFinalizeTangents<<<(numVertices + 63) / 64, 64>>>(
+      tangents, tangentAccum, bitangentAccum, normalAccum, numVertices);
+
+  status = cudaGetLastError();
+  if (reportCudaError(triangle, status, "launching finalize kernel")) {
+    cleanup();
+    return;
+  }
+
+  status = cudaDeviceSynchronize();
+  cudaFree(tangentAccum);
+  cudaFree(bitangentAccum);
+  cudaFree(normalAccum);
+  tangentAccum = nullptr;
+  bitangentAccum = nullptr;
+  normalAccum = nullptr;
+  if (reportCudaError(triangle, status, "computing tangents")) {
+    cleanup();
+    return;
+  }
 
   auto desc = Array1DMemoryDescriptor{
       {
           tangents,
-          {}, // deleter
+          cudaFreeMemoryDeleter, // deleter
           {}, // deleterPtr
           ANARI_FLOAT32_VEC4,
       },
-      tangentsCount,
+      numVertices,
   };
   auto tangentsArray = new Array1D(triangle->deviceState(), desc);
   tangentsArray->commitParameters();
   tangentsArray->finalize();
 
-  if (indices)
-    triangle->setParam("faceVarying.tangent", tangentsArray);
-  else
-    triangle->setParam("vertex.tangent", tangentsArray);
+  triangle->setParam("vertex.tangent", tangentsArray);
   triangle->commitParameters();
   triangle->finalize();
 
diff --git a/devices/rtx/device/geometry/Geometry.cpp b/devices/rtx/device/geometry/Geometry.cpp
index 127eb865e..e39552c72 100644
--- a/devices/rtx/device/geometry/Geometry.cpp
+++ b/devices/rtx/device/geometry/Geometry.cpp
@@ -129,7 +129,7 @@ void Geometry::commitParameters()
 void Geometry::markFinalized()
 {
   Object::markFinalized();
-  deviceState()->objectUpdates.lastBLASChange = helium::newTimeStamp();
+  deviceState()->objectUpdates.lastSurfaceBLASChange = helium::newTimeStamp();
 }
 
 GeometryGPUData Geometry::gpuData() const
diff --git a/devices/rtx/device/geometry/Triangle.cpp b/devices/rtx/device/geometry/Triangle.cpp
index 4369243f3..52aa41216 100644
--- a/devices/rtx/device/geometry/Triangle.cpp
+++ b/devices/rtx/device/geometry/Triangle.cpp
@@ -78,12 +78,14 @@ void Triangle::finalize()
         m_vertex->size());
   }
 
-  if (m_vertexNormalFV && 3 * m_index->size() != m_vertexNormalFV->size()) {
+  const size_t numTriangles = m_index ? m_index->size() : m_vertex->size() / 3;
+
+  if (m_vertexNormalFV && 3 * numTriangles != m_vertexNormalFV->size()) {
     reportMessage(ANARI_SEVERITY_WARNING,
         "'faceVarying.normal' on triangle geometry is not matching "
         "the number of triangles in 'primitive.index' (%zu) vs. (%zu)",
         m_vertexNormalFV->size(),
-        m_index->size());
+        numTriangles);
   }
 
   if (m_vertexTangent && m_vertex->size() != m_vertexTangent->size()) {
@@ -94,12 +96,12 @@ void Triangle::finalize()
         m_vertex->size());
   }
 
-  if (m_vertexTangentFV && 3 * m_index->size() != m_vertexTangentFV->size()) {
+  if (m_vertexTangentFV && 3 * numTriangles != m_vertexTangentFV->size()) {
     reportMessage(ANARI_SEVERITY_WARNING,
         "'faceVarying.Tangent' on triangle geometry is not matching "
         "the number of triangles in 'primitive.index' (%zu) vs. (%zu)",
         m_vertexTangentFV->size(),
-        m_index->size());
+        numTriangles);
   }
 
   if (!m_vertexTangent && !m_vertexTangentFV) {
diff --git a/devices/rtx/device/gpu/computeAO.h b/devices/rtx/device/gpu/computeAO.h
index 77861164b..3d55c3c57 100644
--- a/devices/rtx/device/gpu/computeAO.h
+++ b/devices/rtx/device/gpu/computeAO.h
@@ -45,7 +45,7 @@ VISRTX_DEVICE float computeAO(ScreenSample &ss,
   float weights = 0.0f;
   float hits = 0.0f;
   Ray aoRay;
-  aoRay.org = currentHit.hitpoint + currentHit.Ns * currentHit.epsilon;
+  aoRay.org = shadingHitpoint(currentHit) + currentHit.Ng * currentHit.epsilon;
   aoRay.t.lower = currentHit.epsilon;
   aoRay.t.upper = dist;
 
diff --git a/devices/rtx/device/gpu/gpu_math.h b/devices/rtx/device/gpu/gpu_math.h
index 0b9692792..83dddd432 100644
--- a/devices/rtx/device/gpu/gpu_math.h
+++ b/devices/rtx/device/gpu/gpu_math.h
@@ -326,4 +326,14 @@ VISRTX_HOST_DEVICE vec3 xfmPoint(const mat4 &m, const vec3 &p)
   return m * vec4(p, 1.0f);
 }
 
+// mat3x4 stores OptiX-style affine rows: glm column i = OptiX row i =
+// (m_{i,0}, m_{i,1}, m_{i,2}, m_{i,3}). The transform is
+//   out_i = sum_j m_{i,j} * p_j + m_{i,3}.
+VISRTX_HOST_DEVICE vec3 xfmPoint(const mat3x4 &m, const vec3 &p)
+{
+  return vec3(glm::dot(vec3(m[0]), p) + m[0].w,
+      glm::dot(vec3(m[1]), p) + m[1].w,
+      glm::dot(vec3(m[2]), p) + m[2].w);
+}
+
 } // namespace visrtx
diff --git a/devices/rtx/device/gpu/gpu_util.h b/devices/rtx/device/gpu/gpu_util.h
index 9e17c44ba..e1b8523f5 100644
--- a/devices/rtx/device/gpu/gpu_util.h
+++ b/devices/rtx/device/gpu/gpu_util.h
@@ -195,18 +195,14 @@ VISRTX_DEVICE vec3 boolColor(bool pred)
   return pred ? vec3(0.f, 1.f, 0.f) : vec3(1.f, 0.f, 0.f);
 }
 
+// Uniform on the unit sphere via Marsaglia (1972); pdf = 1/(4*pi).
+// Downstream uses: isotropic volume scatter, AO/bounce hemisphere base.
 VISRTX_DEVICE vec3 randomDir(RandState &rs)
 {
-#if 0
-  const float r1 = curand_uniform(&rs);
-  const float r2 = curand_uniform(&rs);
-  return normalize(vec3(cos(2 * float(M_PI) * r1) * sqrt(1 - (r2 * r2)),
-      sin(2 * float(M_PI) * r1) * sqrt(1 - (r2 * r2)),
-      r2 * r2));
-#else
-  const auto r = curand_uniform4(&rs);
-  return normalize((2.f * vec3(r.x, r.y, r.z)) - vec3(1.f));
-#endif
+  const float cosTheta = 1.f - 2.f * curand_uniform(&rs);
+  const float sinTheta = sqrtf(fmaxf(0.f, 1.f - cosTheta * cosTheta));
+  const float phi = 2.f * float(M_PI) * curand_uniform(&rs);
+  return vec3(sinTheta * cosf(phi), sinTheta * sinf(phi), cosTheta);
 }
 
 VISRTX_DEVICE vec3 randomDir(RandState &rs, const vec3 &normal)
@@ -228,14 +224,15 @@ VISRTX_DEVICE mat3 computeOrthonormalBasis(const vec3 &normal)
   return mat3(u, v, normal);
 }
 
+// Cosine-weighted hemisphere sample (Malley's method); pdf = cos(theta)/pi.
 VISRTX_DEVICE vec3 sampleHemisphere(RandState &rs, const vec3 &normal)
 {
-  auto z = curand_uniform(&rs);
-  auto r = sqrtf(1.f - sqrt(z));
-  auto phi = 2.0f * float(M_PI) * curand_uniform(&rs);
-
-  auto sample = vec3(r * cos(phi), r * sin(phi), z);
-
+  const float u1 = curand_uniform(&rs);
+  const float u2 = curand_uniform(&rs);
+  const float r = sqrtf(u1);
+  const float z = sqrtf(fmaxf(0.f, 1.f - r * r));
+  const float phi = 2.f * float(M_PI) * u2;
+  const vec3 sample(r * cosf(phi), r * sinf(phi), z);
   return computeOrthonormalBasis(normal) * sample;
 }
 
@@ -252,9 +249,90 @@ VISRTX_DEVICE vec3 sampleUnitSphere(RandState &rs, const vec3 &normal)
 
 #define ulpEpsilon 0x1.fp-21
 
-VISRTX_DEVICE float epsilonFrom(const vec3 &P, const vec3 &dir, float t)
+VISRTX_DEVICE float epsilonFrom(const vec3 &P)
 {
-  return glm::compMax(vec4(abs(P), glm::compMax(abs(dir)) * t)) * ulpEpsilon;
+  return glm::compMax(abs(P)) * ulpEpsilon;
+}
+
+// Hanika's shadow-terminator fix (Ray Tracing Gems II, ch. 4): lifts a
+// triangle hit point onto the smooth surface implied by per-vertex normals.
+// Without this, grazing-angle shadow rays self-occlude on the planar facet
+// and produce dark bands shaped like the underlying tessellation. All inputs
+// must share a coordinate space.
+VISRTX_DEVICE vec3 shadowTerminatorOffset(const vec3 &P,
+    const vec3 &v0,
+    const vec3 &v1,
+    const vec3 &v2,
+    const vec3 &n0,
+    const vec3 &n1,
+    const vec3 &n2,
+    const vec3 &bary)
+{
+  const float du = glm::dot(P - v0, n0);
+  const float dv = glm::dot(P - v1, n1);
+  const float dw = glm::dot(P - v2, n2);
+  const vec3 lu = du < 0.f ? -du * n0 : vec3(0.f);
+  const vec3 lv = dv < 0.f ? -dv * n1 : vec3(0.f);
+  const vec3 lw = dw < 0.f ? -dw * n2 : vec3(0.f);
+  return P + bary.x * lu + bary.y * lv + bary.z * lw;
+}
+
+// World-space hit position lifted onto the smooth surface implied by
+// per-vertex normals (Hanika shadow-terminator fix). Use this as the origin
+// for direct-light/AO shadow rays so grazing-angle queries do not self-shadow
+// the planar facet. Do NOT use it for path-continuation rays — transmission
+// especially needs the original facet point, since the smoothed point can sit
+// far enough above the facet that an "into-the-surface" offset still ends up
+// outside the volume.
+VISRTX_DEVICE vec3 shadingHitpoint(const SurfaceHit &hit)
+{
+  if (hit.geometry == nullptr || hit.geometry->type != GeometryType::TRIANGLE)
+    return hit.hitpoint;
+
+  const auto &tri = hit.geometry->tri;
+  if (tri.vertexNormalsFV == nullptr && tri.vertexNormals == nullptr)
+    return hit.hitpoint;
+
+  const uvec3 idx = tri.indices ? tri.indices[hit.primID]
+                                : uvec3(0, 1, 2) + hit.primID * 3;
+  const vec3 v0 = tri.vertices[idx.x];
+  const vec3 v1 = tri.vertices[idx.y];
+  const vec3 v2 = tri.vertices[idx.z];
+
+  vec3 n0, n1, n2;
+  if (tri.vertexNormalsFV != nullptr) {
+    const uvec3 nidx = uvec3(0, 1, 2) + hit.primID * 3;
+    n0 = tri.vertexNormalsFV[nidx.x];
+    n1 = tri.vertexNormalsFV[nidx.y];
+    n2 = tri.vertexNormalsFV[nidx.z];
+  } else {
+    n0 = tri.vertexNormals[idx.x];
+    n1 = tri.vertexNormals[idx.y];
+    n2 = tri.vertexNormals[idx.z];
+  }
+
+  // Hanika's tangent-plane projection assumes unit normals; user data is
+  // not guaranteed to be normalized.
+  n0 = normalize(n0);
+  n1 = normalize(n1);
+  n2 = normalize(n2);
+
+  // populateHit.h flips hit.Ng/Ns for back-face hits so they point toward
+  // the ray origin. The per-vertex normals here are still in the original
+  // outward orientation; flip them too so the smooth surface bulges onto
+  // the ray-origin side of the facet (otherwise Hanika lifts P away from
+  // the ray origin and the trailing `+ Ng * epsilon` can land below the
+  // facet).
+  if (!hit.isFrontFace) {
+    n0 = -n0;
+    n1 = -n1;
+    n2 = -n2;
+  }
+
+  const vec3 Plocal = xfmPoint(hit.worldToObject, hit.hitpoint);
+  const vec3 Psmooth =
+      shadowTerminatorOffset(Plocal, v0, v1, v2, n0, n1, n2, hit.uvw);
+  return xfmPoint(hit.objectToWorld, Psmooth);
 }
 
 VISRTX_DEVICE bool pixelOutOfFrame(
diff --git a/devices/rtx/device/gpu/populateHit.h b/devices/rtx/device/gpu/populateHit.h
index 4930c05e5..c5c649f5f 100644
--- a/devices/rtx/device/gpu/populateHit.h
+++ b/devices/rtx/device/gpu/populateHit.h
@@ -197,9 +197,11 @@ VISRTX_DEVICE const VolumeGPUData &volumeData(const FrameGPUData &frameData)
   return frameData.registry.volumes[idx];
 }
 
-VISRTX_DEVICE const SpatialFieldGPUData &fieldData(const FrameGPUData &frameData, const VolumeGPUData &volumeData)
+VISRTX_DEVICE const SpatialFieldGPUData &fieldData(
+    const FrameGPUData &frameData, const VolumeGPUData &volumeData)
 {
-  // Currently only TF1D volume type is supported, so assume this is what we have
+  // Currently only TF1D volume type is supported, so assume this is what we
+  // have
   return frameData.registry.fields[volumeData.data.tf1d.field];
 }
 
@@ -223,40 +225,56 @@ VISRTX_DEVICE void computeTangentSpace(
     if (!optixIsFrontFaceHit())
       hit.Ng = -hit.Ng;
 
+    vec3 n0, n1, n2;
+    bool hasVertexNormals = true;
     if (ggd.tri.vertexNormalsFV != nullptr) {
-      const uvec3 idx = uvec3(0, 1, 2) + (hit.primID * 3);
-
-      const vec3 n0 = ggd.tri.vertexNormalsFV[idx.x];
-      const vec3 n1 = ggd.tri.vertexNormalsFV[idx.y];
-      const vec3 n2 = ggd.tri.vertexNormalsFV[idx.z];
-      hit.Ns = b.x * n0 + b.y * n1 + b.z * n2;
+      const uvec3 nidx = uvec3(0, 1, 2) + (hit.primID * 3);
+      n0 = ggd.tri.vertexNormalsFV[nidx.x];
+      n1 = ggd.tri.vertexNormalsFV[nidx.y];
+      n2 = ggd.tri.vertexNormalsFV[nidx.z];
     } else if (ggd.tri.vertexNormals != nullptr) {
-      const vec3 n0 = ggd.tri.vertexNormals[idx.x];
-      const vec3 n1 = ggd.tri.vertexNormals[idx.y];
-      const vec3 n2 = ggd.tri.vertexNormals[idx.z];
+      n0 = ggd.tri.vertexNormals[idx.x];
+      n1 = ggd.tri.vertexNormals[idx.y];
+      n2 = ggd.tri.vertexNormals[idx.z];
+    } else {
+      hasVertexNormals = false;
+    }
+
+    if (hasVertexNormals)
       hit.Ns = b.x * n0 + b.y * n1 + b.z * n2;
-    } else
+    else
       hit.Ns = hit.Ng;
 
     hit.Ns = normalize(hit.Ns);
 
-    if (ggd.tri.vertexTangentsFV != nullptr) {
-      const uvec3 idx = uvec3(0, 1, 2) + (hit.primID * 3);
+    const bool hasTangentsFV = ggd.tri.vertexTangentsFV != nullptr;
+    const bool hasTangentsV = ggd.tri.vertexTangents != nullptr;
+    if (hasTangentsFV || hasTangentsV) {
+      const uvec3 tIdx = hasTangentsFV
+          ? uvec3(0, 1, 2) + (hit.primID * 3)
+          : idx;
+      const vec4 *tArr = hasTangentsFV
+          ? ggd.tri.vertexTangentsFV
+          : ggd.tri.vertexTangents;
+      const vec4 t0 = tArr[tIdx.x];
+      const vec4 t1 = tArr[tIdx.y];
+      const vec4 t2 = tArr[tIdx.z];
+
+      // At UV mirror seams the sign flips between adjacent vertices;
+      // barycentric-summing the signs and applying a single sign at the
+      // hit point would carve seam edges into the tangent frame.
+      // Build each vertex's bitangent with its own sign and normal,
+      // then blend B and T independently — same convention
+      // as glTF Sample Renderer, PBRT, Filament.
+      const vec3 N0 = hasVertexNormals ? n0 : hit.Ng;
+      const vec3 N1 = hasVertexNormals ? n1 : hit.Ng;
+      const vec3 N2 = hasVertexNormals ? n2 : hit.Ng;
+      const vec3 B0 = t0.w * cross(N0, vec3(t0));
+      const vec3 B1 = t1.w * cross(N1, vec3(t1));
+      const vec3 B2 = t2.w * cross(N2, vec3(t2));
 
-      const vec3 t0 = ggd.tri.vertexTangentsFV[idx.x];
-      const vec3 t1 = ggd.tri.vertexTangentsFV[idx.y];
-      const vec3 t2 = ggd.tri.vertexTangentsFV[idx.z];
-      const float handedness = ggd.tri.vertexTangentsFV[idx.x].w;
       hit.tU = normalize(b.x * vec3(t0) + b.y * vec3(t1) + b.z * vec3(t2));
-      hit.tV = handedness * normalize(cross(hit.Ns, hit.tU));
-    } else if (ggd.tri.vertexTangents != nullptr) {
-      const vec3 t0 = ggd.tri.vertexTangents[idx.x];
-      const vec3 t1 = ggd.tri.vertexTangents[idx.y];
-      const vec3 t2 = ggd.tri.vertexTangents[idx.z];
-      const float handedness = ggd.tri.vertexTangents[idx.x].w;
-
-      hit.tU = normalize(b.x * t0 + b.y * t1 + b.z * t2);
-      hit.tV = handedness * normalize(cross(hit.Ns, hit.tU));
+      hit.tV = normalize(b.x * B0 + b.y * B1 + b.z * B2);
     } else {
       auto tangentSpace = computeOrthonormalBasis(hit.Ng);
       hit.tU = tangentSpace[0];
@@ -383,7 +401,7 @@ VISRTX_DEVICE void populateSurfaceHit(SurfaceHit &hit)
   hit.primID = ray::primID();
   hit.objID = sd.id;
   hit.instID = isd.id;
-  hit.epsilon = epsilonFrom(ray::hitpoint(), ray::direction(), ray::t());
+  hit.epsilon = epsilonFrom(hit.hitpoint);
   ray::computeTangentSpace(gd, ray::primID(), hit);
 
   const auto &handle = optixGetTransformListHandle(0);
diff --git a/devices/rtx/device/gpu/renderer/raygen_helpers.h b/devices/rtx/device/gpu/renderer/raygen_helpers.h
index 5f65d6405..81bfc7bd8 100644
--- a/devices/rtx/device/gpu/renderer/raygen_helpers.h
+++ b/devices/rtx/device/gpu/renderer/raygen_helpers.h
@@ -61,15 +61,6 @@ VISRTX_DEVICE float volumeAttenuation(ScreenSample &ss, const Ray &r)
   return attenuation;
 }
 
-// Evaluate opacity including transmission
-VISRTX_DEVICE float evaluateOpacity(const MaterialShadingState &shadingState)
-{
-  return materialEvaluateOpacity(shadingState)
-      * (1.0f
-          - glm::luminosity(materialEvaluateTransmission(shadingState))
-              * 0.85f);
-}
-
 // Templated rendering loop
 // ShadingPolicy must implement:
 //   static VISRTX_DEVICE vec4 shadeSurface(
diff --git a/devices/rtx/device/gpu/sampleLight.h b/devices/rtx/device/gpu/sampleLight.h
index 14bbe64cf..2fe5f5013 100644
--- a/devices/rtx/device/gpu/sampleLight.h
+++ b/devices/rtx/device/gpu/sampleLight.h
@@ -339,7 +339,7 @@ VISRTX_DEVICE LightSample sampleHDRILight(
   ls.dir = xfmVec(xfm, dir);
   ls.dist =
       std::numeric_limits<float>::infinity(); // Environment is at infinity
-  ls.radiance = radiance * ld.hdri.scale;
+  ls.radiance = radiance * ld.hdri.scale * ld.color;
   ls.pdf = pdf;
 
   return ls;
@@ -384,7 +384,7 @@ VISRTX_DEVICE LightSample sampleHDRILight(
   // instead of explicitly transposing/inverting the matrix
   ls.dir = xfmVec(xfm, sphericalCoordsToDirection(thetaPhi) * ld.hdri.xfm);
   ls.dist = 1e20f; // Environment is effectively at infinity
-  ls.radiance = radiance * ld.hdri.scale;
+  ls.radiance = radiance * ld.hdri.scale * ld.color;
   ls.pdf = pdf;
 
   return ls;
diff --git a/devices/rtx/device/gpu/shadingState.h b/devices/rtx/device/gpu/shadingState.h
index 1831d5436..e85ec911d 100644
--- a/devices/rtx/device/gpu/shadingState.h
+++ b/devices/rtx/device/gpu/shadingState.h
@@ -80,7 +80,10 @@ struct PhysicallyBasedShadingState
   float metallic;
   float roughness;
   float transmission;
-  float ior;
+  // Refraction ratio n1/n2 from the incident side: init() stores 1/md->ior
+  // for front-facing hits and md->ior for back-facing hits, so this can be
+  // plugged directly into glm::refract and Schlick's F0 formula.
+  float eta;
   vec3 emission;
 
   float occlusion;
diff --git a/devices/rtx/device/material/shaders/MatteShader_ptx.cu b/devices/rtx/device/material/shaders/MatteShader_ptx.cu
index 06429e5fa..371a4cb1c 100644
--- a/devices/rtx/device/material/shaders/MatteShader_ptx.cu
+++ b/devices/rtx/device/material/shaders/MatteShader_ptx.cu
@@ -48,6 +48,12 @@ VISRTX_CALLABLE void __direct_callable__init(MatteShadingState *shadingState,
   shadingState->normal = hit->Ns;
   shadingState->opacity =
       adjustedMaterialOpacity(color.w * opacity, md->alphaMode, md->cutoff);
+
+  // Fall back to the geometric normal if hit->Ns is NaN (e.g. coincident
+  // curve control points) or zero-length. Negated comparison catches both
+  // since NaN compares false to anything.
+  if (!(glm::dot(shadingState->normal, shadingState->normal) > 1e-12f))
+    shadingState->normal = hit->Ng;
 }
 
 VISRTX_CALLABLE NextRay __direct_callable__nextRay(
diff --git a/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu b/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu
index 5e710b729..593457ed3 100644
--- a/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu
+++ b/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu
@@ -65,7 +65,11 @@ VISRTX_DEVICE vec3 sampleNormalMap(const FrameGPUData &fd,
   if (samplerIdx == ~visrtx::DeviceObjectIndex{0})
     return fallback;
   const vec3 ts = normalize(evaluateSampler(fd, samplerIdx, hit) * 2.0f - 1.0f);
-  return applyNormalMap(ts, hit, hit.Ns);
+  const vec3 N = applyNormalMap(ts, hit, hit.Ns);
+  // Negated comparison catches NaN (zero-decoded texels, zero-summed tangents)
+  // as well as zero-length results — fall back to the geometric normal so the
+  // shading frame is always usable.
+  return (dot(N, N) > 1e-12f) ? N : hit.Ng;
 }
 
 VISRTX_DEVICE float luminance(const vec3 &c)
@@ -116,7 +120,13 @@ VISRTX_DEVICE float smithG1GGX(float NdotV, float alpha2)
 
 VISRTX_DEVICE float ggxD(float NdotH, float alpha2)
 {
-  const float denom = NdotH * NdotH * (alpha2 - 1.0f) + 1.0f;
+  // The textbook denom `x·(α²−1) + 1` cancels catastrophically in fp32 once
+  // α² is below eps(1) ≈ 1.19e-7 (our α² floor is 1e-8): `α²−1` rounds to
+  // exactly −1, and at x=1 the whole denom collapses to 0. The algebraically
+  // equivalent `α²·x + (1−x)` has no near-1 subtraction so it stays exact.
+  // The fminf clamp keeps `1−x ≥ 0` against dot-product rounding above 1.
+  const float NdotH2 = fminf(NdotH * NdotH, 1.0f);
+  const float denom = alpha2 * NdotH2 + (1.0f - NdotH2);
   return alpha2 / (float(M_PI) * denom * denom);
 }
 
@@ -276,7 +286,7 @@ VISRTX_CALLABLE void __direct_callable__init(
 
   shadingState->opacity =
       adjustedMaterialOpacity(color.w * opacity, md->alphaMode, md->cutoff);
-  shadingState->ior = hit->isFrontFace ? 1.0f / md->ior : md->ior;
+  shadingState->eta = hit->isFrontFace ? 1.0f / md->ior : md->ior;
   shadingState->metallic = getMaterialParameter(*fd, md->metallic, *hit).x;
   shadingState->roughness = getMaterialParameter(*fd, md->roughness, *hit).x;
   shadingState->emission = vec3(getMaterialParameter(*fd, md->emissive, *hit));
@@ -355,7 +365,7 @@ VISRTX_CALLABLE vec3 __direct_callable__evaluateNormal(
 
 VISRTX_DEVICE vec3 computeDielectricF0(const PhysicallyBasedShadingState *state)
 {
-  const float iorF0 = pow2((1.0f - state->ior) / (1.0f + state->ior));
+  const float iorF0 = pow2((1.0f - state->eta) / (1.0f + state->eta));
   if (state->useSpecular == 0)
     return vec3(iorF0);
   return glm::min(vec3(iorF0) * state->specularColor, vec3(1.0f))
@@ -379,6 +389,21 @@ VISRTX_DEVICE vec3 schlickFresnel(vec3 F0, vec3 F90, float VdotH)
   return F0 + (F90 - F0) * pow5(1.0f - fabsf(VdotH));
 }
 
+VISRTX_DEVICE vec3 evalFresnelWithIridescence(
+    const PhysicallyBasedShadingState *state,
+    const vec3 &F0,
+    const vec3 &F90,
+    float cosTheta)
+{
+  vec3 F = schlickFresnel(F0, F90, cosTheta);
+  if (state->iridescence > 0.0f && state->iridescenceThickness > 0.0f) {
+    const vec3 iridescent = evalIridescence(
+        1.0f, state->iridescenceIor, cosTheta, state->iridescenceThickness, F0);
+    F = glm::mix(F, iridescent, state->iridescence);
+  }
+  return F;
+}
+
 VISRTX_CALLABLE vec3 __direct_callable__shadeSurface(
     const PhysicallyBasedShadingState *state,
     const SurfaceHit *hit,
@@ -390,7 +415,9 @@ VISRTX_CALLABLE vec3 __direct_callable__shadeSurface(
   const vec3 L = lightSample->dir;
 
   const float NdotL = dot(N, L);
-  if (NdotL <= 0.0f)
+  // Negated form so a NaN NdotL takes this early-out — NaN compares false
+  // to everything, so `NdotL <= 0.0f` would let it pass through.
+  if (!(NdotL > 0.0f))
     return vec3(0.0f);
 
   const vec3 H = normalize(L + V);
@@ -398,15 +425,13 @@ VISRTX_CALLABLE vec3 __direct_callable__shadeSurface(
   const float NdotV = fmaxf(dot(N, V), 1e-6f);
   const float VdotH = fmaxf(dot(V, H), 0.0f);
 
-  // Base F0 / F90, optionally overridden by iridescence.
-  vec3 F0 = computeF0(state);
-  vec3 F90 = computeF90(state);
-  vec3 F = schlickFresnel(F0, F90, VdotH);
-  if (state->iridescence > 0.0f && state->iridescenceThickness > 0.0f) {
-    const vec3 iridescent = evalIridescence(
-        1.0f, state->iridescenceIor, VdotH, state->iridescenceThickness, F0);
-    F = glm::mix(F, iridescent, state->iridescence);
-  }
+  // Base F0 / F90. Specular uses Fresnel at the microfacet (VdotH); the
+  // diffuse weight uses Fresnel at NdotV (Frostbite/Disney convention) so
+  // shadeSurface and nextRay's diffuse split agree regardless of light dir.
+  const vec3 F0 = computeF0(state);
+  const vec3 F90 = computeF90(state);
+  const vec3 F = evalFresnelWithIridescence(state, F0, F90, VdotH);
+  const vec3 Fdiff = evalFresnelWithIridescence(state, F0, F90, NdotV);
 
   // Base GGX specular lobe.
   const float alpha = fmaxf(pow2(state->roughness), 1e-4f);
@@ -419,7 +444,7 @@ VISRTX_CALLABLE vec3 __direct_callable__shadeSurface(
   // and transmission; metals have no diffuse).
   const vec3 diffuseColor =
       glm::mix(state->baseColor, vec3(0.0f), state->metallic);
-  const vec3 diffuseBRDF = (vec3(1.0f) - F) * float(M_1_PI) * diffuseColor
+  const vec3 diffuseBRDF = (vec3(1.0f) - Fdiff) * float(M_1_PI) * diffuseColor
       * state->occlusion * (1.0f - state->transmission);
 
   vec3 base = diffuseBRDF + specularBRDF;
@@ -464,8 +489,8 @@ VISRTX_CALLABLE vec3 __direct_callable__shadeSurface(
 
 //-----------------------------------------------------------------------------
 // Next-ray importance sampling: stochastic alpha, Fresnel-aware lobe pick,
-// GGX VNDF reflection/refraction. Clearcoat/sheen are NEE-only (no separate
-// lobe sampling), which matches what the base renderer is set up to consume.
+// GGX VNDF reflection/refraction, plus a clearcoat lobe sampled with
+// probability equal to its view-angle Fresnel weight. Sheen is NEE-only.
 //-----------------------------------------------------------------------------
 
 VISRTX_CALLABLE NextRay __direct_callable__nextRay(
@@ -475,8 +500,55 @@ VISRTX_CALLABLE NextRay __direct_callable__nextRay(
   if (curand_uniform(rs) > state->opacity)
     return NextRay{ray->dir, vec3(1.0f), NEXT_RAY_CONTINUES_THROUGH_SURFACE};
 
-  const vec3 N = state->normal;
   const vec3 V = -ray->dir;
+
+  // Clearcoat lobe: pick it with probability `clearcoat·FcV(NcDotV)`. This
+  // exact weight makes the entry-side attenuation `1 - clearcoat·FcV` cancel
+  // the `1/(1-pick)` lobe-pick divisor in the base path below, so the base
+  // returns only need the exit-side `1 - clearcoat·FcL` multiplier.
+  const vec3 Nc = state->clearcoatNormal;
+  const float NcDotV_world = fmaxf(dot(Nc, V), 0.0f);
+  const float FcV_world =
+      CLEARCOAT_F0 + (1.0f - CLEARCOAT_F0) * pow5(1.0f - NcDotV_world);
+  const float clearcoatPick =
+      glm::clamp(state->clearcoat * FcV_world, 0.0f, 1.0f);
+
+  if (clearcoatPick > 0.0f && curand_uniform(rs) < clearcoatPick) {
+    const mat3 toWorldC = computeOrthonormalBasis(Nc);
+    const vec3 VlocalC = glm::transpose(toWorldC) * V;
+    if (VlocalC.z <= 0.0f)
+      return NextRay{Nc, vec3(0.0f)};
+    const float alphaC = fmaxf(pow2(state->clearcoatRoughness), 1e-4f);
+    const float alphaC2 = alphaC * alphaC;
+    const vec3 HlocalC = sampleGGXVNDF(
+        VlocalC, alphaC, curand_uniform(rs), curand_uniform(rs));
+    const vec3 LlocalC = glm::reflect(-VlocalC, HlocalC);
+    if (LlocalC.z <= 0.0f)
+      return NextRay{Nc, vec3(0.0f)};
+    const float VdotHc = fmaxf(dot(VlocalC, HlocalC), 0.0f);
+    const float Fc =
+        CLEARCOAT_F0 + (1.0f - CLEARCOAT_F0) * pow5(1.0f - VdotHc);
+    const float G1c = smithG1GGX(VlocalC.z, alphaC2);
+    const float G2c = smithG2GGX(VlocalC.z, LlocalC.z, alphaC2);
+    // VNDF gives BRDF·cos/pdf = clearcoat·Fc·G2/G1; the clearcoat factor
+    // cancels against the matching factor in clearcoatPick.
+    const vec3 weight = vec3(state->clearcoat * Fc * G2c / fmaxf(G1c, 1e-8f))
+        / fmaxf(clearcoatPick, 1e-8f);
+    return NextRay{normalize(toWorldC * LlocalC), weight};
+  }
+
+  // Exit-side clearcoat attenuation, applied to every base-path return.
+  // `fabsf` handles the transmission case where L points through the surface.
+  auto clearcoatExitAttn = [&](const vec3 &Lworld) -> float {
+    if (state->clearcoat <= 0.0f)
+      return 1.0f;
+    const float NcDotL = fabsf(dot(Nc, Lworld));
+    const float FcL =
+        CLEARCOAT_F0 + (1.0f - CLEARCOAT_F0) * pow5(1.0f - NcDotL);
+    return glm::clamp(1.0f - state->clearcoat * FcL, 0.0f, 1.0f);
+  };
+
+  const vec3 N = state->normal;
   const mat3 toWorld = computeOrthonormalBasis(N);
   const mat3 toLocal = glm::transpose(toWorld);
   const vec3 Vlocal = toLocal * V;
@@ -491,19 +563,15 @@ VISRTX_CALLABLE NextRay __direct_callable__nextRay(
   const float NdotV = Vlocal.z;
   const float VdotH = fmaxf(dot(Vlocal, Hlocal), 0.0f);
 
-  // Fresnel at the sampled microfacet, with optional iridescence.
+  // Fresnel at the sampled microfacet (specular/transmission split) and at
+  // NdotV (diffuse weight) — matches the convention in shadeSurface.
   const vec3 F0 = computeF0(state);
   const vec3 F90 = computeF90(state);
-  vec3 F = schlickFresnel(F0, F90, VdotH);
-  if (state->iridescence > 0.0f && state->iridescenceThickness > 0.0f) {
-    const vec3 iridescent = evalIridescence(
-        1.0f, state->iridescenceIor, VdotH, state->iridescenceThickness, F0);
-    F = glm::mix(F, iridescent, state->iridescence);
-  }
+  const vec3 F = evalFresnelWithIridescence(state, F0, F90, VdotH);
+  const vec3 Fdiff = evalFresnelWithIridescence(state, F0, F90, NdotV);
 
   const vec3 Lrefl = glm::reflect(-Vlocal, Hlocal);
-  const float eta = state->ior; // init() pre-inverted for front-facing hits
-  const vec3 Ltrans = glm::refract(-Vlocal, Hlocal, eta);
+  const vec3 Ltrans = glm::refract(-Vlocal, Hlocal, state->eta);
   const vec3 transmissionFilter = computeTransmissionFilter(state);
   const bool hasTransmission = luminance(transmissionFilter) > 0.0f;
   const bool totalInternalReflection =
@@ -514,36 +582,60 @@ VISRTX_CALLABLE NextRay __direct_callable__nextRay(
       ? vec3(0.0f)
       : glm::max(vec3(1.0f) - F, vec3(0.0f)) * transmissionFilter;
 
+  // Diffuse importance: the Lambertian throughput collapses to
+  //   (1-F) * baseColor * (1-metallic) * (1-transmission) * occlusion
+  // when sampled cosine-weighted (cos / pdf cancels with 1/pi). Mirror the
+  // factors used by shadeSurface's diffuseBRDF so the lobe split tracks the
+  // BRDF being estimated. TIR has no diffuse share (all energy is reflected).
+  const vec3 diffuseEnergy = totalInternalReflection
+      ? vec3(0.0f)
+      : glm::max(vec3(1.0f) - Fdiff, vec3(0.0f)) * state->baseColor
+          * (1.0f - state->metallic) * (1.0f - state->transmission)
+          * state->occlusion;
+
   const float reflectStrength =
       fmaxf(luminance(glm::max(reflectEnergy, vec3(0.0f))), 0.0f);
   const float transmitStrength =
       fmaxf(luminance(glm::max(transmitEnergy, vec3(0.0f))), 0.0f);
-  const float combinedStrength = reflectStrength + transmitStrength;
+  const float diffuseStrength =
+      fmaxf(luminance(glm::max(diffuseEnergy, vec3(0.0f))), 0.0f);
+  const float combinedStrength =
+      reflectStrength + transmitStrength + diffuseStrength;
   if (combinedStrength <= 0.0f)
     return NextRay{N, vec3(0.0f)};
 
   const float reflectProb = reflectStrength / combinedStrength;
-  const bool sampleTransmission = curand_uniform(rs) > reflectProb;
+  const float transmitProb = transmitStrength / combinedStrength;
+  const float diffuseProb = diffuseStrength / combinedStrength;
+
+  const float u = curand_uniform(rs);
+  if (u < reflectProb) {
+    if (Lrefl.z <= 0.0f)
+      return NextRay{N, vec3(0.0f)};
+    const float NdotL = Lrefl.z;
+    const float G1 = smithG1GGX(NdotV, alpha2);
+    const float G2 = smithG2GGX(NdotV, NdotL, alpha2);
+    const vec3 Lworld = normalize(toWorld * Lrefl);
+    const vec3 weight = reflectEnergy * (G2 / fmaxf(G1, 1e-8f))
+        * clearcoatExitAttn(Lworld) / fmaxf(reflectProb, 1e-8f);
+    return NextRay{Lworld, weight};
+  }
 
-  if (sampleTransmission) {
+  if (u < reflectProb + transmitProb) {
     const float NdotL = -Ltrans.z; // L points through the surface.
     const float G1 = smithG1GGX(NdotV, alpha2);
     const float G2 = smithG2GGX(NdotV, NdotL, alpha2);
+    const vec3 Lworld = normalize(toWorld * Ltrans);
     const vec3 weight = transmitEnergy * (G2 / fmaxf(G1, 1e-8f))
-        / fmaxf(1.0f - reflectProb, 1e-8f);
-    return NextRay{normalize(toWorld * Ltrans),
-        weight,
-        NEXT_RAY_CONTINUES_THROUGH_SURFACE};
+        * clearcoatExitAttn(Lworld) / fmaxf(transmitProb, 1e-8f);
+    return NextRay{Lworld, weight, NEXT_RAY_CONTINUES_THROUGH_SURFACE};
   }
 
-  // Reflection.
-  if (Lrefl.z <= 0.0f)
-    return NextRay{N, vec3(0.0f)};
-
-  const float NdotL = Lrefl.z;
-  const float G1 = smithG1GGX(NdotV, alpha2);
-  const float G2 = smithG2GGX(NdotV, NdotL, alpha2);
+  // Diffuse: sample around the shading normal so pdf=cos/pi matches the BRDF's
+  // NdotL (same axis as shadeSurface's diffuse term). Cos and pdf cancel,
+  // leaving only the energy term and the lobe-pick divisor.
+  const vec3 wi = sampleHemisphere(*rs, N);
   const vec3 weight =
-      reflectEnergy * (G2 / fmaxf(G1, 1e-8f)) / fmaxf(reflectProb, 1e-8f);
-  return NextRay{normalize(toWorld * Lrefl), weight};
+      diffuseEnergy * clearcoatExitAttn(wi) / fmaxf(diffuseProb, 1e-8f);
+  return NextRay{wi, weight};
 }
diff --git a/devices/rtx/device/optix_visrtx.h b/devices/rtx/device/optix_visrtx.h
index 60c330dab..2f6082ad7 100644
--- a/devices/rtx/device/optix_visrtx.h
+++ b/devices/rtx/device/optix_visrtx.h
@@ -205,7 +205,9 @@ struct DeviceGlobalState : public helium::BaseGlobalDeviceState
 
   struct ObjectUpdates
   {
-    helium::TimeStamp lastBLASChange{0};
+    helium::TimeStamp lastSurfaceBLASChange{0};
+    helium::TimeStamp lastVolumeBLASChange{0};
+    helium::TimeStamp lastLightSetChange{0};
     helium::TimeStamp lastTLASChange{0};
   } objectUpdates;
 
diff --git a/devices/rtx/device/renderer/Interactive_ptx.cu b/devices/rtx/device/renderer/Interactive_ptx.cu
index c75d0a0ad..2320e3a23 100644
--- a/devices/rtx/device/renderer/Interactive_ptx.cu
+++ b/devices/rtx/device/renderer/Interactive_ptx.cu
@@ -87,17 +87,18 @@ struct InteractiveShadingPolicy
         * materialEvaluateTint(shadingState);
 
     // Handle all lights contributions
+    const vec3 shadowOrigin = shadingHitpoint(hit) + hit.Ng * hit.epsilon;
     for (size_t i = 0; i < world.numLightInstances; i++) {
       const auto &light = world.lightInstances[i];
       const auto lightSample =
-          sampleLight(ss, hit.hitpoint, light.lightIndex, light.xfm);
+          sampleLight(ss, shadowOrigin, light.lightIndex, light.xfm);
 
       if (lightSample.pdf == 0.0f)
         continue;
 
       // Shadowing
       const Ray shadowRay = {
-          hit.hitpoint + hit.Ng * hit.epsilon,
+          shadowOrigin,
           lightSample.dir,
           {hit.epsilon, lightSample.dist},
       };
@@ -154,7 +155,7 @@ struct InteractiveShadingPolicy
       }
     }
 
-    return vec4(contrib, evaluateOpacity(shadingState));
+    return vec4(contrib, materialEvaluateOpacity(shadingState));
   }
 };
 
@@ -181,7 +182,7 @@ VISRTX_GLOBAL void __anyhit__shadow()
 
     MaterialShadingState shadingState;
     materialInitShading(&shadingState, frameData, *hit.material, hit);
-    auto opacity = evaluateOpacity(shadingState);
+    auto opacity = materialEvaluateOpacity(shadingState);
 
     auto &o = ray::rayData<float>();
 
diff --git a/devices/rtx/device/renderer/Quality_ptx.cu b/devices/rtx/device/renderer/Quality_ptx.cu
index fe319255c..652b0e90c 100644
--- a/devices/rtx/device/renderer/Quality_ptx.cu
+++ b/devices/rtx/device/renderer/Quality_ptx.cu
@@ -153,11 +153,14 @@ VISRTX_DEVICE LightSample sampleLights(ScreenSample &ss,
   // last index is reserved for ambient light if it exists
   if (selectedIdx == world.numLightInstances) {
     const auto &rendererParams = frameData.renderer;
+    // Fold the hemisphere-sample pdf cos(theta)/pi with the uniform light pick.
+    const vec3 dir = sampleHemisphere(ss.rs, normal);
+    const float cosNs = fmaxf(0.f, dot(dir, normal));
     return LightSample{
         rendererParams.ambientColor * rendererParams.ambientIntensity,
-        sampleHemisphere(ss.rs, normal),
+        dir,
         std::numeric_limits<float>::max(),
-        lightPickPdf / (2.0f * float(M_PI)),
+        lightPickPdf * cosNs * float(M_1_PI),
     };
   } else {
     const auto &lightInstance = world.lightInstances[selectedIdx];
@@ -359,13 +362,22 @@ VISRTX_GLOBAL void __raygen__()
         }
 
         sample.color += sampleContribution * materialEmission * materialOpacity;
+        // Sample around the shading normal so the cosine-weighted hemisphere's
+        // pdf matches the BRDF's NdotL (which uses Ns). Sampling around Ng
+        // would bias the Lambertian estimator by cos_Ns/cos_Ng on smooth or
+        // bump-mapped surfaces.
+        const vec3 shadowOrigin =
+            shadingHitpoint(surfaceHit) + surfaceHit.Ng * surfaceHit.epsilon;
         LightSample lightSample =
-            sampleLights(ss, frameData, surfaceHit.hitpoint, surfaceHit.Ng);
+            sampleLights(ss, frameData, shadowOrigin, surfaceHit.Ns);
         if (lightSample.pdf >= ATTENUATION_EPSILON && lightSample.dist > 0.0f) {
-          const float lightDotNg = dot(lightSample.dir, surfaceHit.Ng);
-          if (lightDotNg > 0.0f) {
+          // Gate on the shading normal so the terminator follows the smooth
+          // surface; gating on Ng would carve the per-triangle facet shape
+          // into the lit/unlit boundary at grazing light angles.
+          const float lightDotNs = dot(lightSample.dir, surfaceHit.Ns);
+          if (lightDotNs > 0.0f) {
             const Ray shadowRay = {
-                surfaceHit.hitpoint + surfaceHit.Ng * surfaceHit.epsilon,
+                shadowOrigin,
                 lightSample.dir,
                 {surfaceHit.epsilon, lightSample.dist},
             };
diff --git a/devices/rtx/device/spatial_field/SpatialField.cpp b/devices/rtx/device/spatial_field/SpatialField.cpp
index b0404b473..2aba5be38 100644
--- a/devices/rtx/device/spatial_field/SpatialField.cpp
+++ b/devices/rtx/device/spatial_field/SpatialField.cpp
@@ -49,7 +49,7 @@ SpatialField::SpatialField(DeviceGlobalState *s)
 void SpatialField::markFinalized()
 {
   Object::markFinalized();
-  deviceState()->objectUpdates.lastBLASChange = helium::newTimeStamp();
+  deviceState()->objectUpdates.lastVolumeBLASChange = helium::newTimeStamp();
 }
 
 SpatialField *SpatialField::createInstance(
diff --git a/devices/rtx/device/surface/Surface.cpp b/devices/rtx/device/surface/Surface.cpp
index 5458b2d79..13d9f865f 100644
--- a/devices/rtx/device/surface/Surface.cpp
+++ b/devices/rtx/device/surface/Surface.cpp
@@ -63,7 +63,7 @@ void Surface::finalize()
 void Surface::markFinalized()
 {
   Object::markFinalized();
-  deviceState()->objectUpdates.lastBLASChange = helium::newTimeStamp();
+  deviceState()->objectUpdates.lastSurfaceBLASChange = helium::newTimeStamp();
 }
 
 bool Surface::isValid() const
diff --git a/devices/rtx/device/volume/Volume.cpp b/devices/rtx/device/volume/Volume.cpp
index 10ce94912..57d10933d 100644
--- a/devices/rtx/device/volume/Volume.cpp
+++ b/devices/rtx/device/volume/Volume.cpp
@@ -51,7 +51,7 @@ void Volume::commitParameters()
 void Volume::markFinalized()
 {
   Object::markFinalized();
-  deviceState()->objectUpdates.lastBLASChange = helium::newTimeStamp();
+  deviceState()->objectUpdates.lastVolumeBLASChange = helium::newTimeStamp();
 }
 
 bool Volume::isVisible() const
diff --git a/devices/rtx/device/world/Group.cpp b/devices/rtx/device/world/Group.cpp
index ed2a54526..7a5ef1584 100644
--- a/devices/rtx/device/world/Group.cpp
+++ b/devices/rtx/device/world/Group.cpp
@@ -102,7 +102,11 @@ void Group::finalize()
 void Group::markFinalized()
 {
   Object::markFinalized();
-  deviceState()->objectUpdates.lastBLASChange = helium::newTimeStamp();
+  auto &updates = deviceState()->objectUpdates;
+  const auto now = helium::newTimeStamp();
+  updates.lastSurfaceBLASChange = now;
+  updates.lastVolumeBLASChange = now;
+  updates.lastLightSetChange = now;
 }
 
 OptixTraversableHandle Group::optixTraversableTriangle() const
@@ -194,7 +198,8 @@ const std::vector<Light *> &Group::lights() const
 void Group::rebuildSurfaceBVHs()
 {
   const auto &state = *deviceState();
-  if (state.objectUpdates.lastBLASChange < m_objectUpdates.lastSurfaceBVHBuilt)
+  if (state.objectUpdates.lastSurfaceBLASChange
+      < m_objectUpdates.lastSurfaceBVHBuilt)
     return;
 
   partitionValidGeometriesByType();
@@ -249,10 +254,16 @@ void Group::rebuildSurfaceBVHs()
 
 void Group::rebuildVolumeBVH()
 {
+  const auto &state = *deviceState();
+  if (state.objectUpdates.lastVolumeBLASChange
+      < m_objectUpdates.lastVolumeBVHBuilt)
+    return;
+
   partitionValidVolumes();
   if (m_volumes.empty()) {
     m_volumeBounds = box3();
     m_traversableVolume = {};
+    m_objectUpdates.lastVolumeBVHBuilt = helium::newTimeStamp();
     reportMessage(
         ANARI_SEVERITY_DEBUG, "visrtx::Group skipping volume BVH build");
     return;
@@ -272,6 +283,10 @@ void Group::rebuildVolumeBVH()
 
 void Group::rebuildLights()
 {
+  const auto &state = *deviceState();
+  if (state.objectUpdates.lastLightSetChange < m_objectUpdates.lastLightRebuild)
+    return;
+
   partitionValidLights();
   buildLightGPUData();
   m_objectUpdates.lastLightRebuild = helium::newTimeStamp();
diff --git a/devices/rtx/device/world/World.cpp b/devices/rtx/device/world/World.cpp
index 90c70ef89..93d05c961 100644
--- a/devices/rtx/device/world/World.cpp
+++ b/devices/rtx/device/world/World.cpp
@@ -203,7 +203,11 @@ void World::rebuildWorld()
 {
   const auto &state = *deviceState();
 
-  if (state.objectUpdates.lastBLASChange >= m_objectUpdates.lastBLASCheck) {
+  const auto &updates = state.objectUpdates;
+  const auto lastCheck = m_objectUpdates.lastBLASCheck;
+  if (updates.lastSurfaceBLASChange >= lastCheck
+      || updates.lastVolumeBLASChange >= lastCheck
+      || updates.lastLightSetChange >= lastCheck) {
     m_objectUpdates.lastTLASBuild = 0; // BLAS changed, so need to build TLAS
     rebuildBLASs();
   }