From 939614fb802ed8f792ff923de91692cbafa4e6f7 Mon Sep 17 00:00:00 2001 From: Thomas Arcila <134677+tarcila@users.noreply.github.com> Date: Thu, 30 Apr 2026 05:51:19 -0400 Subject: [PATCH 01/20] rtx: Decouple BLAS/volume/light invalidation timestamps Split the global `lastBLASChange` into per-domain stamps so a geometry edit no longer invalidates volume BVHs or per-group light index lists, and vice versa. lastSurfaceBLASChange -> bumped by Geometry, Surface lastVolumeBLASChange -> bumped by Volume, SpatialField lastLightSetChange -> bumped on Group commits (light array rebind) lastTLASChange -> unchanged (Instance) Group::markFinalized bumps all three since any of its arrays may have been rebound. Each Group::rebuild* gates on its own stamp; World fans the trigger across the three for the BLAS rebuild path. --- devices/rtx/device/geometry/Geometry.cpp | 2 +- devices/rtx/device/optix_visrtx.h | 4 +++- .../rtx/device/spatial_field/SpatialField.cpp | 2 +- devices/rtx/device/surface/Surface.cpp | 2 +- devices/rtx/device/volume/Volume.cpp | 2 +- devices/rtx/device/world/Group.cpp | 19 +++++++++++++++++-- devices/rtx/device/world/World.cpp | 6 +++++- 7 files changed, 29 insertions(+), 8 deletions(-) diff --git a/devices/rtx/device/geometry/Geometry.cpp b/devices/rtx/device/geometry/Geometry.cpp index 127eb865e..e39552c72 100644 --- a/devices/rtx/device/geometry/Geometry.cpp +++ b/devices/rtx/device/geometry/Geometry.cpp @@ -129,7 +129,7 @@ void Geometry::commitParameters() void Geometry::markFinalized() { Object::markFinalized(); - deviceState()->objectUpdates.lastBLASChange = helium::newTimeStamp(); + deviceState()->objectUpdates.lastSurfaceBLASChange = helium::newTimeStamp(); } GeometryGPUData Geometry::gpuData() const diff --git a/devices/rtx/device/optix_visrtx.h b/devices/rtx/device/optix_visrtx.h index 60c330dab..2f6082ad7 100644 --- a/devices/rtx/device/optix_visrtx.h +++ b/devices/rtx/device/optix_visrtx.h @@ -205,7 +205,9 @@ struct DeviceGlobalState : public helium::BaseGlobalDeviceState struct ObjectUpdates { - helium::TimeStamp lastBLASChange{0}; + helium::TimeStamp lastSurfaceBLASChange{0}; + helium::TimeStamp lastVolumeBLASChange{0}; + helium::TimeStamp lastLightSetChange{0}; helium::TimeStamp lastTLASChange{0}; } objectUpdates; diff --git a/devices/rtx/device/spatial_field/SpatialField.cpp b/devices/rtx/device/spatial_field/SpatialField.cpp index b0404b473..2aba5be38 100644 --- a/devices/rtx/device/spatial_field/SpatialField.cpp +++ b/devices/rtx/device/spatial_field/SpatialField.cpp @@ -49,7 +49,7 @@ SpatialField::SpatialField(DeviceGlobalState *s) void SpatialField::markFinalized() { Object::markFinalized(); - deviceState()->objectUpdates.lastBLASChange = helium::newTimeStamp(); + deviceState()->objectUpdates.lastVolumeBLASChange = helium::newTimeStamp(); } SpatialField *SpatialField::createInstance( diff --git a/devices/rtx/device/surface/Surface.cpp b/devices/rtx/device/surface/Surface.cpp index 5458b2d79..13d9f865f 100644 --- a/devices/rtx/device/surface/Surface.cpp +++ b/devices/rtx/device/surface/Surface.cpp @@ -63,7 +63,7 @@ void Surface::finalize() void Surface::markFinalized() { Object::markFinalized(); - deviceState()->objectUpdates.lastBLASChange = helium::newTimeStamp(); + deviceState()->objectUpdates.lastSurfaceBLASChange = helium::newTimeStamp(); } bool Surface::isValid() const diff --git a/devices/rtx/device/volume/Volume.cpp b/devices/rtx/device/volume/Volume.cpp index 10ce94912..57d10933d 100644 --- a/devices/rtx/device/volume/Volume.cpp +++ b/devices/rtx/device/volume/Volume.cpp @@ -51,7 +51,7 @@ void Volume::commitParameters() void Volume::markFinalized() { Object::markFinalized(); - deviceState()->objectUpdates.lastBLASChange = helium::newTimeStamp(); + deviceState()->objectUpdates.lastVolumeBLASChange = helium::newTimeStamp(); } bool Volume::isVisible() const diff --git a/devices/rtx/device/world/Group.cpp b/devices/rtx/device/world/Group.cpp index ed2a54526..7a5ef1584 100644 --- a/devices/rtx/device/world/Group.cpp +++ b/devices/rtx/device/world/Group.cpp @@ -102,7 +102,11 @@ void Group::finalize() void Group::markFinalized() { Object::markFinalized(); - deviceState()->objectUpdates.lastBLASChange = helium::newTimeStamp(); + auto &updates = deviceState()->objectUpdates; + const auto now = helium::newTimeStamp(); + updates.lastSurfaceBLASChange = now; + updates.lastVolumeBLASChange = now; + updates.lastLightSetChange = now; } OptixTraversableHandle Group::optixTraversableTriangle() const @@ -194,7 +198,8 @@ const std::vector &Group::lights() const void Group::rebuildSurfaceBVHs() { const auto &state = *deviceState(); - if (state.objectUpdates.lastBLASChange < m_objectUpdates.lastSurfaceBVHBuilt) + if (state.objectUpdates.lastSurfaceBLASChange + < m_objectUpdates.lastSurfaceBVHBuilt) return; partitionValidGeometriesByType(); @@ -249,10 +254,16 @@ void Group::rebuildSurfaceBVHs() void Group::rebuildVolumeBVH() { + const auto &state = *deviceState(); + if (state.objectUpdates.lastVolumeBLASChange + < m_objectUpdates.lastVolumeBVHBuilt) + return; + partitionValidVolumes(); if (m_volumes.empty()) { m_volumeBounds = box3(); m_traversableVolume = {}; + m_objectUpdates.lastVolumeBVHBuilt = helium::newTimeStamp(); reportMessage( ANARI_SEVERITY_DEBUG, "visrtx::Group skipping volume BVH build"); return; @@ -272,6 +283,10 @@ void Group::rebuildVolumeBVH() void Group::rebuildLights() { + const auto &state = *deviceState(); + if (state.objectUpdates.lastLightSetChange < m_objectUpdates.lastLightRebuild) + return; + partitionValidLights(); buildLightGPUData(); m_objectUpdates.lastLightRebuild = helium::newTimeStamp(); diff --git a/devices/rtx/device/world/World.cpp b/devices/rtx/device/world/World.cpp index 90c70ef89..93d05c961 100644 --- a/devices/rtx/device/world/World.cpp +++ b/devices/rtx/device/world/World.cpp @@ -203,7 +203,11 @@ void World::rebuildWorld() { const auto &state = *deviceState(); - if (state.objectUpdates.lastBLASChange >= m_objectUpdates.lastBLASCheck) { + const auto &updates = state.objectUpdates; + const auto lastCheck = m_objectUpdates.lastBLASCheck; + if (updates.lastSurfaceBLASChange >= lastCheck + || updates.lastVolumeBLASChange >= lastCheck + || updates.lastLightSetChange >= lastCheck) { m_objectUpdates.lastTLASBuild = 0; // BLAS changed, so need to build TLAS rebuildBLASs(); } From 5168cd6d74643a1889db2692ad9189680dcf25a7 Mon Sep 17 00:00:00 2001 From: Thomas Arcila <134677+tarcila@users.noreply.github.com> Date: Wed, 29 Apr 2026 11:35:45 -0400 Subject: [PATCH 02/20] rtx: Fix null-deref in face-varying triangle validation The faceVarying.normal/tangent count check dereferenced m_index unconditionally, crashing on triangle-soup geometry (no index buffer). --- devices/rtx/device/geometry/Triangle.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/devices/rtx/device/geometry/Triangle.cpp b/devices/rtx/device/geometry/Triangle.cpp index 4369243f3..52aa41216 100644 --- a/devices/rtx/device/geometry/Triangle.cpp +++ b/devices/rtx/device/geometry/Triangle.cpp @@ -78,12 +78,14 @@ void Triangle::finalize() m_vertex->size()); } - if (m_vertexNormalFV && 3 * m_index->size() != m_vertexNormalFV->size()) { + const size_t numTriangles = m_index ? m_index->size() : m_vertex->size() / 3; + + if (m_vertexNormalFV && 3 * numTriangles != m_vertexNormalFV->size()) { reportMessage(ANARI_SEVERITY_WARNING, "'faceVarying.normal' on triangle geometry is not matching " "the number of triangles in 'primitive.index' (%zu) vs. (%zu)", m_vertexNormalFV->size(), - m_index->size()); + numTriangles); } if (m_vertexTangent && m_vertex->size() != m_vertexTangent->size()) { @@ -94,12 +96,12 @@ void Triangle::finalize() m_vertex->size()); } - if (m_vertexTangentFV && 3 * m_index->size() != m_vertexTangentFV->size()) { + if (m_vertexTangentFV && 3 * numTriangles != m_vertexTangentFV->size()) { reportMessage(ANARI_SEVERITY_WARNING, "'faceVarying.Tangent' on triangle geometry is not matching " "the number of triangles in 'primitive.index' (%zu) vs. (%zu)", m_vertexTangentFV->size(), - m_index->size()); + numTriangles); } if (!m_vertexTangent && !m_vertexTangentFV) { From 6955236538972d2e5b2b25bb0f910dd83b32ffcc Mon Sep 17 00:00:00 2001 From: Thomas Arcila <134677+tarcila@users.noreply.github.com> Date: Wed, 29 Apr 2026 11:39:12 -0400 Subject: [PATCH 03/20] rtx: Own generated tangent buffer The computed-tangent Array1D was built without a deleter, so the cudaMalloc'd memory leaked when the array was released. Also remove the unused bitangent parameter on computeTangents calls. --- devices/rtx/device/geometry/ComputeTangent.cu | 70 +++++++++++++------ 1 file changed, 49 insertions(+), 21 deletions(-) diff --git a/devices/rtx/device/geometry/ComputeTangent.cu b/devices/rtx/device/geometry/ComputeTangent.cu index bffeb4bb0..1560ffff5 100644 --- a/devices/rtx/device/geometry/ComputeTangent.cu +++ b/devices/rtx/device/geometry/ComputeTangent.cu @@ -52,6 +52,25 @@ namespace { constexpr const auto eps = 1e-8f; +void cudaFreeMemoryDeleter(const void *, const void *memory) +{ + cudaFree(const_cast(memory)); +} + +bool reportCudaError( + visrtx::Triangle *triangle, cudaError_t error, const char *operation) +{ + if (error == cudaSuccess) + return false; + + triangle->reportMessage(ANARI_SEVERITY_ERROR, + "CUDA error while computing tangents for Triangle %p during %s: %s", + triangle, + operation, + cudaGetErrorString(error)); + return true; +} + } // namespace namespace visrtx { @@ -108,7 +127,6 @@ template __global__ void __doComputeTangents( glm::vec4 *tangents, // Output tangent vectors with handedness (w component) - glm::vec3 *bitangents, // Output bitangent vectors const glm::uvec3 *indices, // Input triangle indices const glm::vec3 *positions, // Input vertex positions const glm::vec3 *normals, // Input vertex normals @@ -197,7 +215,6 @@ template void __computeTangents( glm::vec4 *tangents, // Output tangent vectors with handedness (w component) - glm::vec3 *bitangents, // Output bitangent vectors const glm::uvec3 *indices, // Input triangle indices const glm::vec3 *positions, // Input vertex positions const glm::vec3 *normals, // Input vertex normals @@ -207,7 +224,7 @@ void __computeTangents( { __doComputeTangents <<<(numTriangles + 63) / 64, 64>>>( - tangents, bitangents, indices, positions, normals, uvs, numTriangles); + tangents, indices, positions, normals, uvs, numTriangles); } void updateGeometryTangent(Triangle *triangle) @@ -257,12 +274,24 @@ void updateGeometryTangent(Triangle *triangle) auto tangentsCount = indices ? (indices->size() * 3) : positions->size(); auto trianglesCount = indices ? indices->size() : positions->size() / 3; + if (trianglesCount == 0 || tangentsCount == 0) { + triangle->reportMessage(ANARI_SEVERITY_INFO, + "Triangle %p has no triangles, cannot compute tangents", + triangle); + return; + } + glm::vec4 *tangents = {}; - cudaMalloc(&tangents, sizeof(glm::vec4) * tangentsCount); - cudaMemset(tangents, 0, sizeof(glm::vec4) * tangentsCount); - glm::vec3 *bitangents = {}; - cudaMalloc(&bitangents, sizeof(glm::vec3) * tangentsCount); - cudaMemset(bitangents, 0, sizeof(glm::vec3) * tangentsCount); + auto status = cudaMalloc( + reinterpret_cast(&tangents), sizeof(glm::vec4) * tangentsCount); + if (reportCudaError(triangle, status, "allocating tangent buffer")) + return; + + status = cudaMemset(tangents, 0, sizeof(glm::vec4) * tangentsCount); + if (reportCudaError(triangle, status, "clearing tangent buffer")) { + cudaFree(tangents); + return; + } auto positionsPtr = positions->dataAs(AddressSpace::GPU); if (indices) { @@ -274,7 +303,6 @@ void updateGeometryTangent(Triangle *triangle) auto uvsPtr = uvsFV->dataAs(AddressSpace::GPU); // Vertex indexed, face varying normals and face varyings vec2 UVs. __computeTangents(tangents, - bitangents, indicesPtr, positionsPtr, normalsPtr, @@ -284,7 +312,6 @@ void updateGeometryTangent(Triangle *triangle) auto uvsPtr = uvsFV->dataAs(AddressSpace::GPU); // Vertex indexed, face varying normals and face varyings vec3 UVs. __computeTangents(tangents, - bitangents, indicesPtr, positionsPtr, normalsPtr, @@ -296,7 +323,6 @@ void updateGeometryTangent(Triangle *triangle) // Vertex indexed, face varying normals and indexed vec2 UVs. auto uvsPtr = uvs->dataAs(AddressSpace::GPU); __computeTangents(tangents, - bitangents, indicesPtr, positionsPtr, normalsPtr, @@ -306,7 +332,6 @@ void updateGeometryTangent(Triangle *triangle) // Vertex indexed, face varying normals and indexed vec3 UVs. auto uvsPtr = uvs->dataAs(AddressSpace::GPU); __computeTangents(tangents, - bitangents, indicesPtr, positionsPtr, normalsPtr, @@ -321,7 +346,6 @@ void updateGeometryTangent(Triangle *triangle) auto uvsPtr = uvsFV->dataAs(AddressSpace::GPU); // Vertex indexed, index normals and face varyings vec2 UVs. __computeTangents(tangents, - bitangents, indicesPtr, positionsPtr, normalsPtr, @@ -331,7 +355,6 @@ void updateGeometryTangent(Triangle *triangle) auto uvsPtr = uvsFV->dataAs(AddressSpace::GPU); // Vertex indexed, indexed normals and face varyings vec3 UVs. __computeTangents(tangents, - bitangents, indicesPtr, positionsPtr, normalsPtr, @@ -343,7 +366,6 @@ void updateGeometryTangent(Triangle *triangle) // Vertex indexed, indexed normals and indexed vec2 UVs. auto uvsPtr = uvs->dataAs(AddressSpace::GPU); __computeTangents(tangents, - bitangents, indicesPtr, positionsPtr, normalsPtr, @@ -353,7 +375,6 @@ void updateGeometryTangent(Triangle *triangle) // Vertex indexed, indexed normals and indexed vec3 UVs. auto uvsPtr = uvs->dataAs(AddressSpace::GPU); __computeTangents(tangents, - bitangents, indicesPtr, positionsPtr, normalsPtr, @@ -373,7 +394,6 @@ void updateGeometryTangent(Triangle *triangle) // Non indexed vertices, face varying normals and face varyings vec2 UVs. auto uvsPtr = uvs->dataAs(AddressSpace::GPU); __computeTangents(tangents, - bitangents, indicesPtr, positionsPtr, normalsPtr, @@ -383,7 +403,6 @@ void updateGeometryTangent(Triangle *triangle) // Non indexed vertices, face varying normals and face varyings vec3 UVs. auto uvsPtr = uvs->dataAs(AddressSpace::GPU); __computeTangents(tangents, - bitangents, indicesPtr, positionsPtr, normalsPtr, @@ -392,13 +411,22 @@ void updateGeometryTangent(Triangle *triangle) } } - // Release transient bitangent store - cudaFree(bitangents); + status = cudaGetLastError(); + if (reportCudaError(triangle, status, "launching tangent kernel")) { + cudaFree(tangents); + return; + } + + status = cudaDeviceSynchronize(); + if (reportCudaError(triangle, status, "computing tangents")) { + cudaFree(tangents); + return; + } auto desc = Array1DMemoryDescriptor{ { tangents, - {}, // deleter + cudaFreeMemoryDeleter, // deleter {}, // deleterPtr ANARI_FLOAT32_VEC4, }, From e7bcdabde7bafad61d2dff40579b292ef87da2ee Mon Sep 17 00:00:00 2001 From: Thomas Arcila <134677+tarcila@users.noreply.github.com> Date: Wed, 29 Apr 2026 11:31:12 -0400 Subject: [PATCH 04/20] rtx: Negate generated bitangent MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bitangent formula was (t.x*e1 - s.x*e2)/det — sign-reversed vs. the standard (s.x*e2 - t.x*e1)/det used by MikkTSpace. Normal-mapped lighting on generated tangents was mirrored along the bitangent axis. --- devices/rtx/device/geometry/ComputeTangent.cu | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/devices/rtx/device/geometry/ComputeTangent.cu b/devices/rtx/device/geometry/ComputeTangent.cu index 1560ffff5..9105f335b 100644 --- a/devices/rtx/device/geometry/ComputeTangent.cu +++ b/devices/rtx/device/geometry/ComputeTangent.cu @@ -83,8 +83,7 @@ __device__ void __computeTangentAndBitangent( glm::vec3 p2, glm::vec2 uv0, // Input texture coordinates glm::vec2 uv1, - glm::vec2 uv2 -) + glm::vec2 uv2) { // Compute edges of the triangle glm::vec3 e1 = p1 - p0; @@ -116,7 +115,7 @@ __device__ void __computeTangentAndBitangent( // Compute the determinant float invdet = 1.0f / cross; *tangent = (t.y * e1 - s.y * e2) * invdet; - *bitangent = (t.x * e1 - s.x * e2) * invdet; + *bitangent = (s.x * e2 - t.x * e1) * invdet; } } } From 2ff810f8cd4b0393f25c896843f83e9e4219e0ef Mon Sep 17 00:00:00 2001 From: Thomas Arcila <134677+tarcila@users.noreply.github.com> Date: Wed, 29 Apr 2026 11:35:45 -0400 Subject: [PATCH 05/20] rtx: Harden tangent generation against degenerate inputs Degenerate triangles or NaN/zero input normals produced fixed (1,0,0)/(0,1,0) tangents that conflicted with the actual surface normal. Add safeNormalize, fall back to a Pixar orthonormal basis built from the geometric normal, and factor the per-corner Gram-Schmidt + handedness into a shared orthogonalizeTangent helper. --- devices/rtx/device/geometry/ComputeTangent.cu | 128 ++++++++++++------ 1 file changed, 86 insertions(+), 42 deletions(-) diff --git a/devices/rtx/device/geometry/ComputeTangent.cu b/devices/rtx/device/geometry/ComputeTangent.cu index 9105f335b..dbad934b5 100644 --- a/devices/rtx/device/geometry/ComputeTangent.cu +++ b/devices/rtx/device/geometry/ComputeTangent.cu @@ -52,6 +52,49 @@ namespace { constexpr const auto eps = 1e-8f; +__device__ glm::vec3 safeNormalize( + const glm::vec3 &v, const glm::vec3 &fallback) +{ + const float l2 = glm::dot(v, v); + return l2 > eps ? v * rsqrtf(l2) : fallback; +} + +__device__ void makeTangentFrame( + const glm::vec3 &normal, glm::vec3 *tangent, glm::vec3 *bitangent) +{ + // https://graphics.pixar.com/library/OrthonormalB/paper.pdf + const glm::vec3 n = safeNormalize(normal, glm::vec3(0.f, 0.f, 1.f)); + const float sign = n.z >= 0.0f ? 1.0f : -1.0f; + const float a = -1.0f / (sign + n.z); + const float b = n.x * n.y * a; + *tangent = glm::vec3(1.0f + sign * n.x * n.x * a, sign * b, -sign * n.x); + *bitangent = glm::vec3(b, sign + n.y * n.y * a, -n.y); +} + +__device__ glm::vec3 computeGeometricNormal( + const glm::vec3 &e1, const glm::vec3 &e2) +{ + return safeNormalize(glm::cross(e1, e2), glm::vec3(0.f, 0.f, 1.f)); +} + +__device__ void orthogonalizeTangent(const glm::vec3 &tangent, + const glm::vec3 &bitangent, + const glm::vec3 &normal, + glm::vec3 *outTangent, + float *outHandedness) +{ + glm::vec3 fallbackTangent; + glm::vec3 fallbackBitangent; + makeTangentFrame(normal, &fallbackTangent, &fallbackBitangent); + + const glm::vec3 n = safeNormalize(normal, glm::vec3(0.f, 0.f, 1.f)); + *outTangent = + safeNormalize(tangent - n * glm::dot(n, tangent), fallbackTangent); + + const float bitangentSign = glm::dot(glm::cross(n, *outTangent), bitangent); + *outHandedness = bitangentSign < 0.0f ? -1.0f : 1.0f; +} + void cudaFreeMemoryDeleter(const void *, const void *memory) { cudaFree(const_cast(memory)); @@ -88,32 +131,22 @@ __device__ void __computeTangentAndBitangent( // Compute edges of the triangle glm::vec3 e1 = p1 - p0; glm::vec3 e2 = p2 - p0; + const auto normal = computeGeometricNormal(e1, e2); - if (dot(e1, e1) < eps || dot(e2, e2) < eps) { - // Degenerate triangle, use a default tangent and bitangent - *tangent = glm::vec3(1.0f, 0.0f, 0.0f); - *bitangent = glm::vec3(0.0f, 1.0f, 0.0f); + if (glm::dot(e1, e1) < eps || glm::dot(e2, e2) < eps) { + makeTangentFrame(normal, tangent, bitangent); } else { - auto normal = normalize(cross(e1, e2)); - // Compute differences in texture coordinates auto s = uv1 - uv0; auto t = uv2 - uv0; - auto cross = s.x * t.y - s.y * t.x; - - if (abs(cross) < eps) { // degenerate triangle (null vectors or collinears) - // Create a default orthonormal basis: - // https://graphics.pixar.com/library/OrthonormalB/paper.pdf - float sign = normal.z >= 0.0f ? 1.0f : -1.0f; - float a = -1.0f / (sign + normal.z); - float b = normal.x * normal.y * a; - *tangent = glm::vec3( - 1.0f + sign * normal.x * normal.x * a, sign * b, -sign * normal.x); - *bitangent = glm::vec3(b, sign + normal.y * normal.y * a, -normal.y); + auto det = s.x * t.y - s.y * t.x; + + if (glm::abs(det) < eps) { + makeTangentFrame(normal, tangent, bitangent); } else { // Compute the determinant - float invdet = 1.0f / cross; + float invdet = 1.0f / det; *tangent = (t.y * e1 - s.y * e2) * invdet; *bitangent = (s.x * e2 - t.x * e1) * invdet; } @@ -180,28 +213,36 @@ __global__ void __doComputeTangents( uv2 // Input texture coordinates ); - vec3 n0, n1, n2; - if constexpr (NormalsIndexed) { - // Use indexed normals - n0 = normals[indexedIdx.x]; - n1 = normals[indexedIdx.y]; - n2 = normals[indexedIdx.z]; - } else { - // Use per-face normals - n0 = normals[perFaceBaseIdx.x]; - n1 = normals[perFaceBaseIdx.y]; - n2 = normals[perFaceBaseIdx.z]; + const vec3 geometricNormal = computeGeometricNormal(p1 - p0, p2 - p0); + vec3 n0 = geometricNormal; + vec3 n1 = geometricNormal; + vec3 n2 = geometricNormal; + if (normals) { + if constexpr (NormalsIndexed) { + // Use indexed normals + n0 = normals[indexedIdx.x]; + n1 = normals[indexedIdx.y]; + n2 = normals[indexedIdx.z]; + } else { + // Use per-face normals + n0 = normals[perFaceBaseIdx.x]; + n1 = normals[perFaceBaseIdx.y]; + n2 = normals[perFaceBaseIdx.z]; + } + n0 = safeNormalize(n0, geometricNormal); + n1 = safeNormalize(n1, geometricNormal); + n2 = safeNormalize(n2, geometricNormal); } - // Gram-Schmidt orthogonalize and compute handedness - vec3 t0 = normalize(tangent - n0 * dot(n0, tangent)); - float h0 = copysign(1.0f, dot(cross(n0, t0), bitangent)); - - vec3 t1 = normalize(tangent - n1 * dot(n1, tangent)); - float h1 = copysign(1.0f, dot(cross(n1, t1), bitangent)); - - vec3 t2 = normalize(tangent - n2 * dot(n2, tangent)); - float h2 = copysign(1.0f, dot(cross(n2, t2), bitangent)); + vec3 t0; + vec3 t1; + vec3 t2; + float h0; + float h1; + float h2; + orthogonalizeTangent(tangent, bitangent, n0, &t0, &h0); + orthogonalizeTangent(tangent, bitangent, n1, &t1, &h1); + orthogonalizeTangent(tangent, bitangent, n2, &t2, &h2); tangents[perFaceBaseIdx.x] = glm::vec4(t0, h0); tangents[perFaceBaseIdx.y] = glm::vec4(t1, h1); @@ -235,9 +276,9 @@ void updateGeometryTangent(Triangle *triangle) auto normalsFV = triangle->getParamObject("faceVarying.normal"); auto uvsFV = triangle->getParamObject("faceVarying.attribute0"); - if (!positions || (!normals && !normalsFV)) { + if (!positions) { triangle->reportMessage(ANARI_SEVERITY_INFO, - "Triangle %p has no position or normals, cannot compute tangents", + "Triangle %p has no positions, cannot compute tangents", triangle); return; } @@ -339,7 +380,9 @@ void updateGeometryTangent(Triangle *triangle) } } } else { - auto normalsPtr = normals->dataAs(AddressSpace::GPU); + const auto *normalsPtr = normals + ? normals->dataAs(AddressSpace::GPU) + : nullptr; if (uvsFV) { if (uvsFV->elementType() == ANARI_FLOAT32_VEC2) { auto uvsPtr = uvsFV->dataAs(AddressSpace::GPU); @@ -387,7 +430,8 @@ void updateGeometryTangent(Triangle *triangle) normals = normalsFV ? normalsFV : normals; uvs = uvsFV ? uvsFV : uvs; - auto normalsPtr = normals->dataAs(AddressSpace::GPU); + const auto *normalsPtr = + normals ? normals->dataAs(AddressSpace::GPU) : nullptr; if (uvs->elementType() == ANARI_FLOAT32_VEC2) { // Non indexed vertices, face varying normals and face varyings vec2 UVs. From e9c2b2b86682c83de083a5d55ec3e561946d3a5a Mon Sep 17 00:00:00 2001 From: Thomas Arcila <134677+tarcila@users.noreply.github.com> Date: Wed, 6 May 2026 19:36:21 +0000 Subject: [PATCH 06/20] rtx: Compute per-vertex tangents via angle-weighted accumulation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per-triangle T at a shared corner depended on each face's own UV gradient, so orthogonalizing against the same vertex normal still produced different tangents per face — the shading frame jumped at every triangle edge. Replace with two passes: pass 1 atomically accumulates angle-weighted T/B/N into per-vertex slots (MikkTSpace's averaging scheme); pass 2 normalizes and orthogonalizes. Output moves from face-varying (3*numTri) to per-vertex 'vertex.tangent'. UV mirror seams must be vertex-split — same constraint as MikkTSpace defaults. --- devices/rtx/device/geometry/ComputeTangent.cu | 355 ++++++++++++------ 1 file changed, 241 insertions(+), 114 deletions(-) diff --git a/devices/rtx/device/geometry/ComputeTangent.cu b/devices/rtx/device/geometry/ComputeTangent.cu index dbad934b5..f6c1f1f56 100644 --- a/devices/rtx/device/geometry/ComputeTangent.cu +++ b/devices/rtx/device/geometry/ComputeTangent.cu @@ -77,22 +77,30 @@ __device__ glm::vec3 computeGeometricNormal( return safeNormalize(glm::cross(e1, e2), glm::vec3(0.f, 0.f, 1.f)); } -__device__ void orthogonalizeTangent(const glm::vec3 &tangent, - const glm::vec3 &bitangent, - const glm::vec3 &normal, - glm::vec3 *outTangent, - float *outHandedness) +// Each face-vertex's contribution to its vertex's accumulated tangent frame +// is weighted by the triangle's interior angle at that corner — same scheme +// MikkTSpace uses to average across incident faces. Angle weighting (over +// uniform or area) keeps thin sliver triangles from dominating shared +// vertices. +__device__ float cornerAngle( + const glm::vec3 &a, const glm::vec3 &b, const glm::vec3 &c) { - glm::vec3 fallbackTangent; - glm::vec3 fallbackBitangent; - makeTangentFrame(normal, &fallbackTangent, &fallbackBitangent); - - const glm::vec3 n = safeNormalize(normal, glm::vec3(0.f, 0.f, 1.f)); - *outTangent = - safeNormalize(tangent - n * glm::dot(n, tangent), fallbackTangent); + const glm::vec3 ab = b - a; + const glm::vec3 ac = c - a; + const float lab = sqrtf(glm::dot(ab, ab)); + const float lac = sqrtf(glm::dot(ac, ac)); + if (lab < eps || lac < eps) + return 0.0f; + const float cosT = + glm::clamp(glm::dot(ab, ac) / (lab * lac), -1.0f, 1.0f); + return acosf(cosT); +} - const float bitangentSign = glm::dot(glm::cross(n, *outTangent), bitangent); - *outHandedness = bitangentSign < 0.0f ? -1.0f : 1.0f; +__device__ void atomicAddVec3(glm::vec3 &dst, const glm::vec3 &v) +{ + atomicAdd(&dst.x, v.x); + atomicAdd(&dst.y, v.y); + atomicAdd(&dst.z, v.z); } void cudaFreeMemoryDeleter(const void *, const void *memory) @@ -135,36 +143,44 @@ __device__ void __computeTangentAndBitangent( if (glm::dot(e1, e1) < eps || glm::dot(e2, e2) < eps) { makeTangentFrame(normal, tangent, bitangent); - } else { - // Compute differences in texture coordinates - auto s = uv1 - uv0; - auto t = uv2 - uv0; + return; + } - auto det = s.x * t.y - s.y * t.x; + // Compute differences in texture coordinates + auto s = uv1 - uv0; + auto t = uv2 - uv0; - if (glm::abs(det) < eps) { - makeTangentFrame(normal, tangent, bitangent); - } else { - // Compute the determinant - float invdet = 1.0f / det; - *tangent = (t.y * e1 - s.y * e2) * invdet; - *bitangent = (s.x * e2 - t.x * e1) * invdet; - } + auto det = s.x * t.y - s.y * t.x; + + if (glm::abs(det) < eps) { + makeTangentFrame(normal, tangent, bitangent); + return; } + + float invdet = 1.0f / det; + *tangent = (t.y * e1 - s.y * e2) * invdet; + *bitangent = (s.x * e2 - t.x * e1) * invdet; } +// Pass 1 (one thread per triangle): compute the per-triangle T/B from the UV +// gradient, then atomicAdd those vectors into per-vertex accumulators — +// weighted by the triangle's interior angle at each corner. Per-vertex +// normals are accumulated the same way so Pass 2 has a coordinate frame to +// orthogonalize against, regardless of whether input normals are vertex, +// face-varying, or absent. template -__global__ void __doComputeTangents( - glm::vec4 *tangents, // Output tangent vectors with handedness (w component) - const glm::uvec3 *indices, // Input triangle indices - const glm::vec3 *positions, // Input vertex positions - const glm::vec3 *normals, // Input vertex normals - const TexCoord *uvs, // Input texture coordinates - unsigned int numTriangles // Number of triangles -) +__global__ void __doAccumulateTangents( + glm::vec3 *tangentAccum, + glm::vec3 *bitangentAccum, + glm::vec3 *normalAccum, + const glm::uvec3 *indices, + const glm::vec3 *positions, + const glm::vec3 *normals, + const TexCoord *uvs, + unsigned int numTriangles) { unsigned int tri = blockIdx.x * blockDim.x + threadIdx.x; @@ -189,29 +205,18 @@ __global__ void __doComputeTangents( vec2 uv0, uv1, uv2; if constexpr (UVsIndexed) { - // Use indexed UVs uv0 = uvs[indexedIdx.x]; uv1 = uvs[indexedIdx.y]; uv2 = uvs[indexedIdx.z]; } else { - // Use per-face UVs uv0 = uvs[perFaceBaseIdx.x]; uv1 = uvs[perFaceBaseIdx.y]; uv2 = uvs[perFaceBaseIdx.z]; } vec3 tangent, bitangent; - __computeTangentAndBitangent( - &tangent, // Output tangent vectors with handedness (w component) - &bitangent, // Output bitangent vectors - p0, - p1, - p2, // Input vertex positions - uv0, - uv1, - uv2 // Input texture coordinates - ); + &tangent, &bitangent, p0, p1, p2, uv0, uv1, uv2); const vec3 geometricNormal = computeGeometricNormal(p1 - p0, p2 - p0); vec3 n0 = geometricNormal; @@ -219,12 +224,10 @@ __global__ void __doComputeTangents( vec3 n2 = geometricNormal; if (normals) { if constexpr (NormalsIndexed) { - // Use indexed normals n0 = normals[indexedIdx.x]; n1 = normals[indexedIdx.y]; n2 = normals[indexedIdx.z]; } else { - // Use per-face normals n0 = normals[perFaceBaseIdx.x]; n1 = normals[perFaceBaseIdx.y]; n2 = normals[perFaceBaseIdx.z]; @@ -234,19 +237,26 @@ __global__ void __doComputeTangents( n2 = safeNormalize(n2, geometricNormal); } - vec3 t0; - vec3 t1; - vec3 t2; - float h0; - float h1; - float h2; - orthogonalizeTangent(tangent, bitangent, n0, &t0, &h0); - orthogonalizeTangent(tangent, bitangent, n1, &t1, &h1); - orthogonalizeTangent(tangent, bitangent, n2, &t2, &h2); - - tangents[perFaceBaseIdx.x] = glm::vec4(t0, h0); - tangents[perFaceBaseIdx.y] = glm::vec4(t1, h1); - tangents[perFaceBaseIdx.z] = glm::vec4(t2, h2); + // For indexed meshes, accumulate at the shared vertex slot so adjacent + // triangles average their contributions. For triangle-soup each face-vertex + // already has a unique slot. + const glm::uvec3 outIdx = VerticesIndexed ? indexedIdx : perFaceBaseIdx; + + const float w0 = cornerAngle(p0, p1, p2); + const float w1 = cornerAngle(p1, p0, p2); + const float w2 = cornerAngle(p2, p0, p1); + + atomicAddVec3(tangentAccum[outIdx.x], tangent * w0); + atomicAddVec3(tangentAccum[outIdx.y], tangent * w1); + atomicAddVec3(tangentAccum[outIdx.z], tangent * w2); + + atomicAddVec3(bitangentAccum[outIdx.x], bitangent * w0); + atomicAddVec3(bitangentAccum[outIdx.y], bitangent * w1); + atomicAddVec3(bitangentAccum[outIdx.z], bitangent * w2); + + atomicAddVec3(normalAccum[outIdx.x], n0 * w0); + atomicAddVec3(normalAccum[outIdx.y], n1 * w1); + atomicAddVec3(normalAccum[outIdx.z], n2 * w2); } template void __computeTangents( - glm::vec4 *tangents, // Output tangent vectors with handedness (w component) - const glm::uvec3 *indices, // Input triangle indices - const glm::vec3 *positions, // Input vertex positions - const glm::vec3 *normals, // Input vertex normals - const TexCoord *uvs, // Input texture coordinates - unsigned int numTriangles // Number of triangles -) + glm::vec3 *tangentAccum, + glm::vec3 *bitangentAccum, + glm::vec3 *normalAccum, + const glm::uvec3 *indices, + const glm::vec3 *positions, + const glm::vec3 *normals, + const TexCoord *uvs, + unsigned int numTriangles) { - __doComputeTangents - <<<(numTriangles + 63) / 64, 64>>>( - tangents, indices, positions, normals, uvs, numTriangles); + __doAccumulateTangents<<<(numTriangles + 63) / 64, 64>>>(tangentAccum, + bitangentAccum, + normalAccum, + indices, + positions, + normals, + uvs, + numTriangles); +} + +// Pass 2 (one thread per vertex): normalize the accumulated frame and write +// vec4(T_orthog, sign). The accumulated normal is used as the orthogonalization +// basis; for vertex-indexed input it averages back to each vertex's authored +// normal, and for face-varying or missing normals it gives the angle-weighted +// average across incident faces. +__global__ void __doFinalizeTangents(glm::vec4 *tangents, + const glm::vec3 *tangentAccum, + const glm::vec3 *bitangentAccum, + const glm::vec3 *normalAccum, + unsigned int numVertices) +{ + unsigned int v = blockIdx.x * blockDim.x + threadIdx.x; + + if (v >= numVertices) + return; + + const vec3 T_in = tangentAccum[v]; + const vec3 B_in = bitangentAccum[v]; + const vec3 N_in = normalAccum[v]; + + const vec3 n = safeNormalize(N_in, vec3(0.0f, 0.0f, 1.0f)); + + vec3 fallbackT, fallbackB; + makeTangentFrame(n, &fallbackT, &fallbackB); + + const vec3 T_orth = + safeNormalize(T_in - n * glm::dot(n, T_in), fallbackT); + + const float bitangentSign = glm::dot(glm::cross(n, T_orth), B_in); + const float sign = bitangentSign < 0.0f ? -1.0f : 1.0f; + + tangents[v] = glm::vec4(T_orth, sign); } void updateGeometryTangent(Triangle *triangle) @@ -306,30 +359,71 @@ void updateGeometryTangent(Triangle *triangle) return; } - // Always go with faceVarying tangents. Rational is the following: - // - Correct UV and normal sharing is achieve through indexing - // - If faceVarying UVs/normals are used then, it should already imply correct - // sharing - // on common vertices. - - auto tangentsCount = indices ? (indices->size() * 3) : positions->size(); - auto trianglesCount = indices ? indices->size() : positions->size() / 3; - if (trianglesCount == 0 || tangentsCount == 0) { + // Output is per-vertex (vertex.tangent). For indexed meshes the per-vertex + // buffer is what lets adjacent triangles share tangent data at common + // vertices — that sharing is what eliminates the per-triangle facets a + // face-varying buffer would produce. For triangle-soup input each face-vertex + // is its own slot, so the same layout works without changes. + const auto numVertices = static_cast(positions->size()); + const auto trianglesCount = static_cast( + indices ? indices->size() : positions->size() / 3); + if (trianglesCount == 0 || numVertices == 0) { triangle->reportMessage(ANARI_SEVERITY_INFO, "Triangle %p has no triangles, cannot compute tangents", triangle); return; } - glm::vec4 *tangents = {}; - auto status = cudaMalloc( - reinterpret_cast(&tangents), sizeof(glm::vec4) * tangentsCount); - if (reportCudaError(triangle, status, "allocating tangent buffer")) - return; + glm::vec3 *tangentAccum = nullptr; + glm::vec3 *bitangentAccum = nullptr; + glm::vec3 *normalAccum = nullptr; + glm::vec4 *tangents = nullptr; - status = cudaMemset(tangents, 0, sizeof(glm::vec4) * tangentsCount); - if (reportCudaError(triangle, status, "clearing tangent buffer")) { + auto cleanup = [&] { + cudaFree(tangentAccum); + cudaFree(bitangentAccum); + cudaFree(normalAccum); cudaFree(tangents); + }; + + auto status = cudaMalloc(reinterpret_cast(&tangentAccum), + sizeof(glm::vec3) * numVertices); + if (reportCudaError(triangle, status, "allocating tangent accumulator")) { + cleanup(); + return; + } + status = cudaMalloc(reinterpret_cast(&bitangentAccum), + sizeof(glm::vec3) * numVertices); + if (reportCudaError(triangle, status, "allocating bitangent accumulator")) { + cleanup(); + return; + } + status = cudaMalloc(reinterpret_cast(&normalAccum), + sizeof(glm::vec3) * numVertices); + if (reportCudaError(triangle, status, "allocating normal accumulator")) { + cleanup(); + return; + } + status = cudaMalloc( + reinterpret_cast(&tangents), sizeof(glm::vec4) * numVertices); + if (reportCudaError(triangle, status, "allocating tangent output buffer")) { + cleanup(); + return; + } + + status = cudaMemset(tangentAccum, 0, sizeof(glm::vec3) * numVertices); + if (reportCudaError(triangle, status, "clearing tangent accumulator")) { + cleanup(); + return; + } + status = cudaMemset(bitangentAccum, 0, sizeof(glm::vec3) * numVertices); + if (reportCudaError(triangle, status, "clearing bitangent accumulator")) { + cleanup(); + return; + } + status = cudaMemset(normalAccum, 0, sizeof(glm::vec3) * numVertices); + if (reportCudaError(triangle, status, "clearing normal accumulator")) { + cleanup(); return; } @@ -342,7 +436,9 @@ void updateGeometryTangent(Triangle *triangle) if (uvsFV->elementType() == ANARI_FLOAT32_VEC2) { auto uvsPtr = uvsFV->dataAs(AddressSpace::GPU); // Vertex indexed, face varying normals and face varyings vec2 UVs. - __computeTangents(tangents, + __computeTangents(tangentAccum, + bitangentAccum, + normalAccum, indicesPtr, positionsPtr, normalsPtr, @@ -351,7 +447,9 @@ void updateGeometryTangent(Triangle *triangle) } else { auto uvsPtr = uvsFV->dataAs(AddressSpace::GPU); // Vertex indexed, face varying normals and face varyings vec3 UVs. - __computeTangents(tangents, + __computeTangents(tangentAccum, + bitangentAccum, + normalAccum, indicesPtr, positionsPtr, normalsPtr, @@ -362,7 +460,9 @@ void updateGeometryTangent(Triangle *triangle) if (uvs->elementType() == ANARI_FLOAT32_VEC2) { // Vertex indexed, face varying normals and indexed vec2 UVs. auto uvsPtr = uvs->dataAs(AddressSpace::GPU); - __computeTangents(tangents, + __computeTangents(tangentAccum, + bitangentAccum, + normalAccum, indicesPtr, positionsPtr, normalsPtr, @@ -371,7 +471,9 @@ void updateGeometryTangent(Triangle *triangle) } else { // Vertex indexed, face varying normals and indexed vec3 UVs. auto uvsPtr = uvs->dataAs(AddressSpace::GPU); - __computeTangents(tangents, + __computeTangents(tangentAccum, + bitangentAccum, + normalAccum, indicesPtr, positionsPtr, normalsPtr, @@ -387,7 +489,9 @@ void updateGeometryTangent(Triangle *triangle) if (uvsFV->elementType() == ANARI_FLOAT32_VEC2) { auto uvsPtr = uvsFV->dataAs(AddressSpace::GPU); // Vertex indexed, index normals and face varyings vec2 UVs. - __computeTangents(tangents, + __computeTangents(tangentAccum, + bitangentAccum, + normalAccum, indicesPtr, positionsPtr, normalsPtr, @@ -396,7 +500,9 @@ void updateGeometryTangent(Triangle *triangle) } else { auto uvsPtr = uvsFV->dataAs(AddressSpace::GPU); // Vertex indexed, indexed normals and face varyings vec3 UVs. - __computeTangents(tangents, + __computeTangents(tangentAccum, + bitangentAccum, + normalAccum, indicesPtr, positionsPtr, normalsPtr, @@ -407,7 +513,9 @@ void updateGeometryTangent(Triangle *triangle) if (uvs->elementType() == ANARI_FLOAT32_VEC2) { // Vertex indexed, indexed normals and indexed vec2 UVs. auto uvsPtr = uvs->dataAs(AddressSpace::GPU); - __computeTangents(tangents, + __computeTangents(tangentAccum, + bitangentAccum, + normalAccum, indicesPtr, positionsPtr, normalsPtr, @@ -416,7 +524,9 @@ void updateGeometryTangent(Triangle *triangle) } else { // Vertex indexed, indexed normals and indexed vec3 UVs. auto uvsPtr = uvs->dataAs(AddressSpace::GPU); - __computeTangents(tangents, + __computeTangents(tangentAccum, + bitangentAccum, + normalAccum, indicesPtr, positionsPtr, normalsPtr, @@ -426,17 +536,20 @@ void updateGeometryTangent(Triangle *triangle) } } } else { - auto indicesPtr = nullptr; - normals = normalsFV ? normalsFV : normals; - uvs = uvsFV ? uvsFV : uvs; + const glm::uvec3 *indicesPtr = nullptr; + auto effectiveNormals = normalsFV ? normalsFV : normals; + auto effectiveUvs = uvsFV ? uvsFV : uvs; - const auto *normalsPtr = - normals ? normals->dataAs(AddressSpace::GPU) : nullptr; + const auto *normalsPtr = effectiveNormals + ? effectiveNormals->dataAs(AddressSpace::GPU) + : nullptr; - if (uvs->elementType() == ANARI_FLOAT32_VEC2) { + if (effectiveUvs->elementType() == ANARI_FLOAT32_VEC2) { // Non indexed vertices, face varying normals and face varyings vec2 UVs. - auto uvsPtr = uvs->dataAs(AddressSpace::GPU); - __computeTangents(tangents, + auto uvsPtr = effectiveUvs->dataAs(AddressSpace::GPU); + __computeTangents(tangentAccum, + bitangentAccum, + normalAccum, indicesPtr, positionsPtr, normalsPtr, @@ -444,8 +557,10 @@ void updateGeometryTangent(Triangle *triangle) trianglesCount); } else { // Non indexed vertices, face varying normals and face varyings vec3 UVs. - auto uvsPtr = uvs->dataAs(AddressSpace::GPU); - __computeTangents(tangents, + auto uvsPtr = effectiveUvs->dataAs(AddressSpace::GPU); + __computeTangents(tangentAccum, + bitangentAccum, + normalAccum, indicesPtr, positionsPtr, normalsPtr, @@ -455,14 +570,29 @@ void updateGeometryTangent(Triangle *triangle) } status = cudaGetLastError(); - if (reportCudaError(triangle, status, "launching tangent kernel")) { - cudaFree(tangents); + if (reportCudaError(triangle, status, "launching accumulate kernel")) { + cleanup(); + return; + } + + __doFinalizeTangents<<<(numVertices + 63) / 64, 64>>>( + tangents, tangentAccum, bitangentAccum, normalAccum, numVertices); + + status = cudaGetLastError(); + if (reportCudaError(triangle, status, "launching finalize kernel")) { + cleanup(); return; } status = cudaDeviceSynchronize(); + cudaFree(tangentAccum); + cudaFree(bitangentAccum); + cudaFree(normalAccum); + tangentAccum = nullptr; + bitangentAccum = nullptr; + normalAccum = nullptr; if (reportCudaError(triangle, status, "computing tangents")) { - cudaFree(tangents); + cleanup(); return; } @@ -473,16 +603,13 @@ void updateGeometryTangent(Triangle *triangle) {}, // deleterPtr ANARI_FLOAT32_VEC4, }, - tangentsCount, + numVertices, }; auto tangentsArray = new Array1D(triangle->deviceState(), desc); tangentsArray->commitParameters(); tangentsArray->finalize(); - if (indices) - triangle->setParam("faceVarying.tangent", tangentsArray); - else - triangle->setParam("vertex.tangent", tangentsArray); + triangle->setParam("vertex.tangent", tangentsArray); triangle->commitParameters(); triangle->finalize(); From 8d2595d47c51f13a25c534670fc92f6732b73013 Mon Sep 17 00:00:00 2001 From: Thomas Arcila <134677+tarcila@users.noreply.github.com> Date: Wed, 29 Apr 2026 11:40:03 -0400 Subject: [PATCH 07/20] rtx: Read full vec4 tangent and interpolate handedness Tangent fetch was discarding .w (vec3 cast on a vec4 array) and reading handedness from vertex 0 alone. Read the vec4 properly and barycentric- interpolate the sign across the corners, then quantize to +-1 for the basis flip. --- devices/rtx/device/gpu/populateHit.h | 30 ++++++++++++++++------------ 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/devices/rtx/device/gpu/populateHit.h b/devices/rtx/device/gpu/populateHit.h index 4930c05e5..c2e2f652e 100644 --- a/devices/rtx/device/gpu/populateHit.h +++ b/devices/rtx/device/gpu/populateHit.h @@ -197,9 +197,11 @@ VISRTX_DEVICE const VolumeGPUData &volumeData(const FrameGPUData &frameData) return frameData.registry.volumes[idx]; } -VISRTX_DEVICE const SpatialFieldGPUData &fieldData(const FrameGPUData &frameData, const VolumeGPUData &volumeData) +VISRTX_DEVICE const SpatialFieldGPUData &fieldData( + const FrameGPUData &frameData, const VolumeGPUData &volumeData) { - // Currently only TF1D volume type is supported, so assume this is what we have + // Currently only TF1D volume type is supported, so assume this is what we + // have return frameData.registry.fields[volumeData.data.tf1d.field]; } @@ -243,20 +245,22 @@ VISRTX_DEVICE void computeTangentSpace( if (ggd.tri.vertexTangentsFV != nullptr) { const uvec3 idx = uvec3(0, 1, 2) + (hit.primID * 3); - const vec3 t0 = ggd.tri.vertexTangentsFV[idx.x]; - const vec3 t1 = ggd.tri.vertexTangentsFV[idx.y]; - const vec3 t2 = ggd.tri.vertexTangentsFV[idx.z]; - const float handedness = ggd.tri.vertexTangentsFV[idx.x].w; + const vec4 t0 = ggd.tri.vertexTangentsFV[idx.x]; + const vec4 t1 = ggd.tri.vertexTangentsFV[idx.y]; + const vec4 t2 = ggd.tri.vertexTangentsFV[idx.z]; + const float handedness = b.x * t0.w + b.y * t1.w + b.z * t2.w; + const float sign = handedness < 0.f ? -1.f : 1.f; hit.tU = normalize(b.x * vec3(t0) + b.y * vec3(t1) + b.z * vec3(t2)); - hit.tV = handedness * normalize(cross(hit.Ns, hit.tU)); + hit.tV = sign * normalize(cross(hit.Ns, hit.tU)); } else if (ggd.tri.vertexTangents != nullptr) { - const vec3 t0 = ggd.tri.vertexTangents[idx.x]; - const vec3 t1 = ggd.tri.vertexTangents[idx.y]; - const vec3 t2 = ggd.tri.vertexTangents[idx.z]; - const float handedness = ggd.tri.vertexTangents[idx.x].w; + const vec4 t0 = ggd.tri.vertexTangents[idx.x]; + const vec4 t1 = ggd.tri.vertexTangents[idx.y]; + const vec4 t2 = ggd.tri.vertexTangents[idx.z]; + const float handedness = b.x * t0.w + b.y * t1.w + b.z * t2.w; + const float sign = handedness < 0.f ? -1.f : 1.f; - hit.tU = normalize(b.x * t0 + b.y * t1 + b.z * t2); - hit.tV = handedness * normalize(cross(hit.Ns, hit.tU)); + hit.tU = normalize(b.x * vec3(t0) + b.y * vec3(t1) + b.z * vec3(t2)); + hit.tV = sign * normalize(cross(hit.Ns, hit.tU)); } else { auto tangentSpace = computeOrthonormalBasis(hit.Ng); hit.tU = tangentSpace[0]; From a7d098806e040552e23a690673e735663bc839e0 Mon Sep 17 00:00:00 2001 From: Thomas Arcila <134677+tarcila@users.noreply.github.com> Date: Wed, 6 May 2026 19:36:08 +0000 Subject: [PATCH 08/20] rtx: NaN-safe shading and per-vertex bitangent blending Numerical fixes: - Rewrite ggxD denom as alpha2*x + (1-x) instead of x*(alpha2-1) + 1. The textbook form cancels catastrophically when alpha2 < eps(1) and collapses to zero at x=1, producing NaN throughput. - Negate NdotL early-outs so a NaN takes the rejection path instead of slipping through (NaN compares false to <=, > etc.). - Fall back from Ns to Ng in Matte and PBR shading state when the normal length squared is non-positive (catches both NaN and zero). Mirror-seam tangents: - Compute per-vertex bitangents B_i = t_i.w * cross(N_i, T_i) and barycentric-blend B and T independently, instead of blending t.w signs and applying once at the hit. Matches glTF Sample Renderer / PBRT / Filament; avoids carving seam edges into the tangent frame. --- devices/rtx/device/gpu/populateHit.h | 60 +++++++++++-------- .../material/shaders/MatteShader_ptx.cu | 6 ++ .../shaders/PhysicallyBasedShader_ptx.cu | 21 ++++++- 3 files changed, 61 insertions(+), 26 deletions(-) diff --git a/devices/rtx/device/gpu/populateHit.h b/devices/rtx/device/gpu/populateHit.h index c2e2f652e..c43a0b17f 100644 --- a/devices/rtx/device/gpu/populateHit.h +++ b/devices/rtx/device/gpu/populateHit.h @@ -225,42 +225,54 @@ VISRTX_DEVICE void computeTangentSpace( if (!optixIsFrontFaceHit()) hit.Ng = -hit.Ng; + const bool hasVertexNormals = ggd.tri.vertexNormalsFV != nullptr + || ggd.tri.vertexNormals != nullptr; + vec3 n0, n1, n2; if (ggd.tri.vertexNormalsFV != nullptr) { - const uvec3 idx = uvec3(0, 1, 2) + (hit.primID * 3); + const uvec3 nIdx = uvec3(0, 1, 2) + (hit.primID * 3); - const vec3 n0 = ggd.tri.vertexNormalsFV[idx.x]; - const vec3 n1 = ggd.tri.vertexNormalsFV[idx.y]; - const vec3 n2 = ggd.tri.vertexNormalsFV[idx.z]; + n0 = ggd.tri.vertexNormalsFV[nIdx.x]; + n1 = ggd.tri.vertexNormalsFV[nIdx.y]; + n2 = ggd.tri.vertexNormalsFV[nIdx.z]; hit.Ns = b.x * n0 + b.y * n1 + b.z * n2; } else if (ggd.tri.vertexNormals != nullptr) { - const vec3 n0 = ggd.tri.vertexNormals[idx.x]; - const vec3 n1 = ggd.tri.vertexNormals[idx.y]; - const vec3 n2 = ggd.tri.vertexNormals[idx.z]; + n0 = ggd.tri.vertexNormals[idx.x]; + n1 = ggd.tri.vertexNormals[idx.y]; + n2 = ggd.tri.vertexNormals[idx.z]; hit.Ns = b.x * n0 + b.y * n1 + b.z * n2; } else hit.Ns = hit.Ng; hit.Ns = normalize(hit.Ns); - if (ggd.tri.vertexTangentsFV != nullptr) { - const uvec3 idx = uvec3(0, 1, 2) + (hit.primID * 3); + const bool hasTangentsFV = ggd.tri.vertexTangentsFV != nullptr; + const bool hasTangentsV = ggd.tri.vertexTangents != nullptr; + if (hasTangentsFV || hasTangentsV) { + const uvec3 tIdx = hasTangentsFV + ? uvec3(0, 1, 2) + (hit.primID * 3) + : idx; + const vec4 *tArr = hasTangentsFV + ? ggd.tri.vertexTangentsFV + : ggd.tri.vertexTangents; + const vec4 t0 = tArr[tIdx.x]; + const vec4 t1 = tArr[tIdx.y]; + const vec4 t2 = tArr[tIdx.z]; + + // At UV mirror seams the sign flips between adjacent vertices; + // barycentric-summing the signs and applying a single sign at the + // hit point would carve seam edges into the tangent frame. + // Build each vertex's bitangent with its own sign and normal, + // then blend B and T independently — same convention + // as glTF Sample Renderer, PBRT, Filament. + const vec3 N0 = hasVertexNormals ? n0 : hit.Ng; + const vec3 N1 = hasVertexNormals ? n1 : hit.Ng; + const vec3 N2 = hasVertexNormals ? n2 : hit.Ng; + const vec3 B0 = t0.w * cross(N0, vec3(t0)); + const vec3 B1 = t1.w * cross(N1, vec3(t1)); + const vec3 B2 = t2.w * cross(N2, vec3(t2)); - const vec4 t0 = ggd.tri.vertexTangentsFV[idx.x]; - const vec4 t1 = ggd.tri.vertexTangentsFV[idx.y]; - const vec4 t2 = ggd.tri.vertexTangentsFV[idx.z]; - const float handedness = b.x * t0.w + b.y * t1.w + b.z * t2.w; - const float sign = handedness < 0.f ? -1.f : 1.f; hit.tU = normalize(b.x * vec3(t0) + b.y * vec3(t1) + b.z * vec3(t2)); - hit.tV = sign * normalize(cross(hit.Ns, hit.tU)); - } else if (ggd.tri.vertexTangents != nullptr) { - const vec4 t0 = ggd.tri.vertexTangents[idx.x]; - const vec4 t1 = ggd.tri.vertexTangents[idx.y]; - const vec4 t2 = ggd.tri.vertexTangents[idx.z]; - const float handedness = b.x * t0.w + b.y * t1.w + b.z * t2.w; - const float sign = handedness < 0.f ? -1.f : 1.f; - - hit.tU = normalize(b.x * vec3(t0) + b.y * vec3(t1) + b.z * vec3(t2)); - hit.tV = sign * normalize(cross(hit.Ns, hit.tU)); + hit.tV = normalize(b.x * B0 + b.y * B1 + b.z * B2); } else { auto tangentSpace = computeOrthonormalBasis(hit.Ng); hit.tU = tangentSpace[0]; diff --git a/devices/rtx/device/material/shaders/MatteShader_ptx.cu b/devices/rtx/device/material/shaders/MatteShader_ptx.cu index 06429e5fa..371a4cb1c 100644 --- a/devices/rtx/device/material/shaders/MatteShader_ptx.cu +++ b/devices/rtx/device/material/shaders/MatteShader_ptx.cu @@ -48,6 +48,12 @@ VISRTX_CALLABLE void __direct_callable__init(MatteShadingState *shadingState, shadingState->normal = hit->Ns; shadingState->opacity = adjustedMaterialOpacity(color.w * opacity, md->alphaMode, md->cutoff); + + // Fall back to the geometric normal if hit->Ns is NaN (e.g. coincident + // curve control points) or zero-length. Negated comparison catches both + // since NaN compares false to anything. + if (!(glm::dot(shadingState->normal, shadingState->normal) > 1e-12f)) + shadingState->normal = hit->Ng; } VISRTX_CALLABLE NextRay __direct_callable__nextRay( diff --git a/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu b/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu index 5e710b729..b3d834e01 100644 --- a/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu +++ b/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu @@ -116,7 +116,13 @@ VISRTX_DEVICE float smithG1GGX(float NdotV, float alpha2) VISRTX_DEVICE float ggxD(float NdotH, float alpha2) { - const float denom = NdotH * NdotH * (alpha2 - 1.0f) + 1.0f; + // The textbook denom `x·(α²−1) + 1` cancels catastrophically in fp32 once + // α² is below eps(1) ≈ 1.19e-7 (our α² floor is 1e-8): `α²−1` rounds to + // exactly −1, and at x=1 the whole denom collapses to 0. The algebraically + // equivalent `α²·x + (1−x)` has no near-1 subtraction so it stays exact. + // The fminf clamp keeps `1−x ≥ 0` against dot-product rounding above 1. + const float NdotH2 = fminf(NdotH * NdotH, 1.0f); + const float denom = alpha2 * NdotH2 + (1.0f - NdotH2); return alpha2 / (float(M_PI) * denom * denom); } @@ -313,6 +319,15 @@ VISRTX_CALLABLE void __direct_callable__init( shadingState->iridescenceIor = md->iridescenceIor; shadingState->iridescenceThickness = getMaterialParameter(*fd, md->iridescenceThickness, *hit).x; + + // Fall back to the geometric normal if sampleNormalMap produced NaN + // (texel decodes to zero, or zero-summed tangents) or zero-length. The + // negated comparison catches both since NaN compares false to anything. + if (!(glm::dot(shadingState->normal, shadingState->normal) > 1e-12f)) + shadingState->normal = hit->Ng; + if (!(glm::dot(shadingState->clearcoatNormal, shadingState->clearcoatNormal) + > 1e-12f)) + shadingState->clearcoatNormal = hit->Ng; } //----------------------------------------------------------------------------- @@ -390,7 +405,9 @@ VISRTX_CALLABLE vec3 __direct_callable__shadeSurface( const vec3 L = lightSample->dir; const float NdotL = dot(N, L); - if (NdotL <= 0.0f) + // Negated form so a NaN NdotL takes this early-out — NaN compares false + // to everything, so `NdotL <= 0.0f` would let it pass through. + if (!(NdotL > 0.0f)) return vec3(0.0f); const vec3 H = normalize(L + V); From cc40202017ee867e5e84ca26aabf60926004becb Mon Sep 17 00:00:00 2001 From: Thomas Arcila <134677+tarcila@users.noreply.github.com> Date: Wed, 6 May 2026 19:35:40 +0000 Subject: [PATCH 09/20] rtx: Gate Quality renderer direct light on Ns instead of Ng The lightDotNg gate and the ambient hemisphere normal both used Ng, which carved per-triangle facet shapes into the lit/unlit boundary at grazing angles on smooth-normal meshes. Switch both to Ns so the terminator follows the shading surface; the material's own NdotL guard still rejects light from below. --- devices/rtx/device/renderer/Quality_ptx.cu | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/devices/rtx/device/renderer/Quality_ptx.cu b/devices/rtx/device/renderer/Quality_ptx.cu index fe319255c..8e3b84ce1 100644 --- a/devices/rtx/device/renderer/Quality_ptx.cu +++ b/devices/rtx/device/renderer/Quality_ptx.cu @@ -360,10 +360,13 @@ VISRTX_GLOBAL void __raygen__() sample.color += sampleContribution * materialEmission * materialOpacity; LightSample lightSample = - sampleLights(ss, frameData, surfaceHit.hitpoint, surfaceHit.Ng); + sampleLights(ss, frameData, shadowOrigin, surfaceHit.Ns); if (lightSample.pdf >= ATTENUATION_EPSILON && lightSample.dist > 0.0f) { - const float lightDotNg = dot(lightSample.dir, surfaceHit.Ng); - if (lightDotNg > 0.0f) { + // Gate on the shading normal so the terminator follows the smooth + // surface; gating on Ng would carve the per-triangle facet shape + // into the lit/unlit boundary at grazing light angles. + const float lightDotNs = dot(lightSample.dir, surfaceHit.Ns); + if (lightDotNs > 0.0f) { const Ray shadowRay = { surfaceHit.hitpoint + surfaceHit.Ng * surfaceHit.epsilon, lightSample.dir, From f3d7c064868dfa05e8b1697561c2e4c093e64d1d Mon Sep 17 00:00:00 2001 From: Thomas Arcila <134677+tarcila@users.noreply.github.com> Date: Wed, 29 Apr 2026 14:55:58 -0400 Subject: [PATCH 10/20] rtx: Fix biased uniform-sphere and cosine-hemisphere samplers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit randomDir was normalizing a cube-uniform vector — that clusters samples toward cube corners, not uniform on the sphere. Replace with the analytic cosTheta = 1 - 2u mapping. sampleHemisphere had Malley's method inverted (z = u, r = sqrt(1-sqrt(u))) instead of (r = sqrt(u), z = sqrt(1-r^2)), biasing AO and diffuse estimates toward grazing. Fold cos(theta)/pi into the ambient LightSample pdf so it matches the corrected density. --- devices/rtx/device/gpu/gpu_util.h | 29 ++++++++++------------ devices/rtx/device/renderer/Quality_ptx.cu | 13 ++++++++-- 2 files changed, 24 insertions(+), 18 deletions(-) diff --git a/devices/rtx/device/gpu/gpu_util.h b/devices/rtx/device/gpu/gpu_util.h index 9e17c44ba..88b3af0fb 100644 --- a/devices/rtx/device/gpu/gpu_util.h +++ b/devices/rtx/device/gpu/gpu_util.h @@ -195,18 +195,14 @@ VISRTX_DEVICE vec3 boolColor(bool pred) return pred ? vec3(0.f, 1.f, 0.f) : vec3(1.f, 0.f, 0.f); } +// Uniform on the unit sphere via Marsaglia (1972); pdf = 1/(4*pi). +// Downstream uses: isotropic volume scatter, AO/bounce hemisphere base. VISRTX_DEVICE vec3 randomDir(RandState &rs) { -#if 0 - const float r1 = curand_uniform(&rs); - const float r2 = curand_uniform(&rs); - return normalize(vec3(cos(2 * float(M_PI) * r1) * sqrt(1 - (r2 * r2)), - sin(2 * float(M_PI) * r1) * sqrt(1 - (r2 * r2)), - r2 * r2)); -#else - const auto r = curand_uniform4(&rs); - return normalize((2.f * vec3(r.x, r.y, r.z)) - vec3(1.f)); -#endif + const float cosTheta = 1.f - 2.f * curand_uniform(&rs); + const float sinTheta = sqrtf(fmaxf(0.f, 1.f - cosTheta * cosTheta)); + const float phi = 2.f * float(M_PI) * curand_uniform(&rs); + return vec3(sinTheta * cosf(phi), sinTheta * sinf(phi), cosTheta); } VISRTX_DEVICE vec3 randomDir(RandState &rs, const vec3 &normal) @@ -228,14 +224,15 @@ VISRTX_DEVICE mat3 computeOrthonormalBasis(const vec3 &normal) return mat3(u, v, normal); } +// Cosine-weighted hemisphere sample (Malley's method); pdf = cos(theta)/pi. VISRTX_DEVICE vec3 sampleHemisphere(RandState &rs, const vec3 &normal) { - auto z = curand_uniform(&rs); - auto r = sqrtf(1.f - sqrt(z)); - auto phi = 2.0f * float(M_PI) * curand_uniform(&rs); - - auto sample = vec3(r * cos(phi), r * sin(phi), z); - + const float u1 = curand_uniform(&rs); + const float u2 = curand_uniform(&rs); + const float r = sqrtf(u1); + const float z = sqrtf(fmaxf(0.f, 1.f - r * r)); + const float phi = 2.f * float(M_PI) * u2; + const vec3 sample(r * cosf(phi), r * sinf(phi), z); return computeOrthonormalBasis(normal) * sample; } diff --git a/devices/rtx/device/renderer/Quality_ptx.cu b/devices/rtx/device/renderer/Quality_ptx.cu index 8e3b84ce1..dede6347d 100644 --- a/devices/rtx/device/renderer/Quality_ptx.cu +++ b/devices/rtx/device/renderer/Quality_ptx.cu @@ -153,11 +153,14 @@ VISRTX_DEVICE LightSample sampleLights(ScreenSample &ss, // last index is reserved for ambient light if it exists if (selectedIdx == world.numLightInstances) { const auto &rendererParams = frameData.renderer; + // Fold the hemisphere-sample pdf cos(theta)/pi with the uniform light pick. + const vec3 dir = sampleHemisphere(ss.rs, normal); + const float cosNs = fmaxf(0.f, dot(dir, normal)); return LightSample{ rendererParams.ambientColor * rendererParams.ambientIntensity, - sampleHemisphere(ss.rs, normal), + dir, std::numeric_limits::max(), - lightPickPdf / (2.0f * float(M_PI)), + lightPickPdf * cosNs * float(M_1_PI), }; } else { const auto &lightInstance = world.lightInstances[selectedIdx]; @@ -359,6 +362,12 @@ VISRTX_GLOBAL void __raygen__() } sample.color += sampleContribution * materialEmission * materialOpacity; + // Sample around the shading normal so the cosine-weighted hemisphere's + // pdf matches the BRDF's NdotL (which uses Ns). Sampling around Ng + // would bias the Lambertian estimator by cos_Ns/cos_Ng on smooth or + // bump-mapped surfaces. + const vec3 shadowOrigin = + shadingHitpoint(surfaceHit) + surfaceHit.Ng * surfaceHit.epsilon; LightSample lightSample = sampleLights(ss, frameData, shadowOrigin, surfaceHit.Ns); if (lightSample.pdf >= ATTENUATION_EPSILON && lightSample.dist > 0.0f) { From c3e34bacf8d65a25c31f8966f5eb3741565b0fe1 Mon Sep 17 00:00:00 2001 From: Thomas Arcila <134677+tarcila@users.noreply.github.com> Date: Wed, 29 Apr 2026 16:01:21 -0400 Subject: [PATCH 11/20] rtx: Add diffuse lobe to PBR nextRay path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For opaque non-metals reflectProb=1 always, so the indirect bounce was purely glossy GGX and multi-bounce Lambertian light was missing entirely — rough dielectrics rendered black on indirect bounces. Add a third cosine-weighted diffuse lobe with importance proxy (1-F)*(1-metallic)*(1-transmission)*luminance(baseColor); existing reflect/transmit weights now divide by their (now <1) lobe-pick probability. Sampled around Ns to match shadeSurface's diffuseBRDF axis. --- .../shaders/PhysicallyBasedShader_ptx.cu | 51 ++++++++++++++----- 1 file changed, 37 insertions(+), 14 deletions(-) diff --git a/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu b/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu index b3d834e01..f216565d4 100644 --- a/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu +++ b/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu @@ -531,36 +531,59 @@ VISRTX_CALLABLE NextRay __direct_callable__nextRay( ? vec3(0.0f) : glm::max(vec3(1.0f) - F, vec3(0.0f)) * transmissionFilter; + // Diffuse importance: the Lambertian throughput collapses to + // (1-F) * baseColor * (1-metallic) * (1-transmission) * occlusion + // when sampled cosine-weighted (cos / pdf cancels with 1/pi). Mirror the + // factors used by shadeSurface's diffuseBRDF so the lobe split tracks the + // BRDF being estimated. TIR has no diffuse share (all energy is reflected). + const vec3 diffuseEnergy = totalInternalReflection + ? vec3(0.0f) + : glm::max(vec3(1.0f) - F, vec3(0.0f)) * state->baseColor + * (1.0f - state->metallic) * (1.0f - state->transmission) + * state->occlusion; + const float reflectStrength = fmaxf(luminance(glm::max(reflectEnergy, vec3(0.0f))), 0.0f); const float transmitStrength = fmaxf(luminance(glm::max(transmitEnergy, vec3(0.0f))), 0.0f); - const float combinedStrength = reflectStrength + transmitStrength; + const float diffuseStrength = + fmaxf(luminance(glm::max(diffuseEnergy, vec3(0.0f))), 0.0f); + const float combinedStrength = + reflectStrength + transmitStrength + diffuseStrength; if (combinedStrength <= 0.0f) return NextRay{N, vec3(0.0f)}; const float reflectProb = reflectStrength / combinedStrength; - const bool sampleTransmission = curand_uniform(rs) > reflectProb; + const float transmitProb = transmitStrength / combinedStrength; + const float diffuseProb = diffuseStrength / combinedStrength; + + const float u = curand_uniform(rs); + if (u < reflectProb) { + if (Lrefl.z <= 0.0f) + return NextRay{N, vec3(0.0f)}; + const float NdotL = Lrefl.z; + const float G1 = smithG1GGX(NdotV, alpha2); + const float G2 = smithG2GGX(NdotV, NdotL, alpha2); + const vec3 weight = + reflectEnergy * (G2 / fmaxf(G1, 1e-8f)) / fmaxf(reflectProb, 1e-8f); + return NextRay{normalize(toWorld * Lrefl), weight}; + } - if (sampleTransmission) { + if (u < reflectProb + transmitProb) { const float NdotL = -Ltrans.z; // L points through the surface. const float G1 = smithG1GGX(NdotV, alpha2); const float G2 = smithG2GGX(NdotV, NdotL, alpha2); const vec3 weight = transmitEnergy * (G2 / fmaxf(G1, 1e-8f)) - / fmaxf(1.0f - reflectProb, 1e-8f); + / fmaxf(transmitProb, 1e-8f); return NextRay{normalize(toWorld * Ltrans), weight, NEXT_RAY_CONTINUES_THROUGH_SURFACE}; } - // Reflection. - if (Lrefl.z <= 0.0f) - return NextRay{N, vec3(0.0f)}; - - const float NdotL = Lrefl.z; - const float G1 = smithG1GGX(NdotV, alpha2); - const float G2 = smithG2GGX(NdotV, NdotL, alpha2); - const vec3 weight = - reflectEnergy * (G2 / fmaxf(G1, 1e-8f)) / fmaxf(reflectProb, 1e-8f); - return NextRay{normalize(toWorld * Lrefl), weight}; + // Diffuse: sample around the shading normal so pdf=cos/pi matches the BRDF's + // NdotL (same axis as shadeSurface's diffuse term). Cos and pdf cancel, + // leaving only the energy term and the lobe-pick divisor. + const vec3 wi = sampleHemisphere(*rs, N); + const vec3 weight = diffuseEnergy / fmaxf(diffuseProb, 1e-8f); + return NextRay{wi, weight}; } From 4f6ae3de7a24f877cbdc0705350e72a1dedba049 Mon Sep 17 00:00:00 2001 From: Thomas Arcila <134677+tarcila@users.noreply.github.com> Date: Thu, 30 Apr 2026 07:10:07 -0400 Subject: [PATCH 12/20] rtx: Fuse prepareDenoiseInput and prepareDenoiseGuides Two sequential kernels did identical resolveSample work for color and for the albedo/normal guides. Merge into a single kernel; null-pointer checks gate the guide writes when the guides are absent. Halves the prepare-denoise dispatch overhead per frame. --- devices/rtx/device/frame/Frame.cu | 79 ++++++++++--------------------- 1 file changed, 24 insertions(+), 55 deletions(-) diff --git a/devices/rtx/device/frame/Frame.cu b/devices/rtx/device/frame/Frame.cu index e2aa825c6..b89db4074 100644 --- a/devices/rtx/device/frame/Frame.cu +++ b/devices/rtx/device/frame/Frame.cu @@ -73,8 +73,12 @@ __device__ bool resolveSample(uint32_t idx, return divisor > 0; } -__global__ void prepareDenoiseInput(const vec4 *__restrict__ accumColor, +__global__ void prepareDenoiseInputs(const vec4 *__restrict__ accumColor, + const vec3 *__restrict__ accumAlbedo, + const vec3 *__restrict__ accumNormal, vec4 *__restrict__ denoiseInput, + vec3 *__restrict__ denoiseAlbedo, + vec3 *__restrict__ denoiseNormal, uvec2 size, int frameID, int checkerboardID, @@ -88,45 +92,6 @@ __global__ void prepareDenoiseInput(const vec4 *__restrict__ accumColor, int divisor; if (!resolveSample(idx, size, frameID, checkerboardID, srcIdx, divisor)) { denoiseInput[idx] = vec4(0.f); - return; - } - - vec4 c = accumColor[srcIdx] / float(divisor); - if (fireflyFilter) - c = detail::inverseTonemap(c); - denoiseInput[idx] = c; -} - -void launchPrepareDenoiseInput(const vec4 *accumColor, - vec4 *denoiseInput, - uvec2 size, - int frameID, - int checkerboardID, - bool fireflyFilter, - cudaStream_t stream) -{ - const uint32_t nPixels = size.x * size.y; - const uint32_t blockSize = 256; - const uint32_t gridSize = (nPixels + blockSize - 1) / blockSize; - prepareDenoiseInput<<>>( - accumColor, denoiseInput, size, frameID, checkerboardID, fireflyFilter); -} - -__global__ void prepareDenoiseGuides(const vec3 *__restrict__ accumAlbedo, - const vec3 *__restrict__ accumNormal, - vec3 *__restrict__ denoiseAlbedo, - vec3 *__restrict__ denoiseNormal, - uvec2 size, - int frameID, - int checkerboardID) -{ - const uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= size.x * size.y) - return; - - uint32_t srcIdx; - int divisor; - if (!resolveSample(idx, size, frameID, checkerboardID, srcIdx, divisor)) { if (denoiseAlbedo) denoiseAlbedo[idx] = vec3(0.f); if (denoiseNormal) @@ -135,6 +100,11 @@ __global__ void prepareDenoiseGuides(const vec3 *__restrict__ accumAlbedo, } const float invDivisor = 1.0f / float(divisor); + vec4 c = accumColor[srcIdx] * invDivisor; + if (fireflyFilter) + c = detail::inverseTonemap(c); + denoiseInput[idx] = c; + if (denoiseAlbedo) denoiseAlbedo[idx] = accumAlbedo[srcIdx] * invDivisor; @@ -146,25 +116,31 @@ __global__ void prepareDenoiseGuides(const vec3 *__restrict__ accumAlbedo, } } -void launchPrepareDenoiseGuides(const vec3 *accumAlbedo, +void launchPrepareDenoiseInputs(const vec4 *accumColor, + const vec3 *accumAlbedo, const vec3 *accumNormal, + vec4 *denoiseInput, vec3 *denoiseAlbedo, vec3 *denoiseNormal, uvec2 size, int frameID, int checkerboardID, + bool fireflyFilter, cudaStream_t stream) { const uint32_t nPixels = size.x * size.y; const uint32_t blockSize = 256; const uint32_t gridSize = (nPixels + blockSize - 1) / blockSize; - prepareDenoiseGuides<<>>(accumAlbedo, + prepareDenoiseInputs<<>>(accumColor, + accumAlbedo, accumNormal, + denoiseInput, denoiseAlbedo, denoiseNormal, size, frameID, - checkerboardID); + checkerboardID, + fireflyFilter); } __global__ void compositeBackground(vec4 *__restrict__ accumColor, @@ -561,25 +537,18 @@ void Frame::renderFrame() const bool useFloatOutput = m_denoise || m_colorType == ANARI_FLOAT32_VEC4; if (m_denoise) { - launchPrepareDenoiseInput(m_accumColor.ptrAs(), + launchPrepareDenoiseInputs(m_accumColor.ptrAs(), + m_accumAlbedo.ptrAs(), + m_accumNormal.ptrAs(), m_denoiseInput.ptrAs(), + m_denoiseAlbedo.ptrAs(), + m_denoiseNormal.ptrAs(), hd.fb.size, hd.fb.frameID, hd.fb.checkerboardID, hd.renderer.fireflyFilter, state.stream); - if (m_denoiseUsingAlbedo || m_denoiseUsingNormal) { - launchPrepareDenoiseGuides(m_accumAlbedo.ptrAs(), - m_accumNormal.ptrAs(), - m_denoiseAlbedo.ptrAs(), - m_denoiseNormal.ptrAs(), - hd.fb.size, - hd.fb.frameID, - hd.fb.checkerboardID, - state.stream); - } - m_denoiser.launch(); launchCompositeBackground(m_accumColor.ptrAs(), From 17efc098267f43e696574d1cdf4bb15a74afdfab Mon Sep 17 00:00:00 2001 From: Thomas Arcila <134677+tarcila@users.noreply.github.com> Date: Thu, 30 Apr 2026 06:56:39 -0400 Subject: [PATCH 13/20] rtx: Drop redundant dir/t term from epsilonFrom MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The original `compMax(vec4(abs(P), compMax(abs(dir))*t))` was redundant: for normalized rays `hitpoint = origin + dir*t`, so `|hitpoint|_inf` already bounds the magnitude that drives ULP-scale self-intersection offsets at the surface. Drop the dir/t inputs and use `|P|_inf` alone. The sole caller (`populateSurfaceHit` in populateHit.h) only runs from __closesthit__/__anyhit__ programs, where `optixGetRayTmax()` is the finite hit distance — there was no functional bug, this is a cleanup. --- devices/rtx/device/gpu/gpu_util.h | 4 ++-- devices/rtx/device/gpu/populateHit.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/devices/rtx/device/gpu/gpu_util.h b/devices/rtx/device/gpu/gpu_util.h index 88b3af0fb..42dba9e8f 100644 --- a/devices/rtx/device/gpu/gpu_util.h +++ b/devices/rtx/device/gpu/gpu_util.h @@ -249,9 +249,9 @@ VISRTX_DEVICE vec3 sampleUnitSphere(RandState &rs, const vec3 &normal) #define ulpEpsilon 0x1.fp-21 -VISRTX_DEVICE float epsilonFrom(const vec3 &P, const vec3 &dir, float t) +VISRTX_DEVICE float epsilonFrom(const vec3 &P) { - return glm::compMax(vec4(abs(P), glm::compMax(abs(dir)) * t)) * ulpEpsilon; + return glm::compMax(abs(P)) * ulpEpsilon; } VISRTX_DEVICE bool pixelOutOfFrame( diff --git a/devices/rtx/device/gpu/populateHit.h b/devices/rtx/device/gpu/populateHit.h index c43a0b17f..19088f325 100644 --- a/devices/rtx/device/gpu/populateHit.h +++ b/devices/rtx/device/gpu/populateHit.h @@ -399,7 +399,7 @@ VISRTX_DEVICE void populateSurfaceHit(SurfaceHit &hit) hit.primID = ray::primID(); hit.objID = sd.id; hit.instID = isd.id; - hit.epsilon = epsilonFrom(ray::hitpoint(), ray::direction(), ray::t()); + hit.epsilon = epsilonFrom(ray::hitpoint()); ray::computeTangentSpace(gd, ray::primID(), hit); const auto &handle = optixGetTransformListHandle(0); From 8728a4b983cc0814314bcd122b5f47e8843d4938 Mon Sep 17 00:00:00 2001 From: Thomas Arcila <134677+tarcila@users.noreply.github.com> Date: Wed, 6 May 2026 19:35:11 +0000 Subject: [PATCH 14/20] rtx: Lift shadow-ray origins onto smooth surface (Hanika fix) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Smooth-normal triangles produce dark, triangle-shaped bands at grazing light: the planar hit point lies below the smooth surface implied by per-vertex normals, so direct-light shadow rays start inside that implied curvature and self-occlude on the tessellation. Add a shadingHitpoint() helper (Hanika RTGII ch. 4: signed distance from each vertex tangent plane, projected back along the vertex normal, barycentric-blended) and call it at shadow/AO ray origins only. Continuation rays keep the unmodified facet hitpoint — transmission needs it: the smoothed point can sit far enough above the facet that the -Ng*epsilon offset still leaves the origin outside the volume, blocking the refracted path from reaching the back wall. --- devices/rtx/device/gpu/computeAO.h | 2 +- devices/rtx/device/gpu/gpu_math.h | 10 +++ devices/rtx/device/gpu/gpu_util.h | 81 +++++++++++++++++++ devices/rtx/device/gpu/populateHit.h | 22 ++--- .../rtx/device/renderer/Interactive_ptx.cu | 5 +- devices/rtx/device/renderer/Quality_ptx.cu | 2 +- 6 files changed, 108 insertions(+), 14 deletions(-) diff --git a/devices/rtx/device/gpu/computeAO.h b/devices/rtx/device/gpu/computeAO.h index 77861164b..3d55c3c57 100644 --- a/devices/rtx/device/gpu/computeAO.h +++ b/devices/rtx/device/gpu/computeAO.h @@ -45,7 +45,7 @@ VISRTX_DEVICE float computeAO(ScreenSample &ss, float weights = 0.0f; float hits = 0.0f; Ray aoRay; - aoRay.org = currentHit.hitpoint + currentHit.Ns * currentHit.epsilon; + aoRay.org = shadingHitpoint(currentHit) + currentHit.Ng * currentHit.epsilon; aoRay.t.lower = currentHit.epsilon; aoRay.t.upper = dist; diff --git a/devices/rtx/device/gpu/gpu_math.h b/devices/rtx/device/gpu/gpu_math.h index 0b9692792..83dddd432 100644 --- a/devices/rtx/device/gpu/gpu_math.h +++ b/devices/rtx/device/gpu/gpu_math.h @@ -326,4 +326,14 @@ VISRTX_HOST_DEVICE vec3 xfmPoint(const mat4 &m, const vec3 &p) return m * vec4(p, 1.0f); } +// mat3x4 stores OptiX-style affine rows: glm column i = OptiX row i = +// (m_{i,0}, m_{i,1}, m_{i,2}, m_{i,3}). The transform is +// out_i = sum_j m_{i,j} * p_j + m_{i,3}. +VISRTX_HOST_DEVICE vec3 xfmPoint(const mat3x4 &m, const vec3 &p) +{ + return vec3(glm::dot(vec3(m[0]), p) + m[0].w, + glm::dot(vec3(m[1]), p) + m[1].w, + glm::dot(vec3(m[2]), p) + m[2].w); +} + } // namespace visrtx diff --git a/devices/rtx/device/gpu/gpu_util.h b/devices/rtx/device/gpu/gpu_util.h index 42dba9e8f..e1b8523f5 100644 --- a/devices/rtx/device/gpu/gpu_util.h +++ b/devices/rtx/device/gpu/gpu_util.h @@ -254,6 +254,87 @@ VISRTX_DEVICE float epsilonFrom(const vec3 &P) return glm::compMax(abs(P)) * ulpEpsilon; } +// Hanika's shadow-terminator fix (Ray Tracing Gems II, ch. 4): lifts a +// triangle hit point onto the smooth surface implied by per-vertex normals. +// Without this, grazing-angle shadow rays self-occlude on the planar facet +// and produce dark bands shaped like the underlying tessellation. All inputs +// must share a coordinate space. +VISRTX_DEVICE vec3 shadowTerminatorOffset(const vec3 &P, + const vec3 &v0, + const vec3 &v1, + const vec3 &v2, + const vec3 &n0, + const vec3 &n1, + const vec3 &n2, + const vec3 &bary) +{ + const float du = glm::dot(P - v0, n0); + const float dv = glm::dot(P - v1, n1); + const float dw = glm::dot(P - v2, n2); + const vec3 lu = du < 0.f ? -du * n0 : vec3(0.f); + const vec3 lv = dv < 0.f ? -dv * n1 : vec3(0.f); + const vec3 lw = dw < 0.f ? -dw * n2 : vec3(0.f); + return P + bary.x * lu + bary.y * lv + bary.z * lw; +} + +// World-space hit position lifted onto the smooth surface implied by +// per-vertex normals (Hanika shadow-terminator fix). Use this as the origin +// for direct-light/AO shadow rays so grazing-angle queries do not self-shadow +// the planar facet. Do NOT use it for path-continuation rays — transmission +// especially needs the original facet point, since the smoothed point can sit +// far enough above the facet that an "into-the-surface" offset still ends up +// outside the volume. +VISRTX_DEVICE vec3 shadingHitpoint(const SurfaceHit &hit) +{ + if (hit.geometry == nullptr || hit.geometry->type != GeometryType::TRIANGLE) + return hit.hitpoint; + + const auto &tri = hit.geometry->tri; + if (tri.vertexNormalsFV == nullptr && tri.vertexNormals == nullptr) + return hit.hitpoint; + + const uvec3 idx = tri.indices ? tri.indices[hit.primID] + : uvec3(0, 1, 2) + hit.primID * 3; + const vec3 v0 = tri.vertices[idx.x]; + const vec3 v1 = tri.vertices[idx.y]; + const vec3 v2 = tri.vertices[idx.z]; + + vec3 n0, n1, n2; + if (tri.vertexNormalsFV != nullptr) { + const uvec3 nidx = uvec3(0, 1, 2) + hit.primID * 3; + n0 = tri.vertexNormalsFV[nidx.x]; + n1 = tri.vertexNormalsFV[nidx.y]; + n2 = tri.vertexNormalsFV[nidx.z]; + } else { + n0 = tri.vertexNormals[idx.x]; + n1 = tri.vertexNormals[idx.y]; + n2 = tri.vertexNormals[idx.z]; + } + + // Hanika's tangent-plane projection assumes unit normals; user data is + // not guaranteed to be normalized. + n0 = normalize(n0); + n1 = normalize(n1); + n2 = normalize(n2); + + // populateHit.h flips hit.Ng/Ns for back-face hits so they point toward + // the ray origin. The per-vertex normals here are still in the original + // outward orientation; flip them too so the smooth surface bulges onto + // the ray-origin side of the facet (otherwise Hanika lifts P away from + // the ray origin and the trailing `+ Ng * epsilon` can land below the + // facet). + if (!hit.isFrontFace) { + n0 = -n0; + n1 = -n1; + n2 = -n2; + } + + const vec3 Plocal = xfmPoint(hit.worldToObject, hit.hitpoint); + const vec3 Psmooth = + shadowTerminatorOffset(Plocal, v0, v1, v2, n0, n1, n2, hit.uvw); + return xfmPoint(hit.objectToWorld, Psmooth); +} + VISRTX_DEVICE bool pixelOutOfFrame( const uvec2 &pixel, const FramebufferGPUData &fb) { diff --git a/devices/rtx/device/gpu/populateHit.h b/devices/rtx/device/gpu/populateHit.h index 19088f325..c5c649f5f 100644 --- a/devices/rtx/device/gpu/populateHit.h +++ b/devices/rtx/device/gpu/populateHit.h @@ -225,22 +225,24 @@ VISRTX_DEVICE void computeTangentSpace( if (!optixIsFrontFaceHit()) hit.Ng = -hit.Ng; - const bool hasVertexNormals = ggd.tri.vertexNormalsFV != nullptr - || ggd.tri.vertexNormals != nullptr; vec3 n0, n1, n2; + bool hasVertexNormals = true; if (ggd.tri.vertexNormalsFV != nullptr) { - const uvec3 nIdx = uvec3(0, 1, 2) + (hit.primID * 3); - - n0 = ggd.tri.vertexNormalsFV[nIdx.x]; - n1 = ggd.tri.vertexNormalsFV[nIdx.y]; - n2 = ggd.tri.vertexNormalsFV[nIdx.z]; - hit.Ns = b.x * n0 + b.y * n1 + b.z * n2; + const uvec3 nidx = uvec3(0, 1, 2) + (hit.primID * 3); + n0 = ggd.tri.vertexNormalsFV[nidx.x]; + n1 = ggd.tri.vertexNormalsFV[nidx.y]; + n2 = ggd.tri.vertexNormalsFV[nidx.z]; } else if (ggd.tri.vertexNormals != nullptr) { n0 = ggd.tri.vertexNormals[idx.x]; n1 = ggd.tri.vertexNormals[idx.y]; n2 = ggd.tri.vertexNormals[idx.z]; + } else { + hasVertexNormals = false; + } + + if (hasVertexNormals) hit.Ns = b.x * n0 + b.y * n1 + b.z * n2; - } else + else hit.Ns = hit.Ng; hit.Ns = normalize(hit.Ns); @@ -399,7 +401,7 @@ VISRTX_DEVICE void populateSurfaceHit(SurfaceHit &hit) hit.primID = ray::primID(); hit.objID = sd.id; hit.instID = isd.id; - hit.epsilon = epsilonFrom(ray::hitpoint()); + hit.epsilon = epsilonFrom(hit.hitpoint); ray::computeTangentSpace(gd, ray::primID(), hit); const auto &handle = optixGetTransformListHandle(0); diff --git a/devices/rtx/device/renderer/Interactive_ptx.cu b/devices/rtx/device/renderer/Interactive_ptx.cu index c75d0a0ad..1a05ed5e6 100644 --- a/devices/rtx/device/renderer/Interactive_ptx.cu +++ b/devices/rtx/device/renderer/Interactive_ptx.cu @@ -87,17 +87,18 @@ struct InteractiveShadingPolicy * materialEvaluateTint(shadingState); // Handle all lights contributions + const vec3 shadowOrigin = shadingHitpoint(hit) + hit.Ng * hit.epsilon; for (size_t i = 0; i < world.numLightInstances; i++) { const auto &light = world.lightInstances[i]; const auto lightSample = - sampleLight(ss, hit.hitpoint, light.lightIndex, light.xfm); + sampleLight(ss, shadowOrigin, light.lightIndex, light.xfm); if (lightSample.pdf == 0.0f) continue; // Shadowing const Ray shadowRay = { - hit.hitpoint + hit.Ng * hit.epsilon, + shadowOrigin, lightSample.dir, {hit.epsilon, lightSample.dist}, }; diff --git a/devices/rtx/device/renderer/Quality_ptx.cu b/devices/rtx/device/renderer/Quality_ptx.cu index dede6347d..652b0e90c 100644 --- a/devices/rtx/device/renderer/Quality_ptx.cu +++ b/devices/rtx/device/renderer/Quality_ptx.cu @@ -377,7 +377,7 @@ VISRTX_GLOBAL void __raygen__() const float lightDotNs = dot(lightSample.dir, surfaceHit.Ns); if (lightDotNs > 0.0f) { const Ray shadowRay = { - surfaceHit.hitpoint + surfaceHit.Ng * surfaceHit.epsilon, + shadowOrigin, lightSample.dir, {surfaceHit.epsilon, lightSample.dist}, }; From 18d2346a7647da8a7caae71a73366c481afad5a7 Mon Sep 17 00:00:00 2001 From: Thomas Arcila <134677+tarcila@users.noreply.github.com> Date: Tue, 12 May 2026 13:17:29 -0400 Subject: [PATCH 15/20] rtx: Don't mix transmission into Interactive's surface alpha This breaks cutouts. To be revisited. --- devices/rtx/device/gpu/renderer/raygen_helpers.h | 9 --------- devices/rtx/device/renderer/Interactive_ptx.cu | 4 ++-- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/devices/rtx/device/gpu/renderer/raygen_helpers.h b/devices/rtx/device/gpu/renderer/raygen_helpers.h index 5f65d6405..81bfc7bd8 100644 --- a/devices/rtx/device/gpu/renderer/raygen_helpers.h +++ b/devices/rtx/device/gpu/renderer/raygen_helpers.h @@ -61,15 +61,6 @@ VISRTX_DEVICE float volumeAttenuation(ScreenSample &ss, const Ray &r) return attenuation; } -// Evaluate opacity including transmission -VISRTX_DEVICE float evaluateOpacity(const MaterialShadingState &shadingState) -{ - return materialEvaluateOpacity(shadingState) - * (1.0f - - glm::luminosity(materialEvaluateTransmission(shadingState)) - * 0.85f); -} - // Templated rendering loop // ShadingPolicy must implement: // static VISRTX_DEVICE vec4 shadeSurface( diff --git a/devices/rtx/device/renderer/Interactive_ptx.cu b/devices/rtx/device/renderer/Interactive_ptx.cu index 1a05ed5e6..2320e3a23 100644 --- a/devices/rtx/device/renderer/Interactive_ptx.cu +++ b/devices/rtx/device/renderer/Interactive_ptx.cu @@ -155,7 +155,7 @@ struct InteractiveShadingPolicy } } - return vec4(contrib, evaluateOpacity(shadingState)); + return vec4(contrib, materialEvaluateOpacity(shadingState)); } }; @@ -182,7 +182,7 @@ VISRTX_GLOBAL void __anyhit__shadow() MaterialShadingState shadingState; materialInitShading(&shadingState, frameData, *hit.material, hit); - auto opacity = evaluateOpacity(shadingState); + auto opacity = materialEvaluateOpacity(shadingState); auto &o = ray::rayData(); From b2f3733a1a570b06ea5bf7c9e792e0e84211c5c4 Mon Sep 17 00:00:00 2001 From: Thomas Arcila <134677+tarcila@users.noreply.github.com> Date: Tue, 12 May 2026 16:10:56 -0400 Subject: [PATCH 16/20] rtx: Use NdotV Fresnel for PBR diffuse weight Both shadeSurface and nextRay evaluated (1-F) at the GGX half-vector, but the diffuse direction is independent of H so the weight didn't match between NEE and the bounce. Evaluate the diffuse Fresnel at NdotV (Frostbite/Disney) so the two estimators agree at any roughness. Also factor the Fresnel+iridescence block into evalFresnelWithIridescence to remove the duplicated block. --- .../shaders/PhysicallyBasedShader_ptx.cu | 46 +++++++++++-------- 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu b/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu index f216565d4..506a384cb 100644 --- a/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu +++ b/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu @@ -394,6 +394,21 @@ VISRTX_DEVICE vec3 schlickFresnel(vec3 F0, vec3 F90, float VdotH) return F0 + (F90 - F0) * pow5(1.0f - fabsf(VdotH)); } +VISRTX_DEVICE vec3 evalFresnelWithIridescence( + const PhysicallyBasedShadingState *state, + const vec3 &F0, + const vec3 &F90, + float cosTheta) +{ + vec3 F = schlickFresnel(F0, F90, cosTheta); + if (state->iridescence > 0.0f && state->iridescenceThickness > 0.0f) { + const vec3 iridescent = evalIridescence( + 1.0f, state->iridescenceIor, cosTheta, state->iridescenceThickness, F0); + F = glm::mix(F, iridescent, state->iridescence); + } + return F; +} + VISRTX_CALLABLE vec3 __direct_callable__shadeSurface( const PhysicallyBasedShadingState *state, const SurfaceHit *hit, @@ -415,15 +430,13 @@ VISRTX_CALLABLE vec3 __direct_callable__shadeSurface( const float NdotV = fmaxf(dot(N, V), 1e-6f); const float VdotH = fmaxf(dot(V, H), 0.0f); - // Base F0 / F90, optionally overridden by iridescence. - vec3 F0 = computeF0(state); - vec3 F90 = computeF90(state); - vec3 F = schlickFresnel(F0, F90, VdotH); - if (state->iridescence > 0.0f && state->iridescenceThickness > 0.0f) { - const vec3 iridescent = evalIridescence( - 1.0f, state->iridescenceIor, VdotH, state->iridescenceThickness, F0); - F = glm::mix(F, iridescent, state->iridescence); - } + // Base F0 / F90. Specular uses Fresnel at the microfacet (VdotH); the + // diffuse weight uses Fresnel at NdotV (Frostbite/Disney convention) so + // shadeSurface and nextRay's diffuse split agree regardless of light dir. + const vec3 F0 = computeF0(state); + const vec3 F90 = computeF90(state); + const vec3 F = evalFresnelWithIridescence(state, F0, F90, VdotH); + const vec3 Fdiff = evalFresnelWithIridescence(state, F0, F90, NdotV); // Base GGX specular lobe. const float alpha = fmaxf(pow2(state->roughness), 1e-4f); @@ -436,7 +449,7 @@ VISRTX_CALLABLE vec3 __direct_callable__shadeSurface( // and transmission; metals have no diffuse). const vec3 diffuseColor = glm::mix(state->baseColor, vec3(0.0f), state->metallic); - const vec3 diffuseBRDF = (vec3(1.0f) - F) * float(M_1_PI) * diffuseColor + const vec3 diffuseBRDF = (vec3(1.0f) - Fdiff) * float(M_1_PI) * diffuseColor * state->occlusion * (1.0f - state->transmission); vec3 base = diffuseBRDF + specularBRDF; @@ -508,15 +521,12 @@ VISRTX_CALLABLE NextRay __direct_callable__nextRay( const float NdotV = Vlocal.z; const float VdotH = fmaxf(dot(Vlocal, Hlocal), 0.0f); - // Fresnel at the sampled microfacet, with optional iridescence. + // Fresnel at the sampled microfacet (specular/transmission split) and at + // NdotV (diffuse weight) — matches the convention in shadeSurface. const vec3 F0 = computeF0(state); const vec3 F90 = computeF90(state); - vec3 F = schlickFresnel(F0, F90, VdotH); - if (state->iridescence > 0.0f && state->iridescenceThickness > 0.0f) { - const vec3 iridescent = evalIridescence( - 1.0f, state->iridescenceIor, VdotH, state->iridescenceThickness, F0); - F = glm::mix(F, iridescent, state->iridescence); - } + const vec3 F = evalFresnelWithIridescence(state, F0, F90, VdotH); + const vec3 Fdiff = evalFresnelWithIridescence(state, F0, F90, NdotV); const vec3 Lrefl = glm::reflect(-Vlocal, Hlocal); const float eta = state->ior; // init() pre-inverted for front-facing hits @@ -538,7 +548,7 @@ VISRTX_CALLABLE NextRay __direct_callable__nextRay( // BRDF being estimated. TIR has no diffuse share (all energy is reflected). const vec3 diffuseEnergy = totalInternalReflection ? vec3(0.0f) - : glm::max(vec3(1.0f) - F, vec3(0.0f)) * state->baseColor + : glm::max(vec3(1.0f) - Fdiff, vec3(0.0f)) * state->baseColor * (1.0f - state->metallic) * (1.0f - state->transmission) * state->occlusion; From a2df2399dfbd5bd179e2fe01cd19c317a3370775 Mon Sep 17 00:00:00 2001 From: Thomas Arcila <134677+tarcila@users.noreply.github.com> Date: Tue, 12 May 2026 16:18:24 -0400 Subject: [PATCH 17/20] rtx: Fold normal-degeneracy fallback into sampleNormalMap The init() function had two post-hoc dot-product guards to swap a NaN/zero shading normal for the geometric one. Doing it inside sampleNormalMap means every caller gets a usable normal back without the cleanup pass downstream. --- .../material/shaders/PhysicallyBasedShader_ptx.cu | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu b/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu index 506a384cb..39cd8d5d8 100644 --- a/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu +++ b/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu @@ -65,7 +65,11 @@ VISRTX_DEVICE vec3 sampleNormalMap(const FrameGPUData &fd, if (samplerIdx == ~visrtx::DeviceObjectIndex{0}) return fallback; const vec3 ts = normalize(evaluateSampler(fd, samplerIdx, hit) * 2.0f - 1.0f); - return applyNormalMap(ts, hit, hit.Ns); + const vec3 N = applyNormalMap(ts, hit, hit.Ns); + // Negated comparison catches NaN (zero-decoded texels, zero-summed tangents) + // as well as zero-length results — fall back to the geometric normal so the + // shading frame is always usable. + return (dot(N, N) > 1e-12f) ? N : hit.Ng; } VISRTX_DEVICE float luminance(const vec3 &c) @@ -319,15 +323,6 @@ VISRTX_CALLABLE void __direct_callable__init( shadingState->iridescenceIor = md->iridescenceIor; shadingState->iridescenceThickness = getMaterialParameter(*fd, md->iridescenceThickness, *hit).x; - - // Fall back to the geometric normal if sampleNormalMap produced NaN - // (texel decodes to zero, or zero-summed tangents) or zero-length. The - // negated comparison catches both since NaN compares false to anything. - if (!(glm::dot(shadingState->normal, shadingState->normal) > 1e-12f)) - shadingState->normal = hit->Ng; - if (!(glm::dot(shadingState->clearcoatNormal, shadingState->clearcoatNormal) - > 1e-12f)) - shadingState->clearcoatNormal = hit->Ng; } //----------------------------------------------------------------------------- From 65d35740a93aac01f5ba1492a0851e275c2e399c Mon Sep 17 00:00:00 2001 From: Thomas Arcila <134677+tarcila@users.noreply.github.com> Date: Tue, 12 May 2026 16:36:28 -0400 Subject: [PATCH 18/20] rtx: Rename PBR shading state's ior field to eta The field stores the refraction ratio (n1/n2 from the incident side), pre-inverted on front-facing hits so it can feed glm::refract directly. Calling it 'ior' invited reading it as the material's IOR. Move the 'pre-inverted' note onto the struct field so the contract lives with the data. --- devices/rtx/device/gpu/shadingState.h | 5 ++++- .../device/material/shaders/PhysicallyBasedShader_ptx.cu | 7 +++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/devices/rtx/device/gpu/shadingState.h b/devices/rtx/device/gpu/shadingState.h index 1831d5436..e85ec911d 100644 --- a/devices/rtx/device/gpu/shadingState.h +++ b/devices/rtx/device/gpu/shadingState.h @@ -80,7 +80,10 @@ struct PhysicallyBasedShadingState float metallic; float roughness; float transmission; - float ior; + // Refraction ratio n1/n2 from the incident side: init() stores 1/md->ior + // for front-facing hits and md->ior for back-facing hits, so this can be + // plugged directly into glm::refract and Schlick's F0 formula. + float eta; vec3 emission; float occlusion; diff --git a/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu b/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu index 39cd8d5d8..e21bd5da8 100644 --- a/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu +++ b/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu @@ -286,7 +286,7 @@ VISRTX_CALLABLE void __direct_callable__init( shadingState->opacity = adjustedMaterialOpacity(color.w * opacity, md->alphaMode, md->cutoff); - shadingState->ior = hit->isFrontFace ? 1.0f / md->ior : md->ior; + shadingState->eta = hit->isFrontFace ? 1.0f / md->ior : md->ior; shadingState->metallic = getMaterialParameter(*fd, md->metallic, *hit).x; shadingState->roughness = getMaterialParameter(*fd, md->roughness, *hit).x; shadingState->emission = vec3(getMaterialParameter(*fd, md->emissive, *hit)); @@ -365,7 +365,7 @@ VISRTX_CALLABLE vec3 __direct_callable__evaluateNormal( VISRTX_DEVICE vec3 computeDielectricF0(const PhysicallyBasedShadingState *state) { - const float iorF0 = pow2((1.0f - state->ior) / (1.0f + state->ior)); + const float iorF0 = pow2((1.0f - state->eta) / (1.0f + state->eta)); if (state->useSpecular == 0) return vec3(iorF0); return glm::min(vec3(iorF0) * state->specularColor, vec3(1.0f)) @@ -524,8 +524,7 @@ VISRTX_CALLABLE NextRay __direct_callable__nextRay( const vec3 Fdiff = evalFresnelWithIridescence(state, F0, F90, NdotV); const vec3 Lrefl = glm::reflect(-Vlocal, Hlocal); - const float eta = state->ior; // init() pre-inverted for front-facing hits - const vec3 Ltrans = glm::refract(-Vlocal, Hlocal, eta); + const vec3 Ltrans = glm::refract(-Vlocal, Hlocal, state->eta); const vec3 transmissionFilter = computeTransmissionFilter(state); const bool hasTransmission = luminance(transmissionFilter) > 0.0f; const bool totalInternalReflection = From e18c2a4cae4519e4a78b69991757d9ad1606eb57 Mon Sep 17 00:00:00 2001 From: Thomas Arcila <134677+tarcila@users.noreply.github.com> Date: Tue, 12 May 2026 16:36:28 -0400 Subject: [PATCH 19/20] rtx: Use the `color` light parameter when sampling an HDRI --- devices/rtx/device/gpu/sampleLight.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/devices/rtx/device/gpu/sampleLight.h b/devices/rtx/device/gpu/sampleLight.h index 14bbe64cf..2fe5f5013 100644 --- a/devices/rtx/device/gpu/sampleLight.h +++ b/devices/rtx/device/gpu/sampleLight.h @@ -339,7 +339,7 @@ VISRTX_DEVICE LightSample sampleHDRILight( ls.dir = xfmVec(xfm, dir); ls.dist = std::numeric_limits::infinity(); // Environment is at infinity - ls.radiance = radiance * ld.hdri.scale; + ls.radiance = radiance * ld.hdri.scale * ld.color; ls.pdf = pdf; return ls; @@ -384,7 +384,7 @@ VISRTX_DEVICE LightSample sampleHDRILight( // instead of explicitly transposing/inverting the matrix ls.dir = xfmVec(xfm, sphericalCoordsToDirection(thetaPhi) * ld.hdri.xfm); ls.dist = 1e20f; // Environment is effectively at infinity - ls.radiance = radiance * ld.hdri.scale; + ls.radiance = radiance * ld.hdri.scale * ld.color; ls.pdf = pdf; return ls; From f436ca802d52f157c1432e04580e4eee631ea8b2 Mon Sep 17 00:00:00 2001 From: Thomas Arcila <134677+tarcila@users.noreply.github.com> Date: Tue, 12 May 2026 17:00:00 -0400 Subject: [PATCH 20/20] rtx: Sample the PBR clearcoat lobe in nextRay Clearcoat was only evaluated in NEE, so smooth clearcoats over matte bases never picked up HDRI/sky reflections on the bounce path. --- .../shaders/PhysicallyBasedShader_ptx.cu | 70 ++++++++++++++++--- 1 file changed, 59 insertions(+), 11 deletions(-) diff --git a/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu b/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu index e21bd5da8..593457ed3 100644 --- a/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu +++ b/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu @@ -489,8 +489,8 @@ VISRTX_CALLABLE vec3 __direct_callable__shadeSurface( //----------------------------------------------------------------------------- // Next-ray importance sampling: stochastic alpha, Fresnel-aware lobe pick, -// GGX VNDF reflection/refraction. Clearcoat/sheen are NEE-only (no separate -// lobe sampling), which matches what the base renderer is set up to consume. +// GGX VNDF reflection/refraction, plus a clearcoat lobe sampled with +// probability equal to its view-angle Fresnel weight. Sheen is NEE-only. //----------------------------------------------------------------------------- VISRTX_CALLABLE NextRay __direct_callable__nextRay( @@ -500,8 +500,55 @@ VISRTX_CALLABLE NextRay __direct_callable__nextRay( if (curand_uniform(rs) > state->opacity) return NextRay{ray->dir, vec3(1.0f), NEXT_RAY_CONTINUES_THROUGH_SURFACE}; - const vec3 N = state->normal; const vec3 V = -ray->dir; + + // Clearcoat lobe: pick it with probability `clearcoat·FcV(NcDotV)`. This + // exact weight makes the entry-side attenuation `1 - clearcoat·FcV` cancel + // the `1/(1-pick)` lobe-pick divisor in the base path below, so the base + // returns only need the exit-side `1 - clearcoat·FcL` multiplier. + const vec3 Nc = state->clearcoatNormal; + const float NcDotV_world = fmaxf(dot(Nc, V), 0.0f); + const float FcV_world = + CLEARCOAT_F0 + (1.0f - CLEARCOAT_F0) * pow5(1.0f - NcDotV_world); + const float clearcoatPick = + glm::clamp(state->clearcoat * FcV_world, 0.0f, 1.0f); + + if (clearcoatPick > 0.0f && curand_uniform(rs) < clearcoatPick) { + const mat3 toWorldC = computeOrthonormalBasis(Nc); + const vec3 VlocalC = glm::transpose(toWorldC) * V; + if (VlocalC.z <= 0.0f) + return NextRay{Nc, vec3(0.0f)}; + const float alphaC = fmaxf(pow2(state->clearcoatRoughness), 1e-4f); + const float alphaC2 = alphaC * alphaC; + const vec3 HlocalC = sampleGGXVNDF( + VlocalC, alphaC, curand_uniform(rs), curand_uniform(rs)); + const vec3 LlocalC = glm::reflect(-VlocalC, HlocalC); + if (LlocalC.z <= 0.0f) + return NextRay{Nc, vec3(0.0f)}; + const float VdotHc = fmaxf(dot(VlocalC, HlocalC), 0.0f); + const float Fc = + CLEARCOAT_F0 + (1.0f - CLEARCOAT_F0) * pow5(1.0f - VdotHc); + const float G1c = smithG1GGX(VlocalC.z, alphaC2); + const float G2c = smithG2GGX(VlocalC.z, LlocalC.z, alphaC2); + // VNDF gives BRDF·cos/pdf = clearcoat·Fc·G2/G1; the clearcoat factor + // cancels against the matching factor in clearcoatPick. + const vec3 weight = vec3(state->clearcoat * Fc * G2c / fmaxf(G1c, 1e-8f)) + / fmaxf(clearcoatPick, 1e-8f); + return NextRay{normalize(toWorldC * LlocalC), weight}; + } + + // Exit-side clearcoat attenuation, applied to every base-path return. + // `fabsf` handles the transmission case where L points through the surface. + auto clearcoatExitAttn = [&](const vec3 &Lworld) -> float { + if (state->clearcoat <= 0.0f) + return 1.0f; + const float NcDotL = fabsf(dot(Nc, Lworld)); + const float FcL = + CLEARCOAT_F0 + (1.0f - CLEARCOAT_F0) * pow5(1.0f - NcDotL); + return glm::clamp(1.0f - state->clearcoat * FcL, 0.0f, 1.0f); + }; + + const vec3 N = state->normal; const mat3 toWorld = computeOrthonormalBasis(N); const mat3 toLocal = glm::transpose(toWorld); const vec3 Vlocal = toLocal * V; @@ -568,26 +615,27 @@ VISRTX_CALLABLE NextRay __direct_callable__nextRay( const float NdotL = Lrefl.z; const float G1 = smithG1GGX(NdotV, alpha2); const float G2 = smithG2GGX(NdotV, NdotL, alpha2); - const vec3 weight = - reflectEnergy * (G2 / fmaxf(G1, 1e-8f)) / fmaxf(reflectProb, 1e-8f); - return NextRay{normalize(toWorld * Lrefl), weight}; + const vec3 Lworld = normalize(toWorld * Lrefl); + const vec3 weight = reflectEnergy * (G2 / fmaxf(G1, 1e-8f)) + * clearcoatExitAttn(Lworld) / fmaxf(reflectProb, 1e-8f); + return NextRay{Lworld, weight}; } if (u < reflectProb + transmitProb) { const float NdotL = -Ltrans.z; // L points through the surface. const float G1 = smithG1GGX(NdotV, alpha2); const float G2 = smithG2GGX(NdotV, NdotL, alpha2); + const vec3 Lworld = normalize(toWorld * Ltrans); const vec3 weight = transmitEnergy * (G2 / fmaxf(G1, 1e-8f)) - / fmaxf(transmitProb, 1e-8f); - return NextRay{normalize(toWorld * Ltrans), - weight, - NEXT_RAY_CONTINUES_THROUGH_SURFACE}; + * clearcoatExitAttn(Lworld) / fmaxf(transmitProb, 1e-8f); + return NextRay{Lworld, weight, NEXT_RAY_CONTINUES_THROUGH_SURFACE}; } // Diffuse: sample around the shading normal so pdf=cos/pi matches the BRDF's // NdotL (same axis as shadeSurface's diffuse term). Cos and pdf cancel, // leaving only the energy term and the lobe-pick divisor. const vec3 wi = sampleHemisphere(*rs, N); - const vec3 weight = diffuseEnergy / fmaxf(diffuseProb, 1e-8f); + const vec3 weight = + diffuseEnergy * clearcoatExitAttn(wi) / fmaxf(diffuseProb, 1e-8f); return NextRay{wi, weight}; }