diff --git a/devices/rtx/device/frame/Frame.cu b/devices/rtx/device/frame/Frame.cu index e2aa825c6..b89db4074 100644 --- a/devices/rtx/device/frame/Frame.cu +++ b/devices/rtx/device/frame/Frame.cu @@ -73,8 +73,12 @@ __device__ bool resolveSample(uint32_t idx, return divisor > 0; } -__global__ void prepareDenoiseInput(const vec4 *__restrict__ accumColor, +__global__ void prepareDenoiseInputs(const vec4 *__restrict__ accumColor, + const vec3 *__restrict__ accumAlbedo, + const vec3 *__restrict__ accumNormal, vec4 *__restrict__ denoiseInput, + vec3 *__restrict__ denoiseAlbedo, + vec3 *__restrict__ denoiseNormal, uvec2 size, int frameID, int checkerboardID, @@ -88,45 +92,6 @@ __global__ void prepareDenoiseInput(const vec4 *__restrict__ accumColor, int divisor; if (!resolveSample(idx, size, frameID, checkerboardID, srcIdx, divisor)) { denoiseInput[idx] = vec4(0.f); - return; - } - - vec4 c = accumColor[srcIdx] / float(divisor); - if (fireflyFilter) - c = detail::inverseTonemap(c); - denoiseInput[idx] = c; -} - -void launchPrepareDenoiseInput(const vec4 *accumColor, - vec4 *denoiseInput, - uvec2 size, - int frameID, - int checkerboardID, - bool fireflyFilter, - cudaStream_t stream) -{ - const uint32_t nPixels = size.x * size.y; - const uint32_t blockSize = 256; - const uint32_t gridSize = (nPixels + blockSize - 1) / blockSize; - prepareDenoiseInput<<>>( - accumColor, denoiseInput, size, frameID, checkerboardID, fireflyFilter); -} - -__global__ void prepareDenoiseGuides(const vec3 *__restrict__ accumAlbedo, - const vec3 *__restrict__ accumNormal, - vec3 *__restrict__ denoiseAlbedo, - vec3 *__restrict__ denoiseNormal, - uvec2 size, - int frameID, - int checkerboardID) -{ - const uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= size.x * size.y) - return; - - uint32_t srcIdx; - int divisor; - if (!resolveSample(idx, size, frameID, checkerboardID, srcIdx, divisor)) { if (denoiseAlbedo) denoiseAlbedo[idx] = vec3(0.f); if (denoiseNormal) @@ -135,6 +100,11 @@ __global__ void prepareDenoiseGuides(const vec3 *__restrict__ accumAlbedo, } const float invDivisor = 1.0f / float(divisor); + vec4 c = accumColor[srcIdx] * invDivisor; + if (fireflyFilter) + c = detail::inverseTonemap(c); + denoiseInput[idx] = c; + if (denoiseAlbedo) denoiseAlbedo[idx] = accumAlbedo[srcIdx] * invDivisor; @@ -146,25 +116,31 @@ __global__ void prepareDenoiseGuides(const vec3 *__restrict__ accumAlbedo, } } -void launchPrepareDenoiseGuides(const vec3 *accumAlbedo, +void launchPrepareDenoiseInputs(const vec4 *accumColor, + const vec3 *accumAlbedo, const vec3 *accumNormal, + vec4 *denoiseInput, vec3 *denoiseAlbedo, vec3 *denoiseNormal, uvec2 size, int frameID, int checkerboardID, + bool fireflyFilter, cudaStream_t stream) { const uint32_t nPixels = size.x * size.y; const uint32_t blockSize = 256; const uint32_t gridSize = (nPixels + blockSize - 1) / blockSize; - prepareDenoiseGuides<<>>(accumAlbedo, + prepareDenoiseInputs<<>>(accumColor, + accumAlbedo, accumNormal, + denoiseInput, denoiseAlbedo, denoiseNormal, size, frameID, - checkerboardID); + checkerboardID, + fireflyFilter); } __global__ void compositeBackground(vec4 *__restrict__ accumColor, @@ -561,25 +537,18 @@ void Frame::renderFrame() const bool useFloatOutput = m_denoise || m_colorType == ANARI_FLOAT32_VEC4; if (m_denoise) { - launchPrepareDenoiseInput(m_accumColor.ptrAs(), + launchPrepareDenoiseInputs(m_accumColor.ptrAs(), + m_accumAlbedo.ptrAs(), + m_accumNormal.ptrAs(), m_denoiseInput.ptrAs(), + m_denoiseAlbedo.ptrAs(), + m_denoiseNormal.ptrAs(), hd.fb.size, hd.fb.frameID, hd.fb.checkerboardID, hd.renderer.fireflyFilter, state.stream); - if (m_denoiseUsingAlbedo || m_denoiseUsingNormal) { - launchPrepareDenoiseGuides(m_accumAlbedo.ptrAs(), - m_accumNormal.ptrAs(), - m_denoiseAlbedo.ptrAs(), - m_denoiseNormal.ptrAs(), - hd.fb.size, - hd.fb.frameID, - hd.fb.checkerboardID, - state.stream); - } - m_denoiser.launch(); launchCompositeBackground(m_accumColor.ptrAs(), diff --git a/devices/rtx/device/geometry/ComputeTangent.cu b/devices/rtx/device/geometry/ComputeTangent.cu index bffeb4bb0..f6c1f1f56 100644 --- a/devices/rtx/device/geometry/ComputeTangent.cu +++ b/devices/rtx/device/geometry/ComputeTangent.cu @@ -52,6 +52,76 @@ namespace { constexpr const auto eps = 1e-8f; +__device__ glm::vec3 safeNormalize( + const glm::vec3 &v, const glm::vec3 &fallback) +{ + const float l2 = glm::dot(v, v); + return l2 > eps ? v * rsqrtf(l2) : fallback; +} + +__device__ void makeTangentFrame( + const glm::vec3 &normal, glm::vec3 *tangent, glm::vec3 *bitangent) +{ + // https://graphics.pixar.com/library/OrthonormalB/paper.pdf + const glm::vec3 n = safeNormalize(normal, glm::vec3(0.f, 0.f, 1.f)); + const float sign = n.z >= 0.0f ? 1.0f : -1.0f; + const float a = -1.0f / (sign + n.z); + const float b = n.x * n.y * a; + *tangent = glm::vec3(1.0f + sign * n.x * n.x * a, sign * b, -sign * n.x); + *bitangent = glm::vec3(b, sign + n.y * n.y * a, -n.y); +} + +__device__ glm::vec3 computeGeometricNormal( + const glm::vec3 &e1, const glm::vec3 &e2) +{ + return safeNormalize(glm::cross(e1, e2), glm::vec3(0.f, 0.f, 1.f)); +} + +// Each face-vertex's contribution to its vertex's accumulated tangent frame +// is weighted by the triangle's interior angle at that corner — same scheme +// MikkTSpace uses to average across incident faces. Angle weighting (over +// uniform or area) keeps thin sliver triangles from dominating shared +// vertices. +__device__ float cornerAngle( + const glm::vec3 &a, const glm::vec3 &b, const glm::vec3 &c) +{ + const glm::vec3 ab = b - a; + const glm::vec3 ac = c - a; + const float lab = sqrtf(glm::dot(ab, ab)); + const float lac = sqrtf(glm::dot(ac, ac)); + if (lab < eps || lac < eps) + return 0.0f; + const float cosT = + glm::clamp(glm::dot(ab, ac) / (lab * lac), -1.0f, 1.0f); + return acosf(cosT); +} + +__device__ void atomicAddVec3(glm::vec3 &dst, const glm::vec3 &v) +{ + atomicAdd(&dst.x, v.x); + atomicAdd(&dst.y, v.y); + atomicAdd(&dst.z, v.z); +} + +void cudaFreeMemoryDeleter(const void *, const void *memory) +{ + cudaFree(const_cast(memory)); +} + +bool reportCudaError( + visrtx::Triangle *triangle, cudaError_t error, const char *operation) +{ + if (error == cudaSuccess) + return false; + + triangle->reportMessage(ANARI_SEVERITY_ERROR, + "CUDA error while computing tangents for Triangle %p during %s: %s", + triangle, + operation, + cudaGetErrorString(error)); + return true; +} + } // namespace namespace visrtx { @@ -64,57 +134,53 @@ __device__ void __computeTangentAndBitangent( glm::vec3 p2, glm::vec2 uv0, // Input texture coordinates glm::vec2 uv1, - glm::vec2 uv2 -) + glm::vec2 uv2) { // Compute edges of the triangle glm::vec3 e1 = p1 - p0; glm::vec3 e2 = p2 - p0; + const auto normal = computeGeometricNormal(e1, e2); - if (dot(e1, e1) < eps || dot(e2, e2) < eps) { - // Degenerate triangle, use a default tangent and bitangent - *tangent = glm::vec3(1.0f, 0.0f, 0.0f); - *bitangent = glm::vec3(0.0f, 1.0f, 0.0f); - } else { - auto normal = normalize(cross(e1, e2)); - - // Compute differences in texture coordinates - auto s = uv1 - uv0; - auto t = uv2 - uv0; - - auto cross = s.x * t.y - s.y * t.x; - - if (abs(cross) < eps) { // degenerate triangle (null vectors or collinears) - // Create a default orthonormal basis: - // https://graphics.pixar.com/library/OrthonormalB/paper.pdf - float sign = normal.z >= 0.0f ? 1.0f : -1.0f; - float a = -1.0f / (sign + normal.z); - float b = normal.x * normal.y * a; - *tangent = glm::vec3( - 1.0f + sign * normal.x * normal.x * a, sign * b, -sign * normal.x); - *bitangent = glm::vec3(b, sign + normal.y * normal.y * a, -normal.y); - } else { - // Compute the determinant - float invdet = 1.0f / cross; - *tangent = (t.y * e1 - s.y * e2) * invdet; - *bitangent = (t.x * e1 - s.x * e2) * invdet; - } + if (glm::dot(e1, e1) < eps || glm::dot(e2, e2) < eps) { + makeTangentFrame(normal, tangent, bitangent); + return; + } + + // Compute differences in texture coordinates + auto s = uv1 - uv0; + auto t = uv2 - uv0; + + auto det = s.x * t.y - s.y * t.x; + + if (glm::abs(det) < eps) { + makeTangentFrame(normal, tangent, bitangent); + return; } + + float invdet = 1.0f / det; + *tangent = (t.y * e1 - s.y * e2) * invdet; + *bitangent = (s.x * e2 - t.x * e1) * invdet; } +// Pass 1 (one thread per triangle): compute the per-triangle T/B from the UV +// gradient, then atomicAdd those vectors into per-vertex accumulators — +// weighted by the triangle's interior angle at each corner. Per-vertex +// normals are accumulated the same way so Pass 2 has a coordinate frame to +// orthogonalize against, regardless of whether input normals are vertex, +// face-varying, or absent. template -__global__ void __doComputeTangents( - glm::vec4 *tangents, // Output tangent vectors with handedness (w component) - glm::vec3 *bitangents, // Output bitangent vectors - const glm::uvec3 *indices, // Input triangle indices - const glm::vec3 *positions, // Input vertex positions - const glm::vec3 *normals, // Input vertex normals - const TexCoord *uvs, // Input texture coordinates - unsigned int numTriangles // Number of triangles -) +__global__ void __doAccumulateTangents( + glm::vec3 *tangentAccum, + glm::vec3 *bitangentAccum, + glm::vec3 *normalAccum, + const glm::uvec3 *indices, + const glm::vec3 *positions, + const glm::vec3 *normals, + const TexCoord *uvs, + unsigned int numTriangles) { unsigned int tri = blockIdx.x * blockDim.x + threadIdx.x; @@ -139,56 +205,58 @@ __global__ void __doComputeTangents( vec2 uv0, uv1, uv2; if constexpr (UVsIndexed) { - // Use indexed UVs uv0 = uvs[indexedIdx.x]; uv1 = uvs[indexedIdx.y]; uv2 = uvs[indexedIdx.z]; } else { - // Use per-face UVs uv0 = uvs[perFaceBaseIdx.x]; uv1 = uvs[perFaceBaseIdx.y]; uv2 = uvs[perFaceBaseIdx.z]; } vec3 tangent, bitangent; - __computeTangentAndBitangent( - &tangent, // Output tangent vectors with handedness (w component) - &bitangent, // Output bitangent vectors - p0, - p1, - p2, // Input vertex positions - uv0, - uv1, - uv2 // Input texture coordinates - ); - - vec3 n0, n1, n2; - if constexpr (NormalsIndexed) { - // Use indexed normals - n0 = normals[indexedIdx.x]; - n1 = normals[indexedIdx.y]; - n2 = normals[indexedIdx.z]; - } else { - // Use per-face normals - n0 = normals[perFaceBaseIdx.x]; - n1 = normals[perFaceBaseIdx.y]; - n2 = normals[perFaceBaseIdx.z]; + &tangent, &bitangent, p0, p1, p2, uv0, uv1, uv2); + + const vec3 geometricNormal = computeGeometricNormal(p1 - p0, p2 - p0); + vec3 n0 = geometricNormal; + vec3 n1 = geometricNormal; + vec3 n2 = geometricNormal; + if (normals) { + if constexpr (NormalsIndexed) { + n0 = normals[indexedIdx.x]; + n1 = normals[indexedIdx.y]; + n2 = normals[indexedIdx.z]; + } else { + n0 = normals[perFaceBaseIdx.x]; + n1 = normals[perFaceBaseIdx.y]; + n2 = normals[perFaceBaseIdx.z]; + } + n0 = safeNormalize(n0, geometricNormal); + n1 = safeNormalize(n1, geometricNormal); + n2 = safeNormalize(n2, geometricNormal); } - // Gram-Schmidt orthogonalize and compute handedness - vec3 t0 = normalize(tangent - n0 * dot(n0, tangent)); - float h0 = copysign(1.0f, dot(cross(n0, t0), bitangent)); + // For indexed meshes, accumulate at the shared vertex slot so adjacent + // triangles average their contributions. For triangle-soup each face-vertex + // already has a unique slot. + const glm::uvec3 outIdx = VerticesIndexed ? indexedIdx : perFaceBaseIdx; - vec3 t1 = normalize(tangent - n1 * dot(n1, tangent)); - float h1 = copysign(1.0f, dot(cross(n1, t1), bitangent)); + const float w0 = cornerAngle(p0, p1, p2); + const float w1 = cornerAngle(p1, p0, p2); + const float w2 = cornerAngle(p2, p0, p1); - vec3 t2 = normalize(tangent - n2 * dot(n2, tangent)); - float h2 = copysign(1.0f, dot(cross(n2, t2), bitangent)); + atomicAddVec3(tangentAccum[outIdx.x], tangent * w0); + atomicAddVec3(tangentAccum[outIdx.y], tangent * w1); + atomicAddVec3(tangentAccum[outIdx.z], tangent * w2); - tangents[perFaceBaseIdx.x] = glm::vec4(t0, h0); - tangents[perFaceBaseIdx.y] = glm::vec4(t1, h1); - tangents[perFaceBaseIdx.z] = glm::vec4(t2, h2); + atomicAddVec3(bitangentAccum[outIdx.x], bitangent * w0); + atomicAddVec3(bitangentAccum[outIdx.y], bitangent * w1); + atomicAddVec3(bitangentAccum[outIdx.z], bitangent * w2); + + atomicAddVec3(normalAccum[outIdx.x], n0 * w0); + atomicAddVec3(normalAccum[outIdx.y], n1 * w1); + atomicAddVec3(normalAccum[outIdx.z], n2 * w2); } template void __computeTangents( - glm::vec4 *tangents, // Output tangent vectors with handedness (w component) - glm::vec3 *bitangents, // Output bitangent vectors - const glm::uvec3 *indices, // Input triangle indices - const glm::vec3 *positions, // Input vertex positions - const glm::vec3 *normals, // Input vertex normals - const TexCoord *uvs, // Input texture coordinates - unsigned int numTriangles // Number of triangles -) + glm::vec3 *tangentAccum, + glm::vec3 *bitangentAccum, + glm::vec3 *normalAccum, + const glm::uvec3 *indices, + const glm::vec3 *positions, + const glm::vec3 *normals, + const TexCoord *uvs, + unsigned int numTriangles) { - __doComputeTangents - <<<(numTriangles + 63) / 64, 64>>>( - tangents, bitangents, indices, positions, normals, uvs, numTriangles); + __doAccumulateTangents<<<(numTriangles + 63) / 64, 64>>>(tangentAccum, + bitangentAccum, + normalAccum, + indices, + positions, + normals, + uvs, + numTriangles); +} + +// Pass 2 (one thread per vertex): normalize the accumulated frame and write +// vec4(T_orthog, sign). The accumulated normal is used as the orthogonalization +// basis; for vertex-indexed input it averages back to each vertex's authored +// normal, and for face-varying or missing normals it gives the angle-weighted +// average across incident faces. +__global__ void __doFinalizeTangents(glm::vec4 *tangents, + const glm::vec3 *tangentAccum, + const glm::vec3 *bitangentAccum, + const glm::vec3 *normalAccum, + unsigned int numVertices) +{ + unsigned int v = blockIdx.x * blockDim.x + threadIdx.x; + + if (v >= numVertices) + return; + + const vec3 T_in = tangentAccum[v]; + const vec3 B_in = bitangentAccum[v]; + const vec3 N_in = normalAccum[v]; + + const vec3 n = safeNormalize(N_in, vec3(0.0f, 0.0f, 1.0f)); + + vec3 fallbackT, fallbackB; + makeTangentFrame(n, &fallbackT, &fallbackB); + + const vec3 T_orth = + safeNormalize(T_in - n * glm::dot(n, T_in), fallbackT); + + const float bitangentSign = glm::dot(glm::cross(n, T_orth), B_in); + const float sign = bitangentSign < 0.0f ? -1.0f : 1.0f; + + tangents[v] = glm::vec4(T_orth, sign); } void updateGeometryTangent(Triangle *triangle) @@ -219,9 +329,9 @@ void updateGeometryTangent(Triangle *triangle) auto normalsFV = triangle->getParamObject("faceVarying.normal"); auto uvsFV = triangle->getParamObject("faceVarying.attribute0"); - if (!positions || (!normals && !normalsFV)) { + if (!positions) { triangle->reportMessage(ANARI_SEVERITY_INFO, - "Triangle %p has no position or normals, cannot compute tangents", + "Triangle %p has no positions, cannot compute tangents", triangle); return; } @@ -249,20 +359,73 @@ void updateGeometryTangent(Triangle *triangle) return; } - // Always go with faceVarying tangents. Rational is the following: - // - Correct UV and normal sharing is achieve through indexing - // - If faceVarying UVs/normals are used then, it should already imply correct - // sharing - // on common vertices. - - auto tangentsCount = indices ? (indices->size() * 3) : positions->size(); - auto trianglesCount = indices ? indices->size() : positions->size() / 3; - glm::vec4 *tangents = {}; - cudaMalloc(&tangents, sizeof(glm::vec4) * tangentsCount); - cudaMemset(tangents, 0, sizeof(glm::vec4) * tangentsCount); - glm::vec3 *bitangents = {}; - cudaMalloc(&bitangents, sizeof(glm::vec3) * tangentsCount); - cudaMemset(bitangents, 0, sizeof(glm::vec3) * tangentsCount); + // Output is per-vertex (vertex.tangent). For indexed meshes the per-vertex + // buffer is what lets adjacent triangles share tangent data at common + // vertices — that sharing is what eliminates the per-triangle facets a + // face-varying buffer would produce. For triangle-soup input each face-vertex + // is its own slot, so the same layout works without changes. + const auto numVertices = static_cast(positions->size()); + const auto trianglesCount = static_cast( + indices ? indices->size() : positions->size() / 3); + if (trianglesCount == 0 || numVertices == 0) { + triangle->reportMessage(ANARI_SEVERITY_INFO, + "Triangle %p has no triangles, cannot compute tangents", + triangle); + return; + } + + glm::vec3 *tangentAccum = nullptr; + glm::vec3 *bitangentAccum = nullptr; + glm::vec3 *normalAccum = nullptr; + glm::vec4 *tangents = nullptr; + + auto cleanup = [&] { + cudaFree(tangentAccum); + cudaFree(bitangentAccum); + cudaFree(normalAccum); + cudaFree(tangents); + }; + + auto status = cudaMalloc(reinterpret_cast(&tangentAccum), + sizeof(glm::vec3) * numVertices); + if (reportCudaError(triangle, status, "allocating tangent accumulator")) { + cleanup(); + return; + } + status = cudaMalloc(reinterpret_cast(&bitangentAccum), + sizeof(glm::vec3) * numVertices); + if (reportCudaError(triangle, status, "allocating bitangent accumulator")) { + cleanup(); + return; + } + status = cudaMalloc(reinterpret_cast(&normalAccum), + sizeof(glm::vec3) * numVertices); + if (reportCudaError(triangle, status, "allocating normal accumulator")) { + cleanup(); + return; + } + status = cudaMalloc( + reinterpret_cast(&tangents), sizeof(glm::vec4) * numVertices); + if (reportCudaError(triangle, status, "allocating tangent output buffer")) { + cleanup(); + return; + } + + status = cudaMemset(tangentAccum, 0, sizeof(glm::vec3) * numVertices); + if (reportCudaError(triangle, status, "clearing tangent accumulator")) { + cleanup(); + return; + } + status = cudaMemset(bitangentAccum, 0, sizeof(glm::vec3) * numVertices); + if (reportCudaError(triangle, status, "clearing bitangent accumulator")) { + cleanup(); + return; + } + status = cudaMemset(normalAccum, 0, sizeof(glm::vec3) * numVertices); + if (reportCudaError(triangle, status, "clearing normal accumulator")) { + cleanup(); + return; + } auto positionsPtr = positions->dataAs(AddressSpace::GPU); if (indices) { @@ -273,8 +436,9 @@ void updateGeometryTangent(Triangle *triangle) if (uvsFV->elementType() == ANARI_FLOAT32_VEC2) { auto uvsPtr = uvsFV->dataAs(AddressSpace::GPU); // Vertex indexed, face varying normals and face varyings vec2 UVs. - __computeTangents(tangents, - bitangents, + __computeTangents(tangentAccum, + bitangentAccum, + normalAccum, indicesPtr, positionsPtr, normalsPtr, @@ -283,8 +447,9 @@ void updateGeometryTangent(Triangle *triangle) } else { auto uvsPtr = uvsFV->dataAs(AddressSpace::GPU); // Vertex indexed, face varying normals and face varyings vec3 UVs. - __computeTangents(tangents, - bitangents, + __computeTangents(tangentAccum, + bitangentAccum, + normalAccum, indicesPtr, positionsPtr, normalsPtr, @@ -295,8 +460,9 @@ void updateGeometryTangent(Triangle *triangle) if (uvs->elementType() == ANARI_FLOAT32_VEC2) { // Vertex indexed, face varying normals and indexed vec2 UVs. auto uvsPtr = uvs->dataAs(AddressSpace::GPU); - __computeTangents(tangents, - bitangents, + __computeTangents(tangentAccum, + bitangentAccum, + normalAccum, indicesPtr, positionsPtr, normalsPtr, @@ -305,8 +471,9 @@ void updateGeometryTangent(Triangle *triangle) } else { // Vertex indexed, face varying normals and indexed vec3 UVs. auto uvsPtr = uvs->dataAs(AddressSpace::GPU); - __computeTangents(tangents, - bitangents, + __computeTangents(tangentAccum, + bitangentAccum, + normalAccum, indicesPtr, positionsPtr, normalsPtr, @@ -315,13 +482,16 @@ void updateGeometryTangent(Triangle *triangle) } } } else { - auto normalsPtr = normals->dataAs(AddressSpace::GPU); + const auto *normalsPtr = normals + ? normals->dataAs(AddressSpace::GPU) + : nullptr; if (uvsFV) { if (uvsFV->elementType() == ANARI_FLOAT32_VEC2) { auto uvsPtr = uvsFV->dataAs(AddressSpace::GPU); // Vertex indexed, index normals and face varyings vec2 UVs. - __computeTangents(tangents, - bitangents, + __computeTangents(tangentAccum, + bitangentAccum, + normalAccum, indicesPtr, positionsPtr, normalsPtr, @@ -330,8 +500,9 @@ void updateGeometryTangent(Triangle *triangle) } else { auto uvsPtr = uvsFV->dataAs(AddressSpace::GPU); // Vertex indexed, indexed normals and face varyings vec3 UVs. - __computeTangents(tangents, - bitangents, + __computeTangents(tangentAccum, + bitangentAccum, + normalAccum, indicesPtr, positionsPtr, normalsPtr, @@ -342,8 +513,9 @@ void updateGeometryTangent(Triangle *triangle) if (uvs->elementType() == ANARI_FLOAT32_VEC2) { // Vertex indexed, indexed normals and indexed vec2 UVs. auto uvsPtr = uvs->dataAs(AddressSpace::GPU); - __computeTangents(tangents, - bitangents, + __computeTangents(tangentAccum, + bitangentAccum, + normalAccum, indicesPtr, positionsPtr, normalsPtr, @@ -352,8 +524,9 @@ void updateGeometryTangent(Triangle *triangle) } else { // Vertex indexed, indexed normals and indexed vec3 UVs. auto uvsPtr = uvs->dataAs(AddressSpace::GPU); - __computeTangents(tangents, - bitangents, + __computeTangents(tangentAccum, + bitangentAccum, + normalAccum, indicesPtr, positionsPtr, normalsPtr, @@ -363,17 +536,20 @@ void updateGeometryTangent(Triangle *triangle) } } } else { - auto indicesPtr = nullptr; - normals = normalsFV ? normalsFV : normals; - uvs = uvsFV ? uvsFV : uvs; + const glm::uvec3 *indicesPtr = nullptr; + auto effectiveNormals = normalsFV ? normalsFV : normals; + auto effectiveUvs = uvsFV ? uvsFV : uvs; - auto normalsPtr = normals->dataAs(AddressSpace::GPU); + const auto *normalsPtr = effectiveNormals + ? effectiveNormals->dataAs(AddressSpace::GPU) + : nullptr; - if (uvs->elementType() == ANARI_FLOAT32_VEC2) { + if (effectiveUvs->elementType() == ANARI_FLOAT32_VEC2) { // Non indexed vertices, face varying normals and face varyings vec2 UVs. - auto uvsPtr = uvs->dataAs(AddressSpace::GPU); - __computeTangents(tangents, - bitangents, + auto uvsPtr = effectiveUvs->dataAs(AddressSpace::GPU); + __computeTangents(tangentAccum, + bitangentAccum, + normalAccum, indicesPtr, positionsPtr, normalsPtr, @@ -381,9 +557,10 @@ void updateGeometryTangent(Triangle *triangle) trianglesCount); } else { // Non indexed vertices, face varying normals and face varyings vec3 UVs. - auto uvsPtr = uvs->dataAs(AddressSpace::GPU); - __computeTangents(tangents, - bitangents, + auto uvsPtr = effectiveUvs->dataAs(AddressSpace::GPU); + __computeTangents(tangentAccum, + bitangentAccum, + normalAccum, indicesPtr, positionsPtr, normalsPtr, @@ -392,26 +569,47 @@ void updateGeometryTangent(Triangle *triangle) } } - // Release transient bitangent store - cudaFree(bitangents); + status = cudaGetLastError(); + if (reportCudaError(triangle, status, "launching accumulate kernel")) { + cleanup(); + return; + } + + __doFinalizeTangents<<<(numVertices + 63) / 64, 64>>>( + tangents, tangentAccum, bitangentAccum, normalAccum, numVertices); + + status = cudaGetLastError(); + if (reportCudaError(triangle, status, "launching finalize kernel")) { + cleanup(); + return; + } + + status = cudaDeviceSynchronize(); + cudaFree(tangentAccum); + cudaFree(bitangentAccum); + cudaFree(normalAccum); + tangentAccum = nullptr; + bitangentAccum = nullptr; + normalAccum = nullptr; + if (reportCudaError(triangle, status, "computing tangents")) { + cleanup(); + return; + } auto desc = Array1DMemoryDescriptor{ { tangents, - {}, // deleter + cudaFreeMemoryDeleter, // deleter {}, // deleterPtr ANARI_FLOAT32_VEC4, }, - tangentsCount, + numVertices, }; auto tangentsArray = new Array1D(triangle->deviceState(), desc); tangentsArray->commitParameters(); tangentsArray->finalize(); - if (indices) - triangle->setParam("faceVarying.tangent", tangentsArray); - else - triangle->setParam("vertex.tangent", tangentsArray); + triangle->setParam("vertex.tangent", tangentsArray); triangle->commitParameters(); triangle->finalize(); diff --git a/devices/rtx/device/geometry/Geometry.cpp b/devices/rtx/device/geometry/Geometry.cpp index 127eb865e..e39552c72 100644 --- a/devices/rtx/device/geometry/Geometry.cpp +++ b/devices/rtx/device/geometry/Geometry.cpp @@ -129,7 +129,7 @@ void Geometry::commitParameters() void Geometry::markFinalized() { Object::markFinalized(); - deviceState()->objectUpdates.lastBLASChange = helium::newTimeStamp(); + deviceState()->objectUpdates.lastSurfaceBLASChange = helium::newTimeStamp(); } GeometryGPUData Geometry::gpuData() const diff --git a/devices/rtx/device/geometry/Triangle.cpp b/devices/rtx/device/geometry/Triangle.cpp index 4369243f3..52aa41216 100644 --- a/devices/rtx/device/geometry/Triangle.cpp +++ b/devices/rtx/device/geometry/Triangle.cpp @@ -78,12 +78,14 @@ void Triangle::finalize() m_vertex->size()); } - if (m_vertexNormalFV && 3 * m_index->size() != m_vertexNormalFV->size()) { + const size_t numTriangles = m_index ? m_index->size() : m_vertex->size() / 3; + + if (m_vertexNormalFV && 3 * numTriangles != m_vertexNormalFV->size()) { reportMessage(ANARI_SEVERITY_WARNING, "'faceVarying.normal' on triangle geometry is not matching " "the number of triangles in 'primitive.index' (%zu) vs. (%zu)", m_vertexNormalFV->size(), - m_index->size()); + numTriangles); } if (m_vertexTangent && m_vertex->size() != m_vertexTangent->size()) { @@ -94,12 +96,12 @@ void Triangle::finalize() m_vertex->size()); } - if (m_vertexTangentFV && 3 * m_index->size() != m_vertexTangentFV->size()) { + if (m_vertexTangentFV && 3 * numTriangles != m_vertexTangentFV->size()) { reportMessage(ANARI_SEVERITY_WARNING, "'faceVarying.Tangent' on triangle geometry is not matching " "the number of triangles in 'primitive.index' (%zu) vs. (%zu)", m_vertexTangentFV->size(), - m_index->size()); + numTriangles); } if (!m_vertexTangent && !m_vertexTangentFV) { diff --git a/devices/rtx/device/gpu/computeAO.h b/devices/rtx/device/gpu/computeAO.h index 77861164b..3d55c3c57 100644 --- a/devices/rtx/device/gpu/computeAO.h +++ b/devices/rtx/device/gpu/computeAO.h @@ -45,7 +45,7 @@ VISRTX_DEVICE float computeAO(ScreenSample &ss, float weights = 0.0f; float hits = 0.0f; Ray aoRay; - aoRay.org = currentHit.hitpoint + currentHit.Ns * currentHit.epsilon; + aoRay.org = shadingHitpoint(currentHit) + currentHit.Ng * currentHit.epsilon; aoRay.t.lower = currentHit.epsilon; aoRay.t.upper = dist; diff --git a/devices/rtx/device/gpu/gpu_math.h b/devices/rtx/device/gpu/gpu_math.h index 0b9692792..83dddd432 100644 --- a/devices/rtx/device/gpu/gpu_math.h +++ b/devices/rtx/device/gpu/gpu_math.h @@ -326,4 +326,14 @@ VISRTX_HOST_DEVICE vec3 xfmPoint(const mat4 &m, const vec3 &p) return m * vec4(p, 1.0f); } +// mat3x4 stores OptiX-style affine rows: glm column i = OptiX row i = +// (m_{i,0}, m_{i,1}, m_{i,2}, m_{i,3}). The transform is +// out_i = sum_j m_{i,j} * p_j + m_{i,3}. +VISRTX_HOST_DEVICE vec3 xfmPoint(const mat3x4 &m, const vec3 &p) +{ + return vec3(glm::dot(vec3(m[0]), p) + m[0].w, + glm::dot(vec3(m[1]), p) + m[1].w, + glm::dot(vec3(m[2]), p) + m[2].w); +} + } // namespace visrtx diff --git a/devices/rtx/device/gpu/gpu_util.h b/devices/rtx/device/gpu/gpu_util.h index 9e17c44ba..e1b8523f5 100644 --- a/devices/rtx/device/gpu/gpu_util.h +++ b/devices/rtx/device/gpu/gpu_util.h @@ -195,18 +195,14 @@ VISRTX_DEVICE vec3 boolColor(bool pred) return pred ? vec3(0.f, 1.f, 0.f) : vec3(1.f, 0.f, 0.f); } +// Uniform on the unit sphere via Marsaglia (1972); pdf = 1/(4*pi). +// Downstream uses: isotropic volume scatter, AO/bounce hemisphere base. VISRTX_DEVICE vec3 randomDir(RandState &rs) { -#if 0 - const float r1 = curand_uniform(&rs); - const float r2 = curand_uniform(&rs); - return normalize(vec3(cos(2 * float(M_PI) * r1) * sqrt(1 - (r2 * r2)), - sin(2 * float(M_PI) * r1) * sqrt(1 - (r2 * r2)), - r2 * r2)); -#else - const auto r = curand_uniform4(&rs); - return normalize((2.f * vec3(r.x, r.y, r.z)) - vec3(1.f)); -#endif + const float cosTheta = 1.f - 2.f * curand_uniform(&rs); + const float sinTheta = sqrtf(fmaxf(0.f, 1.f - cosTheta * cosTheta)); + const float phi = 2.f * float(M_PI) * curand_uniform(&rs); + return vec3(sinTheta * cosf(phi), sinTheta * sinf(phi), cosTheta); } VISRTX_DEVICE vec3 randomDir(RandState &rs, const vec3 &normal) @@ -228,14 +224,15 @@ VISRTX_DEVICE mat3 computeOrthonormalBasis(const vec3 &normal) return mat3(u, v, normal); } +// Cosine-weighted hemisphere sample (Malley's method); pdf = cos(theta)/pi. VISRTX_DEVICE vec3 sampleHemisphere(RandState &rs, const vec3 &normal) { - auto z = curand_uniform(&rs); - auto r = sqrtf(1.f - sqrt(z)); - auto phi = 2.0f * float(M_PI) * curand_uniform(&rs); - - auto sample = vec3(r * cos(phi), r * sin(phi), z); - + const float u1 = curand_uniform(&rs); + const float u2 = curand_uniform(&rs); + const float r = sqrtf(u1); + const float z = sqrtf(fmaxf(0.f, 1.f - r * r)); + const float phi = 2.f * float(M_PI) * u2; + const vec3 sample(r * cosf(phi), r * sinf(phi), z); return computeOrthonormalBasis(normal) * sample; } @@ -252,9 +249,90 @@ VISRTX_DEVICE vec3 sampleUnitSphere(RandState &rs, const vec3 &normal) #define ulpEpsilon 0x1.fp-21 -VISRTX_DEVICE float epsilonFrom(const vec3 &P, const vec3 &dir, float t) +VISRTX_DEVICE float epsilonFrom(const vec3 &P) { - return glm::compMax(vec4(abs(P), glm::compMax(abs(dir)) * t)) * ulpEpsilon; + return glm::compMax(abs(P)) * ulpEpsilon; +} + +// Hanika's shadow-terminator fix (Ray Tracing Gems II, ch. 4): lifts a +// triangle hit point onto the smooth surface implied by per-vertex normals. +// Without this, grazing-angle shadow rays self-occlude on the planar facet +// and produce dark bands shaped like the underlying tessellation. All inputs +// must share a coordinate space. +VISRTX_DEVICE vec3 shadowTerminatorOffset(const vec3 &P, + const vec3 &v0, + const vec3 &v1, + const vec3 &v2, + const vec3 &n0, + const vec3 &n1, + const vec3 &n2, + const vec3 &bary) +{ + const float du = glm::dot(P - v0, n0); + const float dv = glm::dot(P - v1, n1); + const float dw = glm::dot(P - v2, n2); + const vec3 lu = du < 0.f ? -du * n0 : vec3(0.f); + const vec3 lv = dv < 0.f ? -dv * n1 : vec3(0.f); + const vec3 lw = dw < 0.f ? -dw * n2 : vec3(0.f); + return P + bary.x * lu + bary.y * lv + bary.z * lw; +} + +// World-space hit position lifted onto the smooth surface implied by +// per-vertex normals (Hanika shadow-terminator fix). Use this as the origin +// for direct-light/AO shadow rays so grazing-angle queries do not self-shadow +// the planar facet. Do NOT use it for path-continuation rays — transmission +// especially needs the original facet point, since the smoothed point can sit +// far enough above the facet that an "into-the-surface" offset still ends up +// outside the volume. +VISRTX_DEVICE vec3 shadingHitpoint(const SurfaceHit &hit) +{ + if (hit.geometry == nullptr || hit.geometry->type != GeometryType::TRIANGLE) + return hit.hitpoint; + + const auto &tri = hit.geometry->tri; + if (tri.vertexNormalsFV == nullptr && tri.vertexNormals == nullptr) + return hit.hitpoint; + + const uvec3 idx = tri.indices ? tri.indices[hit.primID] + : uvec3(0, 1, 2) + hit.primID * 3; + const vec3 v0 = tri.vertices[idx.x]; + const vec3 v1 = tri.vertices[idx.y]; + const vec3 v2 = tri.vertices[idx.z]; + + vec3 n0, n1, n2; + if (tri.vertexNormalsFV != nullptr) { + const uvec3 nidx = uvec3(0, 1, 2) + hit.primID * 3; + n0 = tri.vertexNormalsFV[nidx.x]; + n1 = tri.vertexNormalsFV[nidx.y]; + n2 = tri.vertexNormalsFV[nidx.z]; + } else { + n0 = tri.vertexNormals[idx.x]; + n1 = tri.vertexNormals[idx.y]; + n2 = tri.vertexNormals[idx.z]; + } + + // Hanika's tangent-plane projection assumes unit normals; user data is + // not guaranteed to be normalized. + n0 = normalize(n0); + n1 = normalize(n1); + n2 = normalize(n2); + + // populateHit.h flips hit.Ng/Ns for back-face hits so they point toward + // the ray origin. The per-vertex normals here are still in the original + // outward orientation; flip them too so the smooth surface bulges onto + // the ray-origin side of the facet (otherwise Hanika lifts P away from + // the ray origin and the trailing `+ Ng * epsilon` can land below the + // facet). + if (!hit.isFrontFace) { + n0 = -n0; + n1 = -n1; + n2 = -n2; + } + + const vec3 Plocal = xfmPoint(hit.worldToObject, hit.hitpoint); + const vec3 Psmooth = + shadowTerminatorOffset(Plocal, v0, v1, v2, n0, n1, n2, hit.uvw); + return xfmPoint(hit.objectToWorld, Psmooth); } VISRTX_DEVICE bool pixelOutOfFrame( diff --git a/devices/rtx/device/gpu/populateHit.h b/devices/rtx/device/gpu/populateHit.h index 4930c05e5..c5c649f5f 100644 --- a/devices/rtx/device/gpu/populateHit.h +++ b/devices/rtx/device/gpu/populateHit.h @@ -197,9 +197,11 @@ VISRTX_DEVICE const VolumeGPUData &volumeData(const FrameGPUData &frameData) return frameData.registry.volumes[idx]; } -VISRTX_DEVICE const SpatialFieldGPUData &fieldData(const FrameGPUData &frameData, const VolumeGPUData &volumeData) +VISRTX_DEVICE const SpatialFieldGPUData &fieldData( + const FrameGPUData &frameData, const VolumeGPUData &volumeData) { - // Currently only TF1D volume type is supported, so assume this is what we have + // Currently only TF1D volume type is supported, so assume this is what we + // have return frameData.registry.fields[volumeData.data.tf1d.field]; } @@ -223,40 +225,56 @@ VISRTX_DEVICE void computeTangentSpace( if (!optixIsFrontFaceHit()) hit.Ng = -hit.Ng; + vec3 n0, n1, n2; + bool hasVertexNormals = true; if (ggd.tri.vertexNormalsFV != nullptr) { - const uvec3 idx = uvec3(0, 1, 2) + (hit.primID * 3); - - const vec3 n0 = ggd.tri.vertexNormalsFV[idx.x]; - const vec3 n1 = ggd.tri.vertexNormalsFV[idx.y]; - const vec3 n2 = ggd.tri.vertexNormalsFV[idx.z]; - hit.Ns = b.x * n0 + b.y * n1 + b.z * n2; + const uvec3 nidx = uvec3(0, 1, 2) + (hit.primID * 3); + n0 = ggd.tri.vertexNormalsFV[nidx.x]; + n1 = ggd.tri.vertexNormalsFV[nidx.y]; + n2 = ggd.tri.vertexNormalsFV[nidx.z]; } else if (ggd.tri.vertexNormals != nullptr) { - const vec3 n0 = ggd.tri.vertexNormals[idx.x]; - const vec3 n1 = ggd.tri.vertexNormals[idx.y]; - const vec3 n2 = ggd.tri.vertexNormals[idx.z]; + n0 = ggd.tri.vertexNormals[idx.x]; + n1 = ggd.tri.vertexNormals[idx.y]; + n2 = ggd.tri.vertexNormals[idx.z]; + } else { + hasVertexNormals = false; + } + + if (hasVertexNormals) hit.Ns = b.x * n0 + b.y * n1 + b.z * n2; - } else + else hit.Ns = hit.Ng; hit.Ns = normalize(hit.Ns); - if (ggd.tri.vertexTangentsFV != nullptr) { - const uvec3 idx = uvec3(0, 1, 2) + (hit.primID * 3); + const bool hasTangentsFV = ggd.tri.vertexTangentsFV != nullptr; + const bool hasTangentsV = ggd.tri.vertexTangents != nullptr; + if (hasTangentsFV || hasTangentsV) { + const uvec3 tIdx = hasTangentsFV + ? uvec3(0, 1, 2) + (hit.primID * 3) + : idx; + const vec4 *tArr = hasTangentsFV + ? ggd.tri.vertexTangentsFV + : ggd.tri.vertexTangents; + const vec4 t0 = tArr[tIdx.x]; + const vec4 t1 = tArr[tIdx.y]; + const vec4 t2 = tArr[tIdx.z]; + + // At UV mirror seams the sign flips between adjacent vertices; + // barycentric-summing the signs and applying a single sign at the + // hit point would carve seam edges into the tangent frame. + // Build each vertex's bitangent with its own sign and normal, + // then blend B and T independently — same convention + // as glTF Sample Renderer, PBRT, Filament. + const vec3 N0 = hasVertexNormals ? n0 : hit.Ng; + const vec3 N1 = hasVertexNormals ? n1 : hit.Ng; + const vec3 N2 = hasVertexNormals ? n2 : hit.Ng; + const vec3 B0 = t0.w * cross(N0, vec3(t0)); + const vec3 B1 = t1.w * cross(N1, vec3(t1)); + const vec3 B2 = t2.w * cross(N2, vec3(t2)); - const vec3 t0 = ggd.tri.vertexTangentsFV[idx.x]; - const vec3 t1 = ggd.tri.vertexTangentsFV[idx.y]; - const vec3 t2 = ggd.tri.vertexTangentsFV[idx.z]; - const float handedness = ggd.tri.vertexTangentsFV[idx.x].w; hit.tU = normalize(b.x * vec3(t0) + b.y * vec3(t1) + b.z * vec3(t2)); - hit.tV = handedness * normalize(cross(hit.Ns, hit.tU)); - } else if (ggd.tri.vertexTangents != nullptr) { - const vec3 t0 = ggd.tri.vertexTangents[idx.x]; - const vec3 t1 = ggd.tri.vertexTangents[idx.y]; - const vec3 t2 = ggd.tri.vertexTangents[idx.z]; - const float handedness = ggd.tri.vertexTangents[idx.x].w; - - hit.tU = normalize(b.x * t0 + b.y * t1 + b.z * t2); - hit.tV = handedness * normalize(cross(hit.Ns, hit.tU)); + hit.tV = normalize(b.x * B0 + b.y * B1 + b.z * B2); } else { auto tangentSpace = computeOrthonormalBasis(hit.Ng); hit.tU = tangentSpace[0]; @@ -383,7 +401,7 @@ VISRTX_DEVICE void populateSurfaceHit(SurfaceHit &hit) hit.primID = ray::primID(); hit.objID = sd.id; hit.instID = isd.id; - hit.epsilon = epsilonFrom(ray::hitpoint(), ray::direction(), ray::t()); + hit.epsilon = epsilonFrom(hit.hitpoint); ray::computeTangentSpace(gd, ray::primID(), hit); const auto &handle = optixGetTransformListHandle(0); diff --git a/devices/rtx/device/gpu/renderer/raygen_helpers.h b/devices/rtx/device/gpu/renderer/raygen_helpers.h index 5f65d6405..81bfc7bd8 100644 --- a/devices/rtx/device/gpu/renderer/raygen_helpers.h +++ b/devices/rtx/device/gpu/renderer/raygen_helpers.h @@ -61,15 +61,6 @@ VISRTX_DEVICE float volumeAttenuation(ScreenSample &ss, const Ray &r) return attenuation; } -// Evaluate opacity including transmission -VISRTX_DEVICE float evaluateOpacity(const MaterialShadingState &shadingState) -{ - return materialEvaluateOpacity(shadingState) - * (1.0f - - glm::luminosity(materialEvaluateTransmission(shadingState)) - * 0.85f); -} - // Templated rendering loop // ShadingPolicy must implement: // static VISRTX_DEVICE vec4 shadeSurface( diff --git a/devices/rtx/device/gpu/sampleLight.h b/devices/rtx/device/gpu/sampleLight.h index 14bbe64cf..2fe5f5013 100644 --- a/devices/rtx/device/gpu/sampleLight.h +++ b/devices/rtx/device/gpu/sampleLight.h @@ -339,7 +339,7 @@ VISRTX_DEVICE LightSample sampleHDRILight( ls.dir = xfmVec(xfm, dir); ls.dist = std::numeric_limits::infinity(); // Environment is at infinity - ls.radiance = radiance * ld.hdri.scale; + ls.radiance = radiance * ld.hdri.scale * ld.color; ls.pdf = pdf; return ls; @@ -384,7 +384,7 @@ VISRTX_DEVICE LightSample sampleHDRILight( // instead of explicitly transposing/inverting the matrix ls.dir = xfmVec(xfm, sphericalCoordsToDirection(thetaPhi) * ld.hdri.xfm); ls.dist = 1e20f; // Environment is effectively at infinity - ls.radiance = radiance * ld.hdri.scale; + ls.radiance = radiance * ld.hdri.scale * ld.color; ls.pdf = pdf; return ls; diff --git a/devices/rtx/device/gpu/shadingState.h b/devices/rtx/device/gpu/shadingState.h index 1831d5436..e85ec911d 100644 --- a/devices/rtx/device/gpu/shadingState.h +++ b/devices/rtx/device/gpu/shadingState.h @@ -80,7 +80,10 @@ struct PhysicallyBasedShadingState float metallic; float roughness; float transmission; - float ior; + // Refraction ratio n1/n2 from the incident side: init() stores 1/md->ior + // for front-facing hits and md->ior for back-facing hits, so this can be + // plugged directly into glm::refract and Schlick's F0 formula. + float eta; vec3 emission; float occlusion; diff --git a/devices/rtx/device/material/shaders/MatteShader_ptx.cu b/devices/rtx/device/material/shaders/MatteShader_ptx.cu index 06429e5fa..371a4cb1c 100644 --- a/devices/rtx/device/material/shaders/MatteShader_ptx.cu +++ b/devices/rtx/device/material/shaders/MatteShader_ptx.cu @@ -48,6 +48,12 @@ VISRTX_CALLABLE void __direct_callable__init(MatteShadingState *shadingState, shadingState->normal = hit->Ns; shadingState->opacity = adjustedMaterialOpacity(color.w * opacity, md->alphaMode, md->cutoff); + + // Fall back to the geometric normal if hit->Ns is NaN (e.g. coincident + // curve control points) or zero-length. Negated comparison catches both + // since NaN compares false to anything. + if (!(glm::dot(shadingState->normal, shadingState->normal) > 1e-12f)) + shadingState->normal = hit->Ng; } VISRTX_CALLABLE NextRay __direct_callable__nextRay( diff --git a/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu b/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu index 5e710b729..593457ed3 100644 --- a/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu +++ b/devices/rtx/device/material/shaders/PhysicallyBasedShader_ptx.cu @@ -65,7 +65,11 @@ VISRTX_DEVICE vec3 sampleNormalMap(const FrameGPUData &fd, if (samplerIdx == ~visrtx::DeviceObjectIndex{0}) return fallback; const vec3 ts = normalize(evaluateSampler(fd, samplerIdx, hit) * 2.0f - 1.0f); - return applyNormalMap(ts, hit, hit.Ns); + const vec3 N = applyNormalMap(ts, hit, hit.Ns); + // Negated comparison catches NaN (zero-decoded texels, zero-summed tangents) + // as well as zero-length results — fall back to the geometric normal so the + // shading frame is always usable. + return (dot(N, N) > 1e-12f) ? N : hit.Ng; } VISRTX_DEVICE float luminance(const vec3 &c) @@ -116,7 +120,13 @@ VISRTX_DEVICE float smithG1GGX(float NdotV, float alpha2) VISRTX_DEVICE float ggxD(float NdotH, float alpha2) { - const float denom = NdotH * NdotH * (alpha2 - 1.0f) + 1.0f; + // The textbook denom `x·(α²−1) + 1` cancels catastrophically in fp32 once + // α² is below eps(1) ≈ 1.19e-7 (our α² floor is 1e-8): `α²−1` rounds to + // exactly −1, and at x=1 the whole denom collapses to 0. The algebraically + // equivalent `α²·x + (1−x)` has no near-1 subtraction so it stays exact. + // The fminf clamp keeps `1−x ≥ 0` against dot-product rounding above 1. + const float NdotH2 = fminf(NdotH * NdotH, 1.0f); + const float denom = alpha2 * NdotH2 + (1.0f - NdotH2); return alpha2 / (float(M_PI) * denom * denom); } @@ -276,7 +286,7 @@ VISRTX_CALLABLE void __direct_callable__init( shadingState->opacity = adjustedMaterialOpacity(color.w * opacity, md->alphaMode, md->cutoff); - shadingState->ior = hit->isFrontFace ? 1.0f / md->ior : md->ior; + shadingState->eta = hit->isFrontFace ? 1.0f / md->ior : md->ior; shadingState->metallic = getMaterialParameter(*fd, md->metallic, *hit).x; shadingState->roughness = getMaterialParameter(*fd, md->roughness, *hit).x; shadingState->emission = vec3(getMaterialParameter(*fd, md->emissive, *hit)); @@ -355,7 +365,7 @@ VISRTX_CALLABLE vec3 __direct_callable__evaluateNormal( VISRTX_DEVICE vec3 computeDielectricF0(const PhysicallyBasedShadingState *state) { - const float iorF0 = pow2((1.0f - state->ior) / (1.0f + state->ior)); + const float iorF0 = pow2((1.0f - state->eta) / (1.0f + state->eta)); if (state->useSpecular == 0) return vec3(iorF0); return glm::min(vec3(iorF0) * state->specularColor, vec3(1.0f)) @@ -379,6 +389,21 @@ VISRTX_DEVICE vec3 schlickFresnel(vec3 F0, vec3 F90, float VdotH) return F0 + (F90 - F0) * pow5(1.0f - fabsf(VdotH)); } +VISRTX_DEVICE vec3 evalFresnelWithIridescence( + const PhysicallyBasedShadingState *state, + const vec3 &F0, + const vec3 &F90, + float cosTheta) +{ + vec3 F = schlickFresnel(F0, F90, cosTheta); + if (state->iridescence > 0.0f && state->iridescenceThickness > 0.0f) { + const vec3 iridescent = evalIridescence( + 1.0f, state->iridescenceIor, cosTheta, state->iridescenceThickness, F0); + F = glm::mix(F, iridescent, state->iridescence); + } + return F; +} + VISRTX_CALLABLE vec3 __direct_callable__shadeSurface( const PhysicallyBasedShadingState *state, const SurfaceHit *hit, @@ -390,7 +415,9 @@ VISRTX_CALLABLE vec3 __direct_callable__shadeSurface( const vec3 L = lightSample->dir; const float NdotL = dot(N, L); - if (NdotL <= 0.0f) + // Negated form so a NaN NdotL takes this early-out — NaN compares false + // to everything, so `NdotL <= 0.0f` would let it pass through. + if (!(NdotL > 0.0f)) return vec3(0.0f); const vec3 H = normalize(L + V); @@ -398,15 +425,13 @@ VISRTX_CALLABLE vec3 __direct_callable__shadeSurface( const float NdotV = fmaxf(dot(N, V), 1e-6f); const float VdotH = fmaxf(dot(V, H), 0.0f); - // Base F0 / F90, optionally overridden by iridescence. - vec3 F0 = computeF0(state); - vec3 F90 = computeF90(state); - vec3 F = schlickFresnel(F0, F90, VdotH); - if (state->iridescence > 0.0f && state->iridescenceThickness > 0.0f) { - const vec3 iridescent = evalIridescence( - 1.0f, state->iridescenceIor, VdotH, state->iridescenceThickness, F0); - F = glm::mix(F, iridescent, state->iridescence); - } + // Base F0 / F90. Specular uses Fresnel at the microfacet (VdotH); the + // diffuse weight uses Fresnel at NdotV (Frostbite/Disney convention) so + // shadeSurface and nextRay's diffuse split agree regardless of light dir. + const vec3 F0 = computeF0(state); + const vec3 F90 = computeF90(state); + const vec3 F = evalFresnelWithIridescence(state, F0, F90, VdotH); + const vec3 Fdiff = evalFresnelWithIridescence(state, F0, F90, NdotV); // Base GGX specular lobe. const float alpha = fmaxf(pow2(state->roughness), 1e-4f); @@ -419,7 +444,7 @@ VISRTX_CALLABLE vec3 __direct_callable__shadeSurface( // and transmission; metals have no diffuse). const vec3 diffuseColor = glm::mix(state->baseColor, vec3(0.0f), state->metallic); - const vec3 diffuseBRDF = (vec3(1.0f) - F) * float(M_1_PI) * diffuseColor + const vec3 diffuseBRDF = (vec3(1.0f) - Fdiff) * float(M_1_PI) * diffuseColor * state->occlusion * (1.0f - state->transmission); vec3 base = diffuseBRDF + specularBRDF; @@ -464,8 +489,8 @@ VISRTX_CALLABLE vec3 __direct_callable__shadeSurface( //----------------------------------------------------------------------------- // Next-ray importance sampling: stochastic alpha, Fresnel-aware lobe pick, -// GGX VNDF reflection/refraction. Clearcoat/sheen are NEE-only (no separate -// lobe sampling), which matches what the base renderer is set up to consume. +// GGX VNDF reflection/refraction, plus a clearcoat lobe sampled with +// probability equal to its view-angle Fresnel weight. Sheen is NEE-only. //----------------------------------------------------------------------------- VISRTX_CALLABLE NextRay __direct_callable__nextRay( @@ -475,8 +500,55 @@ VISRTX_CALLABLE NextRay __direct_callable__nextRay( if (curand_uniform(rs) > state->opacity) return NextRay{ray->dir, vec3(1.0f), NEXT_RAY_CONTINUES_THROUGH_SURFACE}; - const vec3 N = state->normal; const vec3 V = -ray->dir; + + // Clearcoat lobe: pick it with probability `clearcoat·FcV(NcDotV)`. This + // exact weight makes the entry-side attenuation `1 - clearcoat·FcV` cancel + // the `1/(1-pick)` lobe-pick divisor in the base path below, so the base + // returns only need the exit-side `1 - clearcoat·FcL` multiplier. + const vec3 Nc = state->clearcoatNormal; + const float NcDotV_world = fmaxf(dot(Nc, V), 0.0f); + const float FcV_world = + CLEARCOAT_F0 + (1.0f - CLEARCOAT_F0) * pow5(1.0f - NcDotV_world); + const float clearcoatPick = + glm::clamp(state->clearcoat * FcV_world, 0.0f, 1.0f); + + if (clearcoatPick > 0.0f && curand_uniform(rs) < clearcoatPick) { + const mat3 toWorldC = computeOrthonormalBasis(Nc); + const vec3 VlocalC = glm::transpose(toWorldC) * V; + if (VlocalC.z <= 0.0f) + return NextRay{Nc, vec3(0.0f)}; + const float alphaC = fmaxf(pow2(state->clearcoatRoughness), 1e-4f); + const float alphaC2 = alphaC * alphaC; + const vec3 HlocalC = sampleGGXVNDF( + VlocalC, alphaC, curand_uniform(rs), curand_uniform(rs)); + const vec3 LlocalC = glm::reflect(-VlocalC, HlocalC); + if (LlocalC.z <= 0.0f) + return NextRay{Nc, vec3(0.0f)}; + const float VdotHc = fmaxf(dot(VlocalC, HlocalC), 0.0f); + const float Fc = + CLEARCOAT_F0 + (1.0f - CLEARCOAT_F0) * pow5(1.0f - VdotHc); + const float G1c = smithG1GGX(VlocalC.z, alphaC2); + const float G2c = smithG2GGX(VlocalC.z, LlocalC.z, alphaC2); + // VNDF gives BRDF·cos/pdf = clearcoat·Fc·G2/G1; the clearcoat factor + // cancels against the matching factor in clearcoatPick. + const vec3 weight = vec3(state->clearcoat * Fc * G2c / fmaxf(G1c, 1e-8f)) + / fmaxf(clearcoatPick, 1e-8f); + return NextRay{normalize(toWorldC * LlocalC), weight}; + } + + // Exit-side clearcoat attenuation, applied to every base-path return. + // `fabsf` handles the transmission case where L points through the surface. + auto clearcoatExitAttn = [&](const vec3 &Lworld) -> float { + if (state->clearcoat <= 0.0f) + return 1.0f; + const float NcDotL = fabsf(dot(Nc, Lworld)); + const float FcL = + CLEARCOAT_F0 + (1.0f - CLEARCOAT_F0) * pow5(1.0f - NcDotL); + return glm::clamp(1.0f - state->clearcoat * FcL, 0.0f, 1.0f); + }; + + const vec3 N = state->normal; const mat3 toWorld = computeOrthonormalBasis(N); const mat3 toLocal = glm::transpose(toWorld); const vec3 Vlocal = toLocal * V; @@ -491,19 +563,15 @@ VISRTX_CALLABLE NextRay __direct_callable__nextRay( const float NdotV = Vlocal.z; const float VdotH = fmaxf(dot(Vlocal, Hlocal), 0.0f); - // Fresnel at the sampled microfacet, with optional iridescence. + // Fresnel at the sampled microfacet (specular/transmission split) and at + // NdotV (diffuse weight) — matches the convention in shadeSurface. const vec3 F0 = computeF0(state); const vec3 F90 = computeF90(state); - vec3 F = schlickFresnel(F0, F90, VdotH); - if (state->iridescence > 0.0f && state->iridescenceThickness > 0.0f) { - const vec3 iridescent = evalIridescence( - 1.0f, state->iridescenceIor, VdotH, state->iridescenceThickness, F0); - F = glm::mix(F, iridescent, state->iridescence); - } + const vec3 F = evalFresnelWithIridescence(state, F0, F90, VdotH); + const vec3 Fdiff = evalFresnelWithIridescence(state, F0, F90, NdotV); const vec3 Lrefl = glm::reflect(-Vlocal, Hlocal); - const float eta = state->ior; // init() pre-inverted for front-facing hits - const vec3 Ltrans = glm::refract(-Vlocal, Hlocal, eta); + const vec3 Ltrans = glm::refract(-Vlocal, Hlocal, state->eta); const vec3 transmissionFilter = computeTransmissionFilter(state); const bool hasTransmission = luminance(transmissionFilter) > 0.0f; const bool totalInternalReflection = @@ -514,36 +582,60 @@ VISRTX_CALLABLE NextRay __direct_callable__nextRay( ? vec3(0.0f) : glm::max(vec3(1.0f) - F, vec3(0.0f)) * transmissionFilter; + // Diffuse importance: the Lambertian throughput collapses to + // (1-F) * baseColor * (1-metallic) * (1-transmission) * occlusion + // when sampled cosine-weighted (cos / pdf cancels with 1/pi). Mirror the + // factors used by shadeSurface's diffuseBRDF so the lobe split tracks the + // BRDF being estimated. TIR has no diffuse share (all energy is reflected). + const vec3 diffuseEnergy = totalInternalReflection + ? vec3(0.0f) + : glm::max(vec3(1.0f) - Fdiff, vec3(0.0f)) * state->baseColor + * (1.0f - state->metallic) * (1.0f - state->transmission) + * state->occlusion; + const float reflectStrength = fmaxf(luminance(glm::max(reflectEnergy, vec3(0.0f))), 0.0f); const float transmitStrength = fmaxf(luminance(glm::max(transmitEnergy, vec3(0.0f))), 0.0f); - const float combinedStrength = reflectStrength + transmitStrength; + const float diffuseStrength = + fmaxf(luminance(glm::max(diffuseEnergy, vec3(0.0f))), 0.0f); + const float combinedStrength = + reflectStrength + transmitStrength + diffuseStrength; if (combinedStrength <= 0.0f) return NextRay{N, vec3(0.0f)}; const float reflectProb = reflectStrength / combinedStrength; - const bool sampleTransmission = curand_uniform(rs) > reflectProb; + const float transmitProb = transmitStrength / combinedStrength; + const float diffuseProb = diffuseStrength / combinedStrength; + + const float u = curand_uniform(rs); + if (u < reflectProb) { + if (Lrefl.z <= 0.0f) + return NextRay{N, vec3(0.0f)}; + const float NdotL = Lrefl.z; + const float G1 = smithG1GGX(NdotV, alpha2); + const float G2 = smithG2GGX(NdotV, NdotL, alpha2); + const vec3 Lworld = normalize(toWorld * Lrefl); + const vec3 weight = reflectEnergy * (G2 / fmaxf(G1, 1e-8f)) + * clearcoatExitAttn(Lworld) / fmaxf(reflectProb, 1e-8f); + return NextRay{Lworld, weight}; + } - if (sampleTransmission) { + if (u < reflectProb + transmitProb) { const float NdotL = -Ltrans.z; // L points through the surface. const float G1 = smithG1GGX(NdotV, alpha2); const float G2 = smithG2GGX(NdotV, NdotL, alpha2); + const vec3 Lworld = normalize(toWorld * Ltrans); const vec3 weight = transmitEnergy * (G2 / fmaxf(G1, 1e-8f)) - / fmaxf(1.0f - reflectProb, 1e-8f); - return NextRay{normalize(toWorld * Ltrans), - weight, - NEXT_RAY_CONTINUES_THROUGH_SURFACE}; + * clearcoatExitAttn(Lworld) / fmaxf(transmitProb, 1e-8f); + return NextRay{Lworld, weight, NEXT_RAY_CONTINUES_THROUGH_SURFACE}; } - // Reflection. - if (Lrefl.z <= 0.0f) - return NextRay{N, vec3(0.0f)}; - - const float NdotL = Lrefl.z; - const float G1 = smithG1GGX(NdotV, alpha2); - const float G2 = smithG2GGX(NdotV, NdotL, alpha2); + // Diffuse: sample around the shading normal so pdf=cos/pi matches the BRDF's + // NdotL (same axis as shadeSurface's diffuse term). Cos and pdf cancel, + // leaving only the energy term and the lobe-pick divisor. + const vec3 wi = sampleHemisphere(*rs, N); const vec3 weight = - reflectEnergy * (G2 / fmaxf(G1, 1e-8f)) / fmaxf(reflectProb, 1e-8f); - return NextRay{normalize(toWorld * Lrefl), weight}; + diffuseEnergy * clearcoatExitAttn(wi) / fmaxf(diffuseProb, 1e-8f); + return NextRay{wi, weight}; } diff --git a/devices/rtx/device/optix_visrtx.h b/devices/rtx/device/optix_visrtx.h index 60c330dab..2f6082ad7 100644 --- a/devices/rtx/device/optix_visrtx.h +++ b/devices/rtx/device/optix_visrtx.h @@ -205,7 +205,9 @@ struct DeviceGlobalState : public helium::BaseGlobalDeviceState struct ObjectUpdates { - helium::TimeStamp lastBLASChange{0}; + helium::TimeStamp lastSurfaceBLASChange{0}; + helium::TimeStamp lastVolumeBLASChange{0}; + helium::TimeStamp lastLightSetChange{0}; helium::TimeStamp lastTLASChange{0}; } objectUpdates; diff --git a/devices/rtx/device/renderer/Interactive_ptx.cu b/devices/rtx/device/renderer/Interactive_ptx.cu index c75d0a0ad..2320e3a23 100644 --- a/devices/rtx/device/renderer/Interactive_ptx.cu +++ b/devices/rtx/device/renderer/Interactive_ptx.cu @@ -87,17 +87,18 @@ struct InteractiveShadingPolicy * materialEvaluateTint(shadingState); // Handle all lights contributions + const vec3 shadowOrigin = shadingHitpoint(hit) + hit.Ng * hit.epsilon; for (size_t i = 0; i < world.numLightInstances; i++) { const auto &light = world.lightInstances[i]; const auto lightSample = - sampleLight(ss, hit.hitpoint, light.lightIndex, light.xfm); + sampleLight(ss, shadowOrigin, light.lightIndex, light.xfm); if (lightSample.pdf == 0.0f) continue; // Shadowing const Ray shadowRay = { - hit.hitpoint + hit.Ng * hit.epsilon, + shadowOrigin, lightSample.dir, {hit.epsilon, lightSample.dist}, }; @@ -154,7 +155,7 @@ struct InteractiveShadingPolicy } } - return vec4(contrib, evaluateOpacity(shadingState)); + return vec4(contrib, materialEvaluateOpacity(shadingState)); } }; @@ -181,7 +182,7 @@ VISRTX_GLOBAL void __anyhit__shadow() MaterialShadingState shadingState; materialInitShading(&shadingState, frameData, *hit.material, hit); - auto opacity = evaluateOpacity(shadingState); + auto opacity = materialEvaluateOpacity(shadingState); auto &o = ray::rayData(); diff --git a/devices/rtx/device/renderer/Quality_ptx.cu b/devices/rtx/device/renderer/Quality_ptx.cu index fe319255c..652b0e90c 100644 --- a/devices/rtx/device/renderer/Quality_ptx.cu +++ b/devices/rtx/device/renderer/Quality_ptx.cu @@ -153,11 +153,14 @@ VISRTX_DEVICE LightSample sampleLights(ScreenSample &ss, // last index is reserved for ambient light if it exists if (selectedIdx == world.numLightInstances) { const auto &rendererParams = frameData.renderer; + // Fold the hemisphere-sample pdf cos(theta)/pi with the uniform light pick. + const vec3 dir = sampleHemisphere(ss.rs, normal); + const float cosNs = fmaxf(0.f, dot(dir, normal)); return LightSample{ rendererParams.ambientColor * rendererParams.ambientIntensity, - sampleHemisphere(ss.rs, normal), + dir, std::numeric_limits::max(), - lightPickPdf / (2.0f * float(M_PI)), + lightPickPdf * cosNs * float(M_1_PI), }; } else { const auto &lightInstance = world.lightInstances[selectedIdx]; @@ -359,13 +362,22 @@ VISRTX_GLOBAL void __raygen__() } sample.color += sampleContribution * materialEmission * materialOpacity; + // Sample around the shading normal so the cosine-weighted hemisphere's + // pdf matches the BRDF's NdotL (which uses Ns). Sampling around Ng + // would bias the Lambertian estimator by cos_Ns/cos_Ng on smooth or + // bump-mapped surfaces. + const vec3 shadowOrigin = + shadingHitpoint(surfaceHit) + surfaceHit.Ng * surfaceHit.epsilon; LightSample lightSample = - sampleLights(ss, frameData, surfaceHit.hitpoint, surfaceHit.Ng); + sampleLights(ss, frameData, shadowOrigin, surfaceHit.Ns); if (lightSample.pdf >= ATTENUATION_EPSILON && lightSample.dist > 0.0f) { - const float lightDotNg = dot(lightSample.dir, surfaceHit.Ng); - if (lightDotNg > 0.0f) { + // Gate on the shading normal so the terminator follows the smooth + // surface; gating on Ng would carve the per-triangle facet shape + // into the lit/unlit boundary at grazing light angles. + const float lightDotNs = dot(lightSample.dir, surfaceHit.Ns); + if (lightDotNs > 0.0f) { const Ray shadowRay = { - surfaceHit.hitpoint + surfaceHit.Ng * surfaceHit.epsilon, + shadowOrigin, lightSample.dir, {surfaceHit.epsilon, lightSample.dist}, }; diff --git a/devices/rtx/device/spatial_field/SpatialField.cpp b/devices/rtx/device/spatial_field/SpatialField.cpp index b0404b473..2aba5be38 100644 --- a/devices/rtx/device/spatial_field/SpatialField.cpp +++ b/devices/rtx/device/spatial_field/SpatialField.cpp @@ -49,7 +49,7 @@ SpatialField::SpatialField(DeviceGlobalState *s) void SpatialField::markFinalized() { Object::markFinalized(); - deviceState()->objectUpdates.lastBLASChange = helium::newTimeStamp(); + deviceState()->objectUpdates.lastVolumeBLASChange = helium::newTimeStamp(); } SpatialField *SpatialField::createInstance( diff --git a/devices/rtx/device/surface/Surface.cpp b/devices/rtx/device/surface/Surface.cpp index 5458b2d79..13d9f865f 100644 --- a/devices/rtx/device/surface/Surface.cpp +++ b/devices/rtx/device/surface/Surface.cpp @@ -63,7 +63,7 @@ void Surface::finalize() void Surface::markFinalized() { Object::markFinalized(); - deviceState()->objectUpdates.lastBLASChange = helium::newTimeStamp(); + deviceState()->objectUpdates.lastSurfaceBLASChange = helium::newTimeStamp(); } bool Surface::isValid() const diff --git a/devices/rtx/device/volume/Volume.cpp b/devices/rtx/device/volume/Volume.cpp index 10ce94912..57d10933d 100644 --- a/devices/rtx/device/volume/Volume.cpp +++ b/devices/rtx/device/volume/Volume.cpp @@ -51,7 +51,7 @@ void Volume::commitParameters() void Volume::markFinalized() { Object::markFinalized(); - deviceState()->objectUpdates.lastBLASChange = helium::newTimeStamp(); + deviceState()->objectUpdates.lastVolumeBLASChange = helium::newTimeStamp(); } bool Volume::isVisible() const diff --git a/devices/rtx/device/world/Group.cpp b/devices/rtx/device/world/Group.cpp index ed2a54526..7a5ef1584 100644 --- a/devices/rtx/device/world/Group.cpp +++ b/devices/rtx/device/world/Group.cpp @@ -102,7 +102,11 @@ void Group::finalize() void Group::markFinalized() { Object::markFinalized(); - deviceState()->objectUpdates.lastBLASChange = helium::newTimeStamp(); + auto &updates = deviceState()->objectUpdates; + const auto now = helium::newTimeStamp(); + updates.lastSurfaceBLASChange = now; + updates.lastVolumeBLASChange = now; + updates.lastLightSetChange = now; } OptixTraversableHandle Group::optixTraversableTriangle() const @@ -194,7 +198,8 @@ const std::vector &Group::lights() const void Group::rebuildSurfaceBVHs() { const auto &state = *deviceState(); - if (state.objectUpdates.lastBLASChange < m_objectUpdates.lastSurfaceBVHBuilt) + if (state.objectUpdates.lastSurfaceBLASChange + < m_objectUpdates.lastSurfaceBVHBuilt) return; partitionValidGeometriesByType(); @@ -249,10 +254,16 @@ void Group::rebuildSurfaceBVHs() void Group::rebuildVolumeBVH() { + const auto &state = *deviceState(); + if (state.objectUpdates.lastVolumeBLASChange + < m_objectUpdates.lastVolumeBVHBuilt) + return; + partitionValidVolumes(); if (m_volumes.empty()) { m_volumeBounds = box3(); m_traversableVolume = {}; + m_objectUpdates.lastVolumeBVHBuilt = helium::newTimeStamp(); reportMessage( ANARI_SEVERITY_DEBUG, "visrtx::Group skipping volume BVH build"); return; @@ -272,6 +283,10 @@ void Group::rebuildVolumeBVH() void Group::rebuildLights() { + const auto &state = *deviceState(); + if (state.objectUpdates.lastLightSetChange < m_objectUpdates.lastLightRebuild) + return; + partitionValidLights(); buildLightGPUData(); m_objectUpdates.lastLightRebuild = helium::newTimeStamp(); diff --git a/devices/rtx/device/world/World.cpp b/devices/rtx/device/world/World.cpp index 90c70ef89..93d05c961 100644 --- a/devices/rtx/device/world/World.cpp +++ b/devices/rtx/device/world/World.cpp @@ -203,7 +203,11 @@ void World::rebuildWorld() { const auto &state = *deviceState(); - if (state.objectUpdates.lastBLASChange >= m_objectUpdates.lastBLASCheck) { + const auto &updates = state.objectUpdates; + const auto lastCheck = m_objectUpdates.lastBLASCheck; + if (updates.lastSurfaceBLASChange >= lastCheck + || updates.lastVolumeBLASChange >= lastCheck + || updates.lastLightSetChange >= lastCheck) { m_objectUpdates.lastTLASBuild = 0; // BLAS changed, so need to build TLAS rebuildBLASs(); }