From cdf6537d27c004f12e199067ce4fcb3cf0f7215a Mon Sep 17 00:00:00 2001
From: "Serban D. Porumbescu" <sdporumbescu@ucdavis.edu>
Date: Tue, 16 Apr 2024 20:44:21 -0700
Subject: [PATCH 01/18] Add cmake package manager, CPM

---
 cmake/CPM.cmake | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 cmake/CPM.cmake

diff --git a/cmake/CPM.cmake b/cmake/CPM.cmake
new file mode 100644
index 00000000..d0fd0e8e
--- /dev/null
+++ b/cmake/CPM.cmake
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: MIT
+#
+# SPDX-FileCopyrightText: Copyright (c) 2019-2023 Lars Melchior and contributors
+
+set(CPM_DOWNLOAD_VERSION 0.39.0)
+set(CPM_HASH_SUM "66639bcac9dd2907b2918de466783554c1334446b9874e90d38e3778d404c2ef")
+
+if(CPM_SOURCE_CACHE)
+  set(CPM_DOWNLOAD_LOCATION "${CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
+elseif(DEFINED ENV{CPM_SOURCE_CACHE})
+  set(CPM_DOWNLOAD_LOCATION "$ENV{CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
+else()
+  set(CPM_DOWNLOAD_LOCATION "${CMAKE_BINARY_DIR}/cmake/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
+endif()
+
+# Expand relative path. This is important if the provided path contains a tilde (~)
+get_filename_component(CPM_DOWNLOAD_LOCATION ${CPM_DOWNLOAD_LOCATION} ABSOLUTE)
+
+file(DOWNLOAD
+     https://github.com/cpm-cmake/CPM.cmake/releases/download/v${CPM_DOWNLOAD_VERSION}/CPM.cmake
+     ${CPM_DOWNLOAD_LOCATION} EXPECTED_HASH SHA256=${CPM_HASH_SUM}
+)
+
+include(${CPM_DOWNLOAD_LOCATION})

From 21f5b34847688031d267638eaf04b73cccc584fc Mon Sep 17 00:00:00 2001
From: "Serban D. Porumbescu" <sdporumbescu@ucdavis.edu>
Date: Wed, 17 Apr 2024 12:19:23 -0700
Subject: [PATCH 02/18] Add cuColletions with priority queue

---
 CMakeLists.txt | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a9f53e03..3692b227 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -79,6 +79,22 @@ FetchContent_Declare(cereal
 )
 FetchContent_Populate(cereal)
 
+# Package Management
+# TODO: Consider using CPM for the various libraries above 
+include(cmake/CPM.cmake) 
+
+# Add cuCollection with priority queue. This should eventually come from
+# NVIDIA.
+CPMAddPackage(
+    NAME cuco
+    GITHUB_REPOSITORY andrewbriand/cuCollections
+    GIT_TAG d58dd9fedde721a264c8ae960f7393a3a3b08c58
+    OPTIONS
+       "BUILD_TESTS OFF"
+       "BUILD_BENCHMARKS OFF"
+       "BUILD_EXAMPLES OFF"
+)
+
 # Auto-detect GPU architecture
 include("cmake/AutoDetectCudaArch.cmake")
 

From f02f3512ad688421eeba73772136fc7204b94dd2 Mon Sep 17 00:00:00 2001
From: "Serban D. Porumbescu" <sdporumbescu@ucdavis.edu>
Date: Wed, 17 Apr 2024 12:53:53 -0700
Subject: [PATCH 03/18] Add initial shortest edge collapse with priority queue.

This currently includes simple functionality to demonstrate bulk build
and pop via host interface.
---
 apps/CMakeLists.txt             |  1 +
 apps/SECPriority/CMakeLists.txt | 38 +++++++++++++
 apps/SECPriority/main.cu        | 96 +++++++++++++++++++++++++++++++++
 3 files changed, 135 insertions(+)
 create mode 100644 apps/SECPriority/CMakeLists.txt
 create mode 100644 apps/SECPriority/main.cu

diff --git a/apps/CMakeLists.txt b/apps/CMakeLists.txt
index bc94993e..b6380c68 100644
--- a/apps/CMakeLists.txt
+++ b/apps/CMakeLists.txt
@@ -8,4 +8,5 @@ add_subdirectory( XPBD )
 #add_subdirectory( Simplification )
 add_subdirectory( ShortestEdgeCollapse )
 add_subdirectory( Remesh )
+add_subdirectory( SECPriority )
 
diff --git a/apps/SECPriority/CMakeLists.txt b/apps/SECPriority/CMakeLists.txt
new file mode 100644
index 00000000..930ef76e
--- /dev/null
+++ b/apps/SECPriority/CMakeLists.txt
@@ -0,0 +1,38 @@
+add_executable(SECPriority)
+
+set(SOURCE_LIST
+    main.cu
+)
+
+set(COMMON_LIST    
+    ../common/openmesh_trimesh.h
+	../common/openmesh_report.h
+)
+
+target_sources(SECPriority 
+    PRIVATE
+    ${SOURCE_LIST} ${COMMON_LIST}
+)
+
+if (WIN32)
+    target_compile_definitions(SECPriority
+      PRIVATE _USE_MATH_DEFINES 
+      PRIVATE NOMINMAX
+      PRIVATE _CRT_SECURE_NO_WARNINGS)
+endif()
+
+set_target_properties(SECPriority PROPERTIES FOLDER "apps")
+
+set_property(TARGET SECPriority PROPERTY CUDA_SEPARABLE_COMPILATION ON)
+
+source_group(TREE ${CMAKE_CURRENT_LIST_DIR} PREFIX "SECPriority" FILES ${SOURCE_LIST})
+
+target_link_libraries(SECPriority     
+    PRIVATE RXMesh
+    PRIVATE gtest_main
+	PRIVATE OpenMeshCore
+    PRIVATE OpenMeshTools
+    PRIVATE cuco
+)
+
+#gtest_discover_tests( SECPriority )
\ No newline at end of file
diff --git a/apps/SECPriority/main.cu b/apps/SECPriority/main.cu
new file mode 100644
index 00000000..e6062fd6
--- /dev/null
+++ b/apps/SECPriority/main.cu
@@ -0,0 +1,96 @@
+#include <cuco/priority_queue.cuh>
+#include <cuco/detail/pair.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+
+#include <cooperative_groups.h>
+#include <cuda_runtime.h>
+
+#include <map>
+#include <vector>
+
+#include <iostream>
+#include <random>
+
+using namespace cuco;
+namespace cg = cooperative_groups;
+
+// grab some bits from priority queue tests and benchmarks
+
+// -- simulate reading the mesh, computing edge length
+// -- cuco:pair<float, uint32_t>
+//
+// setup pair_less template
+//
+// setup device function to pop items from queue
+//
+
+template <typename T>
+struct pair_less 
+{
+    __host__ __device__ bool operator()(const T& a, const T& b) const
+    {
+        return a.first < b.first;
+    }
+};
+
+template <typename PairType, typename OutputIt>
+void generate_kv_pairs_uniform(OutputIt output_begin, OutputIt output_end)
+{
+  std::random_device rd;
+  std::mt19937 gen{rd()};
+
+  const auto num_keys = std::distance(output_begin, output_end);
+  for(auto i = 0; i < num_keys; i++)
+  {
+    output_begin[i] = {static_cast<typename PairType::first_type>(gen()),
+                       static_cast<typename PairType::second_type>(i)};
+  }
+}
+
+void sp_pair()
+{
+  // Setup the cuco::priority_queue
+  const size_t insertion_size = 200;
+  const size_t deletion_size  = 100;
+  using PairType              = cuco::pair<float, uint32_t>;
+  using Compare               = pair_less<PairType>;
+
+  cuco::priority_queue<PairType, Compare> pq(insertion_size);
+
+  // Generate data for the queue
+  std::vector<PairType> h_pairs(insertion_size);
+  generate_kv_pairs_uniform<PairType>(h_pairs.begin(), h_pairs.end());
+
+  for(auto i = 0; i < h_pairs.size(); i++)
+  {
+    std::cout << "Priority: " << h_pairs[i].first 
+              << "\tID: " << h_pairs[i].second << "\n";
+  }
+
+  // Fill the priority queue
+  thrust::device_vector<PairType> d_pairs(h_pairs);
+  pq.push(d_pairs.begin(), d_pairs.end());
+  cudaDeviceSynchronize();
+
+  // Pop the priority queue
+  thrust::device_vector<PairType> d_popped(deletion_size);
+  pq.pop(d_popped.begin(), d_popped.end());
+  cudaDeviceSynchronize();
+
+  std::cout << "-----After Pop-----\n";
+  thrust::host_vector<PairType> h_popped(d_popped);
+  for(auto i = 0; i < h_popped.size(); i++)
+  {
+    std::cout << "Priority: " << h_popped[i].first 
+              << "\tID: " << h_popped[i].second << "\n";
+  }
+}
+
+int main(int argc, char* argv[])
+{
+  sp_pair();
+
+  return 0;
+}
\ No newline at end of file

From b625753e9293f62da1005698abe66c689ee52cd4 Mon Sep 17 00:00:00 2001
From: "Serban D. Porumbescu" <sdporumbescu@ucdavis.edu>
Date: Thu, 18 Apr 2024 16:31:50 -0700
Subject: [PATCH 04/18] Initial rxmesh priority queue-based edge collapse

---
 apps/SECPriority/CMakeLists.txt |  3 +-
 apps/SECPriority/secp.cu        | 91 +++++++++++++++++++++++++++++++++
 2 files changed, 93 insertions(+), 1 deletion(-)
 create mode 100644 apps/SECPriority/secp.cu

diff --git a/apps/SECPriority/CMakeLists.txt b/apps/SECPriority/CMakeLists.txt
index 930ef76e..e47f7ab5 100644
--- a/apps/SECPriority/CMakeLists.txt
+++ b/apps/SECPriority/CMakeLists.txt
@@ -1,7 +1,8 @@
 add_executable(SECPriority)
 
 set(SOURCE_LIST
-    main.cu
+    #main.cu
+    secp.cu
 )
 
 set(COMMON_LIST    
diff --git a/apps/SECPriority/secp.cu b/apps/SECPriority/secp.cu
new file mode 100644
index 00000000..a078ee8a
--- /dev/null
+++ b/apps/SECPriority/secp.cu
@@ -0,0 +1,91 @@
+#include "gtest/gtest.h"
+#include "rxmesh/util/log.h"
+#include "rxmesh/util/macros.h"
+#include "rxmesh/util/util.h"
+
+#include "rxmesh/rxmesh_dynamic.h"
+
+struct arg
+{
+    std::string obj_file_name = STRINGIFY(INPUT_DIR) "dragon.obj";
+    std::string output_folder = STRINGIFY(OUTPUT_DIR);
+    uint32_t    target        = 0;
+    uint32_t    device_id     = 0;
+    char**      argv;
+    int         argc;
+} Arg;
+
+//#include "secp_rxmesh.cuh"
+
+TEST(Apps, SECPriority)
+{
+    using namespace rxmesh;
+
+    // Select device
+    cuda_query(Arg.device_id);
+
+    RXMeshDynamic rx(Arg.obj_file_name);
+    rx.save(STRINGIFY(OUTPUT_DIR) + extract_file_name(Arg.obj_file_name) +
+           "_patches");
+
+    // RXMeshDynamic rx(Arg.obj_file_name,
+    //                  STRINGIFY(OUTPUT_DIR) +
+    //                      extract_file_name(Arg.obj_file_name) + "_patches",
+    //                  true);
+
+    ASSERT_TRUE(rx.is_edge_manifold());
+
+//    secp_rxmesh(rx, Arg.target);
+}
+
+
+int main(int argc, char** argv)
+{
+    using namespace rxmesh;
+    Log::init();
+
+    ::testing::InitGoogleTest(&argc, argv);
+    Arg.argv = argv;
+    Arg.argc = argc;
+
+
+    if (argc > 1) {
+        if (cmd_option_exists(argv, argc + argv, "-h")) {
+            // clang-format off
+            RXMESH_INFO("\nUsage: SECPriority.exe < -option X>\n"
+                        " -h:          Display this massage and exit\n"
+                        " -input:      Input file. Input file should be under the input/ subdirectory\n"
+                        "              Default is {} \n"
+                        "              Hint: Only accept OBJ files\n"
+                        " -target:     The final/target number of faces in the output mesh\n"
+                        " -o:          JSON file output folder. Default is {} \n"
+                        " -device_id:  GPU device ID. Default is {}",
+            Arg.obj_file_name, Arg.output_folder, Arg.device_id);
+            // clang-format on
+            exit(EXIT_SUCCESS);
+        }
+
+        if (cmd_option_exists(argv, argc + argv, "-input")) {
+            Arg.obj_file_name =
+                std::string(get_cmd_option(argv, argv + argc, "-input"));
+        }
+        if (cmd_option_exists(argv, argc + argv, "-o")) {
+            Arg.output_folder =
+                std::string(get_cmd_option(argv, argv + argc, "-o"));
+        }
+        if (cmd_option_exists(argv, argc + argv, "-device_id")) {
+            Arg.device_id =
+                atoi(get_cmd_option(argv, argv + argc, "-device_id"));
+        }
+        if (cmd_option_exists(argv, argc + argv, "-target")) {
+            Arg.target = false;
+        }
+    }
+
+    RXMESH_TRACE("input= {}", Arg.obj_file_name);
+    RXMESH_TRACE("output_folder= {}", Arg.output_folder);
+    RXMESH_TRACE("device_id= {}", Arg.device_id);
+    RXMESH_TRACE("target= {}", Arg.target);
+
+    return RUN_ALL_TESTS();
+}
\ No newline at end of file

From 1c0a4ba9479cce5a1f9177b6b51bd92e089a6b63 Mon Sep 17 00:00:00 2001
From: "Serban D. Porumbescu" <sdporumbescu@ucdavis.edu>
Date: Mon, 6 May 2024 21:48:13 -0700
Subject: [PATCH 05/18] ignore build_debug directory

---
 .gitignore | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index cff3e42b..8d612cc7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,6 +11,7 @@ input/*
 !input/plane_5.obj
 !input/sphere1.obj
 build/
+build_debug/
 include/rxmesh/util/git_sha1.cpp
 .vscode/
-scripts/*.log
\ No newline at end of file
+scripts/*.log

From f825a1d80669d993a120574049c4d3bb088017be Mon Sep 17 00:00:00 2001
From: "Serban D. Porumbescu" <sdporumbescu@ucdavis.edu>
Date: Mon, 6 May 2024 22:03:40 -0700
Subject: [PATCH 06/18] Add initial cuCollections priority queue

Currently adds the pair<edge_length, local_edge_id> using
the priority queue device-side api.
---
 CMakeLists.txt                    |   2 +-
 apps/SECPriority/CMakeLists.txt   |   2 +
 apps/SECPriority/secp.cu          |   4 +-
 apps/SECPriority/secp_kernels.cuh | 380 ++++++++++++++++++++++++++++++
 apps/SECPriority/secp_rxmesh.cuh  | 316 +++++++++++++++++++++++++
 5 files changed, 701 insertions(+), 3 deletions(-)
 create mode 100644 apps/SECPriority/secp_kernels.cuh
 create mode 100644 apps/SECPriority/secp_rxmesh.cuh

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3692b227..38888200 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -149,7 +149,7 @@ set(cxx_flags
 
 set(MSVC_XCOMPILER_FLAGS "/openmp:experimental /MP /std:c++17 /Zi")
 set(cuda_flags
-    -Xcompiler=$<$<CXX_COMPILER_ID:GNU>:-Wall -fopenmp -O3 -Wno-unused-function>
+    -Xcompiler=$<$<CXX_COMPILER_ID:GNU>:-rdynamic -Wall -fopenmp -O3 -Wno-unused-function>
 	-Xcompiler=$<$<CXX_COMPILER_ID:MSVC>:${MSVC_XCOMPILER_FLAGS}>
     #Disables warning
     #177-D "function XXX was declared but never referenced"
diff --git a/apps/SECPriority/CMakeLists.txt b/apps/SECPriority/CMakeLists.txt
index e47f7ab5..4f391430 100644
--- a/apps/SECPriority/CMakeLists.txt
+++ b/apps/SECPriority/CMakeLists.txt
@@ -3,6 +3,8 @@ add_executable(SECPriority)
 set(SOURCE_LIST
     #main.cu
     secp.cu
+	secp_rxmesh.cuh
+	secp_kernels.cuh
 )
 
 set(COMMON_LIST    
diff --git a/apps/SECPriority/secp.cu b/apps/SECPriority/secp.cu
index a078ee8a..1cdd2b2d 100644
--- a/apps/SECPriority/secp.cu
+++ b/apps/SECPriority/secp.cu
@@ -15,7 +15,7 @@ struct arg
     int         argc;
 } Arg;
 
-//#include "secp_rxmesh.cuh"
+#include "secp_rxmesh.cuh"
 
 TEST(Apps, SECPriority)
 {
@@ -35,7 +35,7 @@ TEST(Apps, SECPriority)
 
     ASSERT_TRUE(rx.is_edge_manifold());
 
-//    secp_rxmesh(rx, Arg.target);
+    secp_rxmesh(rx, Arg.target);
 }
 
 
diff --git a/apps/SECPriority/secp_kernels.cuh b/apps/SECPriority/secp_kernels.cuh
new file mode 100644
index 00000000..1fc2ca39
--- /dev/null
+++ b/apps/SECPriority/secp_kernels.cuh
@@ -0,0 +1,380 @@
+#pragma once
+#include "rxmesh/cavity_manager.cuh"
+
+#include <cooperative_groups.h>
+#include <cuda_runtime.h>
+
+template <typename T, uint32_t blockThreads>
+__global__ static void secp(rxmesh::Context                   context,
+                            rxmesh::VertexAttribute<T>        coords,
+                           // const CostHistogram<T>            histo,
+                            const int                         reduce_threshold,
+                            rxmesh::EdgeAttribute<EdgeStatus> edge_status,
+                            rxmesh::EdgeAttribute<T>          e_attr,
+                            int*                              d_num_cavities)
+{
+    using namespace rxmesh;
+    auto           block = cooperative_groups::this_thread_block();
+    ShmemAllocator shrd_alloc;
+    CavityManager<blockThreads, CavityOp::EV> cavity(
+        block, context, shrd_alloc, true);
+
+    const uint32_t pid = cavity.patch_id();
+
+    if (pid == INVALID32) {
+        return;
+    }
+
+    // we first use this mask to set the edge we want to collapse (and then
+    // filter them). Then after cavity.prologue, we reuse this bitmask to mark
+    // the newly added edges
+    Bitmask edge_mask(cavity.patch_info().edges_capacity[0], shrd_alloc);
+
+    // we use this bitmask to mark the other end of to-be-collapse edge during
+    // checking for the link condition
+    Bitmask v0_mask(cavity.patch_info().num_vertices[0], shrd_alloc);
+    Bitmask v1_mask(cavity.patch_info().num_vertices[0], shrd_alloc);
+
+
+    // Precompute EV
+    Query<blockThreads> ev_query(context, pid);
+    ev_query.prologue<Op::EV>(block, shrd_alloc);
+    block.sync();
+
+    // 1) mark edge we want to collapse
+    for_each_edge(cavity.patch_info(), [&](EdgeHandle eh) {
+        assert(eh.local_id() < cavity.patch_info().num_edges[0]);
+
+        if (edge_status(eh) != UNSEEN) {
+            return;
+        }
+        const VertexIterator iter =
+            ev_query.template get_iterator<VertexIterator>(eh.local_id());
+
+        const VertexHandle v0 = iter[0];
+        const VertexHandle v1 = iter[1];
+
+        const Vec3<T> p0(coords(v0, 0), coords(v0, 1), coords(v0, 2));
+        const Vec3<T> p1(coords(v1, 0), coords(v1, 1), coords(v1, 2));
+
+        T len2 = glm::distance2(p0, p1);
+
+        /*if (histo.get_bin(len2) <= reduce_threshold) {
+            //::atomicAdd(d_num_cavities + 1, 1);
+            // cavity.create(eh);
+            edge_mask.set(eh.local_id(), true);
+        }*/
+    });
+    block.sync();
+
+
+    // 2) check edge link condition. Here, for each edge marked in edge_mask,
+    // all threads in the block collaborate to check the edge link condition of
+    // this edge
+    __shared__ int s_num_shared_one_ring;
+    for (uint16_t e = 0; e < edge_mask.size(); ++e) {
+
+        if (edge_mask(e)) {
+            // the edge two end vertices
+            const VertexIterator iter =
+                ev_query.template get_iterator<VertexIterator>(e);
+
+            const uint16_t v0 = iter.local(0);
+            const uint16_t v1 = iter.local(1);
+
+            if (threadIdx.x == 0) {
+                s_num_shared_one_ring = 0;
+            }
+
+            v0_mask.reset(block);
+            v1_mask.reset(block);
+            block.sync();
+
+            // each thread will be assigned to an edge (including not-owned one)
+            // and mark in v0_mask/v1_mask if one of its two ends are v0/v1
+            for_each_edge(
+                cavity.patch_info(),
+                [&](EdgeHandle eh) {
+                    if (eh.local_id() == e) {
+                        return;
+                    }
+                    const VertexIterator v_iter =
+                        ev_query.template get_iterator<VertexIterator>(
+                            eh.local_id());
+
+                    const uint16_t vv0 = v_iter.local(0);
+                    const uint16_t vv1 = v_iter.local(1);
+
+
+                    if (vv0 == v0) {
+                        v0_mask.set(vv1, true);
+                    }
+                    if (vv0 == v1) {
+                        v1_mask.set(vv1, true);
+                    }
+
+                    if (vv1 == v0) {
+                        v0_mask.set(vv0, true);
+                    }
+                    if (vv1 == v1) {
+                        v1_mask.set(vv0, true);
+                    }
+                },
+                true);
+            block.sync();
+
+            for (int v = threadIdx.x; v < v0_mask.size(); v += blockThreads) {
+                if (v0_mask(v) && v1_mask(v)) {
+                    ::atomicAdd(&s_num_shared_one_ring, 1);
+                }
+            }
+
+            block.sync();
+            if (s_num_shared_one_ring > 2) {
+                edge_mask.reset(e, true);
+            }
+        }
+    }
+    block.sync();
+
+    for_each_edge(cavity.patch_info(), [&](EdgeHandle eh) {
+        assert(eh.local_id() < cavity.patch_info().num_edges[0]);
+        if (edge_mask(eh.local_id())) {
+            cavity.create(eh);
+        } else {
+            edge_status(eh) = OKAY;
+        }
+    });
+    block.sync();
+
+    ev_query.epilogue(block, shrd_alloc);
+
+    // create the cavity
+    if (cavity.prologue(block, shrd_alloc, coords, edge_status, e_attr)) {
+
+        // if (threadIdx.x == 0) {
+        //     uint16_t num_actual_cavities = 0;
+        //     for (int i = 0; i < cavity.m_s_active_cavity_bitmask.size(); ++i)
+        //     {
+        //         if (cavity.m_s_active_cavity_bitmask(i)) {
+        //             num_actual_cavities++;
+        //         }
+        //     }
+        //     ::atomicAdd(d_num_cavities, num_actual_cavities);
+        // }
+        edge_mask.reset(block);
+        block.sync();
+
+        // fill in the cavities
+        cavity.for_each_cavity(block, [&](uint16_t c, uint16_t size) {
+            const EdgeHandle src = cavity.template get_creator<EdgeHandle>(c);
+
+            // TODO handle boundary edges
+
+            VertexHandle v0, v1;
+
+            cavity.get_vertices(src, v0, v1);
+
+            const VertexHandle new_v = cavity.add_vertex();
+
+            if (new_v.is_valid()) {
+
+                coords(new_v, 0) = (coords(v0, 0) + coords(v1, 0)) * 0.5;
+                coords(new_v, 1) = (coords(v0, 1) + coords(v1, 1)) * 0.5;
+                coords(new_v, 2) = (coords(v0, 2) + coords(v1, 2)) * 0.5;
+
+
+                DEdgeHandle e0 =
+                    cavity.add_edge(new_v, cavity.get_cavity_vertex(c, 0));
+
+                e_attr(e0.get_edge_handle())++;
+
+                if (e0.is_valid()) {
+                    edge_mask.set(e0.local_id(), true);
+
+                    const DEdgeHandle e_init = e0;
+
+                    for (uint16_t i = 0; i < size; ++i) {
+                        const DEdgeHandle e = cavity.get_cavity_edge(c, i);
+
+                        const VertexHandle v_end =
+                            cavity.get_cavity_vertex(c, (i + 1) % size);
+
+                        const DEdgeHandle e1 =
+                            (i == size - 1) ?
+                                e_init.get_flip_dedge() :
+                                cavity.add_edge(
+                                    cavity.get_cavity_vertex(c, i + 1), new_v);
+
+                        if (!e1.is_valid()) {
+                            break;
+                        }
+
+                        if (i != size - 1) {
+                            edge_mask.set(e1.local_id(), true);
+                        }
+
+                        const FaceHandle new_f = cavity.add_face(e0, e, e1);
+
+                        if (!new_f.is_valid()) {
+                            break;
+                        }
+                        e0 = e1.get_flip_dedge();
+                    }
+                }
+            }
+        });
+    }
+
+
+    cavity.epilogue(block);
+    block.sync();
+
+    if (cavity.is_successful()) {
+        for_each_edge(cavity.patch_info(), [&](EdgeHandle eh) {
+            if (edge_mask(eh.local_id())) {
+                edge_status(eh) = ADDED;
+            }
+        });
+    }
+}
+
+//template <typename View, typename InputIt>
+template <typename T, uint32_t blockThreads>
+__global__ static void  compute_edge_priorities(
+    rxmesh::Context                  context,
+    const rxmesh::VertexAttribute<T> coords,
+    PQView_t                         pq_view,
+    size_t                           pq_num_bytes)
+{
+    using namespace rxmesh;
+
+    // shared mem variable
+    // Pair_t my_local_pair_array
+    // index into above
+    // sdp: needs to be the number of edges in the batch,
+    // which is not known at compile time
+    //__shared__ PriorityPair_t intermediatePairs[blockThreads];
+
+//    __shared__ int my_pair_count;
+    // some thread needs to initialize the above to 0
+    // sync
+    //extern __shared__ int shmem[];
+    namespace cg = cooperative_groups;
+    cg::thread_block g = cg::this_thread_block();
+    ShmemAllocator      shrd_alloc;
+
+    Query<blockThreads> query(context);
+    auto intermediatePairs = shrd_alloc.alloc<PriorityPair_t>(query.get_patch_info().num_edges[0]);
+    __shared__ int pair_counter;
+    pair_counter = 0;
+
+/* 05/06
+    char * pq_shrd_mem = shrd_alloc.alloc(pq_num_bytes);
+    printf("blockThreads:%u\n", blockThreads);
+    printf("pq_shrd_mem:%p\t\n", (void*)pq_shrd_mem);
+    */
+
+    auto edge_len = [&](const EdgeHandle& eh, const VertexIterator& iter) {
+        const VertexHandle v0 = iter[0];
+        const VertexHandle v1 = iter[1];
+
+        const Vec3<T> p0(coords(v0, 0), coords(v0, 1), coords(v0, 2));
+        const Vec3<T> p1(coords(v1, 0), coords(v1, 1), coords(v1, 2));
+
+        T len2 = glm::distance2(p0, p1);
+
+        //PriorityPair_t p{(double)len2, (double)len2};
+        //PriorityPair_t p{len2, len2};
+        auto p_e = rxmesh::detail::unpack(eh.unique_id());
+        //printf("e_id:%llu\t, len:%f\n", eh.unique_id(), len2);
+        PriorityPair_t p{len2, p_e.second};
+        //PriorityPair_t p{len2, eh};
+
+        auto val_counter = atomicAdd(&pair_counter, 1);
+        intermediatePairs[val_counter] = p;
+
+        //PriorityPair_t p{len2, eh};
+        //PriorityPair_t p{len2, eh.patch_id()};
+        //PriorityPair_t p{len2, eh.m_handle}; //same as just eh
+        //alignas(64) PriorityPair_t arr_p[1] = {p};
+        //PriorityPair_t p{len2, eh};
+        //PriorityPair_t p{len2, eh.patch_id()};
+        //PriorityPair_t p{len2, eh.m_handle}; //same as just eh
+        //alignas(64) PriorityPair_t arr_p[1] = {p};
+/* 05/06 
+        alignas(16) PriorityPair_t arr_p[1] = {p};
+        //pq_view.push(g, arr_p, *(&arr_p + 1), shmem);
+        pq_view.push(g, arr_p, arr_p + 1, pq_shrd_mem);
+        */
+        //SDP: looks like the push might need to happen outside of this
+        //lambda. I think I need a static array where a thread?/EdgeHandle
+        //can be stored. Then, after the query.dispatch<Op::EV> below,
+        //I can do a pq_view.push(cg, &array[0], &array[size], shmem)
+
+        //Alternatively, this kernel computes the edge_length for EdgeHandle
+        //and saves to a thrust::device_vector<PairType> d_pairs. Given an EdgeHandle,
+        //what's my index into the device_vector?
+        //Then, call pq_view.push(d_pairs.begin(), d_pairs.end()) from host side.
+//SDP remove for build        auto my_pair = {len2, eh};
+//SDP remove for build until can revisit        pq_view.push(cg, &my_pair[0], &my_pair[0]+1, <temp_storage_needed>);
+
+//SDP remove for build        pq_view.push();
+        //atomicMin(histo.min_value(), len2);
+        //atomicMax(histo.max_value(), len2);
+//        int old_value = atomicAdd(&my_pair_count, 1);
+    };
+
+    auto block = cooperative_groups::this_thread_block();
+
+    //ShmemAllocator      shrd_alloc;
+
+    // need to account for this when the kernel is launched
+//SDP remove for build    PriorityPair_t* my_local_array = shrd_alloc.alloc<PriorityPair_t>(query.patch_info().m_edge_capacity[0]);
+    // allocate memory for the priority_queue and keep that pointer to pass
+    query.dispatch<Op::EV>(block, shrd_alloc, edge_len);
+    // sync just in case for now
+    // actually do the push to the queue
+    block.sync();
+    //if(block.thread_rank() == 0) {
+     //  printf("iterators: begin = %p\t end = %p\n", (void*)intermediatePairs, (void*)(intermediatePairs + blockThreads));
+
+       char * pq_shrd_mem = shrd_alloc.alloc(pq_num_bytes);
+      // printf("blockThreads:%u\n", blockThreads);
+      // printf("pq_shrd_mem:%p\t\n", (void*)pq_shrd_mem);
+      // for(size_t i = 0; i < blockThreads; i++) {
+      //  printf("imp:%d\t%f\t%u\t%p\n", i, intermediatePairs[i].first, intermediatePairs[i].second, &(intermediatePairs[i]));
+      // }
+       pq_view.push(block, intermediatePairs, intermediatePairs + pair_counter, pq_shrd_mem);
+       //shrd_alloc.dealloc(pq_num_bytes);
+    //}
+}
+
+/*
+template <typename T, uint32_t blockThreads>
+__global__ static void populate_histogram(
+    rxmesh::Context                  context,
+    const rxmesh::VertexAttribute<T> coords,
+    CostHistogram<T>                 histo)
+{
+    using namespace rxmesh;
+
+    auto edge_len = [&](const EdgeHandle& eh, const VertexIterator& iter) {
+        const VertexHandle v0 = iter[0];
+        const VertexHandle v1 = iter[1];
+
+        const Vec3<T> p0(coords(v0, 0), coords(v0, 1), coords(v0, 2));
+        const Vec3<T> p1(coords(v1, 0), coords(v1, 1), coords(v1, 2));
+
+        T len2 = glm::distance2(p0, p1);
+
+        histo.insert(len2);
+    };
+
+    auto block = cooperative_groups::this_thread_block();
+
+    Query<blockThreads> query(context);
+    ShmemAllocator      shrd_alloc;
+    query.dispatch<Op::EV>(block, shrd_alloc, edge_len);
+}
+*/
\ No newline at end of file
diff --git a/apps/SECPriority/secp_rxmesh.cuh b/apps/SECPriority/secp_rxmesh.cuh
new file mode 100644
index 00000000..c1d7eee3
--- /dev/null
+++ b/apps/SECPriority/secp_rxmesh.cuh
@@ -0,0 +1,316 @@
+#pragma once
+
+#define GLM_ENABLE_EXPERIMENTAL
+#include <glm/glm.hpp>
+#include <glm/gtx/norm.hpp>
+
+
+#include "rxmesh/query.cuh"
+#include "rxmesh/rxmesh_dynamic.h"
+
+// Priority Queue related includes
+#include <cuco/priority_queue.cuh>
+#include <cuco/detail/pair.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+
+// Priority queue setup. Use 'pair_less' to prioritize smaller values.
+template <typename T>
+struct pair_less 
+{
+    __host__ __device__ bool operator()(const T& a, const T& b) const
+    {
+        return a.first < b.first;
+    }
+};
+//using PriorityPair_t        = cuco::pair<float, float>;
+//using PriorityPair_t        = cuco::pair<float, rxmesh::EdgeHandle>;
+
+using PriorityPair_t        = cuco::pair<float, uint32_t>;
+
+//using PriorityPair_t        = cuco::pair<float, uint64_t>; 
+//using PriorityPair_t        = cuco::pair<double, double>; 
+using PriorityCompare       = pair_less<PriorityPair_t>;
+using PriorityQueue_t       = cuco::priority_queue<PriorityPair_t, PriorityCompare>;
+using PQView_t              = PriorityQueue_t::device_mutable_view;
+
+
+template <typename T>
+using Vec3 = glm::vec<3, T, glm::defaultp>;
+
+using EdgeStatus = int8_t;
+enum : EdgeStatus
+{
+    UNSEEN = 0,  // means we have not tested it before for e.g., split/flip/col
+    OKAY   = 1,  // means we have tested it and it is okay to skip
+    UPDATE = 2,  // means we should update it i.e., we have tested it before
+    ADDED  = 3,  // means it has been added to during the split/flip/collapse
+};
+
+//#include "histogram.cuh"
+#include "secp_kernels.cuh"
+
+
+inline void secp_rxmesh(rxmesh::RXMeshDynamic& rx,
+                       const uint32_t         final_num_faces)
+{
+    //EXPECT_TRUE(rx.validate());
+//return;
+    using namespace rxmesh;
+    constexpr uint32_t blockThreads = 32;
+
+
+    auto coords = rx.get_input_vertex_coordinates();
+
+    auto edge_status = rx.add_edge_attribute<EdgeStatus>("EdgeStatus", 1);
+
+    LaunchBox<blockThreads> launch_box;
+
+    float total_time   = 0;
+    float app_time     = 0;
+    float slice_time   = 0;
+    float cleanup_time = 0;
+
+    const int num_bins = 256;
+
+//    CostHistogram<float> histo(num_bins);
+    PriorityQueue_t pq(rx.get_num_edges());
+
+    auto e_attr = rx.add_edge_attribute<float>("eMark", 1);
+
+
+#if USE_POLYSCOPE
+    rx.render_vertex_patch();
+    rx.render_edge_patch();
+    rx.render_face_patch();
+    // polyscope::show();
+#endif
+
+    bool validate = false;
+
+    int* d_num_cavities = nullptr;
+    CUDA_ERROR(cudaMalloc((void**)&d_num_cavities, 2 * sizeof(int)));
+    CUDA_ERROR(cudaMemset(d_num_cavities, 0, 2 * sizeof(int)));
+
+    float reduce_ratio = 0.05;
+
+    // Priority Queue Setup
+    //
+
+    //
+    // Priority Queue Setup End
+
+    CUDA_ERROR(cudaProfilerStart());
+    GPUTimer timer;
+    timer.start();
+    while (rx.get_num_faces() > final_num_faces) {
+
+    rx.prepare_launch_box({Op::EV},
+                          launch_box,
+                          (void*)compute_edge_priorities<float, blockThreads>,
+                          false,
+                          false,
+                          false,
+                          false,
+                          [&](uint32_t v, uint32_t e, uint32_t f){
+                            // Allocate enough additional memory
+                            // for the priority queue.
+                            // SDP: not enough info here to
+                            //  determine the memory the
+                            //  priority queue needs. Probably don't
+                            //  want to use this. 
+                            //  See below.
+                            //return 0;//e*sizeof(PriorityPair_t);
+                            return pq.get_shmem_size(blockThreads) + (e*sizeof(PriorityPair_t));
+                          });
+    //launch_box.smem_bytes_dyn += pq.get_shmem_size(blockThreads);
+
+    RXMESH_TRACE("hello serban 11111111\n");
+    RXMESH_TRACE("pair_alignment<float, rxmesh::EdgeHandle>(){}", cuco::detail::pair_alignment<float, rxmesh::EdgeHandle>());
+    RXMESH_TRACE("pair_alignment<float, uint32_t>(){}", cuco::detail::pair_alignment<float, uint32_t>());
+    RXMESH_TRACE("pair_alignment<float, uint64_t>(){}", cuco::detail::pair_alignment<float, uint64_t>());
+    RXMESH_TRACE("pair_alignment<float, float>(){}", cuco::detail::pair_alignment<float, float>());
+    RXMESH_TRACE("pair_alignment<double, double>(){}", cuco::detail::pair_alignment<double, double>());
+    RXMESH_TRACE("sizeof(PriorityPair_t){}", sizeof(PriorityPair_t));
+    compute_edge_priorities<float, blockThreads>
+        <<<launch_box.blocks,
+           launch_box.num_threads,
+           launch_box.smem_bytes_dyn>>>(rx.get_context(), *coords, pq.get_mutable_device_view(), pq.get_shmem_size(blockThreads));
+
+    cudaDeviceSynchronize();
+    RXMESH_TRACE("hello serban 2222222\n");
+    RXMESH_TRACE("launch_box.smem_bytes_dyn = {}", launch_box.smem_bytes_dyn);
+    RXMESH_TRACE("pq.get_shmem_size = {}", pq.get_shmem_size(blockThreads));
+
+    // now pop all the elements
+    //
+    thrust::device_vector<PriorityPair_t> d_popped(rx.get_num_edges());
+    pq.pop(d_popped.begin(), d_popped.end());
+    cudaDeviceSynchronize();
+    const thrust::host_vector<PriorityPair_t> h_popped(d_popped);
+    for(size_t i = 0; i < h_popped.size(); i++)
+    {
+        std::cout << i << "\t" << h_popped[i].first
+            << "\t" << h_popped[i].second << "\n";
+    }
+    return;
+        // compute max-min histogram
+        //histo.init();
+
+        //rx.prepare_launch_box({Op::EV},
+        //                      launch_box,
+        //                      (void*)compute_min_max_cost<float, blockThreads>,
+        //                      false);
+        //compute_min_max_cost<float, blockThreads>
+        //    <<<launch_box.blocks,
+        //       launch_box.num_threads,
+        //       launch_box.smem_bytes_dyn>>>(rx.get_context(), *coords, histo);
+
+        //// compute histogram bins
+        //rx.prepare_launch_box({Op::EV},
+        //                      launch_box,
+        //                      (void*)populate_histogram<float, blockThreads>,
+        //                      false);
+        //populate_histogram<float, blockThreads>
+        //    <<<launch_box.blocks,
+        //       launch_box.num_threads,
+        //       launch_box.smem_bytes_dyn>>>(rx.get_context(), *coords, histo);
+
+        //histo.scan();
+
+
+        // how much we can reduce the number of edge at each iterations
+        reduce_ratio = reduce_ratio + 0.05;
+
+        // loop over the mesh, and try to collapse
+        const int reduce_threshold =
+            std::max(1, int(reduce_ratio * float(rx.get_num_edges())));
+
+        // reset edge status
+        edge_status->reset(UNSEEN, DEVICE);
+
+        rx.reset_scheduler();
+        while (!rx.is_queue_empty() && rx.get_num_faces() > final_num_faces) {
+            RXMESH_INFO(" Queue size = {}",
+                        rx.get_context().m_patch_scheduler.size());
+
+            rx.prepare_launch_box(
+                {Op::EV},
+                launch_box,
+                (void*)secp<float, blockThreads>,
+                true,
+                false,
+                false,
+                false,
+                [&](uint32_t v, uint32_t e, uint32_t f) {
+                    return detail::mask_num_bytes(e) +
+                           2 * detail::mask_num_bytes(v) +
+                           3 * ShmemAllocator::default_alignment;
+                });
+
+            e_attr->reset(0, DEVICE);
+
+            GPUTimer app_timer;
+            app_timer.start();
+            secp<float, blockThreads>
+                 <<<launch_box.blocks,
+                    launch_box.num_threads,
+                    launch_box.smem_bytes_dyn>>>(rx.get_context(),
+                                                 *coords,
+                                                 //histo,
+                                                 reduce_threshold,
+                                                 *edge_status,
+                                                 *e_attr,
+                                                 d_num_cavities);
+
+            app_timer.stop();
+
+            GPUTimer cleanup_timer;
+            cleanup_timer.start();
+            rx.cleanup();
+            cleanup_timer.stop();
+
+            GPUTimer slice_timer;
+            slice_timer.start();
+            rx.slice_patches(*coords, *edge_status, *e_attr);
+            slice_timer.stop();
+
+            GPUTimer cleanup_timer2;
+            cleanup_timer2.start();
+            rx.cleanup();
+            cleanup_timer2.stop();
+
+
+            CUDA_ERROR(cudaDeviceSynchronize());
+            CUDA_ERROR(cudaGetLastError());
+
+            app_time += app_timer.elapsed_millis();
+            slice_time += slice_timer.elapsed_millis();
+            cleanup_time += cleanup_timer.elapsed_millis();
+            cleanup_time += cleanup_timer2.elapsed_millis();
+
+
+            if (validate) {
+                rx.update_host();
+                EXPECT_TRUE(rx.validate());
+                RXMESH_INFO(" num_vertices = {}, num_edges= {}, num_faces= {}",
+                            rx.get_num_vertices(),
+                            rx.get_num_edges(),
+                            rx.get_num_faces());
+            }
+        }
+
+        {
+            int h_num_cavities[2];
+            CUDA_ERROR(cudaMemcpy(&h_num_cavities,
+                                  d_num_cavities,
+                                  2 * sizeof(int),
+                                  cudaMemcpyDeviceToHost));
+            RXMESH_INFO(" Requested cavities = {}, executed cavities  = {}",
+                        h_num_cavities[1],
+                        h_num_cavities[0]);
+
+            coords->move(DEVICE, HOST);
+            e_attr->move(DEVICE, HOST);
+            rx.update_host();
+            rx.update_polyscope();
+            auto ps_mesh = rx.get_polyscope_mesh();
+            ps_mesh->updateVertexPositions(*coords);
+            ps_mesh->setEnabled(false);
+            ps_mesh->addEdgeScalarQuantity("eMark", *e_attr);
+            rx.render_vertex_patch();
+            rx.render_edge_patch();
+            rx.render_face_patch();
+            polyscope::show();
+        }
+    }
+    timer.stop();
+    total_time += timer.elapsed_millis();
+    CUDA_ERROR(cudaProfilerStop());
+
+    RXMESH_INFO("secp_rxmesh() RXMesh simplification took {} (ms)", total_time);
+    RXMESH_INFO("secp_rxmesh() App time {} (ms)", app_time);
+    RXMESH_INFO("secp_rxmesh() Slice timer {} (ms)", slice_time);
+    RXMESH_INFO("secp_rxmesh() Cleanup timer {} (ms)", cleanup_time);
+
+    if (!validate) {
+        rx.update_host();
+    }
+    coords->move(DEVICE, HOST);
+
+#if USE_POLYSCOPE
+    rx.update_polyscope();
+
+    auto ps_mesh = rx.get_polyscope_mesh();
+    ps_mesh->updateVertexPositions(*coords);
+    ps_mesh->setEnabled(false);
+
+    rx.render_vertex_patch();
+    rx.render_edge_patch();
+    rx.render_face_patch();
+    polyscope::show();
+#endif
+
+//    histo.free();
+}
\ No newline at end of file

From c889d45f9b1034a157c70dc8836622bd1b76084e Mon Sep 17 00:00:00 2001
From: "Serban D. Porumbescu" <sdporumbescu@ucdavis.edu>
Date: Tue, 7 May 2024 01:34:04 -0700
Subject: [PATCH 07/18] Encode patch and local into 32 bits, clean up some dead
 code

---
 apps/SECPriority/secp_kernels.cuh | 113 ++++--------------------------
 apps/SECPriority/secp_rxmesh.cuh  |  82 +++++++++++++---------
 2 files changed, 62 insertions(+), 133 deletions(-)

diff --git a/apps/SECPriority/secp_kernels.cuh b/apps/SECPriority/secp_kernels.cuh
index 1fc2ca39..2fef5992 100644
--- a/apps/SECPriority/secp_kernels.cuh
+++ b/apps/SECPriority/secp_kernels.cuh
@@ -248,18 +248,6 @@ __global__ static void  compute_edge_priorities(
     size_t                           pq_num_bytes)
 {
     using namespace rxmesh;
-
-    // shared mem variable
-    // Pair_t my_local_pair_array
-    // index into above
-    // sdp: needs to be the number of edges in the batch,
-    // which is not known at compile time
-    //__shared__ PriorityPair_t intermediatePairs[blockThreads];
-
-//    __shared__ int my_pair_count;
-    // some thread needs to initialize the above to 0
-    // sync
-    //extern __shared__ int shmem[];
     namespace cg = cooperative_groups;
     cg::thread_block g = cg::this_thread_block();
     ShmemAllocator      shrd_alloc;
@@ -269,12 +257,6 @@ __global__ static void  compute_edge_priorities(
     __shared__ int pair_counter;
     pair_counter = 0;
 
-/* 05/06
-    char * pq_shrd_mem = shrd_alloc.alloc(pq_num_bytes);
-    printf("blockThreads:%u\n", blockThreads);
-    printf("pq_shrd_mem:%p\t\n", (void*)pq_shrd_mem);
-    */
-
     auto edge_len = [&](const EdgeHandle& eh, const VertexIterator& iter) {
         const VertexHandle v0 = iter[0];
         const VertexHandle v1 = iter[1];
@@ -284,97 +266,28 @@ __global__ static void  compute_edge_priorities(
 
         T len2 = glm::distance2(p0, p1);
 
-        //PriorityPair_t p{(double)len2, (double)len2};
-        //PriorityPair_t p{len2, len2};
         auto p_e = rxmesh::detail::unpack(eh.unique_id());
+        //printf("p_id:%u\te_id:%hu\n", p_e.first, p_e.second);
         //printf("e_id:%llu\t, len:%f\n", eh.unique_id(), len2);
-        PriorityPair_t p{len2, p_e.second};
+
+        // repack the EdgeHandle into smaller 32 bits for
+        // use with priority queue. Need to check elsewhere
+        // that there are less than 2^16 patches.
+        auto id32 = unique_id32(p_e.second, (uint16_t)p_e.first);
+        //auto p_e_32 = unpack32(id32);
+        //printf("32bit p_id:%hu\te_id:%hu\n", p_e_32.first, p_e_32.second);
+
+        PriorityPair_t p{len2, id32};
         //PriorityPair_t p{len2, eh};
 
         auto val_counter = atomicAdd(&pair_counter, 1);
         intermediatePairs[val_counter] = p;
-
-        //PriorityPair_t p{len2, eh};
-        //PriorityPair_t p{len2, eh.patch_id()};
-        //PriorityPair_t p{len2, eh.m_handle}; //same as just eh
-        //alignas(64) PriorityPair_t arr_p[1] = {p};
-        //PriorityPair_t p{len2, eh};
-        //PriorityPair_t p{len2, eh.patch_id()};
-        //PriorityPair_t p{len2, eh.m_handle}; //same as just eh
-        //alignas(64) PriorityPair_t arr_p[1] = {p};
-/* 05/06 
-        alignas(16) PriorityPair_t arr_p[1] = {p};
-        //pq_view.push(g, arr_p, *(&arr_p + 1), shmem);
-        pq_view.push(g, arr_p, arr_p + 1, pq_shrd_mem);
-        */
-        //SDP: looks like the push might need to happen outside of this
-        //lambda. I think I need a static array where a thread?/EdgeHandle
-        //can be stored. Then, after the query.dispatch<Op::EV> below,
-        //I can do a pq_view.push(cg, &array[0], &array[size], shmem)
-
-        //Alternatively, this kernel computes the edge_length for EdgeHandle
-        //and saves to a thrust::device_vector<PairType> d_pairs. Given an EdgeHandle,
-        //what's my index into the device_vector?
-        //Then, call pq_view.push(d_pairs.begin(), d_pairs.end()) from host side.
-//SDP remove for build        auto my_pair = {len2, eh};
-//SDP remove for build until can revisit        pq_view.push(cg, &my_pair[0], &my_pair[0]+1, <temp_storage_needed>);
-
-//SDP remove for build        pq_view.push();
-        //atomicMin(histo.min_value(), len2);
-        //atomicMax(histo.max_value(), len2);
-//        int old_value = atomicAdd(&my_pair_count, 1);
     };
 
     auto block = cooperative_groups::this_thread_block();
-
-    //ShmemAllocator      shrd_alloc;
-
-    // need to account for this when the kernel is launched
-//SDP remove for build    PriorityPair_t* my_local_array = shrd_alloc.alloc<PriorityPair_t>(query.patch_info().m_edge_capacity[0]);
-    // allocate memory for the priority_queue and keep that pointer to pass
     query.dispatch<Op::EV>(block, shrd_alloc, edge_len);
-    // sync just in case for now
-    // actually do the push to the queue
     block.sync();
-    //if(block.thread_rank() == 0) {
-     //  printf("iterators: begin = %p\t end = %p\n", (void*)intermediatePairs, (void*)(intermediatePairs + blockThreads));
-
-       char * pq_shrd_mem = shrd_alloc.alloc(pq_num_bytes);
-      // printf("blockThreads:%u\n", blockThreads);
-      // printf("pq_shrd_mem:%p\t\n", (void*)pq_shrd_mem);
-      // for(size_t i = 0; i < blockThreads; i++) {
-      //  printf("imp:%d\t%f\t%u\t%p\n", i, intermediatePairs[i].first, intermediatePairs[i].second, &(intermediatePairs[i]));
-      // }
-       pq_view.push(block, intermediatePairs, intermediatePairs + pair_counter, pq_shrd_mem);
-       //shrd_alloc.dealloc(pq_num_bytes);
-    //}
-}
 
-/*
-template <typename T, uint32_t blockThreads>
-__global__ static void populate_histogram(
-    rxmesh::Context                  context,
-    const rxmesh::VertexAttribute<T> coords,
-    CostHistogram<T>                 histo)
-{
-    using namespace rxmesh;
-
-    auto edge_len = [&](const EdgeHandle& eh, const VertexIterator& iter) {
-        const VertexHandle v0 = iter[0];
-        const VertexHandle v1 = iter[1];
-
-        const Vec3<T> p0(coords(v0, 0), coords(v0, 1), coords(v0, 2));
-        const Vec3<T> p1(coords(v1, 0), coords(v1, 1), coords(v1, 2));
-
-        T len2 = glm::distance2(p0, p1);
-
-        histo.insert(len2);
-    };
-
-    auto block = cooperative_groups::this_thread_block();
-
-    Query<blockThreads> query(context);
-    ShmemAllocator      shrd_alloc;
-    query.dispatch<Op::EV>(block, shrd_alloc, edge_len);
-}
-*/
\ No newline at end of file
+    char * pq_shrd_mem = shrd_alloc.alloc(pq_num_bytes);
+    pq_view.push(block, intermediatePairs, intermediatePairs + pair_counter, pq_shrd_mem);
+}
\ No newline at end of file
diff --git a/apps/SECPriority/secp_rxmesh.cuh b/apps/SECPriority/secp_rxmesh.cuh
index c1d7eee3..163db23a 100644
--- a/apps/SECPriority/secp_rxmesh.cuh
+++ b/apps/SECPriority/secp_rxmesh.cuh
@@ -15,6 +15,38 @@
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 
+/**
+ * @brief Return unique index of the local mesh element composed by the
+ * patch id and the local index
+ *
+ * @param local_id the local within-patch mesh element id
+ * @param patch_id the patch owning the mesh element
+ * @return
+ */
+constexpr __device__ __host__ __forceinline__ uint32_t
+unique_id32(const uint16_t local_id, const uint16_t patch_id)
+{
+    uint32_t ret = patch_id;
+    ret          = (ret << 16);
+    ret |= local_id;
+    return ret;
+}
+
+/**
+ * @brief unpack a 32 uint to its high and low 16 bits. 
+ * This is used to convert the unique id to its local id (16
+ * low bit) and patch id (high 16 bit)
+ * @param uid unique id
+ * @return a std::pair storing the patch id and local id
+ */
+constexpr __device__ __host__ __forceinline__ std::pair<uint16_t, uint16_t>
+                                              unpack32(uint32_t uid)
+{
+    uint16_t local_id = uid & ((1 << 16) - 1);
+    uint16_t patch_id = uid >> 16;
+    return std::make_pair(patch_id, local_id);
+}
+
 // Priority queue setup. Use 'pair_less' to prioritize smaller values.
 template <typename T>
 struct pair_less 
@@ -24,13 +56,8 @@ struct pair_less
         return a.first < b.first;
     }
 };
-//using PriorityPair_t        = cuco::pair<float, float>;
-//using PriorityPair_t        = cuco::pair<float, rxmesh::EdgeHandle>;
 
 using PriorityPair_t        = cuco::pair<float, uint32_t>;
-
-//using PriorityPair_t        = cuco::pair<float, uint64_t>; 
-//using PriorityPair_t        = cuco::pair<double, double>; 
 using PriorityCompare       = pair_less<PriorityPair_t>;
 using PriorityQueue_t       = cuco::priority_queue<PriorityPair_t, PriorityCompare>;
 using PQView_t              = PriorityQueue_t::device_mutable_view;
@@ -48,15 +75,14 @@ enum : EdgeStatus
     ADDED  = 3,  // means it has been added to during the split/flip/collapse
 };
 
-//#include "histogram.cuh"
 #include "secp_kernels.cuh"
 
 
 inline void secp_rxmesh(rxmesh::RXMeshDynamic& rx,
                        const uint32_t         final_num_faces)
 {
-    //EXPECT_TRUE(rx.validate());
-//return;
+    EXPECT_TRUE(rx.validate());
+
     using namespace rxmesh;
     constexpr uint32_t blockThreads = 32;
 
@@ -74,7 +100,6 @@ inline void secp_rxmesh(rxmesh::RXMeshDynamic& rx,
 
     const int num_bins = 256;
 
-//    CostHistogram<float> histo(num_bins);
     PriorityQueue_t pq(rx.get_num_edges());
 
     auto e_attr = rx.add_edge_attribute<float>("eMark", 1);
@@ -95,12 +120,6 @@ inline void secp_rxmesh(rxmesh::RXMeshDynamic& rx,
 
     float reduce_ratio = 0.05;
 
-    // Priority Queue Setup
-    //
-
-    //
-    // Priority Queue Setup End
-
     CUDA_ERROR(cudaProfilerStart());
     GPUTimer timer;
     timer.start();
@@ -115,23 +134,14 @@ inline void secp_rxmesh(rxmesh::RXMeshDynamic& rx,
                           false,
                           [&](uint32_t v, uint32_t e, uint32_t f){
                             // Allocate enough additional memory
-                            // for the priority queue.
-                            // SDP: not enough info here to
-                            //  determine the memory the
-                            //  priority queue needs. Probably don't
-                            //  want to use this. 
-                            //  See below.
-                            //return 0;//e*sizeof(PriorityPair_t);
+                            // for the priority queue and the intermediate
+                            // array of PriorityPait_t.
                             return pq.get_shmem_size(blockThreads) + (e*sizeof(PriorityPair_t));
                           });
-    //launch_box.smem_bytes_dyn += pq.get_shmem_size(blockThreads);
 
-    RXMESH_TRACE("hello serban 11111111\n");
     RXMESH_TRACE("pair_alignment<float, rxmesh::EdgeHandle>(){}", cuco::detail::pair_alignment<float, rxmesh::EdgeHandle>());
     RXMESH_TRACE("pair_alignment<float, uint32_t>(){}", cuco::detail::pair_alignment<float, uint32_t>());
     RXMESH_TRACE("pair_alignment<float, uint64_t>(){}", cuco::detail::pair_alignment<float, uint64_t>());
-    RXMESH_TRACE("pair_alignment<float, float>(){}", cuco::detail::pair_alignment<float, float>());
-    RXMESH_TRACE("pair_alignment<double, double>(){}", cuco::detail::pair_alignment<double, double>());
     RXMESH_TRACE("sizeof(PriorityPair_t){}", sizeof(PriorityPair_t));
     compute_edge_priorities<float, blockThreads>
         <<<launch_box.blocks,
@@ -139,21 +149,27 @@ inline void secp_rxmesh(rxmesh::RXMeshDynamic& rx,
            launch_box.smem_bytes_dyn>>>(rx.get_context(), *coords, pq.get_mutable_device_view(), pq.get_shmem_size(blockThreads));
 
     cudaDeviceSynchronize();
-    RXMESH_TRACE("hello serban 2222222\n");
     RXMESH_TRACE("launch_box.smem_bytes_dyn = {}", launch_box.smem_bytes_dyn);
     RXMESH_TRACE("pq.get_shmem_size = {}", pq.get_shmem_size(blockThreads));
 
-    // now pop all the elements
+    // next kernel needs to pop some percentage of the top
+    // elements in the priority queue and store popped elements
+    // to be used by the next kernel that actually does the collapses
+    //
+    // mark some sort of
+    // associated edge attribute
+
+    // now pop all the elements to ouput on host
     //
     thrust::device_vector<PriorityPair_t> d_popped(rx.get_num_edges());
     pq.pop(d_popped.begin(), d_popped.end());
     cudaDeviceSynchronize();
     const thrust::host_vector<PriorityPair_t> h_popped(d_popped);
-    for(size_t i = 0; i < h_popped.size(); i++)
-    {
-        std::cout << i << "\t" << h_popped[i].first
-            << "\t" << h_popped[i].second << "\n";
-    }
+   // for(size_t i = 0; i < h_popped.size(); i++)
+   // {
+   //     std::cout << i << "\t" << h_popped[i].first
+   //         << "\t" << h_popped[i].second << "\n";
+   // }
     return;
         // compute max-min histogram
         //histo.init();

From 65eb418ce7febef380910953f02af4442e43c3b8 Mon Sep 17 00:00:00 2001
From: "Serban D. Porumbescu" <sdporumbescu@ucdavis.edu>
Date: Tue, 7 May 2024 22:04:27 -0700
Subject: [PATCH 08/18] Add kernel to pop and mark edges to be collapsed

---
 apps/SECPriority/secp_kernels.cuh | 37 +++++++++++++++++++++++++++-
 apps/SECPriority/secp_rxmesh.cuh  | 41 +++++++++++++++++++------------
 2 files changed, 61 insertions(+), 17 deletions(-)

diff --git a/apps/SECPriority/secp_kernels.cuh b/apps/SECPriority/secp_kernels.cuh
index 2fef5992..7ca64516 100644
--- a/apps/SECPriority/secp_kernels.cuh
+++ b/apps/SECPriority/secp_kernels.cuh
@@ -290,4 +290,39 @@ __global__ static void  compute_edge_priorities(
 
     char * pq_shrd_mem = shrd_alloc.alloc(pq_num_bytes);
     pq_view.push(block, intermediatePairs, intermediatePairs + pair_counter, pq_shrd_mem);
-}
\ No newline at end of file
+}
+
+template <uint32_t blockThreads>
+__global__ static void pop_and_mark_edges_to_collapse(
+    PQView_t pq_view,
+    rxmesh::EdgeAttribute<bool> marked_edges,
+    uint32_t pop_num_edges)
+{
+    // setup shared memory array to store the popped pairs
+    // 
+    // device api pop pairs
+    namespace cg = cooperative_groups;
+    using namespace rxmesh;
+    ShmemAllocator      shrd_alloc;
+
+    auto intermediatePairs = shrd_alloc.alloc<PriorityPair_t>(blockThreads);
+    char * pq_shrd_mem = shrd_alloc.alloc(pq_view.get_shmem_size(blockThreads));
+    cg::thread_block g = cg::this_thread_block();
+    pq_view.pop(g, intermediatePairs, intermediatePairs + blockThreads, pq_shrd_mem);
+
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    int local_tid = threadIdx.x;
+
+    // Make sure the index is within bounds
+    if(tid < pop_num_edges)
+    {
+        //printf("tid: %d\n", tid);
+        // unpack the uid to get the patch and edge ids
+        auto p_e = unpack32(intermediatePairs[local_tid].second);
+        //printf("32bit p_id:%hu\te_id:%hu\n", p_e.first, p_e.second);
+        rxmesh::EdgeHandle eh(p_e.first, rxmesh::LocalEdgeT(p_e.second));
+
+        //use the eh to index into a passed in edge attribute
+        marked_edges(eh) = true;
+    }
+}
diff --git a/apps/SECPriority/secp_rxmesh.cuh b/apps/SECPriority/secp_rxmesh.cuh
index 163db23a..7faeafc2 100644
--- a/apps/SECPriority/secp_rxmesh.cuh
+++ b/apps/SECPriority/secp_rxmesh.cuh
@@ -84,8 +84,7 @@ inline void secp_rxmesh(rxmesh::RXMeshDynamic& rx,
     EXPECT_TRUE(rx.validate());
 
     using namespace rxmesh;
-    constexpr uint32_t blockThreads = 32;
-
+    constexpr uint32_t blockThreads = 256;
 
     auto coords = rx.get_input_vertex_coordinates();
 
@@ -103,6 +102,7 @@ inline void secp_rxmesh(rxmesh::RXMeshDynamic& rx,
     PriorityQueue_t pq(rx.get_num_edges());
 
     auto e_attr = rx.add_edge_attribute<float>("eMark", 1);
+    auto e_pop_attr = rx.add_edge_attribute<bool>("ePop", false);
 
 
 #if USE_POLYSCOPE
@@ -155,21 +155,30 @@ inline void secp_rxmesh(rxmesh::RXMeshDynamic& rx,
     // next kernel needs to pop some percentage of the top
     // elements in the priority queue and store popped elements
     // to be used by the next kernel that actually does the collapses
-    //
-    // mark some sort of
-    // associated edge attribute
-
-    // now pop all the elements to ouput on host
-    //
-    thrust::device_vector<PriorityPair_t> d_popped(rx.get_num_edges());
-    pq.pop(d_popped.begin(), d_popped.end());
+
+    float reduce_ratio = 0.1f;
+
+    // Mark the edge attributes to be flipped
+    uint32_t pop_num_edges = reduce_ratio * rx.get_num_edges();
+    RXMESH_TRACE("pop_num_edges: {}", pop_num_edges);
+
+    constexpr uint32_t threads_per_block = 1024;
+    uint32_t number_of_blocks = (pop_num_edges + threads_per_block - 1) / threads_per_block;
+    int shared_mem_bytes = pq.get_shmem_size(threads_per_block) +
+                           (threads_per_block * sizeof(PriorityPair_t));
+    RXMESH_TRACE("threads_per_block: {}", threads_per_block);
+    RXMESH_TRACE("number_of_blocks: {}", number_of_blocks);
+    RXMESH_TRACE("shared_mem_bytes: {}", shared_mem_bytes);
+
+    pop_and_mark_edges_to_collapse<threads_per_block>
+        <<<number_of_blocks, threads_per_block, shared_mem_bytes>>>
+            (pq.get_mutable_device_view(),
+             *e_pop_attr,
+             pop_num_edges);
+
     cudaDeviceSynchronize();
-    const thrust::host_vector<PriorityPair_t> h_popped(d_popped);
-   // for(size_t i = 0; i < h_popped.size(); i++)
-   // {
-   //     std::cout << i << "\t" << h_popped[i].first
-   //         << "\t" << h_popped[i].second << "\n";
-   // }
+    RXMESH_TRACE("Made it past cudaDeviceSynchronize()");
+    
     return;
         // compute max-min histogram
         //histo.init();

From 39053442c0bb67c62691636ffdfdbcbc82b68d48 Mon Sep 17 00:00:00 2001
From: "Serban D. Porumbescu" <sdporumbescu@ucdavis.edu>
Date: Wed, 8 May 2024 12:21:56 -0700
Subject: [PATCH 09/18] Remove dead code

---
 apps/SECPriority/secp_rxmesh.cuh | 191 +------------------------------
 1 file changed, 1 insertion(+), 190 deletions(-)

diff --git a/apps/SECPriority/secp_rxmesh.cuh b/apps/SECPriority/secp_rxmesh.cuh
index 7faeafc2..ebe0d4fb 100644
--- a/apps/SECPriority/secp_rxmesh.cuh
+++ b/apps/SECPriority/secp_rxmesh.cuh
@@ -85,26 +85,13 @@ inline void secp_rxmesh(rxmesh::RXMeshDynamic& rx,
 
     using namespace rxmesh;
     constexpr uint32_t blockThreads = 256;
-
     auto coords = rx.get_input_vertex_coordinates();
-
-    auto edge_status = rx.add_edge_attribute<EdgeStatus>("EdgeStatus", 1);
-
     LaunchBox<blockThreads> launch_box;
 
-    float total_time   = 0;
-    float app_time     = 0;
-    float slice_time   = 0;
-    float cleanup_time = 0;
-
-    const int num_bins = 256;
-
     PriorityQueue_t pq(rx.get_num_edges());
 
-    auto e_attr = rx.add_edge_attribute<float>("eMark", 1);
     auto e_pop_attr = rx.add_edge_attribute<bool>("ePop", false);
 
-
 #if USE_POLYSCOPE
     rx.render_vertex_patch();
     rx.render_edge_patch();
@@ -112,19 +99,6 @@ inline void secp_rxmesh(rxmesh::RXMeshDynamic& rx,
     // polyscope::show();
 #endif
 
-    bool validate = false;
-
-    int* d_num_cavities = nullptr;
-    CUDA_ERROR(cudaMalloc((void**)&d_num_cavities, 2 * sizeof(int)));
-    CUDA_ERROR(cudaMemset(d_num_cavities, 0, 2 * sizeof(int)));
-
-    float reduce_ratio = 0.05;
-
-    CUDA_ERROR(cudaProfilerStart());
-    GPUTimer timer;
-    timer.start();
-    while (rx.get_num_faces() > final_num_faces) {
-
     rx.prepare_launch_box({Op::EV},
                           launch_box,
                           (void*)compute_edge_priorities<float, blockThreads>,
@@ -139,10 +113,6 @@ inline void secp_rxmesh(rxmesh::RXMeshDynamic& rx,
                             return pq.get_shmem_size(blockThreads) + (e*sizeof(PriorityPair_t));
                           });
 
-    RXMESH_TRACE("pair_alignment<float, rxmesh::EdgeHandle>(){}", cuco::detail::pair_alignment<float, rxmesh::EdgeHandle>());
-    RXMESH_TRACE("pair_alignment<float, uint32_t>(){}", cuco::detail::pair_alignment<float, uint32_t>());
-    RXMESH_TRACE("pair_alignment<float, uint64_t>(){}", cuco::detail::pair_alignment<float, uint64_t>());
-    RXMESH_TRACE("sizeof(PriorityPair_t){}", sizeof(PriorityPair_t));
     compute_edge_priorities<float, blockThreads>
         <<<launch_box.blocks,
            launch_box.num_threads,
@@ -158,7 +128,7 @@ inline void secp_rxmesh(rxmesh::RXMeshDynamic& rx,
 
     float reduce_ratio = 0.1f;
 
-    // Mark the edge attributes to be flipped
+    // Mark the edge attributes to be collapsed
     uint32_t pop_num_edges = reduce_ratio * rx.get_num_edges();
     RXMESH_TRACE("pop_num_edges: {}", pop_num_edges);
 
@@ -179,163 +149,4 @@ inline void secp_rxmesh(rxmesh::RXMeshDynamic& rx,
     cudaDeviceSynchronize();
     RXMESH_TRACE("Made it past cudaDeviceSynchronize()");
     
-    return;
-        // compute max-min histogram
-        //histo.init();
-
-        //rx.prepare_launch_box({Op::EV},
-        //                      launch_box,
-        //                      (void*)compute_min_max_cost<float, blockThreads>,
-        //                      false);
-        //compute_min_max_cost<float, blockThreads>
-        //    <<<launch_box.blocks,
-        //       launch_box.num_threads,
-        //       launch_box.smem_bytes_dyn>>>(rx.get_context(), *coords, histo);
-
-        //// compute histogram bins
-        //rx.prepare_launch_box({Op::EV},
-        //                      launch_box,
-        //                      (void*)populate_histogram<float, blockThreads>,
-        //                      false);
-        //populate_histogram<float, blockThreads>
-        //    <<<launch_box.blocks,
-        //       launch_box.num_threads,
-        //       launch_box.smem_bytes_dyn>>>(rx.get_context(), *coords, histo);
-
-        //histo.scan();
-
-
-        // how much we can reduce the number of edge at each iterations
-        reduce_ratio = reduce_ratio + 0.05;
-
-        // loop over the mesh, and try to collapse
-        const int reduce_threshold =
-            std::max(1, int(reduce_ratio * float(rx.get_num_edges())));
-
-        // reset edge status
-        edge_status->reset(UNSEEN, DEVICE);
-
-        rx.reset_scheduler();
-        while (!rx.is_queue_empty() && rx.get_num_faces() > final_num_faces) {
-            RXMESH_INFO(" Queue size = {}",
-                        rx.get_context().m_patch_scheduler.size());
-
-            rx.prepare_launch_box(
-                {Op::EV},
-                launch_box,
-                (void*)secp<float, blockThreads>,
-                true,
-                false,
-                false,
-                false,
-                [&](uint32_t v, uint32_t e, uint32_t f) {
-                    return detail::mask_num_bytes(e) +
-                           2 * detail::mask_num_bytes(v) +
-                           3 * ShmemAllocator::default_alignment;
-                });
-
-            e_attr->reset(0, DEVICE);
-
-            GPUTimer app_timer;
-            app_timer.start();
-            secp<float, blockThreads>
-                 <<<launch_box.blocks,
-                    launch_box.num_threads,
-                    launch_box.smem_bytes_dyn>>>(rx.get_context(),
-                                                 *coords,
-                                                 //histo,
-                                                 reduce_threshold,
-                                                 *edge_status,
-                                                 *e_attr,
-                                                 d_num_cavities);
-
-            app_timer.stop();
-
-            GPUTimer cleanup_timer;
-            cleanup_timer.start();
-            rx.cleanup();
-            cleanup_timer.stop();
-
-            GPUTimer slice_timer;
-            slice_timer.start();
-            rx.slice_patches(*coords, *edge_status, *e_attr);
-            slice_timer.stop();
-
-            GPUTimer cleanup_timer2;
-            cleanup_timer2.start();
-            rx.cleanup();
-            cleanup_timer2.stop();
-
-
-            CUDA_ERROR(cudaDeviceSynchronize());
-            CUDA_ERROR(cudaGetLastError());
-
-            app_time += app_timer.elapsed_millis();
-            slice_time += slice_timer.elapsed_millis();
-            cleanup_time += cleanup_timer.elapsed_millis();
-            cleanup_time += cleanup_timer2.elapsed_millis();
-
-
-            if (validate) {
-                rx.update_host();
-                EXPECT_TRUE(rx.validate());
-                RXMESH_INFO(" num_vertices = {}, num_edges= {}, num_faces= {}",
-                            rx.get_num_vertices(),
-                            rx.get_num_edges(),
-                            rx.get_num_faces());
-            }
-        }
-
-        {
-            int h_num_cavities[2];
-            CUDA_ERROR(cudaMemcpy(&h_num_cavities,
-                                  d_num_cavities,
-                                  2 * sizeof(int),
-                                  cudaMemcpyDeviceToHost));
-            RXMESH_INFO(" Requested cavities = {}, executed cavities  = {}",
-                        h_num_cavities[1],
-                        h_num_cavities[0]);
-
-            coords->move(DEVICE, HOST);
-            e_attr->move(DEVICE, HOST);
-            rx.update_host();
-            rx.update_polyscope();
-            auto ps_mesh = rx.get_polyscope_mesh();
-            ps_mesh->updateVertexPositions(*coords);
-            ps_mesh->setEnabled(false);
-            ps_mesh->addEdgeScalarQuantity("eMark", *e_attr);
-            rx.render_vertex_patch();
-            rx.render_edge_patch();
-            rx.render_face_patch();
-            polyscope::show();
-        }
-    }
-    timer.stop();
-    total_time += timer.elapsed_millis();
-    CUDA_ERROR(cudaProfilerStop());
-
-    RXMESH_INFO("secp_rxmesh() RXMesh simplification took {} (ms)", total_time);
-    RXMESH_INFO("secp_rxmesh() App time {} (ms)", app_time);
-    RXMESH_INFO("secp_rxmesh() Slice timer {} (ms)", slice_time);
-    RXMESH_INFO("secp_rxmesh() Cleanup timer {} (ms)", cleanup_time);
-
-    if (!validate) {
-        rx.update_host();
-    }
-    coords->move(DEVICE, HOST);
-
-#if USE_POLYSCOPE
-    rx.update_polyscope();
-
-    auto ps_mesh = rx.get_polyscope_mesh();
-    ps_mesh->updateVertexPositions(*coords);
-    ps_mesh->setEnabled(false);
-
-    rx.render_vertex_patch();
-    rx.render_edge_patch();
-    rx.render_face_patch();
-    polyscope::show();
-#endif
-
-//    histo.free();
 }
\ No newline at end of file

From 352c8e78766a324f43eb0b75c225242ef6e5f262 Mon Sep 17 00:00:00 2001
From: "Serban D. Porumbescu" <sdporumbescu@ucdavis.edu>
Date: Wed, 8 May 2024 17:04:36 -0700
Subject: [PATCH 10/18] Use priority queue to actually simplify

---
 apps/SECPriority/secp.cu          |  23 +--
 apps/SECPriority/secp_kernels.cuh | 125 +++------------
 apps/SECPriority/secp_rxmesh.cuh  | 246 +++++++++++++++++++++++-------
 3 files changed, 225 insertions(+), 169 deletions(-)

diff --git a/apps/SECPriority/secp.cu b/apps/SECPriority/secp.cu
index 1cdd2b2d..cedc1a64 100644
--- a/apps/SECPriority/secp.cu
+++ b/apps/SECPriority/secp.cu
@@ -5,11 +5,13 @@
 
 #include "rxmesh/rxmesh_dynamic.h"
 
+#include <filesystem>
+
 struct arg
 {
     std::string obj_file_name = STRINGIFY(INPUT_DIR) "dragon.obj";
     std::string output_folder = STRINGIFY(OUTPUT_DIR);
-    uint32_t    target        = 0;
+    float       target        = 0.1;
     uint32_t    device_id     = 0;
     char**      argv;
     int         argc;
@@ -24,18 +26,21 @@ TEST(Apps, SECPriority)
     // Select device
     cuda_query(Arg.device_id);
 
-    RXMeshDynamic rx(Arg.obj_file_name);
-    rx.save(STRINGIFY(OUTPUT_DIR) + extract_file_name(Arg.obj_file_name) +
-           "_patches");
+    // RXMeshDynamic rx(Arg.obj_file_name);
 
-    // RXMeshDynamic rx(Arg.obj_file_name,
-    //                  STRINGIFY(OUTPUT_DIR) +
-    //                      extract_file_name(Arg.obj_file_name) + "_patches",
-    //                  true);
+    const std::string p_file = STRINGIFY(OUTPUT_DIR) +
+                               extract_file_name(Arg.obj_file_name) +
+                               "_patches";
+    RXMeshDynamic rx(Arg.obj_file_name, p_file);
+    if (!std::filesystem::exists(p_file)) {
+        rx.save(p_file);
+    }
 
     ASSERT_TRUE(rx.is_edge_manifold());
 
-    secp_rxmesh(rx, Arg.target);
+    uint32_t final_num_vertices = Arg.target * rx.get_num_vertices();
+
+    secp_rxmesh(rx, final_num_vertices);
 }
 
 
diff --git a/apps/SECPriority/secp_kernels.cuh b/apps/SECPriority/secp_kernels.cuh
index 7ca64516..5807c085 100644
--- a/apps/SECPriority/secp_kernels.cuh
+++ b/apps/SECPriority/secp_kernels.cuh
@@ -1,5 +1,6 @@
 #pragma once
 #include "rxmesh/cavity_manager.cuh"
+#include "../ShortestEdgeCollapse/link_condition.cuh"
 
 #include <cooperative_groups.h>
 #include <cuda_runtime.h>
@@ -7,11 +8,9 @@
 template <typename T, uint32_t blockThreads>
 __global__ static void secp(rxmesh::Context                   context,
                             rxmesh::VertexAttribute<T>        coords,
-                           // const CostHistogram<T>            histo,
                             const int                         reduce_threshold,
                             rxmesh::EdgeAttribute<EdgeStatus> edge_status,
-                            rxmesh::EdgeAttribute<T>          e_attr,
-                            int*                              d_num_cavities)
+                            rxmesh::EdgeAttribute<bool>       e_pop_attr)
 {
     using namespace rxmesh;
     auto           block = cooperative_groups::this_thread_block();
@@ -29,6 +28,7 @@ __global__ static void secp(rxmesh::Context                   context,
     // filter them). Then after cavity.prologue, we reuse this bitmask to mark
     // the newly added edges
     Bitmask edge_mask(cavity.patch_info().edges_capacity[0], shrd_alloc);
+    edge_mask.reset(block);
 
     // we use this bitmask to mark the other end of to-be-collapse edge during
     // checking for the link condition
@@ -41,100 +41,26 @@ __global__ static void secp(rxmesh::Context                   context,
     ev_query.prologue<Op::EV>(block, shrd_alloc);
     block.sync();
 
-    // 1) mark edge we want to collapse
+    // 1a) mark edge we want to collapse given e_pop_attr
     for_each_edge(cavity.patch_info(), [&](EdgeHandle eh) {
         assert(eh.local_id() < cavity.patch_info().num_edges[0]);
 
-        if (edge_status(eh) != UNSEEN) {
+        if (edge_status(eh) != UNSEEN) 
+        {
             return;
         }
-        const VertexIterator iter =
-            ev_query.template get_iterator<VertexIterator>(eh.local_id());
 
-        const VertexHandle v0 = iter[0];
-        const VertexHandle v1 = iter[1];
-
-        const Vec3<T> p0(coords(v0, 0), coords(v0, 1), coords(v0, 2));
-        const Vec3<T> p1(coords(v1, 0), coords(v1, 1), coords(v1, 2));
-
-        T len2 = glm::distance2(p0, p1);
+        if(true == e_pop_attr(eh))
+        {
+           edge_mask.set(eh.local_id(), true);
+        }
 
-        /*if (histo.get_bin(len2) <= reduce_threshold) {
-            //::atomicAdd(d_num_cavities + 1, 1);
-            // cavity.create(eh);
-            edge_mask.set(eh.local_id(), true);
-        }*/
     });
     block.sync();
 
-
-    // 2) check edge link condition. Here, for each edge marked in edge_mask,
-    // all threads in the block collaborate to check the edge link condition of
-    // this edge
-    __shared__ int s_num_shared_one_ring;
-    for (uint16_t e = 0; e < edge_mask.size(); ++e) {
-
-        if (edge_mask(e)) {
-            // the edge two end vertices
-            const VertexIterator iter =
-                ev_query.template get_iterator<VertexIterator>(e);
-
-            const uint16_t v0 = iter.local(0);
-            const uint16_t v1 = iter.local(1);
-
-            if (threadIdx.x == 0) {
-                s_num_shared_one_ring = 0;
-            }
-
-            v0_mask.reset(block);
-            v1_mask.reset(block);
-            block.sync();
-
-            // each thread will be assigned to an edge (including not-owned one)
-            // and mark in v0_mask/v1_mask if one of its two ends are v0/v1
-            for_each_edge(
-                cavity.patch_info(),
-                [&](EdgeHandle eh) {
-                    if (eh.local_id() == e) {
-                        return;
-                    }
-                    const VertexIterator v_iter =
-                        ev_query.template get_iterator<VertexIterator>(
-                            eh.local_id());
-
-                    const uint16_t vv0 = v_iter.local(0);
-                    const uint16_t vv1 = v_iter.local(1);
-
-
-                    if (vv0 == v0) {
-                        v0_mask.set(vv1, true);
-                    }
-                    if (vv0 == v1) {
-                        v1_mask.set(vv1, true);
-                    }
-
-                    if (vv1 == v0) {
-                        v0_mask.set(vv0, true);
-                    }
-                    if (vv1 == v1) {
-                        v1_mask.set(vv0, true);
-                    }
-                },
-                true);
-            block.sync();
-
-            for (int v = threadIdx.x; v < v0_mask.size(); v += blockThreads) {
-                if (v0_mask(v) && v1_mask(v)) {
-                    ::atomicAdd(&s_num_shared_one_ring, 1);
-                }
-            }
-
-            block.sync();
-            if (s_num_shared_one_ring > 2) {
-                edge_mask.reset(e, true);
-            }
-        }
-    }
+    // 2a) check edge link condition.
+    link_condition(block, cavity.patch_info(), ev_query, 
+                   edge_mask, v0_mask, v1_mask);
     block.sync();
 
     for_each_edge(cavity.patch_info(), [&](EdgeHandle eh) {
@@ -149,19 +75,7 @@ __global__ static void secp(rxmesh::Context                   context,
 
     ev_query.epilogue(block, shrd_alloc);
 
-    // create the cavity
-    if (cavity.prologue(block, shrd_alloc, coords, edge_status, e_attr)) {
-
-        // if (threadIdx.x == 0) {
-        //     uint16_t num_actual_cavities = 0;
-        //     for (int i = 0; i < cavity.m_s_active_cavity_bitmask.size(); ++i)
-        //     {
-        //         if (cavity.m_s_active_cavity_bitmask(i)) {
-        //             num_actual_cavities++;
-        //         }
-        //     }
-        //     ::atomicAdd(d_num_cavities, num_actual_cavities);
-        // }
+    if (cavity.prologue(block, shrd_alloc, coords, edge_status)) {
         edge_mask.reset(block);
         block.sync();
 
@@ -179,16 +93,14 @@ __global__ static void secp(rxmesh::Context                   context,
 
             if (new_v.is_valid()) {
 
-                coords(new_v, 0) = (coords(v0, 0) + coords(v1, 0)) * 0.5;
-                coords(new_v, 1) = (coords(v0, 1) + coords(v1, 1)) * 0.5;
-                coords(new_v, 2) = (coords(v0, 2) + coords(v1, 2)) * 0.5;
+                coords(new_v, 0) = (coords(v0, 0) + coords(v1, 0)) * T(0.5);
+                coords(new_v, 1) = (coords(v0, 1) + coords(v1, 1)) * T(0.5);
+                coords(new_v, 2) = (coords(v0, 2) + coords(v1, 2)) * T(0.5);
 
 
                 DEdgeHandle e0 =
                     cavity.add_edge(new_v, cavity.get_cavity_vertex(c, 0));
 
-                e_attr(e0.get_edge_handle())++;
-
                 if (e0.is_valid()) {
                     edge_mask.set(e0.local_id(), true);
 
@@ -226,7 +138,6 @@ __global__ static void secp(rxmesh::Context                   context,
         });
     }
 
-
     cavity.epilogue(block);
     block.sync();
 
@@ -317,7 +228,7 @@ __global__ static void pop_and_mark_edges_to_collapse(
     if(tid < pop_num_edges)
     {
         //printf("tid: %d\n", tid);
-        // unpack the uid to get the patch and edge ids
+        //unpack the uid to get the patch and edge ids
         auto p_e = unpack32(intermediatePairs[local_tid].second);
         //printf("32bit p_id:%hu\te_id:%hu\n", p_e.first, p_e.second);
         rxmesh::EdgeHandle eh(p_e.first, rxmesh::LocalEdgeT(p_e.second));
diff --git a/apps/SECPriority/secp_rxmesh.cuh b/apps/SECPriority/secp_rxmesh.cuh
index ebe0d4fb..c5fad43c 100644
--- a/apps/SECPriority/secp_rxmesh.cuh
+++ b/apps/SECPriority/secp_rxmesh.cuh
@@ -79,18 +79,23 @@ enum : EdgeStatus
 
 
 inline void secp_rxmesh(rxmesh::RXMeshDynamic& rx,
-                       const uint32_t         final_num_faces)
+                       const uint32_t         final_num_vertices)
 {
     EXPECT_TRUE(rx.validate());
 
     using namespace rxmesh;
     constexpr uint32_t blockThreads = 256;
     auto coords = rx.get_input_vertex_coordinates();
+    auto edge_status = rx.add_edge_attribute<EdgeStatus>("EdgeStatus", 1);
     LaunchBox<blockThreads> launch_box;
 
-    PriorityQueue_t pq(rx.get_num_edges());
+    float total_time   = 0;
+    float app_time     = 0;
+    float slice_time   = 0;
+    float cleanup_time = 0;
 
-    auto e_pop_attr = rx.add_edge_attribute<bool>("ePop", false);
+
+    auto e_pop_attr = rx.add_edge_attribute<bool>("ePop", 1);
 
 #if USE_POLYSCOPE
     rx.render_vertex_patch();
@@ -99,54 +104,189 @@ inline void secp_rxmesh(rxmesh::RXMeshDynamic& rx,
     // polyscope::show();
 #endif
 
-    rx.prepare_launch_box({Op::EV},
-                          launch_box,
-                          (void*)compute_edge_priorities<float, blockThreads>,
-                          false,
-                          false,
-                          false,
-                          false,
-                          [&](uint32_t v, uint32_t e, uint32_t f){
-                            // Allocate enough additional memory
-                            // for the priority queue and the intermediate
-                            // array of PriorityPait_t.
-                            return pq.get_shmem_size(blockThreads) + (e*sizeof(PriorityPair_t));
-                          });
-
-    compute_edge_priorities<float, blockThreads>
-        <<<launch_box.blocks,
-           launch_box.num_threads,
-           launch_box.smem_bytes_dyn>>>(rx.get_context(), *coords, pq.get_mutable_device_view(), pq.get_shmem_size(blockThreads));
-
-    cudaDeviceSynchronize();
-    RXMESH_TRACE("launch_box.smem_bytes_dyn = {}", launch_box.smem_bytes_dyn);
-    RXMESH_TRACE("pq.get_shmem_size = {}", pq.get_shmem_size(blockThreads));
-
-    // next kernel needs to pop some percentage of the top
-    // elements in the priority queue and store popped elements
-    // to be used by the next kernel that actually does the collapses
-
-    float reduce_ratio = 0.1f;
-
-    // Mark the edge attributes to be collapsed
-    uint32_t pop_num_edges = reduce_ratio * rx.get_num_edges();
-    RXMESH_TRACE("pop_num_edges: {}", pop_num_edges);
-
-    constexpr uint32_t threads_per_block = 1024;
-    uint32_t number_of_blocks = (pop_num_edges + threads_per_block - 1) / threads_per_block;
-    int shared_mem_bytes = pq.get_shmem_size(threads_per_block) +
-                           (threads_per_block * sizeof(PriorityPair_t));
-    RXMESH_TRACE("threads_per_block: {}", threads_per_block);
-    RXMESH_TRACE("number_of_blocks: {}", number_of_blocks);
-    RXMESH_TRACE("shared_mem_bytes: {}", shared_mem_bytes);
-
-    pop_and_mark_edges_to_collapse<threads_per_block>
-        <<<number_of_blocks, threads_per_block, shared_mem_bytes>>>
-            (pq.get_mutable_device_view(),
-             *e_pop_attr,
-             pop_num_edges);
-
-    cudaDeviceSynchronize();
-    RXMESH_TRACE("Made it past cudaDeviceSynchronize()");
-    
+    bool validate = false;
+
+    CUDA_ERROR(cudaProfilerStart());
+    GPUTimer timer;
+    timer.start();
+    while(rx.get_num_vertices(true) > final_num_vertices)
+    {
+        // rebuild every round?
+        PriorityQueue_t pq(rx.get_num_edges());
+        e_pop_attr->reset(DEVICE, false);
+
+        //rx.prepare_launch_box(
+        rx.update_launch_box(
+            {Op::EV},
+            launch_box,
+            (void*)compute_edge_priorities<float, blockThreads>,
+            false, false, false, false,
+            [&](uint32_t v, uint32_t e, uint32_t f){
+              // Allocate enough additional memory
+              // for the priority queue and the intermediate
+              // array of PriorityPair_t.
+              return pq.get_shmem_size(blockThreads) + (e*sizeof(PriorityPair_t));
+            }
+        );
+
+        compute_edge_priorities<float, blockThreads>
+            <<<launch_box.blocks,
+               launch_box.num_threads,
+               launch_box.smem_bytes_dyn>>>( rx.get_context(), *coords, pq.get_mutable_device_view(), pq.get_shmem_size(blockThreads));
+        cudaDeviceSynchronize();
+        //RXMESH_TRACE("launch_box.smem_bytes_dyn = {}", launch_box.smem_bytes_dyn);
+        //RXMESH_TRACE("pq.get_shmem_size = {}", pq.get_shmem_size(blockThreads));
+
+        // Next kernel needs to pop some percentage of the top
+        // elements in the priority queue and store popped elements
+        // to be used by the next kernel that actually does the collapses
+
+        float reduce_ratio = 0.1f;
+        const int num_edges_before = int(rx.get_num_edges());
+        const int reduce_threshold =
+            std::max(1, int(reduce_ratio * float(num_edges_before)));
+        // Mark the edge attributes to be collapsed
+        uint32_t pop_num_edges = reduce_threshold; //reduce_ratio * rx.get_num_edges();
+        //RXMESH_TRACE("pop_num_edges: {}", pop_num_edges);
+
+        constexpr uint32_t threads_per_block = 32;
+        uint32_t number_of_blocks = (pop_num_edges + threads_per_block - 1) / threads_per_block;
+        int shared_mem_bytes = pq.get_shmem_size(threads_per_block) +
+                               (threads_per_block * sizeof(PriorityPair_t));
+        //RXMESH_TRACE("threads_per_block: {}", threads_per_block);
+        //RXMESH_TRACE("number_of_blocks: {}", number_of_blocks);
+        //RXMESH_TRACE("shared_mem_bytes: {}", shared_mem_bytes);
+
+        pop_and_mark_edges_to_collapse<threads_per_block>
+            <<<number_of_blocks, threads_per_block, shared_mem_bytes>>>
+                (pq.get_mutable_device_view(),
+                 *e_pop_attr,
+                 pop_num_edges);
+
+        CUDA_ERROR(cudaDeviceSynchronize());
+        CUDA_ERROR(cudaGetLastError());
+        //RXMESH_TRACE("Made it past cudaDeviceSynchronize()");
+
+        // loop over the mesh, and try to collapse
+        // reset edge status
+        edge_status->reset(UNSEEN, DEVICE);
+
+        rx.reset_scheduler();
+        while(!rx.is_queue_empty() &&
+              rx.get_num_vertices(true) > final_num_vertices)
+        {
+
+            RXMESH_INFO(" Queue size = {}",
+                        rx.get_context().m_patch_scheduler.size());
+
+            //rx.prepare_launch_box(
+            rx.update_launch_box(
+                {Op::EV},
+                launch_box,
+                (void*)secp<float, blockThreads>,
+                true, false, false, false,
+                [&](uint32_t v, uint32_t e, uint32_t f) {
+                    return detail::mask_num_bytes(e) +
+                           2 * detail::mask_num_bytes(v) +
+                           3 * ShmemAllocator::default_alignment;
+                }
+            );
+
+            GPUTimer app_timer;
+            app_timer.start();
+            secp<float, blockThreads>
+                <<<launch_box.blocks,
+                   launch_box.num_threads,
+                   launch_box.smem_bytes_dyn>>>(rx.get_context(),
+                                                *coords,
+                                                reduce_threshold,
+                                                *edge_status,
+                                                *e_pop_attr);
+            // should we cudaDeviceSyn here? stopping timers too soon?
+            //CUDA_ERROR(cudaDeviceSynchronize());
+            //CUDA_ERROR(cudaGetLastError());
+            
+            app_timer.stop();
+
+            GPUTimer cleanup_timer;
+            cleanup_timer.start();
+            rx.cleanup();
+            cleanup_timer.stop();
+
+            GPUTimer slice_timer;
+            slice_timer.start();
+            rx.slice_patches(*coords, *edge_status);
+            slice_timer.stop();
+
+            GPUTimer cleanup_timer2;
+            cleanup_timer2.start();
+            rx.cleanup();
+            cleanup_timer2.stop();
+
+
+            CUDA_ERROR(cudaDeviceSynchronize());
+            CUDA_ERROR(cudaGetLastError());
+
+            app_time += app_timer.elapsed_millis();
+            slice_time += slice_timer.elapsed_millis();
+            cleanup_time += cleanup_timer.elapsed_millis();
+            cleanup_time += cleanup_timer2.elapsed_millis();
+
+            if (validate) {
+                rx.update_host();
+                EXPECT_TRUE(rx.validate());
+            }
+        }
+
+        if (false) {
+
+            RXMESH_INFO("#Vertices {}", rx.get_num_vertices(true));
+            RXMESH_INFO("#Edges {}", rx.get_num_edges(true));
+            RXMESH_INFO("#Faces {}", rx.get_num_faces(true));
+            RXMESH_INFO("#Patches {}", rx.get_num_patches(true));
+            RXMESH_INFO("request reduction = {}, achieved reduction= {}",
+                        reduce_threshold,
+                        num_edges_before - int(rx.get_num_edges(true)));
+
+            if (false) {
+                rx.update_host();
+                coords->move(DEVICE, HOST);                
+                rx.update_polyscope();
+                auto ps_mesh = rx.get_polyscope_mesh();
+                ps_mesh->updateVertexPositions(*coords);
+                ps_mesh->setEnabled(false);                
+                // rx.render_vertex_patch();
+                // rx.render_edge_patch();
+                // rx.render_face_patch();
+
+                polyscope::show();
+            }
+        }
+    }
+    timer.stop();
+    total_time += timer.elapsed_millis();
+    CUDA_ERROR(cudaProfilerStop());
+
+    RXMESH_INFO("secp_rxmesh() RXMesh simplification took {} (ms)", total_time);
+    RXMESH_INFO("secp_rxmesh() App time {} (ms)", app_time);
+    RXMESH_INFO("secp_rxmesh() Slice timer {} (ms)", slice_time);
+    RXMESH_INFO("secp_rxmesh() Cleanup timer {} (ms)", cleanup_time);
+
+    if (!validate) {
+        rx.update_host();
+    }
+    coords->move(DEVICE, HOST);
+
+#if USE_POLYSCOPE
+    rx.update_polyscope();
+
+    auto ps_mesh = rx.get_polyscope_mesh();
+    ps_mesh->updateVertexPositions(*coords);
+    ps_mesh->setEnabled(false);
+
+    rx.render_vertex_patch();
+    rx.render_edge_patch();
+    rx.render_face_patch();
+    polyscope::show();
+#endif
 }
\ No newline at end of file

From 35fea4417b6a59e5a161f5fa15813e5109eff373 Mon Sep 17 00:00:00 2001
From: "Serban D. Porumbescu" <sdporumbescu@ucdavis.edu>
Date: Thu, 9 May 2024 00:19:09 -0700
Subject: [PATCH 11/18] Update to match histogram and add reporting

---
 apps/SECPriority/secp.cu          |   8 +-
 apps/SECPriority/secp_kernels.cuh |  22 +----
 apps/SECPriority/secp_rxmesh.cuh  | 145 ++++++++++++++++++------------
 3 files changed, 98 insertions(+), 77 deletions(-)

diff --git a/apps/SECPriority/secp.cu b/apps/SECPriority/secp.cu
index cedc1a64..7a115746 100644
--- a/apps/SECPriority/secp.cu
+++ b/apps/SECPriority/secp.cu
@@ -3,8 +3,6 @@
 #include "rxmesh/util/macros.h"
 #include "rxmesh/util/util.h"
 
-#include "rxmesh/rxmesh_dynamic.h"
-
 #include <filesystem>
 
 struct arg
@@ -38,6 +36,8 @@ TEST(Apps, SECPriority)
 
     ASSERT_TRUE(rx.is_edge_manifold());
 
+    ASSERT_TRUE(rx.is_closed());
+
     uint32_t final_num_vertices = Arg.target * rx.get_num_vertices();
 
     secp_rxmesh(rx, final_num_vertices);
@@ -62,7 +62,7 @@ int main(int argc, char** argv)
                         " -input:      Input file. Input file should be under the input/ subdirectory\n"
                         "              Default is {} \n"
                         "              Hint: Only accept OBJ files\n"
-                        " -target:     The final/target number of faces in the output mesh\n"
+                        " -target:     The fraction of output #vertices from the input\n"
                         " -o:          JSON file output folder. Default is {} \n"
                         " -device_id:  GPU device ID. Default is {}",
             Arg.obj_file_name, Arg.output_folder, Arg.device_id);
@@ -83,7 +83,7 @@ int main(int argc, char** argv)
                 atoi(get_cmd_option(argv, argv + argc, "-device_id"));
         }
         if (cmd_option_exists(argv, argc + argv, "-target")) {
-            Arg.target = false;
+            Arg.target = atof(get_cmd_option(argv, argv + argc, "-target"));
         }
     }
 
diff --git a/apps/SECPriority/secp_kernels.cuh b/apps/SECPriority/secp_kernels.cuh
index 5807c085..bd05b62d 100644
--- a/apps/SECPriority/secp_kernels.cuh
+++ b/apps/SECPriority/secp_kernels.cuh
@@ -9,7 +9,6 @@ template <typename T, uint32_t blockThreads>
 __global__ static void secp(rxmesh::Context                   context,
                             rxmesh::VertexAttribute<T>        coords,
                             const int                         reduce_threshold,
-                            rxmesh::EdgeAttribute<EdgeStatus> edge_status,
                             rxmesh::EdgeAttribute<bool>       e_pop_attr)
 {
     using namespace rxmesh;
@@ -45,11 +44,7 @@ __global__ static void secp(rxmesh::Context                   context,
     for_each_edge(cavity.patch_info(), [&](EdgeHandle eh) {
         assert(eh.local_id() < cavity.patch_info().num_edges[0]);
 
-        if (edge_status(eh) != UNSEEN) 
-        {
-            return;
-        }
-
+        //edge_mask.set(eh.local_id(), e_pop_attr(eh));
         if(true == e_pop_attr(eh))
         {
            edge_mask.set(eh.local_id(), true);
@@ -67,15 +62,14 @@ __global__ static void secp(rxmesh::Context                   context,
         assert(eh.local_id() < cavity.patch_info().num_edges[0]);
         if (edge_mask(eh.local_id())) {
             cavity.create(eh);
-        } else {
-            edge_status(eh) = OKAY;
-        }
+        } 
     });
     block.sync();
 
     ev_query.epilogue(block, shrd_alloc);
 
-    if (cavity.prologue(block, shrd_alloc, coords, edge_status)) {
+    // create the cavity
+    if (cavity.prologue(block, shrd_alloc, coords)) {
         edge_mask.reset(block);
         block.sync();
 
@@ -140,14 +134,6 @@ __global__ static void secp(rxmesh::Context                   context,
 
     cavity.epilogue(block);
     block.sync();
-
-    if (cavity.is_successful()) {
-        for_each_edge(cavity.patch_info(), [&](EdgeHandle eh) {
-            if (edge_mask(eh.local_id())) {
-                edge_status(eh) = ADDED;
-            }
-        });
-    }
 }
 
 //template <typename View, typename InputIt>
diff --git a/apps/SECPriority/secp_rxmesh.cuh b/apps/SECPriority/secp_rxmesh.cuh
index c5fad43c..4b7cb523 100644
--- a/apps/SECPriority/secp_rxmesh.cuh
+++ b/apps/SECPriority/secp_rxmesh.cuh
@@ -66,17 +66,9 @@ using PQView_t              = PriorityQueue_t::device_mutable_view;
 template <typename T>
 using Vec3 = glm::vec<3, T, glm::defaultp>;
 
-using EdgeStatus = int8_t;
-enum : EdgeStatus
-{
-    UNSEEN = 0,  // means we have not tested it before for e.g., split/flip/col
-    OKAY   = 1,  // means we have tested it and it is okay to skip
-    UPDATE = 2,  // means we should update it i.e., we have tested it before
-    ADDED  = 3,  // means it has been added to during the split/flip/collapse
-};
-
 #include "secp_kernels.cuh"
 
+#include "rxmesh/util/report.h"
 
 inline void secp_rxmesh(rxmesh::RXMeshDynamic& rx,
                        const uint32_t         final_num_vertices)
@@ -85,18 +77,39 @@ inline void secp_rxmesh(rxmesh::RXMeshDynamic& rx,
 
     using namespace rxmesh;
     constexpr uint32_t blockThreads = 256;
+
+    rxmesh::Report report("SECP_RXMesh");
+    report.command_line(Arg.argc, Arg.argv);
+    report.device();
+    report.system();
+    report.model_data(Arg.obj_file_name + "_before", rx, "model_before");
+    report.add_member("method", std::string("RXMesh"));
+    report.add_member("blockThreads", blockThreads);
+
     auto coords = rx.get_input_vertex_coordinates();
-    auto edge_status = rx.add_edge_attribute<EdgeStatus>("EdgeStatus", 1);
+
     LaunchBox<blockThreads> launch_box;
 
     float total_time   = 0;
     float app_time     = 0;
     float slice_time   = 0;
     float cleanup_time = 0;
-
+    float pq_time      = 0;
+    float pop_mark_time      = 0;
+    float e_priority_time      = 0;
 
     auto e_pop_attr = rx.add_edge_attribute<bool>("ePop", 1);
 
+    RXMESH_INFO("#Vertices {}", rx.get_num_vertices());
+    RXMESH_INFO("#Edges {}", rx.get_num_edges());
+    RXMESH_INFO("#Faces {}", rx.get_num_faces());
+    RXMESH_INFO("#Patches {}", rx.get_num_patches());
+
+    size_t   max_smem_bytes_dyn           = 0;
+    size_t   max_smem_bytes_static        = 0;
+    uint32_t max_num_registers_per_thread = 0;
+    uint32_t max_num_blocks               = 0;
+
 #if USE_POLYSCOPE
     rx.render_vertex_patch();
     rx.render_edge_patch();
@@ -106,12 +119,19 @@ inline void secp_rxmesh(rxmesh::RXMeshDynamic& rx,
 
     bool validate = false;
 
+    int num_passes = 0;
+
     CUDA_ERROR(cudaProfilerStart());
     GPUTimer timer;
     timer.start();
     while(rx.get_num_vertices(true) > final_num_vertices)
     {
-        // rebuild every round?
+        ++num_passes;
+
+        GPUTimer pq_timer;
+        pq_timer.start();
+
+        // rebuild every round? Not necessarily a great way to use a pq.
         PriorityQueue_t pq(rx.get_num_edges());
         e_pop_attr->reset(DEVICE, false);
 
@@ -129,11 +149,15 @@ inline void secp_rxmesh(rxmesh::RXMeshDynamic& rx,
             }
         );
 
+        GPUTimer edge_priorities_timer;
+        edge_priorities_timer.start();
         compute_edge_priorities<float, blockThreads>
             <<<launch_box.blocks,
                launch_box.num_threads,
                launch_box.smem_bytes_dyn>>>( rx.get_context(), *coords, pq.get_mutable_device_view(), pq.get_shmem_size(blockThreads));
-        cudaDeviceSynchronize();
+        edge_priorities_timer.stop();
+        e_priority_time += edge_priorities_timer.elapsed_millis();
+        //cudaDeviceSynchronize();
         //RXMESH_TRACE("launch_box.smem_bytes_dyn = {}", launch_box.smem_bytes_dyn);
         //RXMESH_TRACE("pq.get_shmem_size = {}", pq.get_shmem_size(blockThreads));
 
@@ -149,7 +173,7 @@ inline void secp_rxmesh(rxmesh::RXMeshDynamic& rx,
         uint32_t pop_num_edges = reduce_threshold; //reduce_ratio * rx.get_num_edges();
         //RXMESH_TRACE("pop_num_edges: {}", pop_num_edges);
 
-        constexpr uint32_t threads_per_block = 32;
+        constexpr uint32_t threads_per_block = 256;
         uint32_t number_of_blocks = (pop_num_edges + threads_per_block - 1) / threads_per_block;
         int shared_mem_bytes = pq.get_shmem_size(threads_per_block) +
                                (threads_per_block * sizeof(PriorityPair_t));
@@ -157,6 +181,8 @@ inline void secp_rxmesh(rxmesh::RXMeshDynamic& rx,
         //RXMESH_TRACE("number_of_blocks: {}", number_of_blocks);
         //RXMESH_TRACE("shared_mem_bytes: {}", shared_mem_bytes);
 
+        GPUTimer pop_mark_timer;
+        pop_mark_timer.start();
         pop_and_mark_edges_to_collapse<threads_per_block>
             <<<number_of_blocks, threads_per_block, shared_mem_bytes>>>
                 (pq.get_mutable_device_view(),
@@ -165,19 +191,22 @@ inline void secp_rxmesh(rxmesh::RXMeshDynamic& rx,
 
         CUDA_ERROR(cudaDeviceSynchronize());
         CUDA_ERROR(cudaGetLastError());
-        //RXMESH_TRACE("Made it past cudaDeviceSynchronize()");
+        pop_mark_timer.stop();
+        pop_mark_time += pop_mark_timer.elapsed_millis();
+
+        pq_timer.stop();
+
+        pq_time += pq_timer.elapsed_millis();
 
         // loop over the mesh, and try to collapse
-        // reset edge status
-        edge_status->reset(UNSEEN, DEVICE);
 
         rx.reset_scheduler();
         while(!rx.is_queue_empty() &&
               rx.get_num_vertices(true) > final_num_vertices)
         {
 
-            RXMESH_INFO(" Queue size = {}",
-                        rx.get_context().m_patch_scheduler.size());
+            //RXMESH_INFO(" Queue size = {}",
+            //            rx.get_context().m_patch_scheduler.size());
 
             //rx.prepare_launch_box(
             rx.update_launch_box(
@@ -192,7 +221,17 @@ inline void secp_rxmesh(rxmesh::RXMeshDynamic& rx,
                 }
             );
 
+            max_smem_bytes_dyn =
+                std::max(max_smem_bytes_dyn, launch_box.smem_bytes_dyn);
+            max_smem_bytes_static =
+                std::max(max_smem_bytes_static, launch_box.smem_bytes_static);
+            max_num_registers_per_thread =
+                std::max(max_num_registers_per_thread,
+                         launch_box.num_registers_per_thread);
+            max_num_blocks =
+                std::max(max_num_blocks, DIVIDE_UP(launch_box.blocks, 8));
             GPUTimer app_timer;
+
             app_timer.start();
             secp<float, blockThreads>
                 <<<launch_box.blocks,
@@ -200,7 +239,6 @@ inline void secp_rxmesh(rxmesh::RXMeshDynamic& rx,
                    launch_box.smem_bytes_dyn>>>(rx.get_context(),
                                                 *coords,
                                                 reduce_threshold,
-                                                *edge_status,
                                                 *e_pop_attr);
             // should we cudaDeviceSyn here? stopping timers too soon?
             //CUDA_ERROR(cudaDeviceSynchronize());
@@ -215,7 +253,7 @@ inline void secp_rxmesh(rxmesh::RXMeshDynamic& rx,
 
             GPUTimer slice_timer;
             slice_timer.start();
-            rx.slice_patches(*coords, *edge_status);
+            rx.slice_patches(*coords);
             slice_timer.stop();
 
             GPUTimer cleanup_timer2;
@@ -231,52 +269,46 @@ inline void secp_rxmesh(rxmesh::RXMeshDynamic& rx,
             slice_time += slice_timer.elapsed_millis();
             cleanup_time += cleanup_timer.elapsed_millis();
             cleanup_time += cleanup_timer2.elapsed_millis();
-
-            if (validate) {
-                rx.update_host();
-                EXPECT_TRUE(rx.validate());
-            }
-        }
-
-        if (false) {
-
-            RXMESH_INFO("#Vertices {}", rx.get_num_vertices(true));
-            RXMESH_INFO("#Edges {}", rx.get_num_edges(true));
-            RXMESH_INFO("#Faces {}", rx.get_num_faces(true));
-            RXMESH_INFO("#Patches {}", rx.get_num_patches(true));
-            RXMESH_INFO("request reduction = {}, achieved reduction= {}",
-                        reduce_threshold,
-                        num_edges_before - int(rx.get_num_edges(true)));
-
-            if (false) {
-                rx.update_host();
-                coords->move(DEVICE, HOST);                
-                rx.update_polyscope();
-                auto ps_mesh = rx.get_polyscope_mesh();
-                ps_mesh->updateVertexPositions(*coords);
-                ps_mesh->setEnabled(false);                
-                // rx.render_vertex_patch();
-                // rx.render_edge_patch();
-                // rx.render_face_patch();
-
-                polyscope::show();
-            }
         }
     }
     timer.stop();
     total_time += timer.elapsed_millis();
     CUDA_ERROR(cudaProfilerStop());
 
-    RXMESH_INFO("secp_rxmesh() RXMesh simplification took {} (ms)", total_time);
+    RXMESH_INFO("secp_rxmesh() RXMesh SEC took {} (ms), num_passes= {}",
+                total_time,
+                num_passes);
+    RXMESH_INFO("secp_rxmesh() PriorityQ time {} (ms)", pq_time);
+    RXMESH_INFO("secp_rxmesh() |-Edge priorities time {} (ms)", e_priority_time);
+    RXMESH_INFO("secp_rxmesh() |-Pop and Mark time {} (ms)", pop_mark_time);
     RXMESH_INFO("secp_rxmesh() App time {} (ms)", app_time);
     RXMESH_INFO("secp_rxmesh() Slice timer {} (ms)", slice_time);
     RXMESH_INFO("secp_rxmesh() Cleanup timer {} (ms)", cleanup_time);
 
-    if (!validate) {
-        rx.update_host();
-    }
+    RXMESH_INFO("#Vertices {}", rx.get_num_vertices(true));
+    RXMESH_INFO("#Edges {}", rx.get_num_edges(true));
+    RXMESH_INFO("#Faces {}", rx.get_num_faces(true));
+    RXMESH_INFO("#Patches {}", rx.get_num_patches(true));
+
+
+    rx.update_host();
+    
     coords->move(DEVICE, HOST);
 
+    report.add_member("num_passes", num_passes);
+    report.add_member("max_smem_bytes_dyn", max_smem_bytes_dyn);
+    report.add_member("max_smem_bytes_static", max_smem_bytes_static);
+    report.add_member("max_num_registers_per_thread",
+                      max_num_registers_per_thread);
+    report.add_member("max_num_blocks", max_num_blocks);
+    report.add_member("secs_remesh_time", total_time);
+    report.add_member("priority_queue_time", pq_time);
+    report.add_member("app_time", app_time);
+    report.add_member("slice_time", slice_time);
+    report.add_member("cleanup_time", cleanup_time);
+    report.add_member("attributes_memory_mg", coords->get_memory_mg());
+    report.model_data(Arg.obj_file_name + "_after", rx, "model_after");
+
 #if USE_POLYSCOPE
     rx.update_polyscope();
 
@@ -289,4 +321,7 @@ inline void secp_rxmesh(rxmesh::RXMeshDynamic& rx,
     rx.render_face_patch();
     polyscope::show();
 #endif
+
+    report.write(Arg.output_folder + "/rxmesh_secp",
+                 "SECP_RXMesh_" + extract_file_name(Arg.obj_file_name));
 }
\ No newline at end of file

From 7a0bf73f0a5c784311ce4b4f80d324390116c7ea Mon Sep 17 00:00:00 2001
From: "Serban D. Porumbescu" <sdporumbescu@ucdavis.edu>
Date: Tue, 14 May 2024 13:34:06 -0700
Subject: [PATCH 12/18] Use DIVIDE_UP for number of kernel blocks

---
 apps/SECPriority/secp_rxmesh.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/SECPriority/secp_rxmesh.cuh b/apps/SECPriority/secp_rxmesh.cuh
index 4b7cb523..551e09c2 100644
--- a/apps/SECPriority/secp_rxmesh.cuh
+++ b/apps/SECPriority/secp_rxmesh.cuh
@@ -234,7 +234,7 @@ inline void secp_rxmesh(rxmesh::RXMeshDynamic& rx,
 
             app_timer.start();
             secp<float, blockThreads>
-                <<<launch_box.blocks,
+                <<<DIVIDE_UP(launch_box.blocks, 8),
                    launch_box.num_threads,
                    launch_box.smem_bytes_dyn>>>(rx.get_context(),
                                                 *coords,

From 18925fc36c47a9c7afbab22286f9188af86acf17 Mon Sep 17 00:00:00 2001
From: "Serban D. Porumbescu" <sdporumbescu@ucdavis.edu>
Date: Tue, 14 May 2024 16:17:12 -0700
Subject: [PATCH 13/18] Fix edge attr reset bug that swapped value with device

---
 apps/SECPriority/secp_rxmesh.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/SECPriority/secp_rxmesh.cuh b/apps/SECPriority/secp_rxmesh.cuh
index 551e09c2..9540cd62 100644
--- a/apps/SECPriority/secp_rxmesh.cuh
+++ b/apps/SECPriority/secp_rxmesh.cuh
@@ -133,7 +133,7 @@ inline void secp_rxmesh(rxmesh::RXMeshDynamic& rx,
 
         // rebuild every round? Not necessarily a great way to use a pq.
         PriorityQueue_t pq(rx.get_num_edges());
-        e_pop_attr->reset(DEVICE, false);
+        e_pop_attr->reset(false, DEVICE);
 
         //rx.prepare_launch_box(
         rx.update_launch_box(

From 3cae5d650e5f44ca8296e70ae1f188634b95f023 Mon Sep 17 00:00:00 2001
From: "Serban D. Porumbescu" <sdporumbescu@ucdavis.edu>
Date: Tue, 14 May 2024 17:51:09 -0700
Subject: [PATCH 14/18] Add debug function to view edges to be collaped

---
 apps/SECPriority/secp_rxmesh.cuh | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/apps/SECPriority/secp_rxmesh.cuh b/apps/SECPriority/secp_rxmesh.cuh
index 9540cd62..2dd09059 100644
--- a/apps/SECPriority/secp_rxmesh.cuh
+++ b/apps/SECPriority/secp_rxmesh.cuh
@@ -70,6 +70,34 @@ using Vec3 = glm::vec<3, T, glm::defaultp>;
 
 #include "rxmesh/util/report.h"
 
+template <typename T>
+void render_edge_attr(rxmesh::RXMeshDynamic& rx,
+    const std::shared_ptr<rxmesh::EdgeAttribute<T>>& edge_attr)
+{
+    using namespace rxmesh;
+    //make sure the attribute is on the HOST
+    edge_attr->move(DEVICE, HOST);
+
+    std::vector<float> edgeColors(rx.get_num_edges());
+    rx.for_each_edge(HOST,
+        [&](EdgeHandle eh) {
+            if(true == (*edge_attr)(eh))
+            {
+                //save a red color
+                edgeColors[rx.linear_id(eh)] = 1.0f;
+            }
+            else
+            {
+                //save a black color
+                edgeColors[rx.linear_id(eh)] = 0.0f;
+            }
+        });
+
+    auto ps_mesh = rx.get_polyscope_mesh();
+    auto edge_colors = ps_mesh->addEdgeScalarQuantity("Edges to Collapse", edgeColors);
+    edge_colors->setEnabled(true);
+}
+
 inline void secp_rxmesh(rxmesh::RXMeshDynamic& rx,
                        const uint32_t         final_num_vertices)
 {
@@ -189,6 +217,10 @@ inline void secp_rxmesh(rxmesh::RXMeshDynamic& rx,
                  *e_pop_attr,
                  pop_num_edges);
 
+       // if(num_passes == 1)
+       // {
+       //     render_edge_attr<bool>(rx, e_pop_attr);
+       // }
         CUDA_ERROR(cudaDeviceSynchronize());
         CUDA_ERROR(cudaGetLastError());
         pop_mark_timer.stop();

From 23145d84fecfcd3977b9acf22efeff6aa6d1b888 Mon Sep 17 00:00:00 2001
From: "Serban D. Porumbescu" <sdporumbescu@ucdavis.edu>
Date: Wed, 15 May 2024 11:29:28 -0700
Subject: [PATCH 15/18] Add a bash script to test a range of target vertex
 counts

---
 sweep_test.sh | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100755 sweep_test.sh

diff --git a/sweep_test.sh b/sweep_test.sh
new file mode 100755
index 00000000..6d7ee81e
--- /dev/null
+++ b/sweep_test.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Start value
+start=0.001
+# End value
+end=0.9
+# Step value
+step=0.031
+
+# Command path
+command="./build/bin/SECPriority"
+# Input file
+input_file="./input/rocker-arm.obj"
+
+# Loop through the range
+for target in $(seq $start $step $end)
+do
+    echo "Running with target = $target"
+    $command -input $input_file -target $target
+done
+

From 452bb39f6b83fa44430184fcf8e66ce81e8ed222 Mon Sep 17 00:00:00 2001
From: "Serban D. Porumbescu" <sdporumbescu@ucdavis.edu>
Date: Thu, 16 May 2024 10:22:09 -0700
Subject: [PATCH 16/18] Render edges to collapse with patch boundaries

---
 apps/SECPriority/secp_rxmesh.cuh | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/apps/SECPriority/secp_rxmesh.cuh b/apps/SECPriority/secp_rxmesh.cuh
index 2dd09059..98e48ca4 100644
--- a/apps/SECPriority/secp_rxmesh.cuh
+++ b/apps/SECPriority/secp_rxmesh.cuh
@@ -83,13 +83,11 @@ void render_edge_attr(rxmesh::RXMeshDynamic& rx,
         [&](EdgeHandle eh) {
             if(true == (*edge_attr)(eh))
             {
-                //save a red color
-                edgeColors[rx.linear_id(eh)] = 1.0f;
+                edgeColors[rx.linear_id(eh)] = 200.0f;
             }
             else
             {
-                //save a black color
-                edgeColors[rx.linear_id(eh)] = 0.0f;
+                edgeColors[rx.linear_id(eh)] = eh.patch_id();
             }
         });
 
@@ -333,7 +331,7 @@ inline void secp_rxmesh(rxmesh::RXMeshDynamic& rx,
     report.add_member("max_num_registers_per_thread",
                       max_num_registers_per_thread);
     report.add_member("max_num_blocks", max_num_blocks);
-    report.add_member("secs_remesh_time", total_time);
+    report.add_member("secp_remesh_time", total_time);
     report.add_member("priority_queue_time", pq_time);
     report.add_member("app_time", app_time);
     report.add_member("slice_time", slice_time);

From b6e4cb16fefa66abead58035e1dc329ba2aaba18 Mon Sep 17 00:00:00 2001
From: "Serban D. Porumbescu" <sdporumbescu@ucdavis.edu>
Date: Thu, 16 May 2024 11:58:16 -0700
Subject: [PATCH 17/18] Add edgefrac(tion) to collapse per round

---
 apps/SECPriority/secp.cu         | 8 +++++++-
 apps/SECPriority/secp_rxmesh.cuh | 5 +++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/apps/SECPriority/secp.cu b/apps/SECPriority/secp.cu
index 7a115746..8da2cfe8 100644
--- a/apps/SECPriority/secp.cu
+++ b/apps/SECPriority/secp.cu
@@ -10,6 +10,7 @@ struct arg
     std::string obj_file_name = STRINGIFY(INPUT_DIR) "dragon.obj";
     std::string output_folder = STRINGIFY(OUTPUT_DIR);
     float       target        = 0.1;
+    float       edgefrac = 0.1;
     uint32_t    device_id     = 0;
     char**      argv;
     int         argc;
@@ -40,7 +41,7 @@ TEST(Apps, SECPriority)
 
     uint32_t final_num_vertices = Arg.target * rx.get_num_vertices();
 
-    secp_rxmesh(rx, final_num_vertices);
+    secp_rxmesh(rx, final_num_vertices, Arg.edgefrac);
 }
 
 
@@ -63,6 +64,7 @@ int main(int argc, char** argv)
                         "              Default is {} \n"
                         "              Hint: Only accept OBJ files\n"
                         " -target:     The fraction of output #vertices from the input\n"
+                        " -edgefrac:   The fraction of edges to collapse in a round\n"
                         " -o:          JSON file output folder. Default is {} \n"
                         " -device_id:  GPU device ID. Default is {}",
             Arg.obj_file_name, Arg.output_folder, Arg.device_id);
@@ -85,12 +87,16 @@ int main(int argc, char** argv)
         if (cmd_option_exists(argv, argc + argv, "-target")) {
             Arg.target = atof(get_cmd_option(argv, argv + argc, "-target"));
         }
+        if (cmd_option_exists(argv, argc + argv, "-edgefrac")) {
+            Arg.edgefrac = atof(get_cmd_option(argv, argv + argc, "-edgefrac"));
+        }
     }
 
     RXMESH_TRACE("input= {}", Arg.obj_file_name);
     RXMESH_TRACE("output_folder= {}", Arg.output_folder);
     RXMESH_TRACE("device_id= {}", Arg.device_id);
     RXMESH_TRACE("target= {}", Arg.target);
+    RXMESH_TRACE("edgefrac= {}", Arg.edgefrac);
 
     return RUN_ALL_TESTS();
 }
\ No newline at end of file
diff --git a/apps/SECPriority/secp_rxmesh.cuh b/apps/SECPriority/secp_rxmesh.cuh
index 98e48ca4..76270d97 100644
--- a/apps/SECPriority/secp_rxmesh.cuh
+++ b/apps/SECPriority/secp_rxmesh.cuh
@@ -97,7 +97,8 @@ void render_edge_attr(rxmesh::RXMeshDynamic& rx,
 }
 
 inline void secp_rxmesh(rxmesh::RXMeshDynamic& rx,
-                       const uint32_t         final_num_vertices)
+                       const uint32_t         final_num_vertices,
+                       const float edge_reduce_ratio)
 {
     EXPECT_TRUE(rx.validate());
 
@@ -191,7 +192,7 @@ inline void secp_rxmesh(rxmesh::RXMeshDynamic& rx,
         // elements in the priority queue and store popped elements
         // to be used by the next kernel that actually does the collapses
 
-        float reduce_ratio = 0.1f;
+        float reduce_ratio = edge_reduce_ratio;
         const int num_edges_before = int(rx.get_num_edges());
         const int reduce_threshold =
             std::max(1, int(reduce_ratio * float(num_edges_before)));

From 024a11bd15e0fa7025b442000665c8c51b57d448 Mon Sep 17 00:00:00 2001
From: "Serban D. Porumbescu" <sdporumbescu@ucdavis.edu>
Date: Thu, 16 May 2024 11:58:40 -0700
Subject: [PATCH 18/18] Add bash script to sweep edgefrac(tion)

---
 sweep_edgefrac_test.sh | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100755 sweep_edgefrac_test.sh

diff --git a/sweep_edgefrac_test.sh b/sweep_edgefrac_test.sh
new file mode 100755
index 00000000..9ffb5195
--- /dev/null
+++ b/sweep_edgefrac_test.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Start value
+start=0.001
+# End value
+end=0.9
+# Step value
+step=0.031
+
+# Command path
+command="./build/bin/SECPriority"
+# Input file
+input_file="./input/rocker-arm.obj"
+
+# Loop through the range
+for target in $(seq $start $step $end)
+do
+    echo "Running with edgefrac = $target"
+    $command -input $input_file -edgefrac $target
+done
+