From 48c175c00ce9858c88a5900c19afbf7cbe42a2c5 Mon Sep 17 00:00:00 2001 From: sdargavi Date: Wed, 27 May 2026 17:20:57 +0100 Subject: [PATCH 1/2] Move kokkos IS-view storage onto per-PCAIR handle IS_fine_views_local / IS_coarse_views_local in src/VecISCopyLocalk.kokkos.cxx were file-scope globals indexed by multigrid level and shared across every PCAIR instance, so two concurrent PCAIRs overwrote each other's per-level shared_ptr to the device IS view and the older PCAIR's apply read the newer one's indices, leading to an out-of-bounds device write and a Vec_Kokkos double-free during PCAIR destruction. Move the storage into a per-PCAIR opaque handle hung off air_multigrid_data; add tests/ex6_two_airg as a regression test. Co-Authored-By: Claude Opus 4.7 --- Makefile | 3 +- include/kokkos_helper.hpp | 10 +-- src/AIR_Data_Type.F90 | 9 ++- src/AIR_MG_Setup.F90 | 9 ++- src/AIR_Operators_Setup.F90 | 36 ++++++--- src/C_PETSc_Interfaces.F90 | 54 +++++++------ src/FC_Smooth.F90 | 17 ++-- src/PETSc_Helper.F90 | 29 +++++-- src/PETSc_Helperk.kokkos.cxx | 30 +++---- src/VecISCopyLocalk.kokkos.cxx | 142 +++++++++++++++++++-------------- tests/Makefile | 7 ++ tests/ex6_two_airg.c | 130 ++++++++++++++++++++++++++++++ 12 files changed, 333 insertions(+), 143 deletions(-) create mode 100644 tests/ex6_two_airg.c diff --git a/Makefile b/Makefile index afdc773b..00f61369 100644 --- a/Makefile +++ b/Makefile @@ -161,7 +161,8 @@ export TEST_TARGETS = ex12f \ matrandom_check_reset \ ex12f_gmres_poly \ mat_diag \ - adv_dg_upwind + adv_dg_upwind \ + ex6_two_airg # Include kokkos examples ifeq ($(PETSC_HAVE_KOKKOS),1) export TEST_TARGETS := $(TEST_TARGETS) adv_1dk diff --git a/include/kokkos_helper.hpp b/include/kokkos_helper.hpp index 338d0d66..69901913 100644 --- a/include/kokkos_helper.hpp +++ b/include/kokkos_helper.hpp @@ -42,11 +42,11 @@ PETSC_INTERN void pmisr_existing_measure_implicit_transpose_kokkos(Mat *strength PETSC_INTERN void copy_diag_dom_ratio_d2h(PetscReal *diag_dom_ratio_local); PETSC_INTERN void delete_device_diag_dom_ratio(); -// Define array of shared pointers representing fine and coarse IS's -// on each level on the device -extern ViewPetscIntPtr* IS_fine_views_local; -extern ViewPetscIntPtr* IS_coarse_views_local; -extern int max_levels; +// Per-PCAIR IS views (fine/coarse per multigrid level) live behind an opaque +// handle owned by the air_data on the Fortran side; see VecISCopyLocalk for +// the definition of the storage struct and accessors. +PETSC_INTERN PetscIntKokkosView VecISCopyLocal_kokkos_get_view(void *handle, int our_level, int fine_int); + extern intKokkosView cf_markers_local_d; extern PetscScalarKokkosView diag_dom_ratio_local_d; diff --git a/src/AIR_Data_Type.F90 b/src/AIR_Data_Type.F90 index 87a50131..8873a093 100644 --- a/src/AIR_Data_Type.F90 +++ b/src/AIR_Data_Type.F90 @@ -337,9 +337,16 @@ module air_data_type type(gmres_poly_data) :: inv_coarsest_poly_data ! Temporary storage - type(petsc_vec_array), dimension(4) :: temp_vecs_fine, temp_vecs_coarse + type(petsc_vec_array), dimension(4) :: temp_vecs_fine, temp_vecs_coarse type(petsc_vec_array), dimension(1) :: temp_vecs + ! Per-PCAIR opaque handle to the kokkos-side IS view storage. Set up by + ! create_VecISCopyLocal_kokkos, torn down by destroy_VecISCopyLocal_kokkos. + ! Stored here (instead of as a file-scope global in + ! src/VecISCopyLocalk.kokkos.cxx) so concurrent PCAIR instances do not + ! overwrite each other's per-level IS views. + type(c_ptr) :: kokkos_is_views_handle = c_null_ptr + ! Temporary reuse type(air_reuse_data), allocatable, dimension(:) :: reuse diff --git a/src/AIR_MG_Setup.F90 b/src/AIR_MG_Setup.F90 index 244c9788..48d80970 100644 --- a/src/AIR_MG_Setup.F90 +++ b/src/AIR_MG_Setup.F90 @@ -421,18 +421,21 @@ subroutine setup_air_pcmg(amat, pmat, air_data, pcmg_input) call MatCreateSubMatrixWrapper(air_data%coarse_matrix(our_level), & air_data%IS_fine_index(our_level), air_data%IS_fine_index(our_level), MAT_REUSE_MATRIX, & air_data%A_ff(our_level), & - our_level = our_level, is_row_fine = .TRUE., is_col_fine = .TRUE.) + our_level = our_level, is_row_fine = .TRUE., is_col_fine = .TRUE., & + kokkos_is_views_handle = air_data%kokkos_is_views_handle) else call MatCreateSubMatrixWrapper(air_data%coarse_matrix(our_level), & air_data%IS_fine_index(our_level), air_data%IS_fine_index(our_level), MAT_INITIAL_MATRIX, & air_data%A_ff(our_level), & - our_level = our_level, is_row_fine = .TRUE., is_col_fine = .TRUE.) + our_level = our_level, is_row_fine = .TRUE., is_col_fine = .TRUE., & + kokkos_is_views_handle = air_data%kokkos_is_views_handle) end if else call MatCreateSubMatrixWrapper(air_data%coarse_matrix(our_level), & air_data%IS_fine_index(our_level), air_data%IS_fine_index(our_level), MAT_INITIAL_MATRIX, & air_data%A_ff(our_level), & - our_level = our_level, is_row_fine = .TRUE., is_col_fine = .TRUE.) + our_level = our_level, is_row_fine = .TRUE., is_col_fine = .TRUE., & + kokkos_is_views_handle = air_data%kokkos_is_views_handle) end if call timer_finish(TIMER_ID_AIR_EXTRACT) diff --git a/src/AIR_Operators_Setup.F90 b/src/AIR_Operators_Setup.F90 index 25cff957..cfd3ba65 100644 --- a/src/AIR_Operators_Setup.F90 +++ b/src/AIR_Operators_Setup.F90 @@ -92,12 +92,14 @@ subroutine get_submatrices_start_poly_coeff_comms(input_mat, our_level, air_data call MatCreateSubMatrixWrapper(air_data%reuse(our_level)%reuse_mat(MAT_A_DROP), & air_data%IS_fine_index(our_level), air_data%IS_fine_index(our_level), MAT_REUSE_MATRIX, & air_data%reuse(our_level)%reuse_mat(MAT_AFF_DROP), & - our_level = our_level, is_row_fine = .TRUE., is_col_fine = .TRUE.) + our_level = our_level, is_row_fine = .TRUE., is_col_fine = .TRUE., & + kokkos_is_views_handle = air_data%kokkos_is_views_handle) else call MatCreateSubMatrixWrapper(air_data%reuse(our_level)%reuse_mat(MAT_A_DROP), & air_data%IS_fine_index(our_level), air_data%IS_fine_index(our_level), MAT_INITIAL_MATRIX, & air_data%reuse(our_level)%reuse_mat(MAT_AFF_DROP), & - our_level = our_level, is_row_fine = .TRUE., is_col_fine = .TRUE.) + our_level = our_level, is_row_fine = .TRUE., is_col_fine = .TRUE., & + kokkos_is_views_handle = air_data%kokkos_is_views_handle) end if call timer_finish(TIMER_ID_AIR_EXTRACT) @@ -162,12 +164,14 @@ subroutine get_submatrices_start_poly_coeff_comms(input_mat, our_level, air_data call MatCreateSubMatrixWrapper(input_mat, & air_data%IS_coarse_index(our_level), air_data%IS_coarse_index(our_level), MAT_REUSE_MATRIX, & air_data%A_cc(our_level), & - our_level = our_level, is_row_fine = .FALSE., is_col_fine = .FALSE.) + our_level = our_level, is_row_fine = .FALSE., is_col_fine = .FALSE., & + kokkos_is_views_handle = air_data%kokkos_is_views_handle) else call MatCreateSubMatrixWrapper(input_mat, & air_data%IS_coarse_index(our_level), air_data%IS_coarse_index(our_level), MAT_INITIAL_MATRIX, & air_data%A_cc(our_level), & - our_level = our_level, is_row_fine = .FALSE., is_col_fine = .FALSE.) + our_level = our_level, is_row_fine = .FALSE., is_col_fine = .FALSE., & + kokkos_is_views_handle = air_data%kokkos_is_views_handle) end if call timer_start(TIMER_ID_AIR_INVERSE) @@ -200,20 +204,24 @@ subroutine get_submatrices_start_poly_coeff_comms(input_mat, our_level, air_data call MatCreateSubMatrixWrapper(input_mat, & air_data%IS_fine_index(our_level), air_data%IS_coarse_index(our_level), MAT_REUSE_MATRIX, & air_data%A_fc(our_level), & - our_level = our_level, is_row_fine = .TRUE., is_col_fine = .FALSE.) + our_level = our_level, is_row_fine = .TRUE., is_col_fine = .FALSE., & + kokkos_is_views_handle = air_data%kokkos_is_views_handle) call MatCreateSubMatrixWrapper(input_mat, & air_data%IS_coarse_index(our_level), air_data%IS_fine_index(our_level), MAT_REUSE_MATRIX, & air_data%A_cf(our_level), & - our_level = our_level, is_row_fine = .FALSE., is_col_fine = .TRUE.) + our_level = our_level, is_row_fine = .FALSE., is_col_fine = .TRUE., & + kokkos_is_views_handle = air_data%kokkos_is_views_handle) else call MatCreateSubMatrixWrapper(input_mat, & air_data%IS_fine_index(our_level), air_data%IS_coarse_index(our_level), MAT_INITIAL_MATRIX, & air_data%A_fc(our_level), & - our_level = our_level, is_row_fine = .TRUE., is_col_fine = .FALSE.) + our_level = our_level, is_row_fine = .TRUE., is_col_fine = .FALSE., & + kokkos_is_views_handle = air_data%kokkos_is_views_handle) call MatCreateSubMatrixWrapper(input_mat, & air_data%IS_coarse_index(our_level), air_data%IS_fine_index(our_level), MAT_INITIAL_MATRIX, & air_data%A_cf(our_level), & - our_level = our_level, is_row_fine = .FALSE., is_col_fine = .TRUE.) + our_level = our_level, is_row_fine = .FALSE., is_col_fine = .TRUE., & + kokkos_is_views_handle = air_data%kokkos_is_views_handle) end if call timer_finish(TIMER_ID_AIR_EXTRACT) @@ -241,12 +249,14 @@ subroutine get_submatrices_start_poly_coeff_comms(input_mat, our_level, air_data call MatCreateSubMatrixWrapper(air_data%reuse(our_level)%reuse_mat(MAT_A_DROP), & air_data%IS_coarse_index(our_level), air_data%IS_fine_index(our_level), MAT_REUSE_MATRIX, & air_data%reuse(our_level)%reuse_mat(MAT_ACF_DROP), & - our_level = our_level, is_row_fine = .FALSE., is_col_fine = .TRUE.) + our_level = our_level, is_row_fine = .FALSE., is_col_fine = .TRUE., & + kokkos_is_views_handle = air_data%kokkos_is_views_handle) else call MatCreateSubMatrixWrapper(air_data%reuse(our_level)%reuse_mat(MAT_A_DROP), & air_data%IS_coarse_index(our_level), air_data%IS_fine_index(our_level), MAT_INITIAL_MATRIX, & air_data%reuse(our_level)%reuse_mat(MAT_ACF_DROP), & - our_level = our_level, is_row_fine = .FALSE., is_col_fine = .TRUE.) + our_level = our_level, is_row_fine = .FALSE., is_col_fine = .TRUE., & + kokkos_is_views_handle = air_data%kokkos_is_views_handle) end if if (.NOT. air_data%options%one_point_classical_prolong) then @@ -256,12 +266,14 @@ subroutine get_submatrices_start_poly_coeff_comms(input_mat, our_level, air_data call MatCreateSubMatrixWrapper(air_data%reuse(our_level)%reuse_mat(MAT_A_DROP), & air_data%IS_fine_index(our_level), air_data%IS_coarse_index(our_level), MAT_REUSE_MATRIX, & air_data%reuse(our_level)%reuse_mat(MAT_AFC_DROP), & - our_level = our_level, is_row_fine = .TRUE., is_col_fine = .FALSE.) + our_level = our_level, is_row_fine = .TRUE., is_col_fine = .FALSE., & + kokkos_is_views_handle = air_data%kokkos_is_views_handle) else call MatCreateSubMatrixWrapper(air_data%reuse(our_level)%reuse_mat(MAT_A_DROP), & air_data%IS_fine_index(our_level), air_data%IS_coarse_index(our_level), MAT_INITIAL_MATRIX, & air_data%reuse(our_level)%reuse_mat(MAT_AFC_DROP), & - our_level = our_level, is_row_fine = .TRUE., is_col_fine = .FALSE.) + our_level = our_level, is_row_fine = .TRUE., is_col_fine = .FALSE., & + kokkos_is_views_handle = air_data%kokkos_is_views_handle) end if end if diff --git a/src/C_PETSc_Interfaces.F90 b/src/C_PETSc_Interfaces.F90 index 5a9161f8..d13bfb6b 100644 --- a/src/C_PETSc_Interfaces.F90 +++ b/src/C_PETSc_Interfaces.F90 @@ -337,47 +337,51 @@ end subroutine MatSetAllValues_cpu interface - subroutine create_VecISCopyLocal_kokkos(max_levels_input) & + subroutine create_VecISCopyLocal_kokkos(max_levels_input, handle) & bind(c, name="create_VecISCopyLocal_kokkos") use iso_c_binding integer(c_int), value :: max_levels_input - end subroutine create_VecISCopyLocal_kokkos - - end interface - - interface - - subroutine destroy_VecISCopyLocal_kokkos() & + type(c_ptr) :: handle + end subroutine create_VecISCopyLocal_kokkos + + end interface + + interface + + subroutine destroy_VecISCopyLocal_kokkos(handle) & bind(c, name="destroy_VecISCopyLocal_kokkos") use iso_c_binding - end subroutine destroy_VecISCopyLocal_kokkos - - end interface - - interface - - subroutine set_VecISCopyLocal_kokkos_our_level(our_level, global_row_start, index_fine, index_coarse) & + type(c_ptr) :: handle + end subroutine destroy_VecISCopyLocal_kokkos + + end interface + + interface + + subroutine set_VecISCopyLocal_kokkos_our_level(handle, our_level, global_row_start, index_fine, index_coarse) & bind(c, name="set_VecISCopyLocal_kokkos_our_level") use iso_c_binding + type(c_ptr), value :: handle integer(c_int), value :: our_level integer(PFLARE_PETSCINT_C_KIND), value :: global_row_start integer(c_long_long) :: index_fine integer(c_long_long) :: index_coarse - end subroutine set_VecISCopyLocal_kokkos_our_level - + end subroutine set_VecISCopyLocal_kokkos_our_level + end interface - - interface - - subroutine VecISCopyLocal_kokkos(our_level, fine_int, vfull, mode_int, vreduced) & + + interface + + subroutine VecISCopyLocal_kokkos(handle, our_level, fine_int, vfull, mode_int, vreduced) & bind(c, name="VecISCopyLocal_kokkos") use iso_c_binding + type(c_ptr), value :: handle integer(c_int), value :: our_level, fine_int, mode_int integer(c_long_long) :: vfull integer(c_long_long) :: vreduced - end subroutine VecISCopyLocal_kokkos - - end interface + end subroutine VecISCopyLocal_kokkos + + end interface interface @@ -594,12 +598,14 @@ end subroutine MatAXPY_kokkos subroutine MatCreateSubMatrix_kokkos(A_array, is_row, is_col, & reuse_int, B_array, & + kokkos_is_views_handle, & our_level, is_row_fine_int, is_col_fine_int) & bind(c, name="MatCreateSubMatrix_kokkos") use iso_c_binding integer(c_long_long) :: A_array integer(c_long_long) :: B_array integer(c_long_long) :: is_row, is_col + type(c_ptr), value :: kokkos_is_views_handle integer(c_int), value :: our_level, is_row_fine_int, is_col_fine_int, reuse_int end subroutine MatCreateSubMatrix_kokkos diff --git a/src/FC_Smooth.F90 b/src/FC_Smooth.F90 index ff34438a..0bbd59ba 100644 --- a/src/FC_Smooth.F90 +++ b/src/FC_Smooth.F90 @@ -82,13 +82,14 @@ subroutine create_VecISCopyLocalWrapper(air_data, our_level, mat_type, input_mat mat_type == MATAIJKOKKOS) then ! Build in case not built yet - call create_VecISCopyLocal_kokkos(air_data%options%max_levels) + call create_VecISCopyLocal_kokkos(air_data%options%max_levels, air_data%kokkos_is_views_handle) call MatGetOwnershipRange(input_mat, global_row_start, global_row_end_plus_one, ierr) ! Copy the IS's over to the device is_fine_array = air_data%IS_fine_index(our_level)%v is_coarse_array = air_data%IS_coarse_index(our_level)%v - call set_VecISCopyLocal_kokkos_our_level(our_level, global_row_start, is_fine_array, is_coarse_array) + call set_VecISCopyLocal_kokkos_our_level(air_data%kokkos_is_views_handle, our_level, & + global_row_start, is_fine_array, is_coarse_array) end if #endif @@ -122,8 +123,8 @@ subroutine destroy_VecISCopyLocalWrapper(air_data, our_level) end if else -#if defined(PETSC_HAVE_KOKKOS) - call destroy_VecISCopyLocal_kokkos() +#if defined(PETSC_HAVE_KOKKOS) + call destroy_VecISCopyLocal_kokkos(air_data%kokkos_is_views_handle) #endif end if @@ -181,7 +182,7 @@ subroutine VecISCopyLocalWrapper(air_data, our_level, fine, vfull, mode, vreduce if (fine) fine_int = 1 vfull_array = vfull%v vreduced_array = vreduced%v - call VecISCopyLocal_kokkos(our_level, fine_int, vfull_array, & + call VecISCopyLocal_kokkos(air_data%kokkos_is_views_handle, our_level, fine_int, vfull_array, & mode_int, vreduced_array) ! If debugging do a comparison between CPU and Kokkos results @@ -245,7 +246,7 @@ subroutine VecISCopyLocalWrapper(air_data, our_level, fine, vfull, mode, vreduce if (fine) fine_int = 1 vfull_array = vfull%v vreduced_array = vreduced%v - call VecISCopyLocal_kokkos(our_level, fine_int, vfull_array, & + call VecISCopyLocal_kokkos(air_data%kokkos_is_views_handle, our_level, fine_int, vfull_array, & mode_int, vreduced_array) ! If debugging do a comparison between CPU and Kokkos results @@ -293,7 +294,7 @@ subroutine VecISCopyLocalWrapper(air_data, our_level, fine, vfull, mode, vreduce if (fine) fine_int = 1 vfull_array = vfull%v vreduced_array = vreduced%v - call VecISCopyLocal_kokkos(our_level, fine_int, vfull_array, & + call VecISCopyLocal_kokkos(air_data%kokkos_is_views_handle, our_level, fine_int, vfull_array, & mode_int, vreduced_array) ! If debugging do a comparison between CPU and Kokkos results @@ -357,7 +358,7 @@ subroutine VecISCopyLocalWrapper(air_data, our_level, fine, vfull, mode, vreduce if (fine) fine_int = 1 vfull_array = vfull%v vreduced_array = vreduced%v - call VecISCopyLocal_kokkos(our_level, fine_int, vfull_array, & + call VecISCopyLocal_kokkos(air_data%kokkos_is_views_handle, our_level, fine_int, vfull_array, & mode_int, vreduced_array) ! If debugging do a comparison between CPU and Kokkos results diff --git a/src/PETSc_Helper.F90 b/src/PETSc_Helper.F90 index fd34786f..efff3fdf 100644 --- a/src/PETSc_Helper.F90 +++ b/src/PETSc_Helper.F90 @@ -1,5 +1,6 @@ module petsc_helper + use iso_c_binding, only: c_ptr, c_null_ptr, c_associated use petscmat use c_petsc_interfaces, only: remove_small_from_sparse_kokkos, & remove_from_sparse_match_kokkos, & @@ -1067,24 +1068,28 @@ end subroutine MatAXPYWrapper subroutine MatCreateSubMatrixWrapper(input_mat, is_row, is_col, & reuse, output_mat, & - our_level, is_row_fine, is_col_fine) + our_level, is_row_fine, is_col_fine, kokkos_is_views_handle) ! Wrapper around MatCreateSubMatrix_kokkos - ! Only works if the input IS have the same parallel row/column distribution + ! Only works if the input IS have the same parallel row/column distribution ! as the matrices ! is_col must be sorted - + ! When our_level is supplied (i.e. the kokkos path uses precomputed per-level + ! fine/coarse IS views), kokkos_is_views_handle MUST also be supplied; it + ! is the per-PCAIR handle stored on air_multigrid_data. + ! ~~~~~~~~~~ - ! Input + ! Input type(tMat), intent(in) :: input_mat IS, intent(in) :: is_row, is_col integer, intent(in), optional :: our_level logical, intent(in), optional :: is_row_fine, is_col_fine + type(c_ptr), intent(in), optional :: kokkos_is_views_handle MatReuse, intent(in) :: reuse type(tMat), intent(inout) :: output_mat PetscErrorCode :: ierr -#if defined(PETSC_HAVE_KOKKOS) +#if defined(PETSC_HAVE_KOKKOS) MPIU_Comm :: MPI_COMM_MATRIX integer :: comm_size, errorcode integer(c_long_long) :: A_array, B_array, is_row_ptr, is_col_ptr @@ -1094,8 +1099,9 @@ subroutine MatCreateSubMatrixWrapper(input_mat, is_row, is_col, & Mat :: temp_mat PetscScalar :: normy type(tVec) :: max_vec - PetscInt :: row_loc -#endif + PetscInt :: row_loc + type(c_ptr) :: handle_local +#endif ! ~~~~~~~~~~ #if defined(PETSC_HAVE_KOKKOS) @@ -1122,6 +1128,7 @@ subroutine MatCreateSubMatrixWrapper(input_mat, is_row, is_col, & our_level_int = -1 is_row_fine_int = 0 is_col_fine_int = 0 + handle_local = c_null_ptr if (present(our_level)) then our_level_int = our_level @@ -1132,9 +1139,17 @@ subroutine MatCreateSubMatrixWrapper(input_mat, is_row, is_col, & if (present(is_col_fine)) then if (is_col_fine) is_col_fine_int = 1 end if + if (present(kokkos_is_views_handle)) then + handle_local = kokkos_is_views_handle + end if + if (our_level_int /= -1 .AND. .NOT. c_associated(handle_local)) then + print *, "MatCreateSubMatrixWrapper: our_level set but kokkos_is_views_handle is null" + call MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER, errorcode) + end if call MatCreateSubMatrix_kokkos(A_array, is_row_ptr, is_col_ptr, & reuse_int, B_array, & + handle_local, & our_level_int, is_row_fine_int, is_col_fine_int) output_mat%v = B_array diff --git a/src/PETSc_Helperk.kokkos.cxx b/src/PETSc_Helperk.kokkos.cxx index 464001a5..3122abef 100644 --- a/src/PETSc_Helperk.kokkos.cxx +++ b/src/PETSc_Helperk.kokkos.cxx @@ -2435,13 +2435,16 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos_view(Mat *input_mat, PetscIntKokkosV //------------------------------------------------------------------------------------------------------------------------ // Does a MatGetSubMatrix for a Kokkos matrix - the petsc version currently uses the host making it very slow -// This version only works works if the input IS have the same parallel row/column distribution +// This version only works works if the input IS have the same parallel row/column distribution // as the matrices, ie equivalent to MatCreateSubMatrix_MPIAIJ_SameRowDist // is_col must be sorted -// If you pass in our_level != -1 then it uses the fine/coarse indices stored in IS_fine_views_local -// and IS_coarse_views_local - they should match the passed in is_row and is_col though! +// If you pass in our_level != -1 then it uses the fine/coarse indices stored in +// the per-PCAIR handle (built by set_VecISCopyLocal_kokkos_our_level) - they +// should match the passed in is_row and is_col though! Pass NULL for handle if +// our_level == -1. PETSC_INTERN void MatCreateSubMatrix_kokkos(Mat *input_mat, IS *is_row, IS *is_col, \ const int reuse_int, Mat *output_mat, \ + void *kokkos_is_views_handle, \ const int our_level, const int is_row_fine_int, const int is_col_fine_int) { PetscInt global_row_start, global_row_end_plus_one; @@ -2453,7 +2456,6 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos(Mat *input_mat, IS *is_row, IS *is_c PetscCallVoid(ISGetSize(*is_col, &global_cols_col)); PetscIntKokkosView is_row_d_d, is_col_d_d; - const int level_idx = our_level - 1; auto exec = PetscGetKokkosExecutionSpace(); // If we want the input is_row and is_col to be used @@ -2505,23 +2507,9 @@ PETSC_INTERN void MatCreateSubMatrix_kokkos(Mat *input_mat, IS *is_row, IS *is_c // that already are on the device else { - if (is_row_fine_int) - { - is_row_d_d = *IS_fine_views_local[level_idx]; - } - else - { - is_row_d_d = *IS_coarse_views_local[level_idx]; - } - if (is_col_fine_int) - { - is_col_d_d = *IS_fine_views_local[level_idx]; - } - else - { - is_col_d_d = *IS_coarse_views_local[level_idx]; - } - } + is_row_d_d = VecISCopyLocal_kokkos_get_view(kokkos_is_views_handle, our_level, is_row_fine_int); + is_col_d_d = VecISCopyLocal_kokkos_get_view(kokkos_is_views_handle, our_level, is_col_fine_int); + } MatCreateSubMatrix_kokkos_view(input_mat, is_row_d_d, global_rows_row, is_col_d_d, global_cols_col, reuse_int, output_mat); diff --git a/src/VecISCopyLocalk.kokkos.cxx b/src/VecISCopyLocalk.kokkos.cxx index f5b1ff74..139fde70 100644 --- a/src/VecISCopyLocalk.kokkos.cxx +++ b/src/VecISCopyLocalk.kokkos.cxx @@ -2,60 +2,67 @@ #include "kokkos_helper.hpp" #include -// These are defined as extern in kokkos_helper.hpp but we allocate -// them in this .cxx -ViewPetscIntPtr* IS_fine_views_local = nullptr; -ViewPetscIntPtr* IS_coarse_views_local = nullptr; -int max_levels = -1; +// Per-PCAIR storage for the device-side fine/coarse index views, indexed by +// multigrid level. Previously these arrays were file-scope globals, which +// caused two concurrent PCAIR instances to collide on the same level slot +// (overwriting each other's shared_ptr to the view, leading to use-after-free +// in the older PCAIR's apply path). The handle is now owned by the air_data +// structure on the Fortran side and threaded through every kokkos call that +// needs it. +struct VecISCopyLocalKokkosCtx { + ViewPetscIntPtr* IS_fine_views_local = nullptr; + ViewPetscIntPtr* IS_coarse_views_local = nullptr; + int max_levels = -1; +}; //------------------------------------------------------------------------------------------------------------------------ -// Destroys the data -PETSC_INTERN void destroy_VecISCopyLocal_kokkos() +// Destroys the data. handle is a pointer to a void* (Fortran c_ptr by ref); +// sets it to NULL on exit so the caller's c_ptr field becomes c_null_ptr. +PETSC_INTERN void destroy_VecISCopyLocal_kokkos(void **handle) { - if (IS_fine_views_local) { - // Will automatically call the destructor on each element - delete[] IS_fine_views_local; - IS_fine_views_local = nullptr; + if (!handle || !*handle) return; + auto *ctx = static_cast(*handle); + if (ctx->IS_fine_views_local) { + delete[] ctx->IS_fine_views_local; + ctx->IS_fine_views_local = nullptr; } - if (IS_coarse_views_local) { - delete[] IS_coarse_views_local; - IS_coarse_views_local = nullptr; - } + if (ctx->IS_coarse_views_local) { + delete[] ctx->IS_coarse_views_local; + ctx->IS_coarse_views_local = nullptr; + } + delete ctx; + *handle = nullptr; - return; + return; } //------------------------------------------------------------------------------------------------------------------------ -// Creates the data we need to do the equivalent of veciscopy on local data in kokkos -PETSC_INTERN void create_VecISCopyLocal_kokkos(int max_levels_input) +// Creates the per-PCAIR data we need to do the equivalent of veciscopy on +// local data in kokkos. handle is a pointer to a void* (Fortran c_ptr by +// ref); on entry, if *handle is NULL we allocate fresh; if it is already +// allocated with a matching max_levels we keep it; if max_levels differs we +// destroy and re-create. +PETSC_INTERN void create_VecISCopyLocal_kokkos(int max_levels_input, void **handle) { - // If not built - if (!IS_fine_views_local) - { - // Allocate array of pointers - max_levels = max_levels_input; - - // Initialise fine - IS_fine_views_local = new ViewPetscIntPtr[max_levels]; - // Initialize each element as null until it's set - // we don't want to accidently call the constructor on any of the views - for (int i = 0; i < max_levels; i++) { - IS_fine_views_local[i] = nullptr; - } - // Initialise coarse - IS_coarse_views_local = new ViewPetscIntPtr[max_levels]; - for (int i = 0; i < max_levels; i++) { - IS_coarse_views_local[i] = nullptr; - } + if (!handle) return; + + if (*handle) { + auto *existing = static_cast(*handle); + if (existing->max_levels == max_levels_input) return; // already correct size + destroy_VecISCopyLocal_kokkos(handle); } - // Built but different max size, destroy and rebuild - else if (max_levels_input != max_levels) - { - destroy_VecISCopyLocal_kokkos(); - create_VecISCopyLocal_kokkos(max_levels_input); + + auto *ctx = new VecISCopyLocalKokkosCtx; + ctx->max_levels = max_levels_input; + ctx->IS_fine_views_local = new ViewPetscIntPtr[max_levels_input]; + ctx->IS_coarse_views_local = new ViewPetscIntPtr[max_levels_input]; + for (int i = 0; i < max_levels_input; i++) { + ctx->IS_fine_views_local[i] = nullptr; + ctx->IS_coarse_views_local[i] = nullptr; } + *handle = ctx; return; } @@ -63,8 +70,9 @@ PETSC_INTERN void create_VecISCopyLocal_kokkos(int max_levels_input) //------------------------------------------------------------------------------------------------------------------------ // Copy the input IS's to the device for our_level -PETSC_INTERN void set_VecISCopyLocal_kokkos_our_level(int our_level, PetscInt global_row_start, IS *index_fine, IS *index_coarse) +PETSC_INTERN void set_VecISCopyLocal_kokkos_our_level(void *handle, int our_level, PetscInt global_row_start, IS *index_fine, IS *index_coarse) { + auto *ctx = static_cast(handle); auto exec = PetscGetKokkosExecutionSpace(); const int level_idx = our_level - 1; @@ -80,9 +88,9 @@ PETSC_INTERN void set_VecISCopyLocal_kokkos_our_level(int our_level, PetscInt gl // Create a host view of the existing indices auto fine_view_h = PetscIntConstKokkosViewHost(fine_indices_ptr, fine_local_size); // Create a device view - make sure to index with 0 based - IS_fine_views_local[level_idx] = std::make_shared("IS_fine_view_" + std::to_string(our_level), fine_local_size); + ctx->IS_fine_views_local[level_idx] = std::make_shared("IS_fine_view_" + std::to_string(our_level), fine_local_size); // Copy the indices over to the device - Kokkos::deep_copy(exec, *IS_fine_views_local[level_idx], fine_view_h); + Kokkos::deep_copy(exec, *ctx->IS_fine_views_local[level_idx], fine_view_h); // Log copy with petsc size_t bytes = fine_view_h.extent(0) * sizeof(PetscInt); PetscCallVoid(PetscLogCpuToGpu(bytes)); @@ -90,7 +98,7 @@ PETSC_INTERN void set_VecISCopyLocal_kokkos_our_level(int our_level, PetscInt gl // Rewrite the indices as local - save us a minus during VecISCopyLocal_kokkos PetscIntKokkosView is_d; - is_d = *IS_fine_views_local[level_idx]; + is_d = *ctx->IS_fine_views_local[level_idx]; Kokkos::parallel_for( Kokkos::RangePolicy<>(exec, 0, is_d.extent(0)), KOKKOS_LAMBDA(PetscInt i) { is_d[i] -= global_row_start; @@ -99,22 +107,22 @@ PETSC_INTERN void set_VecISCopyLocal_kokkos_our_level(int our_level, PetscInt gl PetscCallVoid(ISGetIndices(*index_coarse, &coarse_indices_ptr)); auto coarse_view_h = PetscIntConstKokkosViewHost(coarse_indices_ptr, coarse_local_size); // Create a device view - make sure to index with 0 based - IS_coarse_views_local[level_idx] = std::make_shared("IS_coarse_view_" + std::to_string(our_level), coarse_local_size); + ctx->IS_coarse_views_local[level_idx] = std::make_shared("IS_coarse_view_" + std::to_string(our_level), coarse_local_size); // Copy the indices over to the device - Kokkos::deep_copy(exec, *IS_coarse_views_local[level_idx], coarse_view_h); + Kokkos::deep_copy(exec, *ctx->IS_coarse_views_local[level_idx], coarse_view_h); // Log copy with petsc bytes = coarse_view_h.extent(0) * sizeof(PetscInt); - PetscCallVoid(PetscLogCpuToGpu(bytes)); + PetscCallVoid(PetscLogCpuToGpu(bytes)); PetscCallVoid(ISRestoreIndices(*index_coarse, &coarse_indices_ptr)); - + // Rewrite the indices as local - save us a minus during VecISCopyLocal_kokkos - is_d = *IS_coarse_views_local[level_idx]; + is_d = *ctx->IS_coarse_views_local[level_idx]; Kokkos::parallel_for( Kokkos::RangePolicy<>(exec, 0, is_d.extent(0)), KOKKOS_LAMBDA(PetscInt i) { is_d[i] -= global_row_start; }); // Ensure we're finished before we exit - Kokkos::fence(); + Kokkos::fence(); return; } @@ -122,23 +130,24 @@ PETSC_INTERN void set_VecISCopyLocal_kokkos_our_level(int our_level, PetscInt gl //------------------------------------------------------------------------------------------------------------------------ // Do the equivalent of veciscopy on local data using the IS data on the device -PETSC_INTERN void VecISCopyLocal_kokkos(int our_level, int fine_int, Vec *vfull, int mode_int, Vec *vreduced) +PETSC_INTERN void VecISCopyLocal_kokkos(void *handle, int our_level, int fine_int, Vec *vfull, int mode_int, Vec *vreduced) { + auto *ctx = static_cast(handle); const int level_idx = our_level - 1; - // Can't use the shared pointer directly within the parallel + // Can't use the shared pointer directly within the parallel // regions on the device PetscIntKokkosView is_d; auto exec = PetscGetKokkosExecutionSpace(); // Make sure to index with 0 based if (fine_int) { - is_d = *IS_fine_views_local[level_idx]; + is_d = *ctx->IS_fine_views_local[level_idx]; } else { - is_d = *IS_coarse_views_local[level_idx]; - } + is_d = *ctx->IS_coarse_views_local[level_idx]; + } // SCATTER_REVERSE=1 // vreduced[i] = vfull[is[i]] @@ -155,9 +164,9 @@ PETSC_INTERN void VecISCopyLocal_kokkos(int our_level, int fine_int, Vec *vfull, }); PetscCallVoid(VecRestoreKokkosViewWrite(*vreduced, &vreduced_d)); - PetscCallVoid(VecRestoreKokkosView(*vfull, &vfull_d)); + PetscCallVoid(VecRestoreKokkosView(*vfull, &vfull_d)); - } + } // SCATTER_FORWARD=0 // vfull[is[i]] = vreduced[i] else if (mode_int == 0) @@ -170,10 +179,10 @@ PETSC_INTERN void VecISCopyLocal_kokkos(int our_level, int fine_int, Vec *vfull, Kokkos::parallel_for( Kokkos::RangePolicy<>(exec, 0, is_d.extent(0)), KOKKOS_LAMBDA(PetscInt i) { vfull_d[is_d(i)] = vreduced_d[i]; - }); + }); PetscCallVoid(VecRestoreKokkosView(*vreduced, &vreduced_d)); - PetscCallVoid(VecRestoreKokkosViewWrite(*vfull, &vfull_d)); + PetscCallVoid(VecRestoreKokkosViewWrite(*vfull, &vfull_d)); } // Ensure we're done before we exit Kokkos::fence(); @@ -181,4 +190,15 @@ PETSC_INTERN void VecISCopyLocal_kokkos(int our_level, int fine_int, Vec *vfull, return; } -//------------------------------------------------------------------------------------------------------------------------ \ No newline at end of file +//------------------------------------------------------------------------------------------------------------------------ + +// Accessor used by MatCreateSubMatrix_kokkos to read the per-PCAIR IS view +// for a given level/fine-or-coarse. Returns a copy of the shared view (cheap; +// just bumps the underlying refcount of the kokkos View handle). +PETSC_INTERN PetscIntKokkosView VecISCopyLocal_kokkos_get_view(void *handle, int our_level, int fine_int) +{ + auto *ctx = static_cast(handle); + const int level_idx = our_level - 1; + if (fine_int) return *ctx->IS_fine_views_local[level_idx]; + return *ctx->IS_coarse_views_local[level_idx]; +} diff --git a/tests/Makefile b/tests/Makefile index 16f11872..298023bb 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -81,6 +81,10 @@ run_tests_load_serial: @echo "" @echo "Test AIRG with GMRES polynomials for hyperbolic streaming problem in C" ./ex6 -f data/mat_stream_2364 -pc_air_a_drop 1e-3 -pc_air_inverse_type power -ksp_max_it 5 +# + @echo "" + @echo "Test two concurrent AIRG solves don't corrupt per-level IS state" + ./ex6_two_airg -f data/mat_stream_2364 # @echo "" @echo "Test lAIR with GMRES polynomial smoothing for hyperbolic streaming problem" @@ -181,6 +185,9 @@ run_tests_load_parallel: # @echo "Test AIRG with GMRES polynomials for hyperbolic streaming problem, matrix-free smoothing in C in parallel" $(MPIEXEC) -n 2 ./ex6 -f data/mat_stream_2364 -pc_air_a_drop 1e-3 -pc_air_inverse_type power -pc_air_matrix_free_polys -ksp_max_it 5 +# + @echo "Test two concurrent AIRG solves don't corrupt per-level IS state in parallel" + $(MPIEXEC) -n 2 ./ex6_two_airg -f data/mat_stream_2364 # @echo "Test single level GMRES polynomial preconditioning for hyperbolic streaming problem in C in parallel" $(MPIEXEC) -n 2 ./ex6 -f data/mat_stream_2364 -pc_type pflareinv -pc_pflareinv_type power -ksp_max_it 21 diff --git a/tests/ex6_two_airg.c b/tests/ex6_two_airg.c new file mode 100644 index 00000000..38726c9f --- /dev/null +++ b/tests/ex6_two_airg.c @@ -0,0 +1,130 @@ +static char help[] = "Reads a PETSc matrix and sets up two concurrent PCAIR\n\ +preconditioners on it, this checks the data structures are per PCAIR.\n\ +\n\ +Input arguments:\n\ + -f : matrix to load (see $PETSC_DIR/share/petsc/datafiles/matrices)\n\n"; + +#include +#include "pflare.h" + +/* Build a single KSP wrapping PCAIR on the given operator. KSPSetUp is called + explicitly so that the per-PCAIR multigrid hierarchy (and the per-level + fine/coarse IS views, on the Kokkos path) is constructed up front. Pass a + distinct strong_threshold per instance so the two PCAIRs produce different + CF splittings (and therefore different per-level IS views) — that is what + makes the file-scope-globals bug observable; with identical splittings the + overwritten view happens to have the same contents and the bug is silent. */ +static PetscErrorCode BuildAIRKSP(MPI_Comm comm, Mat A, const char *prefix, PetscReal strong_threshold, KSP *ksp) +{ + PC pc; + PetscFunctionBeginUser; + PetscCall(KSPCreate(comm, ksp)); + PetscCall(KSPSetOperators(*ksp, A, A)); + PetscCall(KSPSetTolerances(*ksp, 1e-6, PETSC_DEFAULT, PETSC_DEFAULT, 100)); + PetscCall(KSPGetPC(*ksp, &pc)); + PetscCall(PCSetType(pc, PCAIR)); + PetscCall(PCAIRSetStrongThreshold(pc, strong_threshold)); + if (prefix) PetscCall(KSPSetOptionsPrefix(*ksp, prefix)); + PetscCall(KSPSetFromOptions(*ksp)); + PetscCall(KSPSetUp(*ksp)); + PetscFunctionReturn(PETSC_SUCCESS); +} + +int main(int argc, char **args) +{ + Mat A, A_diff_type; + Vec b, x1, x2; + PetscRandom rnd; + PetscViewer fd; + char file[PETSC_MAX_PATH_LEN]; + PetscBool flg; + PetscInt m, n, M, N, one = 1; + MatType mtype, mtype_input; + KSP ksp1, ksp2; + KSPConvergedReason reason1, reason2; + int npe; + + PetscCall(PetscInitialize(&argc, &args, (char *)0, help)); + PCRegister_PFLARE(); + + PetscCall(PetscOptionsGetString(NULL, NULL, "-f", file, sizeof(file), &flg)); + if (!flg) SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_USER_INPUT, "Must indicate binary file with the -f option"); + PetscCall(PetscViewerBinaryOpen(PETSC_COMM_WORLD, file, FILE_MODE_READ, &fd)); + PetscCall(MatCreate(PETSC_COMM_WORLD, &A)); + PetscCall(MatLoad(A, fd)); + PetscCall(PetscViewerDestroy(&fd)); + + /* Partition the loaded matrix when in parallel (copy from ex6.c). */ + PetscCallMPI(MPI_Comm_size(PETSC_COMM_WORLD, &npe)); + if (npe != 1) { + MatPartitioning part; + IS is, isrows; + Mat A_partitioned; + PetscCall(MatPartitioningCreate(PETSC_COMM_WORLD, &part)); + PetscCall(MatPartitioningSetAdjacency(part, A)); + PetscCall(MatPartitioningSetNParts(part, npe)); + PetscCall(MatPartitioningSetFromOptions(part)); + PetscCall(MatPartitioningApply(part, &is)); + PetscCall(ISBuildTwoSided(is, NULL, &isrows)); + PetscCall(MatCreateSubMatrix(A, isrows, isrows, MAT_INITIAL_MATRIX, &A_partitioned)); + PetscCall(MatDestroy(&A)); + PetscCall(MatPartitioningDestroy(&part)); + PetscCall(ISDestroy(&is)); + PetscCall(ISDestroy(&isrows)); + A = A_partitioned; + } + + /* Convert A to the user-requested matrix type (e.g. -mat_type aijkokkos). */ + PetscCall(MatGetLocalSize(A, &m, &n)); + PetscCall(MatGetSize(A, &M, &N)); + PetscCall(MatCreateFromOptions(PETSC_COMM_WORLD, NULL, one, m, n, M, N, &A_diff_type)); + PetscCall(MatAssemblyBegin(A_diff_type, MAT_FINAL_ASSEMBLY)); + PetscCall(MatAssemblyEnd(A_diff_type, MAT_FINAL_ASSEMBLY)); + + PetscCall(MatGetType(A, &mtype)); + PetscCall(MatGetType(A_diff_type, &mtype_input)); + if (strcmp(mtype, mtype_input) != 0) { + PetscCall(MatCopy(A, A_diff_type, DIFFERENT_NONZERO_PATTERN)); + PetscCall(MatDestroy(&A)); + A = A_diff_type; + } else { + PetscCall(MatDestroy(&A_diff_type)); + } + + /* Build a random RHS that matches A's vec type. */ + PetscCall(MatCreateVecs(A, &b, &x1)); + PetscCall(VecDuplicate(x1, &x2)); + PetscCall(PetscRandomCreate(PETSC_COMM_WORLD, &rnd)); + PetscCall(PetscRandomSetFromOptions(rnd)); + PetscCall(VecSetRandom(b, rnd)); + + /* Build BOTH KSPs (each with its own PCAIR on A) before applying either. + Different strong thresholds give different CF splittings — without that, + the two PCAIRs end up with identical per-level IS views and the bug is + hidden behind matching contents. */ + PetscCall(BuildAIRKSP(PETSC_COMM_WORLD, A, "air1_", 0.3, &ksp1)); + PetscCall(BuildAIRKSP(PETSC_COMM_WORLD, A, "air2_", 0.8, &ksp2)); + + PetscCall(KSPSolve(ksp1, b, x1)); + PetscCall(KSPGetConvergedReason(ksp1, &reason1)); + + /* ksp2's solve, the second AIRG application. */ + PetscCall(KSPSolve(ksp2, b, x2)); + PetscCall(KSPGetConvergedReason(ksp2, &reason2)); + + PetscCall(PetscPrintf(PETSC_COMM_WORLD, + "ksp1 reason = %d, ksp2 reason = %d\n", + (int)reason1, (int)reason2)); + + int exit_code = (reason1 >= 0 && reason2 >= 0) ? 0 : 1; + + PetscCall(KSPDestroy(&ksp1)); + PetscCall(KSPDestroy(&ksp2)); + PetscCall(VecDestroy(&b)); + PetscCall(VecDestroy(&x1)); + PetscCall(VecDestroy(&x2)); + PetscCall(PetscRandomDestroy(&rnd)); + PetscCall(MatDestroy(&A)); + PetscCall(PetscFinalize()); + return exit_code; +} From 68e41d67708790b4be78b033eacd18131774c72a Mon Sep 17 00:00:00 2001 From: sdargavi Date: Wed, 27 May 2026 21:23:46 +0100 Subject: [PATCH 2/2] Have to given different PETSC_INTERN for C++ return types --- include/kokkos_helper.hpp | 4 +++- src/VecISCopyLocalk.kokkos.cxx | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/include/kokkos_helper.hpp b/include/kokkos_helper.hpp index 69901913..d8c1efec 100644 --- a/include/kokkos_helper.hpp +++ b/include/kokkos_helper.hpp @@ -45,7 +45,9 @@ PETSC_INTERN void delete_device_diag_dom_ratio(); // Per-PCAIR IS views (fine/coarse per multigrid level) live behind an opaque // handle owned by the air_data on the Fortran side; see VecISCopyLocalk for // the definition of the storage struct and accessors. -PETSC_INTERN PetscIntKokkosView VecISCopyLocal_kokkos_get_view(void *handle, int our_level, int fine_int); +// Not PETSC_INTERN: that forces extern "C" linkage, which is incompatible with +// returning a C++ Kokkos View. Callers are all C++ (.kokkos.cxx). +PETSC_VISIBILITY_INTERNAL PetscIntKokkosView VecISCopyLocal_kokkos_get_view(void *handle, int our_level, int fine_int); extern intKokkosView cf_markers_local_d; extern PetscScalarKokkosView diag_dom_ratio_local_d; diff --git a/src/VecISCopyLocalk.kokkos.cxx b/src/VecISCopyLocalk.kokkos.cxx index 139fde70..5d2743c5 100644 --- a/src/VecISCopyLocalk.kokkos.cxx +++ b/src/VecISCopyLocalk.kokkos.cxx @@ -195,7 +195,7 @@ PETSC_INTERN void VecISCopyLocal_kokkos(void *handle, int our_level, int fine_in // Accessor used by MatCreateSubMatrix_kokkos to read the per-PCAIR IS view // for a given level/fine-or-coarse. Returns a copy of the shared view (cheap; // just bumps the underlying refcount of the kokkos View handle). -PETSC_INTERN PetscIntKokkosView VecISCopyLocal_kokkos_get_view(void *handle, int our_level, int fine_int) +PETSC_VISIBILITY_INTERNAL PetscIntKokkosView VecISCopyLocal_kokkos_get_view(void *handle, int our_level, int fine_int) { auto *ctx = static_cast(handle); const int level_idx = our_level - 1;