diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index abdc732f..542c2a70 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -25,6 +25,8 @@ jobs:
         - {name: "TEST_UBUNTU_OPENMP_DEBUG", os: ubuntu-latest, debug: "enabled", cc: "gcc", cxx: "g++", kokkos_backend: "openmp"}
         - {name: "TEST_UBUNTU_SERIAL_RELEASE", os: ubuntu-latest, debug: "disabled", cc: "gcc", cxx: "g++", kokkos_backend: "serial"}
         - {name: "TEST_UBUNTU_OPENMP_RELEASE", os: ubuntu-latest, debug: "disabled", cc: "gcc", cxx: "g++", kokkos_backend: "openmp"}
+        # Kokkos + MPI: builds MATAR with MPI, registers MPICArrayKokkos_mpi_suite (mpirun from CTest).
+        - {name: "TEST_UBUNTU_SERIAL_MPI_DEBUG", os: ubuntu-latest, debug: "enabled", cc: "gcc", cxx: "g++", kokkos_backend: "serial_mpi"}
         - {name: "TEST_MAC_SERIAL_DEBUG", os: macos-14, debug: "enabled", cc: "clang", cxx: "clang++", kokkos_backend: "serial"}
         - {name: "TEST_MAC_SERIAL_RELEASE", os: macos-14, debug: "disabled", cc: "clang", cxx: "clang++", kokkos_backend: "serial"}
 
@@ -35,6 +37,18 @@ jobs:
     - name: Checkout submodules
       run: git submodule update --init --recursive
 
+    - name: Install Open MPI
+      if: contains(matrix.config.kokkos_backend, 'mpi') && matrix.config.os == 'ubuntu-latest'
+      env:
+        DEBIAN_FRONTEND: noninteractive
+      run: |
+        sudo apt-get update -qq
+        sudo apt-get install -y --no-install-recommends \
+          libopenmpi-dev \
+          openmpi-bin \
+          mpi-default-bin \
+          mpi-default-dev
+
     # Build MATAR tests using the build-matar.sh script
     - name: Build MATAR Tests
       shell: bash
@@ -52,6 +66,6 @@ jobs:
     - name: Run Tests
       shell: bash
       run: |
-        cd ${{ github.workspace }}/build-matar-${{ matrix.config.kokkos_backend }}/test_cases
+        cd ${{ github.workspace }}/build-matar-${{ matrix.config.kokkos_backend }}
         ctest --output-on-failure
- 
+
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 32238913..6026c9db 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -163,6 +163,9 @@ if (KOKKOS)
 
     include_directories(mesh_decomp)
     add_subdirectory(mesh_decomp)
+
+    add_executable(matar_mpi matar_mpi.cpp)
+    target_link_libraries(matar_mpi ${LINKING_LIBRARIES})
   endif()
 
 endif()
diff --git a/examples/matar_mpi.cpp b/examples/matar_mpi.cpp
new file mode 100644
index 00000000..8f616fbf
--- /dev/null
+++ b/examples/matar_mpi.cpp
@@ -0,0 +1,301 @@
+// Example: basic MPI features in MATAR (requires HAVE_MPI and HAVE_KOKKOS).
+// - MPICArrayKokkos: distributed array with optional halo exchange via communicate()
+// - CommunicationPlan: MPI distributed graph + per-neighbor index lists
+// - MPICArrayKokkos::all_reduce: 1D, and fixed trailing indices for rank 2/3/4
+//   (mpi_type_map + mpi_op_for in mpi_types.h / communication_plan.h)
+
+#if !defined(HAVE_MPI) || !defined(HAVE_KOKKOS)
+
+#include <iostream>
+
+int main() {
+    std::cerr
+        << "This example requires MATAR built with HAVE_MPI and HAVE_KOKKOS.\n";
+    return 0;
+}
+
+#else
+
+#include <cmath>
+#include <iostream>
+
+#include <mpi.h>
+#include <matar.h>
+
+using namespace mtr;
+
+int main(int argc, char* argv[]) {
+
+MPI_Init(&argc, &argv);
+MATAR_INITIALIZE(argc, argv);
+{
+
+    int rank = 0;
+    int size = 1;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+    // Create a basic communication plan, this handles things like storing MPI world comms and 
+    // allows for basic reduction operations.  For more complex communication patterns, you can
+    // use the CommunicationPlan to build a more complex communication plan.
+    CommunicationPlan comm_plan;
+    comm_plan.initialize(MPI_COMM_WORLD);
+
+    // -------------------------------------------------------------------------
+    // MPICArrayKokkos::all_reduce — uses mpi_type_map<T> for MPI_Datatype and
+    // mpi_op_for(operation) for MPI_Op (no CommunicationPlan required).
+    // -------------------------------------------------------------------------
+    
+    // Same size arrays on each rank, just do a simple reduction.
+    const int num_values = 100;
+    MPICArrayKokkos<double> locals(num_values, "values");
+    locals.initialize_comm_plan(comm_plan);
+    locals.set_values(1.0);
+    locals.update_device();
+
+    double global_sum = locals.all_reduce(operation::sum);
+    
+    double expected_sum = static_cast<double>(num_values * size);
+    if (rank == 0) {
+        std::cout << "all_reduce(sum): " << global_sum << " (expect "
+                  << expected_sum << ")\n";
+    }
+
+    // Different size arrays on each rank, do a simple reduction.
+    const int num_values_per_rank = 10*(1+rank);
+    MPICArrayKokkos<double> rank_locals(num_values_per_rank, "values");
+    rank_locals.initialize_comm_plan(comm_plan);
+    rank_locals.set_values(1.0);
+    rank_locals.update_device();
+
+    global_sum = rank_locals.all_reduce(operation::sum);
+    // Rank r has 10*(r+1) entries of 1 → total = 10 * (1 + … + size).
+    expected_sum = 10.0 * static_cast<double>(size * (size + 1) / 2);
+
+
+    if (rank == 0) {
+        std::cout << "all_reduce(sum) with different size arrays on each rank (10*rank_id): " << global_sum << " (expect "
+                  << expected_sum << ")\n";
+    }
+
+
+    // Different size arrays on each rank, find the minimum value
+    MPICArrayKokkos<float> min_locals(num_values_per_rank, "values");
+    min_locals.initialize_comm_plan(comm_plan);
+
+    FOR_ALL(i, 0, num_values_per_rank, {
+        min_locals(i) = static_cast<float>(10*rank + i);
+    });
+
+    min_locals.update_device();
+    float global_min = min_locals.all_reduce(operation::min);
+    float expected_min = 0.0F;
+    if (rank == 0) {
+        std::cout << "all_reduce(min) with different size arrays on each rank (10*rank_id): " << global_min << " (expect "
+                  << expected_min << ")\n";
+    }
+
+    float global_max = min_locals.all_reduce(operation::max);
+    // Largest entry is on rank (size-1) at i = 10*size - 1.
+    const float expected_max = static_cast<float>(10 * (size - 1) + (10 * size - 1));
+    if (rank == 0) {
+        std::cout << "all_reduce(max) with different size arrays on each rank (10*rank_id): " << global_max << " (expect "
+                  << expected_max << ")\n";
+    }
+
+
+    
+    // Example: all_reduce with product
+    // Initialize a MPICArrayKokkos<double> with all values 2.0
+    MPICArrayKokkos<double> prod_locals(4, "prod_values");
+    prod_locals.initialize_comm_plan(comm_plan);
+    prod_locals.set_values(2.0);
+    prod_locals.update_device();
+
+    // Compute the product across all ranks and all values
+    double global_product = prod_locals.all_reduce(operation::product);
+
+    // The expected product is pow(2, 4*size), i.e., each rank contributes 4 twos
+    double expected_product = std::pow(2.0, 4 * size);
+    if (rank == 0) {
+        std::cout << "all_reduce(product): " << global_product << " (expect "
+                  << expected_product << ")\n";
+    }
+
+    // -------------------------------------------------------------------------
+    // all_reduce with fixed trailing indices (multi-dimensional arrays).
+    // -------------------------------------------------------------------------
+    const size_t n_elem = size * 10;
+
+    // Rank-2: element centroid coordinates — elem_centroids(elem_id, elem_position)
+    // with elem_position ∈ {0,1,2} as x, y, z. Reduce over elem_id for each axis.
+    {
+        size_t n_elem = 3;
+        size_t num_coords = 3;
+        MPICArrayKokkos<double> elem_centroids(n_elem, num_coords,"elem_centroids");
+        elem_centroids.initialize_comm_plan(comm_plan);
+        FOR_ALL(elem_id, 0, n_elem,
+                elem_position, 0, num_coords, {
+            const double base = 1000.0 * rank + 100.0 * elem_id;
+            elem_centroids(elem_id, elem_position) =
+                base + 10.0 * static_cast<double>(elem_position);
+        });
+        MATAR_FENCE();
+        elem_centroids.update_device();
+
+        const double max_x = elem_centroids.all_reduce(operation::max, 0);
+        const double max_y = elem_centroids.all_reduce(operation::max, 1);
+        const double max_z = elem_centroids.all_reduce(operation::max, 2);
+
+        const double base_rank = 1000.0 * static_cast<double>(size - 1);
+        const double base_elem = 100.0 * static_cast<double>(n_elem - 1);
+        const double expect_max_x = base_rank + base_elem + 0.0;
+        const double expect_max_y = base_rank + base_elem + 10.0;
+        const double expect_max_z = base_rank + base_elem + 20.0;
+
+        if (rank == 0) {
+            std::cout << "all_reduce(max, coord) rank-2 centroids — max x: " << max_x
+                      << " (expect " << expect_max_x << ")\n";
+            std::cout << "all_reduce(max, coord) rank-2 centroids — max y: " << max_y
+                      << " (expect " << expect_max_y << ")\n";
+            std::cout << "all_reduce(max, coord) rank-2 centroids — max z: " << max_z
+                      << " (expect " << expect_max_z << ")\n";
+        }
+    }
+
+    // Rank-3: reduce over e at fixed tensor component — e.g. stress(e, 0, 1).
+    {
+        // Rank-3: reduce over e at fixed tensor component — e.g. stress(e, 0, 1).
+        MPICArrayKokkos<double> stress(n_elem, 3, 3, "stress");
+        stress.initialize_comm_plan(comm_plan);
+        FOR_ALL(e, 0, n_elem, r, 0, 3, c, 0, 3, {
+            stress(e, r, c) =
+                10000.0 * rank + 1000.0 * e + 100.0 * r + c;
+        });
+        MATAR_FENCE();
+        stress.update_device();
+        const double max_comp = stress.all_reduce(operation::max,  0, 1);
+        
+        const double expect_3d =
+            10000.0 * static_cast<double>(size - 1) +
+            1000.0 * static_cast<double>(n_elem - 1) + 1.0;
+        if (rank == 0) {
+            std::cout << "all_reduce(max, i, j) rank-3 s(e,i,j): " << max_comp
+                      << " (expect " << expect_3d << ")\n";
+        }
+    }
+
+    // Rank-4: reduce over element at fixed Gauss point and tensor component.
+    {
+        const size_t n_gauss = 2;
+        MPICArrayKokkos<double> s4(n_elem, n_gauss, 3, 3, "s4");
+        s4.initialize_comm_plan(comm_plan);
+        for (size_t e = 0; e < n_elem; ++e) {
+            for (size_t g = 0; g < n_gauss; ++g) {
+                for (size_t r = 0; r < 3; ++r) {
+                    for (size_t c = 0; c < 3; ++c) {
+                        s4.host(e, g, r, c) =
+                            100000.0 * rank + 1000.0 * static_cast<double>(e) +
+                            100.0 * static_cast<double>(g) +
+                            10.0 * static_cast<double>(r) +
+                            static_cast<double>(c);
+                    }
+                }
+            }
+        }
+        s4.update_device();
+        const size_t g_fix = 1;
+        const size_t ti_fix = 0;
+        const size_t tj_fix = 1;
+        const double max_qp =
+            s4.all_reduce(operation::max, g_fix, ti_fix, tj_fix);
+        const double expect_4d =
+            100000.0 * static_cast<double>(size - 1) +
+            1000.0 * static_cast<double>(n_elem - 1) + 101.0;
+        if (rank == 0) {
+            std::cout << "all_reduce(max, g, i, j) rank-4 stress(e,g,i,j): "
+                      << max_qp << " (expect " << expect_4d << ")\n";
+        }
+    }
+
+    // -------------------------------------------------------------------------
+    // CommunicationPlan + communicate(): periodic 1D halo (needs 2+ ranks).
+    // Layout per rank: index 0 = left ghost, 1..L = owned, L+1 = right ghost.
+    // -------------------------------------------------------------------------
+    // if (size < 2) {
+    //     if (rank == 0) {
+    //         std::cout
+    //             << "Re-run with 2+ MPI processes to run the halo exchange demo.\n";
+    //     }
+    // } else {
+    //     const int L = 4;
+    //     const int left = (rank + size - 1) % size;
+    //     const int right = (rank + 1) % size;
+
+    //     CommunicationPlan comm_plan;
+    //     comm_plan.initialize(MPI_COMM_WORLD);
+
+    //     int send_ranks[2] = { left, right };
+    //     int recv_ranks[2] = { left, right };
+    //     comm_plan.initialize_graph_communicator(2, send_ranks, 2, recv_ranks);
+
+    //     DCArrayKokkos<size_t> send_strides(2, "send_strides");
+    //     send_strides.host(0) = 1;
+    //     send_strides.host(1) = 1;
+    //     send_strides.update_device();
+    //     DRaggedRightArrayKokkos<int> rank_send_ids(send_strides,
+    //                                                "rank_send_ids");
+    //     // Send first owned cell to the left neighbor; last owned to the right.
+    //     rank_send_ids.host(0, 0) = 1;
+    //     rank_send_ids.host(1, 0) = L;
+    //     rank_send_ids.update_device();
+
+    //     DCArrayKokkos<size_t> recv_strides(2, "recv_strides");
+    //     recv_strides.host(0) = 1;
+    //     recv_strides.host(1) = 1;
+    //     recv_strides.update_device();
+    //     DRaggedRightArrayKokkos<int> rank_recv_ids(recv_strides,
+    //                                                "rank_recv_ids");
+    //     rank_recv_ids.host(0, 0) = 0;
+    //     rank_recv_ids.host(1, 0) = L + 1;
+    //     rank_recv_ids.update_device();
+
+    //     MATAR_FENCE();
+    //     comm_plan.setup_send_recv(rank_send_ids, rank_recv_ids);
+
+    //     MPICArrayKokkos<double> field(static_cast<size_t>(L + 2), "field");
+    //     field.initialize_comm_plan(comm_plan);
+
+    //     FOR_ALL(i, 0, L + 2, {
+    //         field(i) = -1.0;
+    //     });
+    //     FOR_ALL(i, 1, L + 1, { field(i) = static_cast<double>(rank); });
+    //     MATAR_FENCE();
+
+    //     field.communicate();
+
+    //     field.update_host();
+    //     MATAR_FENCE();
+
+    //     const double gl = field.host(0);
+    //     const double gr = field.host(L + 1);
+    //     const bool halo_ok =
+    //         (std::fabs(gl - static_cast<double>(left)) < 1.0e-14) &&
+    //         (std::fabs(gr - static_cast<double>(right)) < 1.0e-14);
+
+    //     if (rank == 0) {
+    //         std::cout << "After halo exchange (periodic 1D): ";
+    //         if (halo_ok) {
+    //             std::cout << "ghost values match neighbor ranks.\n";
+    //         } else {
+    //             std::cout << "verification failed.\n";
+    //         }
+    //     }
+    // }
+}
+MATAR_FINALIZE();
+MPI_Finalize();
+return 0;
+}
+
+#endif
diff --git a/scripts/cmake_build_test.sh b/scripts/cmake_build_test.sh
index 21e9a62b..07d2e987 100644
--- a/scripts/cmake_build_test.sh
+++ b/scripts/cmake_build_test.sh
@@ -32,6 +32,10 @@ else
     )
 fi
 
+if [[ "${kokkos_build_type}" == *"mpi"* ]]; then
+    cmake_options+=(-D MATAR_TEST_USE_MPI=1)
+fi
+
 # Print CMake options for reference
 echo "CMake Options: ${cmake_options[@]}"
 
diff --git a/src/include/communication_plan.h b/src/include/communication_plan.h
index 0f642b2e..619ad158 100644
--- a/src/include/communication_plan.h
+++ b/src/include/communication_plan.h
@@ -7,14 +7,38 @@
 
 #include <set>
 
-using namespace mtr;
-
+namespace mtr
+{
 
 enum class communication_plan_type {
     no_communication,
     all_to_all_graph
 };
 
+/// Reduction operation for @c MPICArrayKokkos::all_reduce (maps to @c MPI_Op for the MPI stage).
+enum class operation {
+    sum,
+    product,
+    max,
+    min
+};
+
+/// Map @ref operation to the corresponding @c MPI_Op (host-only; call after @c MPI_Init).
+inline MPI_Op mpi_op_for(operation op) {
+    switch (op) {
+    case operation::sum:
+        return MPI_SUM;
+    case operation::product:
+        return MPI_PROD;
+    case operation::max:
+        return MPI_MAX;
+    case operation::min:
+        return MPI_MIN;
+    default:
+        return MPI_SUM;
+    }
+}
+
 
 struct CommunicationPlan {
     
@@ -65,7 +89,6 @@ struct CommunicationPlan {
     DCArrayKokkos<int> send_counts_; // [size: num_send_ranks] Number of items to send to each rank
     DCArrayKokkos<int> recv_counts_; // [size: num_recv_ranks] Number of items to receive from each rank
     
-    
     DCArrayKokkos<int> send_displs_; // [size: num_send_ranks] Starting index of items to send to each rank
     DCArrayKokkos<int> recv_displs_; // [size: num_recv_ranks] Starting index of items to receive from each rank
 
@@ -91,8 +114,7 @@ struct CommunicationPlan {
             MPI_Comm_free(&mpi_comm_graph);
         }
     }
-    
-    
+
     void initialize(MPI_Comm comm_world){
         int mpi_init = 0;
         MPI_Initialized(&mpi_init);
@@ -512,6 +534,8 @@ struct CommunicationPlan {
     }
 }; // End of CommunicationPlan
 
+} // end namespace mtr
+
 #endif // end if HAVE_MPI
 #endif // end if COMMUNICATION_PLAN_H
 
diff --git a/src/include/mpi_types.h b/src/include/mpi_types.h
index bc00d852..78c02f4c 100644
--- a/src/include/mpi_types.h
+++ b/src/include/mpi_types.h
@@ -87,16 +87,11 @@ class MPICArrayKokkos {
     size_t length_ = 0;
     size_t order_ = 0;  // tensor order (rank)
 
-    MPI_Comm mpi_comm_;
+    MPI_Comm mpi_comm_ = MPI_COMM_NULL;
     MPI_Status mpi_status_;
     MPI_Datatype mpi_datatype_;
     MPI_Request mpi_request_;
 
-    
-    // --- Ghost Communication Support ---
-    CommunicationPlan* comm_plan_ = NULL;      // Pointer to shared communication plan
-
-
     DCArrayKokkos<int> send_counts_; // [size: num_send_ranks] Number of items to send to each rank
     DCArrayKokkos<int> recv_counts_; // [size: num_recv_ranks] Number of items to receive from each rank
     DCArrayKokkos<int> send_displs_; // [size: num_send_ranks] Starting index of items to send to each rank
@@ -109,10 +104,15 @@ class MPICArrayKokkos {
     DRaggedRightArrayKokkos<int> recv_indices_; // [size: num_recv_ranks, num_items_to_recv_by_rank] Indices of items to receive from each rank
     
     
-    size_t num_owned_;            // Number of owned items (nodes/elements)
-    size_t num_ghost_;            // Number of ghost items (nodes/elements)
+    size_t num_owned_ = 0;            // Number of owned items (nodes/elements); optional override
+    size_t num_ghost_ = 0;            // Number of ghost items (nodes/elements); informational when user-set
+
 
 public:
+
+    // --- Ghost Communication Support ---
+    CommunicationPlan* comm_plan_ = NULL;      // Pointer to shared communication plan
+
     // Data member to access host view (initialized as pointer to this_array_.host_pointer())
     ViewCArray <T> host;
 
@@ -358,6 +358,20 @@ class MPICArrayKokkos {
         MATAR_FENCE();
     };
 
+    /// Reduce over index 0 (e.g. elements); array must be rank 1.
+    T all_reduce(operation op);
+
+    /// Reduce over leading dimension only; fix one trailing index (rank-2 arrays).
+    T all_reduce(operation op, size_t j);
+
+    /// Reduce over leading dimension only; fix two trailing indices (e.g.
+    /// stress(elem,3,3) → all_reduce(op,0,1) is max of stress(e,0,1) over owned elements e).
+    T all_reduce(operation op, size_t j, size_t k);
+
+    /// Reduce over elements only; fix Gauss point and tensor indices (rank-4), e.g.
+    /// stress(elem, gauss, i, j) → all_reduce(op, g, i, j) over owned elem.
+    T all_reduce(operation op, size_t g, size_t ti, size_t tj);
+
     void set_values(const T& value){
         this_array_.set_values(value);
     };
@@ -635,12 +649,286 @@ void MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::update_device() {
     this_array_.update_device();
 }
 
+
+
 template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
-KOKKOS_INLINE_FUNCTION
-MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::~MPICArrayKokkos() {
+T MPICArrayKokkos<T, Layout, ExecSpace, MemoryTraits>::all_reduce(operation op) {
+   
+    // assert(order_ == 1 && stride_ == 1 && "MPICArrayKokkos::all_reduce requires a 1D array (stride 1).");
+    assert(!(op == operation::product && sizeof(T) == sizeof(bool)) &&
+           "MPICArrayKokkos::all_reduce: product reduction is not supported for bool.");
+
+
+    assert(num_owned_ <= dims_[0] && "MPICArrayKokkos::all_reduce: num_owned exceeds dim0.");
+    const size_t owned_len = (num_owned_ > 0) ? num_owned_ : dims_[0];
+    assert(owned_len > 0 && "MPICArrayKokkos::all_reduce: empty reduction range.");
+
+    // Local varaible for on node reduction
+    T local;
+
+    if (comm_plan_ == nullptr || comm_plan_->comm_type == communication_plan_type::no_communication) {
+
+        switch (op) {
+        case operation::sum: {
+            local = 0;
+            T loc_sum = 0;
+            FOR_REDUCE_SUM(i, 0, owned_len,
+                        loc_sum, {
+                loc_sum += this_array_(i);
+            }, local);
+            break;
+        }
+        case operation::product: {
+            local = T(1);
+            T loc_prod = 1;
+            FOR_REDUCE_PRODUCT(i, 0, owned_len,
+                        loc_prod, {
+                loc_prod *= this_array_(i);
+            }, local);
+            break;
+        }
+        case operation::max: {
+            local = this_array_(0);
+            T loc_max = this_array_(0);
+            FOR_REDUCE_MAX(i, 0, owned_len,
+                        loc_max, {
+                loc_max = (this_array_(i) > loc_max) ? this_array_(i) : loc_max;
+            }, local);
+            break;
+        }
+        case operation::min: {
+            local = this_array_(0);
+            T loc_min = this_array_(0);
+            FOR_REDUCE_MIN(i, 0, owned_len,
+                        loc_min, {
+                loc_min = (this_array_(i) < loc_min) ? this_array_(i) : loc_min;
+            }, local);
+            break;
+        }
+        default:
+            printf("MPICArrayKokkos::all_reduce: unsupported operation %d\n", op);
+            printf("Supported operations are: sum, product, max, min\n");
+            Kokkos::abort("MPICArrayKokkos::all_reduce: unsupported operation");
+            break;
+        }
+    } else {
+        printf("MPICArrayKokkos::all_reduce: communication plan requires info on ghost vs owned\n");
+    }
 
+    T global = local;
+    MPI_Comm comm = MPI_COMM_WORLD;
+    if (comm_plan_ != nullptr && comm_plan_->has_comm_world) {
+        comm = comm_plan_->mpi_comm_world;
+    }
+    MPI_Allreduce(&local, &global, 1, mpi_type_map<T>::value(), mpi_op_for(op), comm);
+    return global;
 }
 
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+T MPICArrayKokkos<T, Layout, ExecSpace, MemoryTraits>::all_reduce(operation op, size_t j) {
+    assert(order_ == 2 && "MPICArrayKokkos::all_reduce(op,j) requires a rank-2 array.");
+    assert(j < dims_[1] && "Fixed index j is out of bounds.");
+    assert(!(op == operation::product && sizeof(T) == sizeof(bool)) &&
+           "MPICArrayKokkos::all_reduce: product reduction is not supported for bool.");
+    assert(num_owned_ <= dims_[0] && "MPICArrayKokkos::all_reduce: num_owned exceeds dim0.");
+    const size_t owned_len = (num_owned_ > 0) ? num_owned_ : dims_[0];
+    assert(owned_len > 0 && "MPICArrayKokkos::all_reduce: empty reduction range.");
+
+    T local;
+    if (comm_plan_ == nullptr || comm_plan_->comm_type == communication_plan_type::no_communication) {
+        switch (op) {
+        case operation::sum: {
+            local = 0;
+            T loc_sum = 0;
+            FOR_REDUCE_SUM(e, 0, owned_len, loc_sum, {
+                loc_sum += this_array_(e, j);
+            }, local);
+            break;
+        }
+        case operation::product: {
+            local = T(1);
+            T loc_prod = 1;
+            FOR_REDUCE_PRODUCT(e, 0, owned_len, loc_prod, {
+                loc_prod *= this_array_(e, j);
+            }, local);
+            break;
+        }
+        case operation::max: {
+            local = this_array_(0, j);
+            T loc_max = local;
+            FOR_REDUCE_MAX(e, 0, owned_len, loc_max, {
+                const T v = this_array_(e, j);
+                loc_max = (v > loc_max) ? v : loc_max;
+            }, local);
+            break;
+        }
+        case operation::min: {
+            local = this_array_(0, j);
+            T loc_min = local;
+            FOR_REDUCE_MIN(e, 0, owned_len, loc_min, {
+                const T v = this_array_(e, j);
+                loc_min = (v < loc_min) ? v : loc_min;
+            }, local);
+            break;
+        }
+        default:
+            printf("MPICArrayKokkos::all_reduce: unsupported operation %d\n", op);
+            printf("Supported operations are: sum, product, max, min\n");
+            Kokkos::abort("MPICArrayKokkos::all_reduce: unsupported operation");
+            break;
+        }
+    } else {
+        printf("MPICArrayKokkos::all_reduce: communication plan requires info on ghost vs owned\n");
+    }
+
+    T global = local;
+    MPI_Comm mpi_comm = MPI_COMM_WORLD;
+    if (comm_plan_ != nullptr && comm_plan_->has_comm_world) {
+        mpi_comm = comm_plan_->mpi_comm_world;
+    }
+    MPI_Allreduce(&local, &global, 1, mpi_type_map<T>::value(), mpi_op_for(op), mpi_comm);
+    return global;
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+T MPICArrayKokkos<T, Layout, ExecSpace, MemoryTraits>::all_reduce(operation op, size_t j, size_t k) {
+    assert(order_ == 3 && "MPICArrayKokkos::all_reduce(op,j,k) requires a rank-3 array.");
+    assert(j < dims_[1] && k < dims_[2] && "Fixed tensor indices (j,k) are out of bounds.");
+    assert(!(op == operation::product && sizeof(T) == sizeof(bool)) &&
+           "MPICArrayKokkos::all_reduce: product reduction is not supported for bool.");
+    assert(num_owned_ <= dims_[0] && "MPICArrayKokkos::all_reduce: num_owned exceeds dim0.");
+    const size_t owned_len = (num_owned_ > 0) ? num_owned_ : dims_[0];
+    assert(owned_len > 0 && "MPICArrayKokkos::all_reduce: empty reduction range.");
+
+    T local;
+    if (comm_plan_ == nullptr || comm_plan_->comm_type == communication_plan_type::no_communication) {
+        switch (op) {
+        case operation::sum: {
+            local = 0;
+            T loc_sum = 0;
+            FOR_REDUCE_SUM(e, 0, owned_len, loc_sum, {
+                loc_sum += this_array_(e, j, k);
+            }, local);
+            break;
+        }
+        case operation::product: {
+            local = T(1);
+            T loc_prod = 1;
+            FOR_REDUCE_PRODUCT(e, 0, owned_len, loc_prod, {
+                loc_prod *= this_array_(e, j, k);
+            }, local);
+            break;
+        }
+        case operation::max: {
+            local = this_array_(0, j, k);
+            T loc_max = local;
+            FOR_REDUCE_MAX(e, 0, owned_len, loc_max, {
+                const T v = this_array_(e, j, k);
+                loc_max = (v > loc_max) ? v : loc_max;
+            }, local);
+            break;
+        }
+        case operation::min: {
+            local = this_array_(0, j, k);
+            T loc_min = local;
+            FOR_REDUCE_MIN(e, 0, owned_len, loc_min, {
+                const T v = this_array_(e, j, k);
+                loc_min = (v < loc_min) ? v : loc_min;
+            }, local);
+            break;
+        }
+        default:
+            printf("MPICArrayKokkos::all_reduce: unsupported operation %d\n", op);
+            printf("Supported operations are: sum, product, max, min\n");
+            Kokkos::abort("MPICArrayKokkos::all_reduce: unsupported operation");
+            break;
+        }
+    } else {
+        printf("MPICArrayKokkos::all_reduce: communication plan requires info on ghost vs owned\n");
+    }
+
+    T global = local;
+    MPI_Comm mpi_comm = MPI_COMM_WORLD;
+    if (comm_plan_ != nullptr && comm_plan_->has_comm_world) {
+        mpi_comm = comm_plan_->mpi_comm_world;
+    }
+    MPI_Allreduce(&local, &global, 1, mpi_type_map<T>::value(), mpi_op_for(op), mpi_comm);
+    return global;
+}
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+T MPICArrayKokkos<T, Layout, ExecSpace, MemoryTraits>::all_reduce(operation op, size_t g, size_t ti,
+                                                                  size_t tj) {
+    assert(order_ == 4 && "MPICArrayKokkos::all_reduce(op,g,ti,tj) requires a rank-4 array.");
+    assert(g < dims_[1] && ti < dims_[2] && tj < dims_[3] &&
+           "Fixed indices (Gauss, tensor i, j) are out of bounds.");
+    assert(!(op == operation::product && sizeof(T) == sizeof(bool)) &&
+           "MPICArrayKokkos::all_reduce: product reduction is not supported for bool.");
+    assert(num_owned_ <= dims_[0] && "MPICArrayKokkos::all_reduce: num_owned exceeds dim0.");
+    const size_t owned_len = (num_owned_ > 0) ? num_owned_ : dims_[0];
+    assert(owned_len > 0 && "MPICArrayKokkos::all_reduce: empty reduction range.");
+
+    T local;
+    if (comm_plan_ == nullptr || comm_plan_->comm_type == communication_plan_type::no_communication) {
+        switch (op) {
+        case operation::sum: {
+            local = 0;
+            T loc_sum = 0;
+            FOR_REDUCE_SUM(e, 0, owned_len, loc_sum, {
+                loc_sum += this_array_(e, g, ti, tj);
+            }, local);
+            break;
+        }
+        case operation::product: {
+            local = T(1);
+            T loc_prod = 1;
+            FOR_REDUCE_PRODUCT(e, 0, owned_len, loc_prod, {
+                loc_prod *= this_array_(e, g, ti, tj);
+            }, local);
+            break;
+        }
+        case operation::max: {
+            local = this_array_(0, g, ti, tj);
+            T loc_max = local;
+            FOR_REDUCE_MAX(e, 0, owned_len, loc_max, {
+                const T v = this_array_(e, g, ti, tj);
+                loc_max = (v > loc_max) ? v : loc_max;
+            }, local);
+            break;
+        }
+        case operation::min: {
+            local = this_array_(0, g, ti, tj);
+            T loc_min = local;
+            FOR_REDUCE_MIN(e, 0, owned_len, loc_min, {
+                const T v = this_array_(e, g, ti, tj);
+                loc_min = (v < loc_min) ? v : loc_min;
+            }, local);
+            break;
+        }
+        default:
+            printf("MPICArrayKokkos::all_reduce: unsupported operation %d\n", op);
+            printf("Supported operations are: sum, product, max, min\n");
+            Kokkos::abort("MPICArrayKokkos::all_reduce: unsupported operation");
+            break;
+        }
+    } else {
+        printf("MPICArrayKokkos::all_reduce: communication plan requires info on ghost vs owned\n");
+    }
+
+    T global = local;
+    MPI_Comm mpi_comm = MPI_COMM_WORLD;
+    if (comm_plan_ != nullptr && comm_plan_->has_comm_world) {
+        mpi_comm = comm_plan_->mpi_comm_world;
+    }
+    MPI_Allreduce(&local, &global, 1, mpi_type_map<T>::value(), mpi_op_for(op), mpi_comm);
+    return global;
+}
+
+
+template <typename T, typename Layout, typename ExecSpace, typename MemoryTraits>
+KOKKOS_INLINE_FUNCTION
+MPICArrayKokkos<T,Layout,ExecSpace,MemoryTraits>::~MPICArrayKokkos() {}
+
 } // end namespace mtr
 
 
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index e6c2bfaf..55a0a7d4 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -2,6 +2,16 @@ cmake_minimum_required(VERSION 3.5)
 
 project (matartest)
 
+# Required at top level so CMake generates ${CMAKE_BINARY_DIR}/CTestTestfile.cmake.
+# Without this, ctest from the build root finds no tests (enable_testing in a subdirectory is not enough).
+enable_testing()
+
+# MATAR built with MPI lists MPI::MPI_CXX on imported target `matar`; MatarConfig loads targets during
+# find_package(Matar). Only pre-find MPI for those test builds (see cmake_build_test.sh).
+option(MATAR_TEST_USE_MPI "Configure tests against MPI-enabled MATAR (requires MPI before find_package)" OFF)
+if(MATAR_TEST_USE_MPI)
+    find_package(MPI REQUIRED COMPONENTS CXX)
+endif()
 find_package(Matar REQUIRED)
 
 #-------------------------------------------
diff --git a/test/test_cases/CMakeLists.txt b/test/test_cases/CMakeLists.txt
index a0e07edd..b9a7ee07 100644
--- a/test/test_cases/CMakeLists.txt
+++ b/test/test_cases/CMakeLists.txt
@@ -1,7 +1,10 @@
 cmake_minimum_required(VERSION 3.5)
 
-# Find all test files in the current directory except test_main.cpp
+# Optional MPI for matar_mpi_tests: loaded in ../CMakeLists.txt when MATAR_TEST_USE_MPI is set.
+
+# Find all test files in the current directory except test_main.cpp and MPI-only sources.
 file(GLOB TEST_SOURCES "test_*.cpp")
+list(REMOVE_ITEM TEST_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/test_mpi_types.cpp")
 
 # Create a single test executable that includes all test files
 add_executable(matar_tests test_main.cpp ${TEST_SOURCES})
@@ -14,6 +17,18 @@ endif()
 
 # Add test discovery
 include(GoogleTest)
-enable_testing()
 
 gtest_discover_tests(matar_tests)
+
+# --- MPI + Kokkos: MPICArrayKokkos / CommunicationPlan (run suite under mpirun) ---
+if(KOKKOS AND TARGET MPI::MPI_CXX)
+    add_executable(matar_mpi_tests mpi_test_main.cpp test_mpi_types.cpp)
+    target_link_libraries(matar_mpi_tests PRIVATE matar gtest Kokkos::kokkos MPI::MPI_CXX)
+    target_compile_definitions(matar_mpi_tests PRIVATE HAVE_MPI=1 HAVE_KOKKOS=1)
+
+    # --oversubscribe: GitHub Actions / small VMs often expose fewer slots than ranks (Open MPI).
+    add_test(
+        NAME MPICArrayKokkos_mpi_suite
+        COMMAND ${MPIEXEC_EXECUTABLE} --oversubscribe ${MPIEXEC_NUMPROC_FLAG} 4 $<TARGET_FILE:matar_mpi_tests>
+        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+endif()
diff --git a/test/test_cases/mpi_test_main.cpp b/test/test_cases/mpi_test_main.cpp
new file mode 100644
index 00000000..570e6ed6
--- /dev/null
+++ b/test/test_cases/mpi_test_main.cpp
@@ -0,0 +1,15 @@
+#include <gtest/gtest.h>
+#include <mpi.h>
+
+#include <matar.h>
+
+// MPI must initialize before Kokkos (MATAR_INITIALIZE).
+int main(int argc, char** argv) {
+    MPI_Init(&argc, &argv);
+    MATAR_INITIALIZE(argc, argv);
+    ::testing::InitGoogleTest(&argc, argv);
+    const int result = RUN_ALL_TESTS();
+    MATAR_FINALIZE();
+    MPI_Finalize();
+    return result;
+}
diff --git a/test/test_cases/test_mpi_types.cpp b/test/test_cases/test_mpi_types.cpp
new file mode 100644
index 00000000..3c262411
--- /dev/null
+++ b/test/test_cases/test_mpi_types.cpp
@@ -0,0 +1,208 @@
+// Google tests for MPICArrayKokkos / CommunicationPlan (HAVE_MPI + HAVE_KOKKOS).
+// Run the mpi_test_main executable under mpirun (see test_cases/CMakeLists.txt).
+
+#if !defined(HAVE_MPI) || !defined(HAVE_KOKKOS)
+
+#include <gtest/gtest.h>
+
+TEST(MPI_Types, SkippedWithoutMpiKokkos) {
+    GTEST_SKIP() << "Build MATAR tests with MPI and Kokkos enabled.";
+}
+
+#else
+
+#include <cmath>
+#include <mpi.h>
+
+#include <gtest/gtest.h>
+#include <matar.h>
+
+using namespace mtr;
+
+namespace {
+
+void mpi_rank_size(int* rank, int* size) {
+    MPI_Comm_rank(MPI_COMM_WORLD, rank);
+    MPI_Comm_size(MPI_COMM_WORLD, size);
+}
+
+} // namespace
+
+TEST(MPICArrayKokkos, AllReduce_Sum_1D) {
+    int rank = 0;
+    int size = 1;
+    mpi_rank_size(&rank, &size);
+
+    CommunicationPlan comm_plan;
+    comm_plan.initialize(MPI_COMM_WORLD);
+
+    const int num_values = 100;
+    MPICArrayKokkos<double> locals(num_values, "ut_values");
+    locals.initialize_comm_plan(comm_plan);
+    locals.set_values(1.0);
+    locals.update_device();
+
+    const double global_sum = locals.all_reduce(operation::sum);
+    const double expected = static_cast<double>(num_values * size);
+    EXPECT_DOUBLE_EQ(global_sum, expected);
+}
+
+TEST(MPICArrayKokkos, AllReduce_Sum_VariableLengthPerRank) {
+    int rank = 0;
+    int size = 1;
+    mpi_rank_size(&rank, &size);
+
+    CommunicationPlan comm_plan;
+    comm_plan.initialize(MPI_COMM_WORLD);
+
+    const int num_values_per_rank = 10 * (1 + rank);
+    MPICArrayKokkos<double> rank_locals(num_values_per_rank, "ut_varlen");
+    rank_locals.initialize_comm_plan(comm_plan);
+    rank_locals.set_values(1.0);
+    rank_locals.update_device();
+
+    const double global_sum = rank_locals.all_reduce(operation::sum);
+    const double expected =
+        10.0 * static_cast<double>(size * (size + 1) / 2);
+    EXPECT_DOUBLE_EQ(global_sum, expected);
+}
+
+TEST(MPICArrayKokkos, AllReduce_MinMax_1D) {
+    int rank = 0;
+    int size = 1;
+    mpi_rank_size(&rank, &size);
+
+    CommunicationPlan comm_plan;
+    comm_plan.initialize(MPI_COMM_WORLD);
+
+    const int num_values_per_rank = 10 * (1 + rank);
+    MPICArrayKokkos<float> vals(num_values_per_rank, "ut_minmax");
+    vals.initialize_comm_plan(comm_plan);
+
+    FOR_ALL(i, 0, num_values_per_rank, {
+        vals(i) = static_cast<float>(10 * rank + i);
+    });
+    MATAR_FENCE();
+    vals.update_device();
+
+    const float global_min = vals.all_reduce(operation::min);
+    const float global_max = vals.all_reduce(operation::max);
+    EXPECT_FLOAT_EQ(global_min, 0.0F);
+    const float expected_max =
+        static_cast<float>(10 * (size - 1) + (10 * size - 1));
+    EXPECT_FLOAT_EQ(global_max, expected_max);
+}
+
+TEST(MPICArrayKokkos, AllReduce_Product) {
+    int rank = 0;
+    int size = 1;
+    mpi_rank_size(&rank, &size);
+
+    CommunicationPlan comm_plan;
+    comm_plan.initialize(MPI_COMM_WORLD);
+
+    MPICArrayKokkos<double> prod_locals(4, "ut_prod");
+    prod_locals.initialize_comm_plan(comm_plan);
+    prod_locals.set_values(2.0);
+    prod_locals.update_device();
+
+    const double global_product = prod_locals.all_reduce(operation::product);
+    const double expected = std::pow(2.0, 4 * size);
+    EXPECT_DOUBLE_EQ(global_product, expected);
+}
+
+TEST(MPICArrayKokkos, AllReduce_Rank2_CentroidXYZ) {
+    int rank = 0;
+    int size = 1;
+    mpi_rank_size(&rank, &size);
+
+    CommunicationPlan comm_plan;
+    comm_plan.initialize(MPI_COMM_WORLD);
+
+    const size_t n_elem = 3;
+    constexpr int num_coords = 3;
+
+    MPICArrayKokkos<double> elem_centroids(n_elem, static_cast<size_t>(num_coords),
+                                           "ut_centroids");
+    elem_centroids.initialize_comm_plan(comm_plan);
+    FOR_ALL(elem_id, 0, n_elem, elem_position, 0, num_coords, {
+        const double base = 1000.0 * rank + 100.0 * elem_id;
+        elem_centroids(elem_id, elem_position) =
+            base + 10.0 * static_cast<double>(elem_position);
+    });
+    MATAR_FENCE();
+    elem_centroids.update_device();
+
+    const double max_x = elem_centroids.all_reduce(operation::max, 0U);
+    const double max_y = elem_centroids.all_reduce(operation::max, 1U);
+    const double max_z = elem_centroids.all_reduce(operation::max, 2U);
+
+    const double base_rank = 1000.0 * static_cast<double>(size - 1);
+    const double base_elem = 100.0 * static_cast<double>(n_elem - 1);
+    EXPECT_DOUBLE_EQ(max_x, base_rank + base_elem + 0.0);
+    EXPECT_DOUBLE_EQ(max_y, base_rank + base_elem + 10.0);
+    EXPECT_DOUBLE_EQ(max_z, base_rank + base_elem + 20.0);
+}
+
+TEST(MPICArrayKokkos, AllReduce_Rank3_StressComponent) {
+    int rank = 0;
+    int size = 1;
+    mpi_rank_size(&rank, &size);
+
+    CommunicationPlan comm_plan;
+    comm_plan.initialize(MPI_COMM_WORLD);
+
+    const size_t n_elem = 3;
+    MPICArrayKokkos<double> stress(n_elem, 3, 3, "ut_stress");
+    stress.initialize_comm_plan(comm_plan);
+    FOR_ALL(e, 0, n_elem, r, 0, 3, c, 0, 3, {
+        stress(e, r, c) = 10000.0 * rank + 1000.0 * e + 100.0 * r + c;
+    });
+    MATAR_FENCE();
+    stress.update_device();
+
+    const double max_comp =
+        stress.all_reduce(operation::max, static_cast<size_t>(0),
+                          static_cast<size_t>(1));
+    const double expected = 10000.0 * static_cast<double>(size - 1) +
+                            1000.0 * static_cast<double>(n_elem - 1) + 1.0;
+    EXPECT_DOUBLE_EQ(max_comp, expected);
+}
+
+TEST(MPICArrayKokkos, AllReduce_Rank4_GaussStressComponent) {
+    int rank = 0;
+    int size = 1;
+    mpi_rank_size(&rank, &size);
+
+    CommunicationPlan comm_plan;
+    comm_plan.initialize(MPI_COMM_WORLD);
+
+    const size_t n_elem = 3;
+    const size_t n_gauss = 2;
+
+    MPICArrayKokkos<double> s4(n_elem, n_gauss, 3, 3, "ut_s4");
+    s4.initialize_comm_plan(comm_plan);
+    for (size_t e = 0; e < n_elem; ++e) {
+        for (size_t g = 0; g < n_gauss; ++g) {
+            for (size_t r = 0; r < 3; ++r) {
+                for (size_t c = 0; c < 3; ++c) {
+                    s4.host(e, g, r, c) =
+                        100000.0 * rank + 1000.0 * static_cast<double>(e) +
+                        100.0 * static_cast<double>(g) +
+                        10.0 * static_cast<double>(r) +
+                        static_cast<double>(c);
+                }
+            }
+        }
+    }
+    s4.update_device();
+
+    const double max_qp =
+        s4.all_reduce(operation::max, static_cast<size_t>(1),
+                      static_cast<size_t>(0), static_cast<size_t>(1));
+    const double expected = 100000.0 * static_cast<double>(size - 1) +
+                            1000.0 * static_cast<double>(n_elem - 1) + 101.0;
+    EXPECT_DOUBLE_EQ(max_qp, expected);
+}
+
+#endif // HAVE_MPI && HAVE_KOKKOS