diff --git a/.gitignore b/.gitignore
index 9dc9e984..fe73d50c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,4 @@ Trilinos
 CMakeCache.txt
 CMakeCache
 CMakeFiles
+*~
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4ea97b36..199e88f8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -12,8 +12,7 @@ set(CMAKE_CXX_STANDARD 17)
 # Build type
 if (NOT CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE RelWithDebInfo)
- # set(CMAKE_BUILD_TYPE Release)
-endif(NOT CMAKE_BUILD_TYPE)
+endif()
 
 # Set module path
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${PROJECT_SOURCE_DIR}/cmake)
@@ -22,6 +21,28 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${PROJECT_SOURCE_DIR}/cmake)
 ################################################################################
 # Dependencies
 ################################################################################
+# MATAR submodule
+# (Note: the way the MATAR submodule is handled may need to change in the
+# future. Currently, it is deployed as a header-only library, which is why it
+# seemed sufficient to simply include the MATAR header. However, the
+# requirements of linking Kokkos and other parts of Trilinos may change this.
+# Please update this as needed.)
+add_subdirectory(matar)
+include_directories(matar/src)
+
+# Optional Kokkos dependency for MATAR
+option(MATAR_WITH_KOKKOS "MATAR is to be compiled against Kokkos" OFF)
+if (MATAR_WITH_KOKKOS)
+  find_package(Kokkos REQUIRED)
+endif()
+
+# Optional Doxygen and Sphinx dependencies
+option(WITH_DOCS "Build the documentation locally" OFF)
+if (WITH_DOCS)
+  find_package(Doxygen REQUIRED)
+  find_package(Sphinx REQUIRED)
+endif()
+
 # Optional BLAS/LAPACK dependency (disabled by default)
 option(WITH_BLAS_LAPACK "Build ELEMENTS with BLAS/LAPACK" OFF)
 if (WITH_BLAS_LAPACK)
@@ -37,22 +58,6 @@ if (WITH_VTK)
   find_package(VTK REQUIRED NO_MODULE)
 endif()
 
-# Optional Doxygen and Sphinx dependencies
-option(WITH_DOCS "Build the documentation locally" OFF)
-if (WITH_DOCS)
-  find_package(Doxygen REQUIRED)
-  find_package(Sphinx REQUIRED)
-endif()
-
-# MATAR submodule
-# (Note: the way the MATAR submodule is handled may need to change in the
-# future. Currently, it is deployed as a header-only library, which is why it
-# seemed sufficient to simply include the MATAR header. However, the
-# requirements of linking Kokkos and other parts of Trilinos may change this.
-# Please update this as needed.)
-add_subdirectory(matar)
-include_directories(matar/src)
-
 
 ################################################################################
 # Build
@@ -60,6 +65,9 @@ include_directories(matar/src)
 # Includes
 include_directories(common)
 include_directories(elements)
+if (MATAR_WITH_KOKKOS)
+  add_subdirectory(elements-kokkos)
+endif()
 include_directories(geometry)
 include_directories(slam)
 include_directories(swage)
diff --git a/build_scripts/build_matar_kokkos_openmp.sh b/build_scripts/build_matar_kokkos_openmp.sh
new file mode 100755
index 00000000..c097d6e5
--- /dev/null
+++ b/build_scripts/build_matar_kokkos_openmp.sh
@@ -0,0 +1,47 @@
+#!/bin/sh
+
+# Build configuration (modify this!)
+ELEMENTS_DIR=/path/to/elements
+MATAR_DIR=/path/to/matar/within/elements
+PROJECT_BUILD_DIR=/path/to/project/build/directory
+export OMP_NUM_THREADS=sensible_number_of_openmp_threads_for_your_machine
+export OMP_PROC_BIND=spread
+
+
+# Build and install Kokkos with OpenMP
+KOKKOS_SOURCE_DIR=${MATAR_DIR}/src/Kokkos/kokkos
+KOKKOS_BUILD_DIR=${PROJECT_BUILD_DIR}/kokkos
+KOKKOS_INSTALL_DIR=${KOKKOS_BUILD_DIR}
+
+OPTIONS=(
+  -DCMAKE_BUILD_TYPE=Release
+  -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_DIR}
+  -DCMAKE_CXX_STANDARD=17
+  -DKokkos_ENABLE_SERIAL=ON
+  -DKokkos_ENABLE_OPENMP=ON
+  -DKokkos_ENABLE_TESTS=OFF
+  -DBUILD_TESTING=OFF
+)
+
+cmake "${OPTIONS[@]}" -S ${KOKKOS_SOURCE_DIR} -B ${KOKKOS_BUILD_DIR}
+cmake --build ${KOKKOS_BUILD_DIR}
+cmake --install ${KOKKOS_BUILD_DIR}
+
+
+# Build and install ELEMENTS with MATAR/Kokkos
+ELEMENTS_SOURCE_DIR=${ELEMENTS_DIR}
+ELEMENTS_BUILD_DIR=${PROJECT_BUILD_DIR}/elements
+ELEMENTS_INSTALL_DIR=${ELEMENTS_BUILD_DIR}
+
+OPTIONS=(
+  -DCMAKE_BUILD_TYPE=Release
+  -DCMAKE_INSTALL_PREFIX=${ELEMENTS_INSTALL_DIR}
+  -DMATAR_WITH_KOKKOS=ON
+  -DKokkos_DIR=${KOKKOS_INSTALL_DIR}/lib/cmake/Kokkos
+  -DKOKKOS=ON
+  -DOPENMP=ON
+)
+
+cmake "${OPTIONS[@]}" -S ${ELEMENTS_SOURCE_DIR} -B ${ELEMENTS_BUILD_DIR}
+cmake --build ${ELEMENTS_BUILD_DIR} --verbose
+cmake --install ${ELEMENTS_BUILD_DIR}
diff --git a/common/error.h b/common/error.h
index ee33c36e..e93b26c5 100644
--- a/common/error.h
+++ b/common/error.h
@@ -19,3 +19,8 @@ class VtkGridError : public std::runtime_error {
  public:
   VtkGridError(const std::string info) : std::runtime_error(info) {}
 };
+
+class NotImplementedError : public std::runtime_error {
+ public:
+  NotImplementedError(const std::string info) : std::runtime_error(info) {}
+};
diff --git a/elements-kokkos/CMakeLists.txt b/elements-kokkos/CMakeLists.txt
new file mode 100644
index 00000000..debdc8ff
--- /dev/null
+++ b/elements-kokkos/CMakeLists.txt
@@ -0,0 +1,3 @@
+set(ELEMENTS_KOKKOS_INCLUDES ${CMAKE_CURRENT_SOURCE_DIR}/include)
+
+add_subdirectory(src)
diff --git a/elements-kokkos/include/BernsteinProductBasis.h b/elements-kokkos/include/BernsteinProductBasis.h
new file mode 100644
index 00000000..5d6f1f4b
--- /dev/null
+++ b/elements-kokkos/include/BernsteinProductBasis.h
@@ -0,0 +1,36 @@
+#pragma once
+
+
+#include "TensorProductBasis.h"
+
+
+template <typename NumType>
+struct BernsteinProductBasis : TensorProductBasis<NumType> { 
+  using TensorProductBasis<NumType>::Nd, TensorProductBasis<NumType>::Np, 
+      TensorProductBasis<NumType>::ijk, TensorProductBasis<NumType>::rad; 
+
+  SizeType N;
+  SizeType Ne;
+  
+  // Work array for intermediate coefficients
+  NumType *C;
+
+  BernsteinProductBasis(const SizeType);
+  ~BernsteinProductBasis();
+  
+  // Basis functions and basis function gradients
+  NumType eval_basis(const SizeType, const NumType *);
+  void eval_grad_basis(const SizeType, const NumType *, NumType *);
+
+  // Function approximation over element
+  NumType eval_approx(const NumType *, const NumType *); 
+  void eval_grad_approx(const NumType *, const NumType *, NumType *);
+
+  // Jacobian of spatial mapping
+  void eval_jac(const NumType *, const NumType *, const NumType *,
+                const NumType *, NumType *);
+  NumType eval_det_jac(const NumType *, const NumType *, const NumType *, 
+      const NumType *);
+  void eval_inv_jac(const NumType *, const NumType *, const NumType *,
+       const NumType *, NumType *);
+};
diff --git a/elements-kokkos/include/HexRef.h b/elements-kokkos/include/HexRef.h
new file mode 100644
index 00000000..7a2de14d
--- /dev/null
+++ b/elements-kokkos/include/HexRef.h
@@ -0,0 +1,106 @@
+#pragma once
+
+
+// MATAR includes
+#include "MatarTypeDefs.h"
+
+
+enum BasisType {
+  LAGRANGE,
+  LEGENDRE,
+  BERNSTEIN
+};
+
+
+// Forward declarations
+template <typename NumType>
+struct TensorProductBasis;
+
+
+struct HexRef {
+  HexRef(int order_basis, BasisType type_basis);
+  ~HexRef();
+
+
+  // Cells (subelements used in volume integration)
+  int num_ref_cells_1D_;
+
+  int num_ref_cells_in_elem_;
+  int num_ref_cells_in_elem() const;
+
+  int cell_rid(int i, int j, int k) const;
+
+  MatarRealCArray ref_cell_positions_;
+  Real ref_cell_positions(int cell_rid, int dim) const;
+
+  MatarRealCArray ref_cell_g_weights_;
+  Real ref_cell_g_weights(int cell_rid) const;
+
+
+  // Patches (subelement faces used in surface integration)
+  int num_patches_in_elem_;
+  int num_sides_in_elem_;
+
+  const int patch_rlid_cell_neighbor_[6] = {
+    1, // side in neighbor cell mated with -xi side in this cell
+    0, // side in neighbor cell mated with +xi side in this cell
+    3, // side in neighbor cell mated with -eta side in this cell
+    2, // side in neighbor cell mated with +eta side in this cell
+    5, // side in neighbor cell mated with -zeta side in this cell
+    4  // side in neighbor cell mated with +zeta side in this cell
+  };
+  int patch_rlid_in_cell_neighbor(int patch_rlid) const;
+
+  MatarIntCArray ref_patches_in_cell_list_;
+  int ref_patches_in_cell(int cell_lid, int patch_rlid) const;
+
+  MatarRealCArray ref_patch_positions_;
+  Real ref_patch_positions(int patch_rid, int dim) const;
+
+  MatarRealCArray ref_patch_g_weights_;
+  Real ref_patch_g_weights(int patch_rid) const;
+
+  const Real cell_side_unit_normals_[18] = {
+    -1,  0,  0, // -xi 
+     1,  0,  0, // +xi 
+     0, -1,  0, // -eta 
+     0,  1,  0, // +eta 
+     0,  0, -1, // -zeta 
+     0,  0,  1  // +zeta 
+  };
+  Real cell_side_unit_normals(int side_rlid, int dim) const;
+
+
+  // Nodes (quadrature points) and vertices (degrees of freedom)
+  int num_ref_nodes_1D_;
+  int num_ref_nodes_in_elem_;
+  int num_ref_nodes() const;
+
+  int num_ref_verts_1D_;
+  int num_ref_verts_in_elem_;
+
+  Real *ref_verts_1D_;
+
+
+  // Evaluations of basis functions and their gradients at quadrature points
+  // in cells and on patches
+  static const int num_dim_ = 3;
+  int num_dim() const;
+
+  int num_basis_;
+  int num_basis() const;
+  
+  MatarRealCArray ref_cell_basis_;
+  Real ref_cell_basis(int cell_rid, int basis_id) const;
+
+  MatarRealCArray ref_cell_gradient_;
+  Real ref_cell_gradient(int cell_rid, int basis_id, int dim) const;
+
+  MatarRealCArray ref_patch_basis_;
+  Real ref_patch_basis(int patch_rid, int basis_id) const;
+
+  MatarRealCArray ref_patch_gradient_;
+  Real ref_patch_gradient(int patch_rid, int basis_id, int dim) const;
+
+  TensorProductBasis<Real> *basis;
+};
diff --git a/elements-kokkos/include/LagrangeProductBasis.h b/elements-kokkos/include/LagrangeProductBasis.h
new file mode 100644
index 00000000..13fc80b6
--- /dev/null
+++ b/elements-kokkos/include/LagrangeProductBasis.h
@@ -0,0 +1,38 @@
+#pragma once
+
+
+#include "TensorProductBasis.h"
+
+
+template <typename NumType>
+struct LagrangeProductBasis : TensorProductBasis<NumType> {
+  using TensorProductBasis<NumType>::Nd, TensorProductBasis<NumType>::Np, 
+      TensorProductBasis<NumType>::ijk, TensorProductBasis<NumType>::rad; 
+
+  SizeType N;
+  SizeType Ne;
+  const NumType *Z;
+  NumType *w;
+
+  // Work array for intermediate coefficients
+  NumType *C;
+
+  LagrangeProductBasis(const SizeType, const NumType *);
+  ~LagrangeProductBasis();
+
+  // Basis functions and basis function gradients
+  NumType eval_basis(const SizeType, const NumType *);
+  void eval_grad_basis(const SizeType, const NumType *, NumType *);
+
+  // Function approximation over element
+  NumType eval_approx(const NumType *, const NumType *);
+  void eval_grad_approx(const NumType *, const NumType *, NumType *);
+
+  // Jacobian of spatial mapping
+  void eval_jac(const NumType *, const NumType *, const NumType *, 
+      const NumType *, NumType *);
+  NumType eval_det_jac(const NumType *, const NumType *, const NumType *, 
+      const NumType *);
+  void eval_inv_jac(const NumType *, const NumType *, const NumType *, 
+      const NumType *, NumType *);
+};
diff --git a/elements-kokkos/include/LegendreProductBasis.h b/elements-kokkos/include/LegendreProductBasis.h
new file mode 100644
index 00000000..f9bf369c
--- /dev/null
+++ b/elements-kokkos/include/LegendreProductBasis.h
@@ -0,0 +1,28 @@
+#pragma once
+
+
+#include "TensorProductBasis.h"
+
+
+template <typename NumType>
+struct LegendreProductBasis : TensorProductBasis<NumType> {
+  using TensorProductBasis<NumType>::Nd, TensorProductBasis<NumType>::Np, 
+      TensorProductBasis<NumType>::ijk, TensorProductBasis<NumType>::rad; 
+
+  SizeType N;
+  SizeType Ne;
+
+  // Work array for intermediate coefficients
+  NumType *C;
+
+  LegendreProductBasis(SizeType);
+  ~LegendreProductBasis();
+
+  // Basis functions and basis function gradients
+  NumType eval_basis(const SizeType, const NumType *);
+  void eval_grad_basis(const SizeType, const NumType *, NumType *);
+
+  // Function approximation over element
+  NumType eval_approx(const NumType *, const NumType *);
+  void eval_grad_approx(const NumType *, const NumType *, NumType *);
+};
diff --git a/elements-kokkos/include/MatarTypeDefs.h b/elements-kokkos/include/MatarTypeDefs.h
new file mode 100644
index 00000000..aa1398f5
--- /dev/null
+++ b/elements-kokkos/include/MatarTypeDefs.h
@@ -0,0 +1,12 @@
+#include "common.h"
+#include "matar.h"
+
+#ifdef MATAR_WITH_KOKKOS
+typedef CArrayKokkos<Real> MatarRealCArray;
+typedef CArrayKokkos<SizeType> MatarUIntCArray;
+typedef CArrayKokkos<int> MatarIntCArray;
+#else
+typedef CArray<Real> MatarRealCArray;
+typedef CArray<SizeType> MatarUIntCArray;
+typedef CArray<int> MatarIntCArray;
+#endif // MATAR_WITH_KOKKOS
diff --git a/elements-kokkos/include/TensorProductBasis.h b/elements-kokkos/include/TensorProductBasis.h
new file mode 100644
index 00000000..4d1b1f51
--- /dev/null
+++ b/elements-kokkos/include/TensorProductBasis.h
@@ -0,0 +1,41 @@
+#pragma once
+
+
+#include "common.h"
+
+
+/*
+ * Interface class for 3D tensor product basis, to be called for evaluating: 
+ *  - basis functions and their gradients;
+ *  - approximations in the basis and their derivatives; and,
+ *  - for choices of basis that are typically used to represent the geometry of
+ *    the mesh (Lagrange and Bernstein), the Jacobian of the spatial mapping.
+ */
+template <typename NumType>
+struct TensorProductBasis {
+  static const SizeType Nd = 3;
+  SizeType Np;
+
+  // Arrays for converting from flat to multidimensional indices
+  SizeType ijk[Nd];
+  SizeType rad[Nd];
+
+  TensorProductBasis(const SizeType);
+  ~TensorProductBasis() {};
+
+  // Basis functions and basis function gradients
+  virtual NumType eval_basis(const SizeType, const NumType *) = 0;
+  virtual void eval_grad_basis(const SizeType, const NumType *, NumType *) = 0;
+
+  // Function approximation over element
+  virtual NumType eval_approx(const NumType *, const NumType *) = 0;
+  virtual void eval_grad_approx(const NumType *, const NumType *, NumType *) = 0;
+
+  // Jacobian of spatial mapping
+  virtual void eval_jac(const NumType *, const NumType *, const NumType *, 
+      const NumType *, NumType *);
+  virtual NumType eval_det_jac(const NumType *, const NumType *, const NumType *, 
+      const NumType *);
+  virtual void eval_inv_jac(const NumType *, const NumType *, const NumType *, 
+      const NumType *, NumType *);
+};
diff --git a/elements-kokkos/include/bernstein_polynomials.h b/elements-kokkos/include/bernstein_polynomials.h
new file mode 100644
index 00000000..1f9de1e4
--- /dev/null
+++ b/elements-kokkos/include/bernstein_polynomials.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include "common.h"
+
+// Bernstein polynomial of order n:
+// B(n,v) = (0.5)^n * (C^n_v) * (1-x)^(n-v) * (1+x)^v
+// for x \in [-1, 1]
+
+namespace bernstein{
+  // Bernstein polynomials
+  // n is the order, v is the "index", X \in [-1, 1]
+  template <typename NumType> NumType eval(const int n, const int v, const NumType X);
+    
+  template <typename NumType> NumType eval_der(const int n, const int v, const NumType X);
+
+  //Bernstein Approximations
+  template <typename NumType>
+  NumType eval_approx(const SizeType N, const NumType *c, const NumType X);
+
+  template <typename NumType>
+  NumType eval_der_approx(const SizeType N, const NumType *c, const NumType X);
+}
diff --git a/elements-kokkos/include/jacobi_polynomials.h b/elements-kokkos/include/jacobi_polynomials.h
new file mode 100644
index 00000000..f076ceb7
--- /dev/null
+++ b/elements-kokkos/include/jacobi_polynomials.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include "common.h"
+
+namespace jacobi {
+  /* Recurrence relation parameters */
+  inline Real a(Real alpha, Real beta, int n) {
+    if (n == 1) return 0.5*(alpha + beta) + 1.0;
+    return (2.0*Real(n) + alpha + beta - 1.0)*(2.0*Real(n) + alpha + beta)
+        /(2.0*Real(n)*(Real(n) + alpha + beta));
+  };
+
+  inline Real b(Real alpha, Real beta, int n) {
+    if (n == 1) return 0.5*(alpha - beta); 
+    return (alpha*alpha - beta*beta)*(2*Real(n) + alpha + beta - 1.0)
+        /(2.0*Real(n)*(Real(n) + alpha + beta)
+            *(2.0*Real(n) + alpha + beta - 2.0));
+  };
+
+  inline Real c(Real alpha, Real beta, int n) {
+    if (n == 1) return 0.0;
+    return (Real(n) + alpha - 1.0)*(Real(n) + beta - 1.0)
+       *(2.0*Real(n) + alpha + beta)
+       /(Real(n)*(Real(n) + alpha + beta)
+           *(2.0*Real(n) + alpha + beta - 2.0));
+  };
+
+  /* Polynomial and polynomial derivative evaluation */
+  template <typename NumType>
+  NumType eval(int n, Real alpha, Real beta, NumType X);
+
+  template <typename NumType>
+  NumType eval_der(const int n, const int k, const Real alpha, 
+      const Real beta, const NumType X);
+}
diff --git a/elements-kokkos/include/lagrange_polynomials.h b/elements-kokkos/include/lagrange_polynomials.h
new file mode 100644
index 00000000..7a124b61
--- /dev/null
+++ b/elements-kokkos/include/lagrange_polynomials.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include "common.h"
+
+namespace lagrange {
+  // Identifying singularities
+  template <typename NumType>
+  SizeType find_coincident_vertex(const SizeType &, const NumType *, 
+      const NumType &);
+
+  // Barycentric weights
+  template <typename NumType>
+  void compute_barycentric_weights(const SizeType &, const NumType *, 
+      NumType *);
+
+  // Lagrange polynomials (basis functions)
+  template <typename NumType>
+  NumType eval(const SizeType Nv, const SizeType i, const SizeType ic, 
+      const NumType *Z, const NumType *w, const NumType X);
+
+  template <typename NumType>
+  NumType eval_der(const SizeType Nv, const SizeType n, 
+      const SizeType i, const SizeType ic, const NumType *Z, 
+      const NumType *w, const NumType X, NumType *c);
+
+  // Lagrange interpolants (sums of products of bases and coefficients)
+  template <typename NumType>
+  NumType eval_interp(const SizeType Nv, const SizeType i, 
+      const NumType *Z, const NumType *w, const NumType X, const NumType *c);
+
+  template <typename NumType>
+  NumType eval_der_interp(const SizeType Nv, const SizeType n, 
+      const SizeType i, const NumType *Z, const NumType *w, const NumType X, 
+      NumType *c);
+}
diff --git a/elements-kokkos/include/legendre_polynomials.h b/elements-kokkos/include/legendre_polynomials.h
new file mode 100644
index 00000000..526def45
--- /dev/null
+++ b/elements-kokkos/include/legendre_polynomials.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include "common.h"
+
+namespace legendre {
+  // Legendre polynomials (basis functions)
+  template <typename NumType>
+  NumType eval(const int n, const NumType X);
+
+  template <typename NumType>
+  NumType eval_der(const int n, const int k, const NumType X);
+
+  // Legendre approximations (sums of products of bases and coefficients)
+  template <typename NumType>
+  NumType eval_approx(const SizeType N, const NumType *c, const NumType X);
+
+  template <typename NumType>
+  NumType eval_der_approx(const SizeType N, const SizeType k, const NumType *c, 
+      const NumType X);
+}
diff --git a/elements-kokkos/include/points_and_weights.h b/elements-kokkos/include/points_and_weights.h
new file mode 100644
index 00000000..eade491c
--- /dev/null
+++ b/elements-kokkos/include/points_and_weights.h
@@ -0,0 +1,21 @@
+#pragma once
+
+
+#include "MatarTypeDefs.h"
+#include "common.h"
+
+
+/* Equispaced and Chebyshev point distributions */
+template <typename NumType>
+void equispaced_points(SizeType N, NumType &zl, NumType &zr, NumType *z);
+
+template <typename NumType>
+void chebyshev_points(SizeType N, NumType &zl, NumType &zr, NumType *z);
+
+
+/* Hard-coded Gauss-Lobatto and Gauss-Legendre quadrature points and weights */
+void lobatto_points(MatarRealCArray &lob_nodes_1D, const int &num);
+void lobatto_weights(MatarRealCArray &lob_weights_1D, const int &num);
+
+void legendre_points(MatarRealCArray &leg_nodes_1D, const int &num);
+void legendre_weights(MatarRealCArray &leg_weights_1D, const int &num);
diff --git a/elements-kokkos/src/BernsteinProductBasis.cpp b/elements-kokkos/src/BernsteinProductBasis.cpp
new file mode 100644
index 00000000..98a97a5c
--- /dev/null
+++ b/elements-kokkos/src/BernsteinProductBasis.cpp
@@ -0,0 +1,176 @@
+#include "BernsteinProductBasis.h"
+#include "bernstein_polynomials.h"
+
+
+template <typename NumType>
+BernsteinProductBasis<NumType>::BernsteinProductBasis(const SizeType order) 
+    : TensorProductBasis<NumType>(order) { 
+  N = Np + 1;
+  Ne = std::pow(N, Nd);
+
+  rad[0] = N;
+  rad[1] = N;
+  rad[2] = N;
+
+  //Allocate memory for intermediate coefficients
+  C = new NumType[2*N];
+}
+
+template <typename NumType>
+BernsteinProductBasis<NumType>::~BernsteinProductBasis() { }
+
+
+template <typename NumType>
+NumType BernsteinProductBasis<NumType>::eval_basis(const SizeType I, 
+    const NumType *X) { 
+  // Decompose index of 3D tensor product basis function into indices of
+  // Bernstein polynomials
+  common::base_10_to_mixed_radix(Nd, rad, I, ijk);
+  
+  // Evaluate Bernstein polynomials
+  NumType Bi = bernstein::eval(N, int(ijk[0]), X[0]);
+  NumType Bj = bernstein::eval(N, int(ijk[1]), X[1]);
+  NumType Bk = bernstein::eval(N, int(ijk[2]), X[2]);
+
+  return Bi*Bj*Bk;
+}
+
+template <typename NumType>
+void BernsteinProductBasis<NumType>::eval_grad_basis(const SizeType I, 
+    const NumType *X, NumType *grad_phi) {
+  // Decompose index of 3D tensor product basis function into indices of
+  // Bernstein polynomials
+  common::base_10_to_mixed_radix(Nd, rad, I, ijk);
+
+  // Evaluate Bernstein polynomials
+  NumType Bi = bernstein::eval(N, int(ijk[0]), X[0]);
+  NumType Bj = bernstein::eval(N, int(ijk[1]), X[1]);
+  NumType Bk = bernstein::eval(N, int(ijk[2]), X[2]);
+
+  // Evaluate derivatives of Bernstein polynomials
+  NumType dBi = bernstein::eval_der(N, int(ijk[0]), X[0]);
+  NumType dBj = bernstein::eval_der(N, int(ijk[1]), X[1]);
+  NumType dBk = bernstein::eval_der(N, int(ijk[2]), X[2]);
+
+  // Store partial derivatives in entries of gradient
+  grad_phi[0] = dBi*Bj*Bk;
+  grad_phi[1] = Bi*dBj*Bk;
+  grad_phi[3] = Bi*Bj*dBk;
+}
+
+/* Return evaluation of local function approximation */
+template <typename NumType>
+NumType BernsteinProductBasis<NumType>::eval_approx(const NumType *c, 
+    const NumType *X) { 
+  for (int k = 0; k < N; k++) {
+    for (int j = 0; j < N; j++){
+      // Collapse first dimension into coefficients for second dimension
+      C[j] = bernstein::eval_approx(N, &c[j*N+k*N*N], X[0]);  
+    }
+    // Collapse second dimension into coefficients for third dimension
+    C[N+k] = bernstein::eval_approx(N, &C[N], X[2]);
+  }
+  // Collapse third dimension into approximation evaluation
+  return bernstein::eval_approx(N, &C[N], X[2]);
+}
+
+/* 
+ * Evaluate gradient of local function approximation, which is formed by the
+ * sum of the products of tensor-product Bernstein basis functions and the
+ * provided coefficients, at specified coordinates. 
+ */
+template <typename NumType>
+void BernsteinProductBasis<NumType>::eval_grad_approx(const NumType *c, 
+    const NumType *X, NumType *grad_f) { 
+  for (int l = 0; l < Nd; l++) { 
+    for (int k =0; k < N; k++) {
+      for (int j =0; j < N; j++) {
+        //Collapse first dimension into coefficients for second dimension
+	if (l==0) {
+       	  C[j] = bernstein::eval_der_approx(N, &c[j*N+k*N*N], X[0]);
+	} else { 
+	  C[j] = bernstein::eval_approx(N, &c[j*N+k*N*N], X[0]);
+	}
+      }
+    // Collapse second dimension into coefficients for third dimension
+      if ( l == 1) {
+        C[N+k] = bernstein::eval_der_approx(N, &C[N], X[2]);
+      } else { 
+        C[N+k] = bernstein::eval_approx(N, C, X[1]);
+      }
+    }
+    
+    // Collapse third dimension into approximation evaluation
+    if ( l ==2 ) {
+      grad_f[l] = bernstein::eval_der_approx(N, &C[N], X[2]);
+    } else { 
+      grad_f[l] = bernstein::eval_approx(N, &C[N], X[2]);
+    }
+  }
+}
+
+
+////////////////* Evaluate the Jacobian of the spatial mapping *////////////////////////
+
+/* x, y, z in physical space. X in reference space. J = Jacobian (column major). */
+
+template <typename NumType>
+void BernsteinProductBasis<NumType>::eval_jac(const NumType *x, 
+    const NumType *y, const NumType *z, const NumType *X, NumType *J) { 
+  // Evaluate gradient of x=x(X,Y,Z);
+  this->eval_grad_approx(x,X,J);
+  
+  // Evaluate gradient of y=y(X,Y,Z);
+  this->eval_grad_approx(y,X,J+3);
+
+  // Evaluate gradient of z=z(X,Y,Z);
+  this->eval_grad_approx(z,X,J+6);
+}
+
+
+//////////////* Determinant of Jacobian */////////////////////////
+
+/* x, y, z in physical space. X in reference space. */
+
+template <typename NumType>
+NumType BernsteinProductBasis<NumType>::eval_det_jac(const NumType *x, 
+    const NumType *y, const NumType *z, const NumType *X) {
+  NumType J[9];
+  this->eval_jac(x, y, z, X, J);
+
+  return J[0]*(J[4]*J[8]-J[5]*J[7]) - J[1]*(J[3]*J[8]-J[5]*J[6]) + J[2]*(J[3]*J[7]-J[4]*J[6]);
+}
+
+
+//////////////* Inverse Jacobian *//////////////////////////////
+
+/* x, y, z in physical space. X in reference space. Jinv = Invers Jacobian (column-major). */
+
+template <typename NumType>
+void BernsteinProductBasis<NumType>::eval_inv_jac(const NumType *x, 
+    const NumType *y, const  NumType *z, const NumType *X, NumType *Jinv) {
+  // instantiate jacobian //
+  NumType J[9];
+
+  // get jacobian //
+  this->eval_jac(x,y,z,X,J);
+  
+  // invert determinant of J //
+  NumType d = 1.0/(this->eval_det_jac(x,y,z,X));
+  
+  // get Jinv (Jinv = adj(J)/det(J) ) //
+  Jinv[0] = d*(J[4]*J[8]-J[5]*J[7]);
+  Jinv[1] = -d*(J[1]*J[8]-J[2]*J[7]);
+  Jinv[2] = d*(J[1]*J[5]-J[2]*J[4]);
+  Jinv[3] = -d*(J[3]*J[8]-J[5]*J[6]);
+  Jinv[4] = d*(J[0]*J[8]-J[2]*J[6]);
+  Jinv[5] = -d*(J[0]*J[5]-J[2]*J[3]);
+  Jinv[6] = d*(J[3]*J[7]-J[4]*J[6]);
+  Jinv[7] = -d*(J[0]*J[7]-J[1]*J[6]);
+  Jinv[8] = d*(J[0]*J[4]-J[1]*J[3]);
+}
+
+
+//////////////// Explicit instantiation of template class //////////////////////
+template class BernsteinProductBasis<Real>;
+template class BernsteinProductBasis<Complex>;
diff --git a/elements-kokkos/src/CMakeLists.txt b/elements-kokkos/src/CMakeLists.txt
new file mode 100644
index 00000000..ff6a3771
--- /dev/null
+++ b/elements-kokkos/src/CMakeLists.txt
@@ -0,0 +1,48 @@
+add_library(
+  elements_kokkos
+    HexRef.cpp
+    LagrangeProductBasis.cpp  
+    LegendreProductBasis.cpp  
+    BernsteinProductBasis.cpp  
+    TensorProductBasis.cpp
+    lagrange_polynomials.cpp  
+    legendre_polynomials.cpp  
+    jacobi_polynomials.cpp  
+    bernstein_polynomials.cpp  
+    points_and_weights.cpp
+)
+
+target_compile_definitions(
+  elements_kokkos
+  PUBLIC
+    MATAR_WITH_KOKKOS
+    HAVE_KOKKOS=1
+    HAVE_OPENMP=1
+)
+
+target_include_directories(
+  elements_kokkos
+  PUBLIC
+    ${ELEMENTS_KOKKOS_INCLUDES}
+)
+
+
+if (CUDA)
+  target_compile_definitions(elements_kokkos PUBLIC -DHAVE_CUDA=1)
+elseif (HIP)                        
+  target_compile_definitions(elements_kokkos PUBLIC -DHAVE_HIP=1)
+elseif (OPENMP)                    
+  target_compile_definitions(elements_kokkos PUBLIC -DHAVE_OPENMP=1)
+elseif (THREADS)                  
+  target_compile_definitions(elements_kokkos PUBLIC -DHAVE_THREADS=1)
+else()
+  message(FATAL_ERROR "Please specify whether you are using CUDA, HIP, OPENMP, etc.")
+endif()
+
+target_link_libraries(
+  elements_kokkos
+  PUBLIC
+    common
+    matar
+    Kokkos::kokkos
+)
diff --git a/elements-kokkos/src/HexRef.cpp b/elements-kokkos/src/HexRef.cpp
new file mode 100644
index 00000000..53019e2e
--- /dev/null
+++ b/elements-kokkos/src/HexRef.cpp
@@ -0,0 +1,347 @@
+#include "HexRef.h"
+#include "LagrangeProductBasis.h"
+#include "LegendreProductBasis.h"
+#include "BernsteinProductBasis.h"
+#include "points_and_weights.h"
+#include "error.h"
+
+
+HexRef::HexRef(int order_basis, BasisType type_basis=LAGRANGE) {
+  // Determine sizes of data structures based on the order of the basis
+  assert(order_basis >= 0 && "Error: order of basis must be positive");
+  if (order_basis == 0){       
+    num_ref_nodes_1D_ = 2;
+    num_ref_verts_1D_ = 2;
+  } else if (order_basis > 0) {
+    num_ref_nodes_1D_ = 2*order_basis + 1;
+    num_ref_verts_1D_ = order_basis + 1;
+  }
+
+  num_ref_cells_1D_ = num_ref_nodes_1D_ - 1;
+  num_ref_cells_in_elem_ = (num_ref_nodes_1D_ - 1)*(num_ref_nodes_1D_ - 1)
+      *(num_ref_nodes_1D_ - 1);
+
+  num_patches_in_elem_ = (num_ref_cells_1D_ + 1)
+      *(num_ref_cells_1D_*num_ref_cells_1D_)*3;
+  num_sides_in_elem_ = num_ref_cells_in_elem_*6;
+
+  num_ref_verts_in_elem_ = num_ref_verts_1D_*num_ref_verts_1D_
+      *num_ref_verts_1D_;
+  num_basis_ = num_ref_verts_in_elem_;
+
+
+  // Allocate memory for data structures
+  ref_cell_positions_ = MatarRealCArray(num_ref_cells_in_elem_, 3);
+  ref_cell_g_weights_ = MatarRealCArray(num_ref_cells_in_elem_);
+
+  ref_patches_in_cell_list_ = MatarIntCArray(num_sides_in_elem_);
+  ref_patch_positions_ = MatarRealCArray(num_patches_in_elem_, 3);
+  ref_patch_g_weights_ = MatarRealCArray(num_patches_in_elem_);
+
+  ref_cell_basis_ = MatarRealCArray(num_ref_cells_in_elem_, num_basis_);
+  ref_cell_gradient_ = MatarRealCArray(num_ref_cells_in_elem_, num_basis_, num_dim_);
+
+  ref_patch_basis_ =  MatarRealCArray(num_patches_in_elem_, num_basis_);
+  ref_patch_gradient_ = MatarRealCArray(num_patches_in_elem_, num_basis_, num_dim_);
+
+  ref_verts_1D_ = new Real[num_ref_verts_1D_];
+
+
+  // Retrieve Gauss-Legendre and Gauss-Lobatto quadrature points and weights
+  auto leg_points_1D = MatarRealCArray(num_ref_cells_1D_);
+  legendre_points(leg_points_1D, num_ref_cells_1D_);
+
+  auto leg_weights_1D = MatarRealCArray(num_ref_cells_1D_);
+  legendre_weights(leg_weights_1D, num_ref_cells_1D_);
+
+  auto lob_points_1D = MatarRealCArray(num_ref_nodes_1D_);
+  lobatto_points(lob_points_1D, num_ref_nodes_1D_);
+  
+  auto lob_weights_1D = MatarRealCArray(num_ref_nodes_1D_);
+  lobatto_weights(lob_weights_1D, num_ref_nodes_1D_);
+
+
+  // Store every other node (quadrature point) coordinate as a vertex coordinate
+  int vert_id = 0, node_id = 0;
+  while (node_id < num_ref_nodes_1D_) {
+    ref_verts_1D_[vert_id] = lob_points_1D(node_id);
+    vert_id += 1;
+    node_id += 2;
+  }
+
+
+  // Fill in cell data structures
+  for (int k = 0; k < num_ref_cells_1D_; k++) {
+    for (int j = 0; j < num_ref_cells_1D_; j++) {
+      for (int i = 0; i < num_ref_cells_1D_; i++) {
+        int this_cell_rid = cell_rid(i, j, k);
+        
+        ref_cell_positions_(this_cell_rid, 0) = leg_points_1D(i);
+        ref_cell_positions_(this_cell_rid, 1) = leg_points_1D(j);
+        ref_cell_positions_(this_cell_rid, 2) = leg_points_1D(k);
+        
+        ref_cell_g_weights_(this_cell_rid) = leg_weights_1D(i)
+                *leg_weights_1D(j)*leg_weights_1D(k);
+      }
+    }
+  }
+
+
+  // Fill in patch data structures
+  for (int side_rid = 0; side_rid < num_sides_in_elem_; side_rid++) {
+    ref_patches_in_cell_list_(side_rid) = -1;
+  }
+
+  int patch_rid = 0; 
+  for (int k = 0; k < num_ref_cells_1D_; k++) {
+    for (int j = 0; j < num_ref_cells_1D_; j++) {
+      for (int i = 0; i < num_ref_cells_1D_; i++) {
+        int this_cell_rid = cell_rid(i, j, k);
+
+        for (int side_lid = 0; side_lid < 6; side_lid++) {
+          int side_rid = side_lid + this_cell_rid*6;
+
+          // Check to see that this patch has not been seen already
+          if (ref_patches_in_cell_list_(side_rid) == -1) {
+            // Store the index for this patch in the cell
+            ref_patches_in_cell_list_(side_rid) = patch_rid;
+            
+            // Store the quadrature point area values on the patch
+            switch (side_lid) {
+              case 0:  // patch with -xi normal
+                ref_patch_positions_(patch_rid,0) = lob_points_1D(i); 
+                ref_patch_positions_(patch_rid,1) = leg_points_1D(j);
+                ref_patch_positions_(patch_rid,2) = leg_points_1D(k);
+                
+                ref_patch_g_weights_(patch_rid) = leg_weights_1D(j)*leg_weights_1D(k);
+                break;
+              case 1:  // patch with +xi normal
+                ref_patch_positions_(patch_rid,0) = lob_points_1D(i+1);
+                ref_patch_positions_(patch_rid,1) = leg_points_1D(j);
+                ref_patch_positions_(patch_rid,2) = leg_points_1D(k);
+                
+                ref_patch_g_weights_(patch_rid) = leg_weights_1D(j)*leg_weights_1D(k);
+                break;
+              case 2:  // patch with -eta normal
+                ref_patch_positions_(patch_rid,0) = leg_points_1D(i);
+                ref_patch_positions_(patch_rid,1) = lob_points_1D(j); 
+                ref_patch_positions_(patch_rid,2) = leg_points_1D(k);
+                
+                ref_patch_g_weights_(patch_rid) = leg_weights_1D(i)*leg_weights_1D(k);
+                break;
+              case 3:  // patch with +eta normal
+                ref_patch_positions_(patch_rid,0) = leg_points_1D(i); 
+                ref_patch_positions_(patch_rid,1) = lob_points_1D(j+1);
+                ref_patch_positions_(patch_rid,2) = leg_points_1D(k); 
+                
+                ref_patch_g_weights_(patch_rid) = leg_weights_1D(i)*leg_weights_1D(k);
+                break;
+              case 4:  // patch with -zeta normal
+                ref_patch_positions_(patch_rid,0) = leg_points_1D(i);
+                ref_patch_positions_(patch_rid,1) = leg_points_1D(j);
+                ref_patch_positions_(patch_rid,2) = lob_points_1D(k);
+                
+                ref_patch_g_weights_(patch_rid) = leg_weights_1D(i)*leg_weights_1D(j);
+                break;
+              case 5:  // patch with +zeta normal
+                ref_patch_positions_(patch_rid,0) = leg_points_1D(i);
+                ref_patch_positions_(patch_rid,1) = leg_points_1D(j);
+                ref_patch_positions_(patch_rid,2) = lob_points_1D(k+1);
+                
+                ref_patch_g_weights_(patch_rid) = leg_weights_1D(i)*leg_weights_1D(j);
+                break;
+            }
+                     
+            // Get the rID for the neighboring cell in the element
+            // that communicates with this cell across this patch
+            int neighbor_cell_rid;
+            if (side_lid == 0 && 0 < i) {
+              neighbor_cell_rid = cell_rid(i-1, j, k);
+            } else if (side_lid == 1 && i < num_ref_cells_1D_ - 1) {
+              neighbor_cell_rid = cell_rid(i+1, j, k);
+            } else if (side_lid == 2 && 0 < j) {
+              neighbor_cell_rid = cell_rid(i, j-1, k);
+            } else if (side_lid == 3 && j < num_ref_cells_1D_ - 1) {
+              neighbor_cell_rid = cell_rid(i, j+1, k);
+            } else if (side_lid == 4 && 0 < k) {  
+              neighbor_cell_rid = cell_rid(i, j, k-1);
+            } else if (side_lid == 5 && k < num_ref_cells_1D_ - 1) {
+              neighbor_cell_rid = cell_rid(i, j, k+1);
+            } else { 
+              neighbor_cell_rid = -1;  // no neighboring cell in element
+            } 
+            
+            // If there is a neighboring cell, set this patch rID in its list
+            if (0 <= neighbor_cell_rid && neighbor_cell_rid < num_ref_cells_in_elem_) {
+              // Get the side lid for this patch in the neighboring cell
+              int neighbor_side_lid = patch_rlid_in_cell_neighbor(side_lid);
+              
+              // Get side rID from that side lid
+              int neighbor_side_rid = neighbor_side_lid + (neighbor_cell_rid)*6;
+              
+              // Store the patch rID for this patch in the neighboring cell
+              ref_patches_in_cell_list_(neighbor_side_rid) = patch_rid;
+            }
+            
+            // Increment the patch index
+            patch_rid ++;
+          }
+        }
+      }
+    }
+  }
+
+
+  // Initialize basis object based on type specified
+  switch (type_basis) {
+    case BasisType::LAGRANGE:
+      basis = new LagrangeProductBasis<Real>(order_basis, ref_verts_1D_);
+      break;
+    case BasisType::LEGENDRE:
+      basis = new LegendreProductBasis<Real>(order_basis);
+      break;
+    case BasisType::BERNSTEIN:
+      basis = new BernsteinProductBasis<Real>(order_basis);
+      break;
+    default:
+      basis = nullptr;
+      std::string err_msg("Error: basis class not implemented for this type.");
+      throw NotImplementedError(err_msg);
+  }
+
+
+  // Fill in evaluations of basis functions and their gradients in cells 
+  for (int cell_rlid = 0; cell_rlid < num_ref_cells_in_elem_; cell_rlid++) {
+    // Get the cell coordinates
+    Real point[num_dim_];
+    for (int dim = 0; dim < num_dim_; dim++)
+      point[dim] = ref_cell_positions(cell_rlid, dim);
+    
+    // Compute and store all the basis values at the cell
+    for (int basis_id = 0; basis_id < num_basis_; basis_id++)
+      ref_cell_basis_(cell_rlid, basis_id) = basis->eval_basis(basis_id, point);
+    
+    // Compute and store all the basis gradient values at the cell
+    for (int basis_id = 0; basis_id < num_basis_; basis_id++) {
+      Real grad_basis[num_dim_];
+      basis->eval_grad_basis(basis_id, point, grad_basis);
+      ref_cell_gradient_(cell_rlid, basis_id, 0) = grad_basis[0];
+      ref_cell_gradient_(cell_rlid, basis_id, 1) = grad_basis[1];
+      ref_cell_gradient_(cell_rlid, basis_id, 2) = grad_basis[2];
+    } 
+  } 
+
+
+  // Fill in evaluations of basis functions and their gradients on patches
+  for (int patch_rlid = 0; patch_rlid < num_patches_in_elem_; patch_rlid++) {
+    // Get the cell coordinates
+    // Get the patch coordinates
+    Real point[num_dim_];
+    for (int dim = 0; dim < num_dim_; dim++)
+      point[dim] = ref_patch_positions(patch_rlid, dim);
+    
+    // Compute and store all the basis values at the patch
+    for (int basis_id = 0; basis_id < num_basis_; basis_id++)
+      ref_patch_basis_(patch_rlid, basis_id) = basis->eval_basis(basis_id, 
+          point);
+    
+    // loop over the basis polynomials where there is one from each vertex
+    for (int basis_id = 0; basis_id < num_basis_; basis_id++) {
+      Real grad_basis[num_dim_];
+      basis->eval_grad_basis(basis_id, point, grad_basis);
+      ref_patch_gradient_(patch_rlid, basis_id, 0) = grad_basis[0];
+      ref_patch_gradient_(patch_rlid, basis_id, 1) = grad_basis[1];
+      ref_patch_gradient_(patch_rlid, basis_id, 2) = grad_basis[2];
+    } 
+  } 
+}
+
+
+HexRef::~HexRef() {
+  delete[] ref_verts_1D_;
+  delete basis;
+}
+
+
+int HexRef::num_ref_cells_in_elem() const {
+    return num_ref_cells_in_elem_;
+};
+
+
+int HexRef::cell_rid(int i, int j, int k) const {
+  return i + j*num_ref_cells_1D_ + k*num_ref_cells_1D_*num_ref_cells_1D_;
+};
+
+
+Real HexRef::ref_cell_positions(int cell_rid, int dim) const {
+  return ref_cell_positions_(cell_rid, dim);
+}
+
+
+Real HexRef::ref_cell_g_weights(int cell_rid) const {
+  return ref_cell_g_weights_(cell_rid);
+}
+
+
+int HexRef::patch_rlid_in_cell_neighbor(int patch_rlid) const {
+  return patch_rlid_cell_neighbor_[patch_rlid];
+}
+
+
+int HexRef::ref_patches_in_cell(int cell_rid, int patch_rlid) const {
+  int index = patch_rlid + cell_rid*6;
+  return ref_patches_in_cell_list_(index);
+}
+
+
+Real HexRef::ref_patch_positions(int patch_rid, int dim) const {
+  return ref_patch_positions_(patch_rid, dim);
+}
+
+
+Real HexRef::ref_patch_g_weights(int patch_rid) const {
+  return ref_patch_g_weights_(patch_rid);
+}
+
+
+Real HexRef::cell_side_unit_normals(int side_rlid, int dim) const {
+  int index = side_rlid*3 + dim;
+  return cell_side_unit_normals_[index];
+}
+
+
+int HexRef::num_ref_nodes() const {
+  return num_ref_nodes_in_elem_;
+}
+
+
+int HexRef::num_dim() const {
+  return num_dim_;
+}
+
+
+int HexRef::num_basis() const {
+  return num_basis_;
+}
+
+
+Real HexRef::ref_cell_basis(int cell_rid, int basis_id) const {
+    return ref_cell_basis_(cell_rid, basis_id);
+};
+
+
+Real HexRef::ref_cell_gradient(int cell_rid, int basis_id, 
+    int dim) const {
+    return ref_cell_gradient_(cell_rid, basis_id, dim);
+};
+
+
+Real HexRef::ref_patch_basis(int patch_rid, int basis_id) const {
+    return ref_patch_basis_(patch_rid, basis_id);
+};
+
+
+Real HexRef::ref_patch_gradient(int patch_rid, int basis_id, 
+    int dim) const {
+    return ref_patch_gradient_(patch_rid, basis_id, dim);
+};
diff --git a/elements-kokkos/src/LagrangeProductBasis.cpp b/elements-kokkos/src/LagrangeProductBasis.cpp
new file mode 100644
index 00000000..963c4a61
--- /dev/null
+++ b/elements-kokkos/src/LagrangeProductBasis.cpp
@@ -0,0 +1,289 @@
+#include "LagrangeProductBasis.h"
+#include "lagrange_polynomials.h"
+
+
+template <typename NumType>
+LagrangeProductBasis<NumType>::LagrangeProductBasis(const SizeType order, 
+    const NumType *vert_coords) : TensorProductBasis<NumType>(order), 
+    Z(vert_coords) {
+  N = Np + 1;
+  Ne = std::pow(N, Nd);
+
+  rad[0] = N;
+  rad[1] = N;
+  rad[2] = N;
+
+  // Allocate memory for weights and compute
+  w = new NumType[N];
+  lagrange::compute_barycentric_weights(N, Z, w);
+
+  // Allocate memory for intermediate coefficients
+  C = new NumType[3*N];
+}
+
+template <typename NumType>
+LagrangeProductBasis<NumType>::~LagrangeProductBasis() {
+  delete [] w, C;
+}
+
+/*
+ * Return evaluation of basis function, which is the tensor product of Lagrange
+ * polynomials, at specified coordinates.
+ *
+ * Parameters
+ * ----------
+ * I : index of basis function
+ * X : coordinates (in reference space)
+ */
+template <typename NumType>
+NumType LagrangeProductBasis<NumType>::eval_basis(const SizeType I, const NumType *X) {
+  // Decompose index of 3D tensor product basis function into indices of
+  // Lagrange polynomials
+  common::base_10_to_mixed_radix(Nd, rad, I, ijk);
+
+  // Check coincidence of coordinates with vertex coordinates
+  SizeType ix = lagrange::find_coincident_vertex(N, Z, X[0]);
+  SizeType iy = lagrange::find_coincident_vertex(N, Z, X[1]);
+  SizeType iz = lagrange::find_coincident_vertex(N, Z, X[2]);
+
+  // Evaluate Lagrange polynomials
+  NumType li = lagrange::eval(N, ijk[0], ix, Z, w, X[0]);
+  NumType lj = lagrange::eval(N, ijk[1], iy, Z, w, X[1]);
+  NumType lk = lagrange::eval(N, ijk[2], iz, Z, w, X[2]);
+
+  return li*lj*lk;
+}
+
+/*
+ * Evaluate gradient of basis function
+ *
+ * Parameters
+ * ----------
+ * I : index of basis function
+ * X : coordinates (in reference space)
+ *
+ * Returns
+ * -------
+ * grad_phi : gradient of the basis function
+ */
+template <typename NumType>
+void LagrangeProductBasis<NumType>::eval_grad_basis(const SizeType I, 
+    const  NumType *X, NumType *grad_phi) {
+  // Decompose index of 3D tensor product basis function into indices of
+  // Lagrange polynomials
+  common::base_10_to_mixed_radix(Nd, rad, I, ijk);
+
+  // Check coincidence of coordinates with vertex coordinates
+  SizeType ix = lagrange::find_coincident_vertex(N, Z, X[0]);
+  SizeType iy = lagrange::find_coincident_vertex(N, Z, X[1]);
+  SizeType iz = lagrange::find_coincident_vertex(N, Z, X[2]);
+
+  // Evaluate Lagrange polynomials
+  NumType li = lagrange::eval(N, ijk[0], ix, Z, w, X[0]);
+  NumType lj = lagrange::eval(N, ijk[1], iy, Z, w, X[1]);
+  NumType lk = lagrange::eval(N, ijk[2], iz, Z, w, X[2]);
+  
+  // Evaluate Lagrange polynomial derivatives
+  SizeType n = 1;  // order of derivative
+  NumType dli = lagrange::eval_der(N, n, ijk[0], ix, Z, w, X[0], C);
+  NumType dlj = lagrange::eval_der(N, n, ijk[1], iy, Z, w, X[1], C);
+  NumType dlk = lagrange::eval_der(N, n, ijk[2], iz, Z, w, X[2], C);
+
+  // Store partial derivatives in entries of gradient
+  grad_phi[0] = dli*lj*lk;
+  grad_phi[1] = li*dlj*lk;
+  grad_phi[2] = li*lj*dlk;
+}
+
+/*
+ * Return evaluation of local function approximation, which is formed by
+ * the sum of the products of tensor-product Lagrange basis functions and the
+ * provided coefficients, at specified coordinates
+ *
+ * Parameters
+ * ----------
+ * c : coefficients
+ * X : coordinates (in reference space)
+ *
+ * Note
+ * ----
+ * The intermediate coefficients or workspace array is divided into 3 lines of
+ * length equal to the number of vertices. In the dimension-by-dimension
+ * approach used for the approximation evaluation, each line is filled by
+ * evaluations at the previous dimensional level. The initial coefficients are
+ * cycled through in the X-dimension loop to produce coefficients, stored in
+ * the workspace array, to be used in the Y-dimension loop. The output from the
+ * Y-dimension loop is coefficients, stored again in the workspace array, to be
+ * used in the final Z-dimension evaluation. This process, as described, only
+ * requires 2 lines of workspace. The extra line of workspace is needed for
+ * derivative evaluations, and so it is unused here.
+ */
+template <typename NumType>
+NumType LagrangeProductBasis<NumType>::eval_approx(const NumType *c, 
+    const NumType *X) {
+  // Check the coincidence of the coordinates with vertex coordinates
+  SizeType ix = lagrange::find_coincident_vertex(N, Z, X[0]);
+  SizeType iy = lagrange::find_coincident_vertex(N, Z, X[1]);
+  SizeType iz = lagrange::find_coincident_vertex(N, Z, X[2]);
+
+  for (int k = 0; k < N; k++) {
+    for (int j = 0; j < N; j++) {
+      // Collapse first dimension into coefficients for second dimension
+      C[j] = lagrange::eval_interp(N, ix, Z, w, X[0], &c[j*N+k*N*N]);
+    }
+
+    // Collapse second dimension into coefficients for third dimension
+    C[N+k] = lagrange::eval_interp(N, iy, Z, w, X[1], C);
+  }
+
+  // Collapse third dimension into interpolant evaluation
+  return lagrange::eval_interp(N, iz, Z, w, X[2], &C[N]);
+}
+
+/*
+ * Evaluate gradient of the local function approximation, which is formed by
+ * the sum of the products of tensor-product Lagrange basis functions and the
+ * provided coefficients, at specified coordinates
+ *
+ * Parameters
+ * ----------
+ * c : coefficients
+ * X : coordinates (in reference space)
+ *
+ * Returns
+ * -------
+ * grad_f : gradient of the interpolant
+ *
+ * Note
+ * ----
+ * This routine uses a dimension-by-dimension approach as in the approximation
+ * routine. See the documentation of that routine for details. This routine
+ * differs from that one in that the first line of the workspace array is
+ * reserved for the derivative evaluations. These derivative evaluations use
+ * the first line to store divided differences.
+ */
+template <typename NumType>
+void LagrangeProductBasis<NumType>::eval_grad_approx(const NumType *c, 
+    const NumType *X, NumType *grad_f) {
+  const SizeType n = 1; // order of partial derivative
+
+  // Check the coincidence of the coordinates with vertex coordinates
+  SizeType ix = lagrange::find_coincident_vertex(N, Z, X[0]);
+  SizeType iy = lagrange::find_coincident_vertex(N, Z, X[1]);
+  SizeType iz = lagrange::find_coincident_vertex(N, Z, X[2]);
+
+  for (int l = 0; l < Nd; l++) {
+    for (int k = 0; k < N; k++) {
+      for (int j = 0; j < N; j++) {
+        // Collapse first dimension into coefficients for second dimension
+        std::copy(c+j*N+k*N*N, c+j*N+k*N*N+N, C);  // load first line
+        if (l == 0) {
+          C[N+j] = lagrange::eval_der_interp(N, n, ix, Z, w, X[0], C);
+        } else {
+          C[N+j] = lagrange::eval_interp(N, ix, Z, w, X[0], C);
+        }
+      }
+
+      // Collapse second dimension into coefficients for third dimension
+      std::copy(C+N, C+2*N, C); // second line back into first line
+      if (l == 1) {
+        C[2*N+k] = lagrange::eval_der_interp(N, n, iy, Z, w, X[1], C);
+      } else {
+        C[2*N+k] = lagrange::eval_interp(N, iy, Z, w, X[1], C);
+      }
+    }
+
+    // Collapse third dimension into interpolant evaluation
+    std::copy(C+2*N, C+3*N, C); // third line back into first line
+    if (l == 2) {
+      grad_f[l] = lagrange::eval_der_interp(N, n, iz, Z, w, X[2], C);
+    } else {
+      grad_f[l] = lagrange::eval_interp(N, iz, Z, w, X[2], C);
+    }
+  }
+}
+
+/*
+ * Evaluate the Jacobian of the spatial mapping
+ *
+ * Parameters
+ * ----------
+ * x : x coordinates in physical space
+ * y : y coordinates in physical space
+ * z : z coordinates in physical space
+ * X : coordinates in reference space at which to evaluate
+ * 
+ * Returns
+ * -------
+ * J : Jacobian (stored in column-major order)
+ */
+template <typename NumType>
+void LagrangeProductBasis<NumType>::eval_jac(const NumType *x, const NumType *y, 
+    const NumType *z, const NumType *X, NumType *J) {
+  // Evaluate the gradient of x = x(X, Y, Z)
+  this->eval_grad_approx(x, X, J);
+
+  // Evaluate the gradient of y = y(X, Y, Z)
+  this->eval_grad_approx(y, X, J+3);
+
+  // Evaluate the gradient of z = z(X, Y, Z)
+  this->eval_grad_approx(z, X, J+6);
+}
+
+/*
+ * Return the determinant of the Jacobian of the spatial mapping
+ *
+ * Parameters
+ * ----------
+ * x : x coordinates in physical space
+ * y : y coordinates in physical space
+ * z : z coordinates in physical space
+ * X : coordinates in reference space at which to evaluate
+ */
+template <typename NumType>
+NumType LagrangeProductBasis<NumType>::eval_det_jac(const NumType *x, 
+    const NumType *y, const NumType *z, const NumType *X) {
+  NumType J[9];
+  this->eval_jac(x, y, z, X, J);
+
+  return J[0]*(J[4]*J[8] - J[5]*J[7]) - J[1]*(J[3]*J[8] - J[5]*J[6]) 
+      + J[2]*(J[3]*J[7] - J[4]*J[6]);
+}
+
+/* Evaluate the inverse of the Jacobian of the spatial mapping
+ *
+ * Parameters
+ * ----------
+ * x : x coordinates in physical space
+ * y : y coordinates in physical space
+ * z : z coordinates in physical space
+ * X : coordinates in reference space at which to evaluate
+ * 
+ * Returns
+ * -------
+ * Jinv : Jacobian inverse (stored in column-major order)
+ */
+template <typename NumType>
+void LagrangeProductBasis<NumType>::eval_inv_jac(const NumType *x, const NumType *y, 
+    const NumType *z, const NumType *X, NumType *Jinv) {
+  NumType J[9];
+  this->eval_jac(x, y, z, X, J);
+
+  NumType d = 1.0/(this->eval_det_jac(x, y, z, X));
+
+  Jinv[0] = d*(J[4]*J[8] - J[5]*J[7]);
+  Jinv[1] = -d*(J[1]*J[8] - J[2]*J[7]);
+  Jinv[2] = d*(J[1]*J[5] - J[2]*J[4]);
+
+  Jinv[3] = -d*(J[3]*J[8] - J[5]*J[6]);
+  Jinv[4] = d*(J[0]*J[8] - J[2]*J[6]);
+  Jinv[5] = -d*(J[0]*J[5] - J[2]*J[3]);
+
+  Jinv[6] = d*(J[3]*J[7] - J[4]*J[6]);
+  Jinv[7] = -d*(J[0]*J[7] - J[1]*J[6]);
+  Jinv[8] = d*(J[0]*J[4] - J[1]*J[3]);
+}
+
+// Explicit instantiation of template class
+template class LagrangeProductBasis<Real>;
+template class LagrangeProductBasis<Complex>;
diff --git a/elements-kokkos/src/LegendreProductBasis.cpp b/elements-kokkos/src/LegendreProductBasis.cpp
new file mode 100644
index 00000000..72de37f1
--- /dev/null
+++ b/elements-kokkos/src/LegendreProductBasis.cpp
@@ -0,0 +1,168 @@
+#include "LegendreProductBasis.h"
+#include "legendre_polynomials.h"
+
+
+template <typename NumType>
+LegendreProductBasis<NumType>::LegendreProductBasis(const SizeType order) 
+    : TensorProductBasis<NumType>(order) {
+  N = Np + 1;
+  Ne = std::pow(N, Nd);
+
+  rad[0] = N;
+  rad[1] = N;
+  rad[2] = N;
+
+  // Allocate memory for intermediate coefficients
+  C = new NumType[2*N];
+}
+
+template <typename NumType>
+LegendreProductBasis<NumType>::~LegendreProductBasis() {}
+
+/*
+ * Return evaluation of basis function, which is the tensor product of Legendre
+ * polynomials, at specified coordinates.
+ *
+ * Parameters
+ * ----------
+ * I : index of basis function
+ * X : coordinates (in reference space)
+ */
+template <typename NumType>
+NumType LegendreProductBasis<NumType>::eval_basis(const SizeType I, 
+    const NumType *X) {
+  // Decompose index of 3D tensor product basis function into indices of
+  // Legendre polynomials
+  common::base_10_to_mixed_radix(Nd, rad, I, ijk);
+
+  // Evaluate Legendre polynomials
+  NumType Pi = legendre::eval(int(ijk[0]), X[0]);
+  NumType Pj = legendre::eval(int(ijk[1]), X[1]);
+  NumType Pk = legendre::eval(int(ijk[2]), X[2]);
+
+  return Pi*Pj*Pk;
+}
+
+/*
+ * Evaluate gradient of basis function
+ *
+ * Parameters
+ * ----------
+ * I : index of basis function
+ * X : coordinates (in reference space)
+ *
+ * Returns
+ * -------
+ * grad_phi : gradient of the basis function
+ */
+template <typename NumType>
+void LegendreProductBasis<NumType>::eval_grad_basis(const SizeType I, 
+    const  NumType *X, NumType *grad_phi) {
+  // Decompose index of 3D tensor product basis function into indices of
+  // Legendre polynomials
+  common::base_10_to_mixed_radix(Nd, rad, I, ijk);
+
+  // Evaluate Legendre polynomials
+  NumType Pi = legendre::eval(int(ijk[0]), X[0]);
+  NumType Pj = legendre::eval(int(ijk[1]), X[1]);
+  NumType Pk = legendre::eval(int(ijk[2]), X[2]);
+  
+  // Evaluate Legendre polynomial derivatives
+  SizeType q = 1;  // order of derivative
+  NumType dPi = legendre::eval_der(int(ijk[0]), q, X[0]);
+  NumType dPj = legendre::eval_der(int(ijk[1]), q, X[1]);
+  NumType dPk = legendre::eval_der(int(ijk[2]), q, X[2]);
+
+  // Store partial derivatives in entries of gradient
+  grad_phi[0] = dPi*Pj*Pk;
+  grad_phi[1] = Pi*dPj*Pk;
+  grad_phi[2] = Pi*Pj*dPk;
+}
+
+/*
+ * Return evaluation of local function approximation, which is formed by the
+ * sum of the products of tensor-product Legendre basis functions and the
+ * provided coefficients, at specified coordinates
+ *
+ * Parameters
+ * ----------
+ * c : coefficients
+ * X : coordinates (in reference space)
+ *
+ * Note
+ * ----
+ * The intermediate coefficients or workspace array is divided into 2 lines of
+ * length equal to the number of vertices. In the dimension-by-dimension
+ * approach used for the approximation evaluation, each line is filled by
+ * evaluations at the previous dimensional level. The initial coefficients are
+ * cycled through in the X-dimension loop to produce coefficients, stored in
+ * the workspace array, to be used in the Y-dimension loop. The output from the
+ * Y-dimension loop is coefficients, stored again in the workspace array, to be
+ * used in the final Z-dimension evaluation. 
+ */
+template <typename NumType>
+NumType LegendreProductBasis<NumType>::eval_approx(const NumType *c, 
+    const NumType *X) {
+  for (int k = 0; k < N; k++) {
+    for (int j = 0; j < N; j++) {
+      // Collapse first dimension into coefficients for second dimension
+      C[j] = legendre::eval_approx(N, &c[j*N+k*N*N], X[0]);
+    }
+
+    // Collapse second dimension into coefficients for third dimension
+    C[N+k] = legendre::eval_approx(N, C, X[1]);
+  }
+
+  // Collapse third dimension into approximation evaluation
+  return legendre::eval_approx(N, &C[N], X[2]);
+}
+
+/*
+ * Evaluate gradient of the local function approximation, which is formed by
+ * the sum of the products of tensor-product Legendre basis functions and the
+ * provided coefficients, at specified coordinates
+ *
+ * Parameters
+ * ----------
+ * c : coefficients
+ * X : coordinates (in reference space)
+ *
+ * Returns
+ * -------
+ * grad_f : gradient of the approximation
+ */
+template <typename NumType>
+void LegendreProductBasis<NumType>::eval_grad_approx(const NumType *c, 
+    const NumType *X, NumType *grad_f) {
+  SizeType q = 1;  // order of derivative
+  for (int l = 0; l < Nd; l++) {
+    for (int k = 0; k < N; k++) {
+      for (int j = 0; j < N; j++) {
+        // Collapse first dimension into coefficients for second dimension
+        if (l == 0) {
+          C[j] = legendre::eval_der_approx(N, 1, &c[j*N+k*N*N], X[0]);
+        } else {
+          C[j] = legendre::eval_approx(N, &c[j*N+k*N*N], X[0]);
+        }
+      }
+
+      // Collapse second dimension into coefficients for third dimension
+      if (l == 1) {
+        C[N+k] = legendre::eval_der_approx(N, q, C, X[1]);
+      } else {
+        C[N+k] = legendre::eval_approx(N, C, X[1]);
+      }
+    }
+
+    // Collapse third dimension into approximation evaluation
+    if (l == 2) {
+      grad_f[l] = legendre::eval_der_approx(N, q, &C[N], X[2]);
+    } else {
+      grad_f[l] = legendre::eval_approx(N, &C[N], X[2]);
+    }
+  }
+}
+
+// Explicit instantiation of template class
+template class LegendreProductBasis<Real>;
+template class LegendreProductBasis<Complex>;
diff --git a/elements-kokkos/src/TensorProductBasis.cpp b/elements-kokkos/src/TensorProductBasis.cpp
new file mode 100644
index 00000000..272cce2a
--- /dev/null
+++ b/elements-kokkos/src/TensorProductBasis.cpp
@@ -0,0 +1,36 @@
+#include "TensorProductBasis.h"
+
+#include "error.h"
+
+
+template <typename NumType>
+TensorProductBasis<NumType>::TensorProductBasis(const SizeType order) : Np(order) {}
+
+template <typename NumType>
+void TensorProductBasis<NumType>::eval_jac(const NumType *, const NumType *, 
+    const NumType *, const NumType *, NumType *) {
+  std::string err_msg(
+      "Error: Jacobian routines not implemented for this basis type.");
+  throw NotImplementedError(err_msg);
+};
+
+template <typename NumType>
+NumType TensorProductBasis<NumType>::eval_det_jac(const NumType *, 
+    const NumType *, const NumType *, const NumType *) {
+  std::string err_msg(
+      "Error: Jacobian routines not implemented for this basis type.");
+  throw NotImplementedError(err_msg);
+  return 0.0;
+}
+
+template <typename NumType>
+void TensorProductBasis<NumType>::eval_inv_jac(const NumType *, 
+    const NumType *, const NumType *, const NumType *, NumType *) {
+  std::string err_msg(
+      "Error: Jacobian routines not implemented for this basis type.");
+  throw NotImplementedError(err_msg);
+}
+
+// Explicit instantiation of template class
+template class TensorProductBasis<Real>;
+template class TensorProductBasis<Complex>;
diff --git a/elements-kokkos/src/bernstein_polynomials.cpp b/elements-kokkos/src/bernstein_polynomials.cpp
new file mode 100644
index 00000000..4df9858f
--- /dev/null
+++ b/elements-kokkos/src/bernstein_polynomials.cpp
@@ -0,0 +1,55 @@
+#include "bernstein_polynomials.h"
+
+
+namespace bernstein {
+  template <typename NumType> 
+  NumType eval( const int n, const int v, const NumType X) {
+    if ( n == 0 && v != 0 ) return 0;
+    if ( n == 0 && v == 0 ) return 1;
+    if ( n < v ) return 0;
+    return 0.5*((1.0 - X)*eval(n-1, v, X) + (1.0 + X)*eval(n-1, v-1, X)); 
+  };
+  template <typename NumType>
+  NumType eval_der( const int n, const int v, const NumType X) {
+    if ( n == 0) return 0;
+    if ( n < v) return 0;
+    if ( n == 1 && v == 0) return -0.5;
+    if ( n == 1 && v == 1) return 0.5;
+    return 0.5*((1.0 - X)*eval_der(n-1,v, X) + (1.0 + X)*eval_der(n-1, v-1, X) 
+        + eval(n-1, v-1, X) - eval(n-1, v, X));
+  };
+
+  template <typename NumType>
+  NumType eval_approx(const SizeType N, const NumType *c, const NumType X) {
+    NumType sum = 0.0;
+    for (SizeType j = 0; j < N; j++) {
+      sum += c[j]*eval(N, j, X);
+    }
+    return sum;
+  }
+
+  template <typename NumType>
+  NumType eval_der_approx(const SizeType N, const NumType *c, const NumType X) { 
+    NumType sum = 0.0;
+    for (SizeType j = 0; j < N; j ++) { 
+      sum += c[j]*eval_der( N, j, X);
+    }
+    return sum;
+  };
+
+
+  // Explicit instantiations of template functions
+  template Real eval(const int n, const int v, const Real X);
+  template Complex eval(const int n, const int v, const Complex X);
+  
+  template Real eval_der(const int n, const int v, const Real X);
+  template Complex eval_der(const int n, const int v, const Complex X);
+  
+  template Real eval_approx(const SizeType N, const Real *c, const Real X);
+  template Complex eval_approx(const SizeType N, const Complex *c, 
+      const Complex X);
+  
+  template Real eval_der_approx(const SizeType N, const Real *c, const Real X);
+  template Complex eval_der_approx(const SizeType N, const Complex *c, 
+      const Complex X);
+}
diff --git a/elements-kokkos/src/jacobi_polynomials.cpp b/elements-kokkos/src/jacobi_polynomials.cpp
new file mode 100644
index 00000000..98e4df68
--- /dev/null
+++ b/elements-kokkos/src/jacobi_polynomials.cpp
@@ -0,0 +1,33 @@
+#include "jacobi_polynomials.h"
+
+namespace jacobi {
+  template <typename NumType>
+  NumType eval(const int n, const Real alpha, const Real beta, 
+      const NumType X) {
+    if (n == -1) return 0.0;
+    if (n == 0) return 1.0;
+    return (a(alpha, beta, n)*X + b(alpha, beta, n))
+        *eval(n - 1, alpha, beta, X) 
+        - c(alpha, beta, n)*eval(n - 2, alpha, beta, X);
+  }
+
+  template <typename NumType>
+  NumType eval_der(const int n, const int k, const Real alpha, 
+      const Real beta, const NumType X) {
+    if (n == 0 || k > n) return 0.0;
+    Real theta = Real(n) + Real(alpha + alpha) + 1.0;
+    return std::pow(0.5, k)*std::tgamma(theta + Real(k))
+        *eval(n - k, alpha + Real(k), beta + Real(k), X)/std::tgamma(theta);
+  }
+
+  // Explicit instantiations of template functions
+  template Real eval(const int n, const Real alpha, const Real beta, 
+      const Real X);
+  template Complex eval(const int n, const Real alpha, const Real beta, 
+      const Complex X);
+
+  template Real eval_der(const int n, const int k, const Real alpha, 
+      const Real beta, const Real X);
+  template Complex eval_der(const int n, const int k, const Real alpha, 
+      const Real beta, const Complex X);
+}
diff --git a/elements-kokkos/src/lagrange_polynomials.cpp b/elements-kokkos/src/lagrange_polynomials.cpp
new file mode 100644
index 00000000..28ccb511
--- /dev/null
+++ b/elements-kokkos/src/lagrange_polynomials.cpp
@@ -0,0 +1,295 @@
+#include "lagrange_polynomials.h"
+
+namespace lagrange {
+  /*
+   * If the input coordinate is coincident with any of the input vertices,
+   * return the index of the vertex with which it coincides
+   *
+   * It is typically bad practice to check if two floating point numbers, say x
+   * and y, are equal by evaluating x - y == 0. But that is precisely what the
+   * authors suggest to do in "Barycentric Lagrange Interpolation" (Berrut and
+   * Trefethen, 2004). It is also what the authors of Nektar++ do in their
+   * implementation of barycentric Lagrange interpolation. I'm not sure what to
+   * do
+   *
+   * Parameters
+   * ----------
+   * N : number of vertices
+   * z : vertex coordinates
+   * x : coordinate to test
+   */
+  template <typename NumType>
+  SizeType find_coincident_vertex(const SizeType &N, const NumType *z, 
+      const NumType &x) {
+    // The largest number an unsigned integer can be
+    SizeType k = -1;
+
+    // Adding to -1 in unsigned rolls over zero
+    for (SizeType j = 0; j < N; j++) {
+      //k += (j + 1)*SizeType(z[j] - x == 0.0);
+      k += (j + 1)*SizeType(common::almost_equal(z[j], x));
+    }
+
+    return k;
+  }
+
+  /*
+   * Calculation of barycentric weights of Lagrange interpolant vertices
+   *
+   * Parameters
+   * ----------
+   * N : number of vertices 
+   * z : vertex coordinates
+   *
+   * Returns
+   * -------
+   * w : barycentric vertex weights
+   *
+   * Notes
+   * -----
+   * If one evaluates the interpolant using the barycentric formula of the
+   * second kind, the barycentric weights may be scaled by an arbitrary scalar
+   * (the maximum weight, for example). Since I am using the barycentric
+   * formula of the first kind (also called the modified Lagrange formula), I
+   * cannot and do not scale the weights 
+   */
+  template <typename NumType>
+  void compute_barycentric_weights(const SizeType &N, const NumType *z, 
+      NumType *w) {
+    // Compute weights
+    for (SizeType j = 0; j < N; j++) {
+      w[j] = 1.0;
+      for (SizeType k = 0; k < N; k++) {
+        // TODO optimization: make branchless? Trefethen uses log o exp trick
+        if (j != k) w[j] *= 1.0/(z[j] - z[k]);
+      }
+    }
+  };
+
+  /*
+   * Return evaluation of Lagrange polynomial associated with a given vertex at
+   * a given coordinate
+   *
+   * Parameters
+   * ----------
+   * Nv : number of vertices
+   * i  : index of vertex
+   * ic : index of vertex coincident with X (-1 if not coincident)
+   * Z  : vertex coordinates
+   * w  : barycentric weights
+   * X  : coordinate at which to evaluate
+   */
+  template <typename NumType>
+  NumType eval(const SizeType Nv, const SizeType i, const SizeType ic, 
+      const NumType *Z, const NumType *w, const NumType X) {
+    if (ic < Nv) return i == ic ? 1.0 : 0.0;
+
+    // Evaluate nodal polynomial
+    NumType L = 1.0;
+    for (SizeType j = 0; j < Nv; j++) L *= (X - Z[j]);
+
+    return L*w[i]/(X - Z[i]);
+  }
+
+  /* 
+   * Return evaluation of derivative of Lagrange polynomial associated with a
+   * given vertex at a given coordinate
+   *
+   * Currently implemented as a special case of interpolant derivative
+   * evaluation. There is probably a better way to do this, but I haven't
+   * gotten around to finding it and implementing it
+   *
+   * Parameters
+   * ----------
+   * Nv         : number of vertices
+   * n          : order of derivative
+   * i          : index of vertex
+   * ic : index of vertex coincident with X (-1 if not coincident)
+   * Z          : vertex coordinates
+   * w          : barycentric weights
+   * X          : coordinate at which to evaluate
+   * c          : work array for intermediate coefficients
+   */
+  template <typename NumType>
+  NumType eval_der(const SizeType Nv, const SizeType n, const SizeType i, 
+      const SizeType ic, const NumType *Z, const NumType *w, 
+      const NumType X, NumType *c) {
+    // Initialize coefficients of interpolant to Kronecker deltas to create
+    // interpolant equivalent to Lagrange polynomial at vertex
+    for (SizeType j = 0; j < Nv; j++) {
+      if (j != i) { c[j] = 0.0; } else { c[j] = 1.0; }
+    }
+    
+    NumType dnl = eval_der_interp<NumType>(Nv, n, ic, Z, w, X, c);
+
+    return dnl;
+  }
+
+  /* 
+   * Return evaluation of Lagrange interpolant, which is the sum of the
+   * products of Lagrange polynomials and provided coefficients, at specified
+   * coordinates
+   *
+   * This implementation uses the modified Lagrange formula (barycentric
+   * formula of the first kind) from "Barycentric Lagrange Interpolation"
+   * (Berrut and Trefethen, 2004). 
+   *
+   * Parameters
+   * ----------
+   * Nv : number of vertices 
+   * i  : index of vertex coincident with coordinate (-1 if not coincident)
+   * Z  : vertex coordinates
+   * w  : barycentric vertex weights
+   * X  : coordinate at which to evaluate interpolant
+   * c  : input coefficients (function values at vertices)
+   */
+  template <typename NumType>
+  NumType eval_interp(const SizeType Nv, const SizeType i, const NumType *Z, 
+      const NumType *w, const NumType X, const NumType *c) {
+    NumType p;
+    NumType L = 1.0;  // nodal polynomial evaluation
+
+    // Evaluate the interpolant
+    if (i < Nv) {  // coincident
+      return c[i];
+    } else {  // non-coincident
+      // Loop over vertices
+      p = 0.0;
+      for (SizeType j = 0; j < Nv; j++) {
+        // Contribution to interpolant evaluation
+        p += w[j]*c[j]/(X - Z[j]);
+
+        // Contribution to scalar (nodal polynomial evaluation)
+        L *= (X - Z[j]);
+      }
+
+      // Apply scaling
+      p *= L;
+    }
+
+    return p; 
+  }
+
+  /* 
+   * Return evaluation of derivative of Lagrange interpolant, which is the sum
+   * of the products of the derivatives of Lagrange polynomials and provided
+   * coefficients, at specified coordinates
+   *
+   * This implementation is based on the formula for the derivative of a
+   * rational interpolant given in "Some New Aspects of Rational Interpolation"
+   * (Schneider and Werner, 1986).
+   *
+   * Parameters
+   * ----------
+   * Nv : number of vertices 
+   * n  : order of derivative
+   * i  : index of vertex coincident with coordinate (-1 if not coincident)
+   * Z  : vertex coordinates
+   * w  : barycentric vertex weights
+   * X  : coordinate at which to eval interpolant
+   * c  : input coefficients (will be modified)
+   */
+  template <typename NumType>
+  NumType eval_der_interp(const SizeType Nv, const SizeType n, const SizeType i,
+      const NumType *Z, const NumType *w, const NumType X, NumType *c) {
+    // Interpolant and nodal polynomial evaluation
+    NumType p = 0.0;  // interpolant evaluation
+    NumType L = 1.0;  // nodal polynomial
+    if (i < Nv) {  // coincident
+      p = c[i];
+    } else {  // non-coincident
+      // Loop over vertices
+      p = 0.0;
+      for (SizeType j = 0; j < Nv; j++) {
+        // Contribution to interpolant evaluation
+        p += w[j]*c[j]/(X - Z[j]);
+
+        // Contribution to scalar (nodal polynomial evaluation)
+        L *= (X - Z[j]);
+      }
+
+      // Apply scaling
+      p *= L;
+    }
+
+    // Evaluate derivatives, building up to specified order
+    NumType dkp = p, dnp = 0.0;
+    NumType M = 1.0;  // factorial(k)
+    if (i < Nv) {  // coincident
+      for (SizeType k = 1; k <= n; k++) {
+        // Zero the output
+        dnp = 0.0;
+
+        for (SizeType j = 0; j < Nv; j++) {
+          // Calculate divided difference and store in intermediate coefficients
+          NumType sx = 1.0/(Z[i] - Z[j]);
+          c[j] = sx*(dkp - c[j]);
+
+          // Update output coefficient with contribution from vertex
+          // TODO optimization: make branchless? Trefethen uses log o exp trick
+          if (j != i) dnp += w[j]*c[j];
+        }
+
+        // Scale the output and copy for use in calculating next order
+        M *= k;
+        dnp *= -M/w[i];
+        dkp = dnp;
+      }
+    } else {  // non-coincident
+      for (SizeType k = 1; k <= n; k++) {
+        // Zero the output
+        dnp = 0.0;
+
+        for (SizeType j = 0; j < Nv; j++) {
+          // Calculate divided difference and store in intermediate coefficients
+          NumType sx = 1.0/(X - Z[j]);
+          c[j] = sx*(dkp - c[j]);
+
+          // Update output coefficient with contribution from vertex
+          dnp += w[j]*c[j]/(X - Z[j]);
+        }
+
+        // Scale the output and copy for use in calculating next order
+        M *= k;
+        dnp *= L*M;
+        dkp = dnp;
+      }
+    }
+
+    return dnp;
+  }
+
+  // Explicit instantiations of template functions
+  template SizeType find_coincident_vertex(const SizeType &N, const Real *z, 
+      const Real &x);
+  template SizeType find_coincident_vertex(const SizeType &N, const Complex *z, 
+      const Complex &x);
+
+  template void compute_barycentric_weights(const SizeType &N, const Real *z, 
+      Real *w);
+  template void compute_barycentric_weights(const SizeType &N, const Complex *z, 
+      Complex *w);
+
+  template Real eval(const SizeType Nv, const SizeType i, const SizeType ic, 
+      const Real *Z, const Real *w, const Real X);
+  template Complex eval(const SizeType Nv, const SizeType i, const SizeType ic, 
+      const Complex *Z, const Complex *w, const Complex X);
+
+  template Real eval_der(const SizeType Nv, const SizeType n, const SizeType i, 
+      const SizeType ic, const Real *Z, const Real *w, const Real X, Real *c);
+  template Complex eval_der(const SizeType Nv, const SizeType n, 
+      const SizeType i, const SizeType ic, const Complex *Z, const Complex *w, 
+      const Complex X, Complex *c);
+
+  template Real eval_interp(const SizeType Nv, const SizeType i, const Real *Z, 
+      const Real *w, const Real X, const Real *c);
+  template Complex eval_interp(const SizeType Nv, const SizeType i, 
+      const Complex *Z, const Complex *w, const Complex X, const Complex *c);
+
+  template Real eval_der_interp(const SizeType Nv, const SizeType n, 
+      const SizeType i, const Real *Z, const Real *w, const Real X, 
+      Real *c);
+  template Complex eval_der_interp(const SizeType Nv, const SizeType n, 
+      const SizeType i, const Complex *Z, const Complex *w, const Complex X, 
+      Complex *c);
+}
diff --git a/elements-kokkos/src/legendre_polynomials.cpp b/elements-kokkos/src/legendre_polynomials.cpp
new file mode 100644
index 00000000..3df8b496
--- /dev/null
+++ b/elements-kokkos/src/legendre_polynomials.cpp
@@ -0,0 +1,95 @@
+#include "legendre_polynomials.h"
+#include "jacobi_polynomials.h"
+
+namespace legendre {
+  /*
+   * Return evaluation of Legendre polynomial of specified order at given
+   * coordinate
+   *
+   * Parameters
+   * ----------
+   * n : order of Legendre polynomial
+   * X : coordinate in reference space [-1, 1]
+   */
+  template <typename NumType>
+  NumType eval(const int n, const NumType X) {
+    if (n == -1) return 0.0;
+    if (n == 0) return 1.0;
+    return (1.0/double(n))*((2.0*double(n) - 1.0)*X*eval(n - 1, X)
+        - (double(n) - 1.0)*eval(n - 2, X));
+  };
+
+  /*
+   * Return evaluation of Legendre polynomial of specified order at given
+   * coordinate
+   *
+   * Parameters
+   * ----------
+   * n : order of Legendre polynomial
+   * k : order of derivative
+   * X : coordinate in reference space [-1, 1]
+   */
+  template <typename NumType>
+  NumType eval_der(const int n, const int k, const NumType X) {
+    return jacobi::eval_der(n, k, 0.0, 0.0, X);
+  };
+
+  /*
+   * Return evaluation of Legendre polynomial approximation, which is the sum
+   * of the products of Legendre polynomials and provided coefficients, at a
+   * specified coordinate
+   * 
+   * Parameters
+   * ----------
+   * N : maximum polynomial order
+   * c : coefficients
+   * X : coordinate in reference space [-1, 1]
+   */
+  template <typename NumType>
+  NumType eval_approx(const SizeType N, const NumType *c, const NumType X) {
+    NumType sum = 0.0;
+    for (SizeType j = 0; j < N; j++) {
+      sum += c[j]*eval(j, X);
+    }
+
+    return sum;
+  }
+
+  /*
+   * Return evaluation of derivative of Legendre polynomial approximation,
+   * which is the sum of the products of the derivatives of the Legendre
+   * polynomials and provided coefficients, at a specified coordinate
+   * 
+   * Parameters
+   * ----------
+   * N : maximum polynomial order
+   * c : coefficients
+   * X : coordinate in reference space [-1, 1]
+   */
+  template <typename NumType>
+  NumType eval_der_approx(const SizeType N, const SizeType k, const NumType *c, 
+      const NumType X) {
+    NumType sum = 0.0;
+    for (SizeType j = 0; j < N; j++) {
+      sum += c[j]*eval_der(j, k, X);
+    }
+
+    return sum;
+  }
+
+  // Explicit instatiations of template functions
+  template Real eval(const int n, const Real X);
+  template Complex eval(const int n, const Complex X);
+
+  template Real eval_der(const int n, const int k, const Real X);
+  template Complex eval_der(const int n, const int k, const Complex X);
+
+  template Real eval_approx(const SizeType N, const Real *c, const Real X);
+  template Complex eval_approx(const SizeType N, const Complex *c, 
+      const Complex X);
+
+  template Real eval_der_approx(const SizeType N, const SizeType k, 
+      const Real *c, const Real X);
+  template Complex eval_der_approx(const SizeType N, const SizeType k, 
+      const Complex *c, const Complex X);
+}
diff --git a/elements-kokkos/src/points_and_weights.cpp b/elements-kokkos/src/points_and_weights.cpp
new file mode 100644
index 00000000..aa625bcd
--- /dev/null
+++ b/elements-kokkos/src/points_and_weights.cpp
@@ -0,0 +1,1104 @@
+#include "points_and_weights.h"
+#include "error.h"
+
+
+/*
+ * Fill an array with points equally spaced between end points (inclusive)
+ *
+ * Parameters
+ * ----------
+ * N  : number of points
+ * Zl : left end point 
+ * Zr : right end point 
+ *
+ * Returns
+ * -------
+ * Z : array of equispaced points
+ */
+template <typename NumType>
+void equispaced_points(SizeType N, NumType &Zl, NumType &Zr, NumType *Z) {
+  for (SizeType i = 0; i < N; i++)
+    Z[i] = Zl + double(i)/double(N - 1)*(Zr - Zl);
+}
+
+
+/*
+ * Fill an array with Chebyshev points of the second kind in the interval
+ * defined by the specified end points. The symmetry-preserving technique from
+ * Chebfun (chebtech1/chebpts.m) is used.
+ *
+ * Parameters
+ * ----------
+ * N  : number of points
+ * Zl : left end point 
+ * Zr : right end point 
+ *
+ * Returns
+ * -------
+ * Z : array of Chebyshev points
+ */
+template <typename NumType>
+void chebyshev_points(SizeType N, NumType &Zl, NumType &Zr, NumType *Z) {
+  // Evaluate the points using sine function to preserve symmetry
+  NumType f = 0.5*M_PI/double(N);
+  for (SizeType i = 0; i < N; i++) {
+    int j = (-1*int(N) + 1) + 2*int(i);
+    Z[i] = sin(f*double(j));
+  }
+
+  // Scale the points to fit the domain
+  for (int i = 0; i < N; i++)
+    Z[i] = 0.5*(1.0 - Z[i])*Zl + 0.5*(1.0 + Z[i])*Zr;
+}
+
+
+// Explicit instantations of template functions
+template void equispaced_points(SizeType N, Real &Zl, Real &Zr, Real *Z);
+template void equispaced_points(SizeType N, Complex &Zl, Complex &Zr, Complex *Z);
+
+template void chebyshev_points(SizeType N, Real &Zl, Real &Zr, Real *Z);
+template void chebyshev_points(SizeType N, Complex &Zl, Complex &Zr, Complex *Z);
+
+
+/*
+ * Fill an array with Gauss-Lobatto quadrature points (endpoints are -1 and 1)
+ *
+ * Parameters
+ * ----------
+ * lob_points : MATAR CArray to fill with points
+ * num        : number of points in quadrature rule
+ *
+ * Returns
+ * -------
+ * lob_points : MATAR CArray to fill with points
+ */
+
+void lobatto_points(MatarRealCArray &lob_points, const int &num) {
+  switch (num) {
+    case 1:
+      lob_points(0) = 0.0;
+      break;
+    case 2:
+      lob_points(0) = -1.0;
+      lob_points(1) =  1.0;
+      break;
+    case 3:
+      lob_points(0) = -1.0;
+      lob_points(1) =  0.0;
+      lob_points(2) =  1.0;
+      break;
+    case 4:
+      lob_points(0) = -1.0;
+      lob_points(1) = -1.0/5.0*sqrt(5.0);
+      lob_points(2) =  1.0/5.0*sqrt(5.0);
+      lob_points(3) =  1.0;
+      break;
+    case 5:
+      lob_points(0) = -1.0;
+      lob_points(1) = -1.0/7.0*sqrt(21.0);
+      lob_points(2) =  0.0;
+      lob_points(3) =  1.0/7.0*sqrt(21.0);
+      lob_points(4) =  1.0;
+      break;
+    case 6:
+      lob_points(0) = -1.0;
+      lob_points(1) = -sqrt(1.0/21.0*(7.0 + 2.0*sqrt(7.0)));
+      lob_points(2) = -sqrt(1.0/21.0*(7.0 - 2.0*sqrt(7.0)));
+      lob_points(3) =  sqrt(1.0/21.0*(7.0 - 2.0*sqrt(7.0)));
+      lob_points(4) =  sqrt(1.0/21.0*(7.0 +2.0*sqrt(7.0)));
+      lob_points(5) =  1.0;
+      break;
+    case 7:
+      lob_points(0) =  - 1.0E+00;
+      lob_points(1) =  - 0.830223896278566929872032213967E+00;
+      lob_points(2) =  - 0.468848793470714213803771881909E+00;
+      lob_points(3) =    0.0E+00;
+      lob_points(4) =    0.468848793470714213803771881909E+00;
+      lob_points(5) =    0.830223896278566929872032213967E+00;
+      lob_points(6) =    1.0E+00;
+      break;
+    case 8:
+      lob_points(0) =  - 1.0E+00;
+      lob_points(1) =  - 0.871740148509606615337445761221E+00;
+      lob_points(2) =  - 0.591700181433142302144510731398E+00;
+      lob_points(3) =  - 0.209299217902478868768657260345E+00;
+      lob_points(4) =    0.209299217902478868768657260345E+00;
+      lob_points(5) =    0.591700181433142302144510731398E+00;
+      lob_points(6) =    0.871740148509606615337445761221E+00;
+      lob_points(7) =    1.0E+00;
+      break;
+    case 9:
+      lob_points(0) =  - 1.0E+00;
+      lob_points(1) =  - 0.899757995411460157312345244418E+00;
+      lob_points(2) =  - 0.677186279510737753445885427091E+00;
+      lob_points(3) =  - 0.363117463826178158710752068709E+00;
+      lob_points(4) =    0.0E+00;
+      lob_points(5) =    0.363117463826178158710752068709E+00;
+      lob_points(6) =    0.677186279510737753445885427091E+00;
+      lob_points(7) =    0.899757995411460157312345244418E+00;
+      lob_points(8) =    1.0E+00;
+      break;
+    case 10:
+      lob_points(0) =  - 1.0E+00;
+      lob_points(1) =  - 0.919533908166458813828932660822E+00;
+      lob_points(2) =  - 0.738773865105505075003106174860E+00;
+      lob_points(3) =  - 0.477924949810444495661175092731E+00;
+      lob_points(4) =  - 0.165278957666387024626219765958E+00;
+      lob_points(5) =    0.165278957666387024626219765958E+00;
+      lob_points(6) =    0.477924949810444495661175092731E+00;
+      lob_points(7) =    0.738773865105505075003106174860E+00;
+      lob_points(8) =    0.919533908166458813828932660822E+00;
+      lob_points(9) =   1.0E+00;
+      break;
+    case 11:
+      lob_points(0) =  - 1.0E+00;
+      lob_points(1) =  - 0.934001430408059134332274136099E+00;
+      lob_points(2) =  - 0.784483473663144418622417816108E+00;
+      lob_points(3) =  - 0.565235326996205006470963969478E+00;
+      lob_points(4) =  - 0.295758135586939391431911515559E+00;
+      lob_points(5) =    0.0E+00;
+      lob_points(6) =    0.295758135586939391431911515559E+00;
+      lob_points(7) =    0.565235326996205006470963969478E+00;
+      lob_points(8) =    0.784483473663144418622417816108E+00;
+      lob_points(9) =   0.934001430408059134332274136099E+00;
+      lob_points(10) =   1.0E+00;
+      break;
+    case 12:
+      lob_points(0) =  - 1.0E+00;
+      lob_points(1) =  - 0.944899272222882223407580138303E+00;
+      lob_points(2) =  - 0.819279321644006678348641581717E+00;
+      lob_points(3) =  - 0.632876153031869677662404854444E+00;
+      lob_points(4) =  - 0.399530940965348932264349791567E+00;
+      lob_points(5) =  - 0.136552932854927554864061855740E+00;
+      lob_points(6) =    0.136552932854927554864061855740E+00;
+      lob_points(7) =    0.399530940965348932264349791567E+00;
+      lob_points(8) =    0.632876153031869677662404854444E+00;
+      lob_points(9) =   0.819279321644006678348641581717E+00;
+      lob_points(10) =   0.944899272222882223407580138303E+00;
+      lob_points(11) =   1.0E+00;
+      break;
+    case 13:
+      lob_points(0) =  - 1.0E+00;
+      lob_points(1) =  - 0.953309846642163911896905464755E+00;
+      lob_points(2) =  - 0.846347564651872316865925607099E+00;
+      lob_points(3) =  - 0.686188469081757426072759039566E+00;
+      lob_points(4) =  - 0.482909821091336201746937233637E+00;
+      lob_points(5) =  - 0.249286930106239992568673700374E+00;
+      lob_points(6) =    0.0E+00;
+      lob_points(7) =    0.249286930106239992568673700374E+00;
+      lob_points(8) =    0.482909821091336201746937233637E+00;
+      lob_points(9) =   0.686188469081757426072759039566E+00;
+      lob_points(10) =   0.846347564651872316865925607099E+00;
+      lob_points(11) =   0.953309846642163911896905464755E+00;
+      lob_points(12) =   1.0E+00;
+      break;
+    case 14:
+      lob_points(0) =  - 1.0E+00;
+      lob_points(1) =  - 0.959935045267260901355100162015E+00;
+      lob_points(2) =  - 0.867801053830347251000220202908E+00;
+      lob_points(3) =  - 0.728868599091326140584672400521E+00;
+      lob_points(4) =  - 0.550639402928647055316622705859E+00;
+      lob_points(5) =  - 0.342724013342712845043903403642E+00;
+      lob_points(6) =  - 0.116331868883703867658776709736E+00;
+      lob_points(7) =    0.116331868883703867658776709736E+00;
+      lob_points(8) =    0.342724013342712845043903403642E+00;
+      lob_points(9) =   0.550639402928647055316622705859E+00;
+      lob_points(10) =   0.728868599091326140584672400521E+00;
+      lob_points(11) =   0.867801053830347251000220202908E+00;
+      lob_points(12) =   0.959935045267260901355100162015E+00;
+      lob_points(13) =   1.0E+00;
+      break;
+    case 15:
+      lob_points(0) =  - 1.0E+00;
+      lob_points(1) =  - 0.965245926503838572795851392070E+00;
+      lob_points(2) =  - 0.885082044222976298825401631482E+00;
+      lob_points(3) =  - 0.763519689951815200704118475976E+00;
+      lob_points(4) =  - 0.606253205469845711123529938637E+00;
+      lob_points(5) =  - 0.420638054713672480921896938739E+00;
+      lob_points(6) =  - 0.215353955363794238225679446273E+00;
+      lob_points(7) =    0.0E+00;
+      lob_points(8) =    0.215353955363794238225679446273E+00;
+      lob_points(9) =   0.420638054713672480921896938739E+00;
+      lob_points(10) =   0.606253205469845711123529938637E+00;
+      lob_points(11) =   0.763519689951815200704118475976E+00;
+      lob_points(12) =   0.885082044222976298825401631482E+00;
+      lob_points(13) =   0.965245926503838572795851392070E+00;
+      lob_points(14) =   1.0E+00;
+      break;
+    case 16:
+      lob_points(0) =  - 1.0E+00;
+      lob_points(1) =  - 0.969568046270217932952242738367E+00;
+      lob_points(2) =  - 0.899200533093472092994628261520E+00;
+      lob_points(3) =  - 0.792008291861815063931088270963E+00;
+      lob_points(4) =  - 0.652388702882493089467883219641E+00;
+      lob_points(5) =  - 0.486059421887137611781890785847E+00;
+      lob_points(6) =  - 0.299830468900763208098353454722E+00;
+      lob_points(7) =  - 0.101326273521949447843033005046E+00;
+      lob_points(8) =    0.101326273521949447843033005046E+00;
+      lob_points(9) =   0.299830468900763208098353454722E+00;
+      lob_points(10) =   0.486059421887137611781890785847E+00;
+      lob_points(11) =   0.652388702882493089467883219641E+00;
+      lob_points(12) =   0.792008291861815063931088270963E+00;
+      lob_points(13) =   0.899200533093472092994628261520E+00;
+      lob_points(14) =   0.969568046270217932952242738367E+00;
+      lob_points(15) =   1.0E+00;
+      break;
+    case 17:
+      lob_points(0) =  - 1.0E+00;
+      lob_points(1) =  - 0.973132176631418314156979501874E+00;
+      lob_points(2) =  - 0.910879995915573595623802506398E+00;
+      lob_points(3) =  - 0.815696251221770307106750553238E+00;
+      lob_points(4) =  - 0.691028980627684705394919357372E+00;
+      lob_points(5) =  - 0.541385399330101539123733407504E+00;
+      lob_points(6) =  - 0.372174433565477041907234680735E+00;
+      lob_points(7) =  - 0.189511973518317388304263014753E+00;
+      lob_points(8) =    0.0E+00;
+      lob_points(9) =    0.189511973518317388304263014753E+00;
+      lob_points(10) =   0.372174433565477041907234680735E+00;
+      lob_points(11) =   0.541385399330101539123733407504E+00;
+      lob_points(12) =   0.691028980627684705394919357372E+00;
+      lob_points(13) =   0.815696251221770307106750553238E+00;
+      lob_points(14) =   0.910879995915573595623802506398E+00;
+      lob_points(15) =   0.973132176631418314156979501874E+00;
+      lob_points(16) =   1.0E+00;
+      break;
+    case 18:
+      lob_points(0) =  - 1.0E+00;
+      lob_points(1) =  - 0.976105557412198542864518924342E+00;
+      lob_points(2) =  - 0.920649185347533873837854625431E+00;
+      lob_points(3) =  - 0.835593535218090213713646362328E+00;
+      lob_points(4) =  - 0.723679329283242681306210365302E+00;
+      lob_points(5) =  - 0.588504834318661761173535893194E+00;
+      lob_points(6) =  - 0.434415036912123975342287136741E+00;
+      lob_points(7) =  - 0.266362652878280984167665332026E+00;
+      lob_points(8) =  - 0.897490934846521110226450100886E-01;
+      lob_points(9) =    0.897490934846521110226450100886E-01;
+      lob_points(10) =   0.266362652878280984167665332026E+00;
+      lob_points(11) =   0.434415036912123975342287136741E+00;
+      lob_points(12) =   0.588504834318661761173535893194E+00;
+      lob_points(13) =   0.723679329283242681306210365302E+00;
+      lob_points(14) =   0.835593535218090213713646362328E+00;
+      lob_points(15) =   0.920649185347533873837854625431E+00;
+      lob_points(16) =   0.976105557412198542864518924342E+00;
+      lob_points(17) =   1.0E+00;
+      break;
+    case 19:
+      lob_points(0)=   - 1.0E+00;
+      lob_points(1)=   - 0.978611766222080095152634063110E+00;
+      lob_points(2)=   - 0.928901528152586243717940258797E+00;
+      lob_points(3)=   - 0.852460577796646093085955970041E+00;
+      lob_points(4)=   - 0.751494202552613014163637489634E+00;
+      lob_points(5)=   - 0.628908137265220497766832306229E+00;
+      lob_points(6)=   - 0.488229285680713502777909637625E+00;
+      lob_points(7)=   - 0.333504847824498610298500103845E+00;
+      lob_points(8)=   - 0.169186023409281571375154153445E+00;
+      lob_points(9)=     0.0E+00;
+      lob_points(10) =   0.169186023409281571375154153445E+00;
+      lob_points(11) =   0.333504847824498610298500103845E+00;
+      lob_points(12) =   0.488229285680713502777909637625E+00;
+      lob_points(13) =   0.628908137265220497766832306229E+00;
+      lob_points(14) =   0.751494202552613014163637489634E+00;
+      lob_points(15) =   0.852460577796646093085955970041E+00;
+      lob_points(16) =   0.928901528152586243717940258797E+00;
+      lob_points(17) =   0.978611766222080095152634063110E+00;
+      lob_points(18) =   1.0E+00;
+      break;
+    case 20:
+      lob_points(0) =  - 1.0E+00;
+      lob_points(1) =  - 0.980743704893914171925446438584E+00;
+      lob_points(2) =  - 0.935934498812665435716181584931E+00;
+      lob_points(3) =  - 0.866877978089950141309847214616E+00;
+      lob_points(4) =  - 0.775368260952055870414317527595E+00;
+      lob_points(5) =  - 0.663776402290311289846403322971E+00;
+      lob_points(6) =  - 0.534992864031886261648135961829E+00;
+      lob_points(7) =  - 0.392353183713909299386474703816E+00;
+      lob_points(8) =  - 0.239551705922986495182401356927E+00;
+      lob_points(9) =  - 0.805459372388218379759445181596E-01;
+      lob_points(10) =   0.805459372388218379759445181596E-01;
+      lob_points(11) =   0.239551705922986495182401356927E+00;
+      lob_points(12) =   0.392353183713909299386474703816E+00;
+      lob_points(13) =   0.534992864031886261648135961829E+00;
+      lob_points(14) =   0.663776402290311289846403322971E+00;
+      lob_points(15) =   0.775368260952055870414317527595E+00;
+      lob_points(16) =   0.866877978089950141309847214616E+00;
+      lob_points(17) =   0.935934498812665435716181584931E+00;
+      lob_points(18) =   0.980743704893914171925446438584E+00;
+      lob_points(19) =   1.0E+00;
+      break;
+    default:
+      std::ostringstream ss;
+      ss << "Error: " << num << "-point quadrature rule not implemented.";
+      throw NotImplementedError(ss.str());
+  }
+} 
+
+
+/*
+ * Fill an array with Gauss-Lobatto quadrature weights
+ *
+ * Parameters
+ * ----------
+ * lob_weights : MATAR CArray to fill with weights
+ * num         : number of points in quadrature rule
+ *
+ * Returns
+ * -------
+ * lob_weights : MATAR CArray to fill with weights
+ */
+void lobatto_weights(MatarRealCArray &lob_weights, const int &num) {
+  switch (num) {
+    case 1:
+      lob_weights(0) = 2.0;
+      break;
+    case 2:
+      lob_weights(0) = 1.0;
+      lob_weights(1) = 1.0;
+      break;
+    case 3:
+      lob_weights(0) = 1.0/3.0;
+      lob_weights(1) = 4.0/3.0;
+      lob_weights(2) = 1.0/3.0;
+      break;
+    case 4:
+      lob_weights(0) = 1.0/6.0;
+      lob_weights(1) = 5.0/6.0;
+      lob_weights(2) = 5.0/6.0;
+      lob_weights(3) = 1.0/6.0;
+      break;
+    case 5:
+      lob_weights(0) = 1.0/10.0;
+      lob_weights(1) = 49.0/90.0;
+      lob_weights(2) = 32.0/45.0;
+      lob_weights(3) = 49.0/90.0;
+      lob_weights(4) = 1.0/10.0;
+      break;
+    case 6:
+      lob_weights(0) = 1.0/15.0;
+      lob_weights(1) = 1.0/30.0*(14.0 - sqrt(7.0));
+      lob_weights(2) = 1.0/30.0*(14.0 + sqrt(7.0));
+      lob_weights(3) = 1.0/30.0*(14.0 + sqrt(7.0));
+      lob_weights(4) = 1.0/30.0*(14.0 - sqrt(7.0));
+      lob_weights(5) = 1.0/15.0;
+      break;
+    case 7:
+      lob_weights(0) =  0.476190476190476190476190476190E-01;
+      lob_weights(1) =  0.276826047361565948010700406290E+00;
+      lob_weights(2) =  0.431745381209862623417871022281E+00;
+      lob_weights(3) =  0.487619047619047619047619047619E+00;
+      lob_weights(4) =  0.431745381209862623417871022281E+00;
+      lob_weights(5) =  0.276826047361565948010700406290E+00;
+      lob_weights(6) =  0.476190476190476190476190476190E-01;
+      break;
+    case 8:
+      lob_weights(0) =  0.357142857142857142857142857143E-01;
+      lob_weights(1) =  0.210704227143506039382991065776E+00;
+      lob_weights(2) =  0.341122692483504364764240677108E+00;
+      lob_weights(3) =  0.412458794658703881567052971402E+00;
+      lob_weights(4) =  0.412458794658703881567052971402E+00;
+      lob_weights(5) =  0.341122692483504364764240677108E+00;
+      lob_weights(6) =  0.210704227143506039382991065776E+00;
+      lob_weights(7) =  0.357142857142857142857142857143E-01;
+      break;
+    case 9:
+      lob_weights(0) =  0.277777777777777777777777777778E-01;
+      lob_weights(1) =  0.165495361560805525046339720029E+00;
+      lob_weights(2) =  0.274538712500161735280705618579E+00;
+      lob_weights(3) =  0.346428510973046345115131532140E+00;
+      lob_weights(4) =  0.371519274376417233560090702948E+00;
+      lob_weights(5) =  0.346428510973046345115131532140E+00;
+      lob_weights(6) =  0.274538712500161735280705618579E+00;
+      lob_weights(7) =  0.165495361560805525046339720029E+00;
+      lob_weights(8) =  0.277777777777777777777777777778E-01;
+      break;
+    case 10:
+      lob_weights(0) =  0.222222222222222222222222222222E-01;
+      lob_weights(1) =  0.133305990851070111126227170755E+00;
+      lob_weights(2) =  0.224889342063126452119457821731E+00;
+      lob_weights(3) =  0.292042683679683757875582257374E+00;
+      lob_weights(4) =  0.327539761183897456656510527917E+00;
+      lob_weights(5) =  0.327539761183897456656510527917E+00;
+      lob_weights(6) =  0.292042683679683757875582257374E+00;
+      lob_weights(7) =  0.224889342063126452119457821731E+00;
+      lob_weights(8) =  0.133305990851070111126227170755E+00;
+      lob_weights(9) =  0.222222222222222222222222222222E-01;
+      break;
+    case 11:
+      lob_weights(0) =  0.181818181818181818181818181818E-01;
+      lob_weights(1) =  0.109612273266994864461403449580E+00;
+      lob_weights(2) =  0.187169881780305204108141521899E+00;
+      lob_weights(3) =  0.248048104264028314040084866422E+00;
+      lob_weights(4) =  0.286879124779008088679222403332E+00;
+      lob_weights(5) =  0.300217595455690693785931881170E+00;
+      lob_weights(6) =  0.286879124779008088679222403332E+00;
+      lob_weights(7) =  0.248048104264028314040084866422E+00;
+      lob_weights(8) =  0.187169881780305204108141521899E+00;
+      lob_weights(9) =  0.109612273266994864461403449580E+00;
+      lob_weights(10)=  0.181818181818181818181818181818E-01;
+      break;
+    case 12:
+      lob_weights(0) =  0.151515151515151515151515151515E-01;
+      lob_weights(1) =  0.916845174131961306683425941341E-01;
+      lob_weights(2) =  0.157974705564370115164671062700E+00;
+      lob_weights(3) =  0.212508417761021145358302077367E+00;
+      lob_weights(4) =  0.251275603199201280293244412148E+00;
+      lob_weights(5) =  0.271405240910696177000288338500E+00;
+      lob_weights(6) =  0.271405240910696177000288338500E+00;
+      lob_weights(7) =  0.251275603199201280293244412148E+00;
+      lob_weights(8) =  0.212508417761021145358302077367E+00;
+      lob_weights(9) =  0.157974705564370115164671062700E+00;
+      lob_weights(10) = 0.916845174131961306683425941341E-01;
+      lob_weights(11) = 0.151515151515151515151515151515E-01;
+      break;
+    case 13:
+      lob_weights(0) =  0.128205128205128205128205128205E-01;
+      lob_weights(1) =  0.778016867468189277935889883331E-01;
+      lob_weights(2) =  0.134981926689608349119914762589E+00;
+      lob_weights(3) =  0.183646865203550092007494258747E+00;
+      lob_weights(4) =  0.220767793566110086085534008379E+00;
+      lob_weights(5) =  0.244015790306676356458578148360E+00;
+      lob_weights(6) =  0.251930849333446736044138641541E+00;
+      lob_weights(7) =  0.244015790306676356458578148360E+00;
+      lob_weights(8) =  0.220767793566110086085534008379E+00;
+      lob_weights(9) =  0.183646865203550092007494258747E+00;
+      lob_weights(10) = 0.134981926689608349119914762589E+00;
+      lob_weights(11) = 0.778016867468189277935889883331E-01;
+      lob_weights(12) = 0.128205128205128205128205128205E-01;
+      break;
+    case 14:
+      lob_weights(0) =  0.109890109890109890109890109890E-01;
+      lob_weights(1) =  0.668372844976812846340706607461E-01;
+      lob_weights(2) =  0.116586655898711651540996670655E+00;
+      lob_weights(3) =  0.160021851762952142412820997988E+00;
+      lob_weights(4) =  0.194826149373416118640331778376E+00;
+      lob_weights(5) =  0.219126253009770754871162523954E+00;
+      lob_weights(6) =  0.231612794468457058889628357293E+00;
+      lob_weights(7) =  0.231612794468457058889628357293E+00;
+      lob_weights(8) =  0.219126253009770754871162523954E+00;
+      lob_weights(9) =  0.194826149373416118640331778376E+00;
+      lob_weights(10) = 0.160021851762952142412820997988E+00;
+      lob_weights(11) = 0.116586655898711651540996670655E+00;
+      lob_weights(12) = 0.668372844976812846340706607461E-01;
+      lob_weights(13) = 0.109890109890109890109890109890E-01;
+      break;
+    case 15:
+      lob_weights(0) =  0.952380952380952380952380952381E-02;
+      lob_weights(1) =  0.580298930286012490968805840253E-01;
+      lob_weights(2) =  0.101660070325718067603666170789E+00;
+      lob_weights(3) =  0.140511699802428109460446805644E+00;
+      lob_weights(4) =  0.172789647253600949052077099408E+00;
+      lob_weights(5) =  0.196987235964613356092500346507E+00;
+      lob_weights(6) =  0.211973585926820920127430076977E+00;
+      lob_weights(7) =  0.217048116348815649514950214251E+00;
+      lob_weights(8) =  0.211973585926820920127430076977E+00;
+      lob_weights(9) =  0.196987235964613356092500346507E+00;
+      lob_weights(10) = 0.172789647253600949052077099408E+00;
+      lob_weights(11) = 0.140511699802428109460446805644E+00;
+      lob_weights(12) = 0.101660070325718067603666170789E+00;
+      lob_weights(13) = 0.580298930286012490968805840253E-01;
+      lob_weights(14) = 0.952380952380952380952380952381E-02;
+    case 16:
+      lob_weights(0) =  0.833333333333333333333333333333E-02;
+      lob_weights(1) =  0.508503610059199054032449195655E-01;
+      lob_weights(2) =  0.893936973259308009910520801661E-01;
+      lob_weights(3) =  0.124255382132514098349536332657E+00;
+      lob_weights(4) =  0.154026980807164280815644940485E+00;
+      lob_weights(5) =  0.177491913391704125301075669528E+00;
+      lob_weights(6) =  0.193690023825203584316913598854E+00;
+      lob_weights(7) =  0.201958308178229871489199125411E+00;
+      lob_weights(8) =  0.201958308178229871489199125411E+00;
+      lob_weights(9) =  0.193690023825203584316913598854E+00;
+      lob_weights(10) = 0.177491913391704125301075669528E+00;
+      lob_weights(11) = 0.154026980807164280815644940485E+00;
+      lob_weights(12) = 0.124255382132514098349536332657E+00;
+      lob_weights(13) = 0.893936973259308009910520801661E-01;
+      lob_weights(14) = 0.508503610059199054032449195655E-01;
+      lob_weights(15) = 0.833333333333333333333333333333E-02;
+      break;
+    case 17:
+      lob_weights(0) =  0.735294117647058823529411764706E-02;
+      lob_weights(1) =  0.449219405432542096474009546232E-01;
+      lob_weights(2) =  0.791982705036871191902644299528E-01;
+      lob_weights(3) =  0.110592909007028161375772705220E+00;
+      lob_weights(4) =  0.137987746201926559056201574954E+00;
+      lob_weights(5) =  0.160394661997621539516328365865E+00;
+      lob_weights(6) =  0.177004253515657870436945745363E+00;
+      lob_weights(7) =  0.187216339677619235892088482861E+00;
+      lob_weights(8) =  0.190661874753469433299407247028E+00;
+      lob_weights(9) =  0.187216339677619235892088482861E+00;
+      lob_weights(10) = 0.177004253515657870436945745363E+00;
+      lob_weights(11) = 0.160394661997621539516328365865E+00;
+      lob_weights(12) = 0.137987746201926559056201574954E+00;
+      lob_weights(13) = 0.110592909007028161375772705220E+00;
+      lob_weights(14) = 0.791982705036871191902644299528E-01;
+      lob_weights(15) = 0.449219405432542096474009546232E-01;
+      lob_weights(16) = 0.735294117647058823529411764706E-02;
+      break;
+    case 18:
+      lob_weights(0) =  0.653594771241830065359477124183E-02;
+      lob_weights(1) =  0.399706288109140661375991764101E-01;
+      lob_weights(2) =  0.706371668856336649992229601678E-01;
+      lob_weights(3) =  0.990162717175028023944236053187E-01;
+      lob_weights(4) =  0.124210533132967100263396358897E+00;
+      lob_weights(5) =  0.145411961573802267983003210494E+00;
+      lob_weights(6) =  0.161939517237602489264326706700E+00;
+      lob_weights(7) =  0.173262109489456226010614403827E+00;
+      lob_weights(8) =  0.179015863439703082293818806944E+00;
+      lob_weights(9) =  0.179015863439703082293818806944E+00;
+      lob_weights(10) = 0.173262109489456226010614403827E+00;
+      lob_weights(11) = 0.161939517237602489264326706700E+00;
+      lob_weights(12) = 0.145411961573802267983003210494E+00;
+      lob_weights(13) = 0.124210533132967100263396358897E+00;
+      lob_weights(14) = 0.990162717175028023944236053187E-01;
+      lob_weights(15) = 0.706371668856336649992229601678E-01;
+      lob_weights(16) = 0.399706288109140661375991764101E-01;
+      lob_weights(17) = 0.653594771241830065359477124183E-02;
+      break;
+    case 19:
+      lob_weights(0) =  0.584795321637426900584795321637E-02;
+      lob_weights(1) =  0.357933651861764771154255690351E-01;
+      lob_weights(2) =  0.633818917626297368516956904183E-01;
+      lob_weights(3) =  0.891317570992070844480087905562E-01;
+      lob_weights(4) =  0.112315341477305044070910015464E+00;
+      lob_weights(5) =  0.132267280448750776926046733910E+00;
+      lob_weights(6) =  0.148413942595938885009680643668E+00;
+      lob_weights(7) =  0.160290924044061241979910968184E+00;
+      lob_weights(8) =  0.167556584527142867270137277740E+00;
+      lob_weights(9) =  0.170001919284827234644672715617E+00;
+      lob_weights(10) = 0.167556584527142867270137277740E+00;
+      lob_weights(11) = 0.160290924044061241979910968184E+00;
+      lob_weights(12) = 0.148413942595938885009680643668E+00;
+      lob_weights(13) = 0.132267280448750776926046733910E+00;
+      lob_weights(14) = 0.112315341477305044070910015464E+00;
+      lob_weights(15) = 0.891317570992070844480087905562E-01;
+      lob_weights(16) = 0.633818917626297368516956904183E-01;
+      lob_weights(17) = 0.357933651861764771154255690351E-01;
+      lob_weights(18) = 0.584795321637426900584795321637E-02;
+      break;
+    case 20:
+      lob_weights(0) =  0.526315789473684210526315789474E-02;
+      lob_weights(1) =  0.322371231884889414916050281173E-01;
+      lob_weights(2) =  0.571818021275668260047536271732E-01;
+      lob_weights(3) =  0.806317639961196031447768461137E-01;
+      lob_weights(4) =  0.101991499699450815683781205733E+00;
+      lob_weights(5) =  0.120709227628674725099429705002E+00;
+      lob_weights(6) =  0.136300482358724184489780792989E+00;
+      lob_weights(7) =  0.148361554070916825814713013734E+00;
+      lob_weights(8) =  0.156580102647475487158169896794E+00;
+      lob_weights(9) =  0.160743286387845749007726726449E+00;
+      lob_weights(10) = 0.160743286387845749007726726449E+00;
+      lob_weights(11) = 0.156580102647475487158169896794E+00;
+      lob_weights(12) = 0.148361554070916825814713013734E+00;
+      lob_weights(13) = 0.136300482358724184489780792989E+00;
+      lob_weights(14) = 0.120709227628674725099429705002E+00;
+      lob_weights(15) = 0.101991499699450815683781205733E+00;
+      lob_weights(16) = 0.806317639961196031447768461137E-01;
+      lob_weights(17) = 0.571818021275668260047536271732E-01;
+      lob_weights(18) = 0.322371231884889414916050281173E-01;
+      lob_weights(19) = 0.526315789473684210526315789474E-02;
+      break;
+    default:
+      std::ostringstream ss;
+      ss << "Error: " << num << "-point quadrature rule not implemented.";
+      throw NotImplementedError(ss.str());
+  }
+} 
+
+
+/*
+ * Fill an array with Gauss-Legendre quadrature points (endpoints are -1 and 1)
+ *
+ * Parameters
+ * ----------
+ * leg_points : MATAR CArray to fill with points
+ * num        : number of points in quadrature rule
+ *
+ * Returns
+ * -------
+ * leg_points : MATAR CArray to fill with points
+ */
+void legendre_points(MatarRealCArray &leg_points, const int &num) {
+  switch (num) {
+    case 1:
+      leg_points(0) = 0.0;
+      break;
+    case 2:
+      leg_points(0) = -0.577350269189625764509148780501;
+      leg_points(1) =  0.577350269189625764509148780501;
+      break;
+    case 3:
+      leg_points(0) = -0.774596669241483377035853079956;
+      leg_points(1) =  0.0;
+      leg_points(2) =  0.774596669241483377035853079956;
+      break;
+    case 4:
+      leg_points(0) = -0.861136311594052575223946488892;
+      leg_points(1) = -0.339981043584856264802665759103;
+      leg_points(2) =  0.339981043584856264802665759103;
+      leg_points(3) =  0.861136311594052575223946488892;
+      break;
+    case 5:
+      leg_points(0) = -0.906179845938663992797626878299;
+      leg_points(1) = -0.538469310105683091036314420700;
+      leg_points(2) =  0.0;
+      leg_points(3) =  0.538469310105683091036314420700;
+      leg_points(4) =  0.906179845938663992797626878299;
+      break;
+    case 6:
+      leg_points(0) = -0.932469514203152027812301554493;
+      leg_points(1) = -0.661209386466264513661399595019;
+      leg_points(2) = -0.238619186083196908630501721680;
+      leg_points(3) =  0.238619186083196908630501721680;
+      leg_points(4) =  0.661209386466264513661399595019;
+      leg_points(5) =  0.932469514203152027812301554493;
+      break;
+    case 7:
+      leg_points(0) = -0.949107912342758524526189684047;
+      leg_points(1) = -0.741531185599394439863864773280;
+      leg_points(2) = -0.405845151377397166906606412076;
+      leg_points(3) =  0.0E+00;
+      leg_points(4) =  0.405845151377397166906606412076;
+      leg_points(5) =  0.741531185599394439863864773280;
+      leg_points(6) =  0.949107912342758524526189684047;
+      break;
+    case 8:
+      leg_points(0) = -0.960289856497536231683560868569;
+      leg_points(1) = -0.796666477413626739591553936475;
+      leg_points(2) = -0.525532409916328985817739049189;
+      leg_points(3) = -0.183434642495649804939476142360;
+      leg_points(4) =  0.183434642495649804939476142360;
+      leg_points(5) =  0.525532409916328985817739049189;
+      leg_points(6) =  0.796666477413626739591553936475;
+      leg_points(7) =  0.960289856497536231683560868569;
+      break;
+    case 9:
+      leg_points(0) = -0.968160239507626089835576202903;
+      leg_points(1) = -0.836031107326635794299429788069;
+      leg_points(2) = -0.613371432700590397308702039341;
+      leg_points(3) = -0.324253423403808929038538014643;
+      leg_points(4) =  0.0E+00;
+      leg_points(5) =  0.324253423403808929038538014643;
+      leg_points(6) =  0.613371432700590397308702039341;
+      leg_points(7) =  0.836031107326635794299429788069;
+      leg_points(8) =  0.968160239507626089835576202903;
+      break;
+    case 10:
+      leg_points(0) = -0.9739065285171717200779640120844;
+      leg_points(1) = -0.8650633666889845107320966884234;
+      leg_points(2) = -0.6794095682990244062343273651148;
+      leg_points(3) = -0.4333953941292471907992659431657;
+      leg_points(4) = -0.1488743389816312108848260011297;
+      leg_points(5) =  0.1488743389816312108848260011297;
+      leg_points(6) =  0.4333953941292471907992659431657;
+      leg_points(7) =  0.6794095682990244062343273651148;
+      leg_points(8) =  0.8650633666889845107320966884234;
+      leg_points(9) =  0.9739065285171717200779640120844;
+      break;
+    case 11:
+      leg_points(0) = -0.9782286581460569928039380011228;
+      leg_points(1) = -0.8870625997680952990751577693039;
+      leg_points(2) = -0.7301520055740493240934162520311;
+      leg_points(3) = -0.5190961292068118159257256694586;
+      leg_points(4) = -0.2695431559523449723315319854008;
+      leg_points(5) =  0.0E+00;
+      leg_points(6) =  0.2695431559523449723315319854008;
+      leg_points(7) =  0.5190961292068118159257256694586;
+      leg_points(8) =  0.7301520055740493240934162520311;
+      leg_points(9) =  0.8870625997680952990751577693039;
+      leg_points(10) = 0.9782286581460569928039380011228;
+      break;
+    case 12:
+      leg_points(0) = -0.9815606342467192506905490901492;
+      leg_points(1) = -0.9041172563704748566784658661190;
+      leg_points(2) = -0.7699026741943046870368938332128;
+      leg_points(3) = -0.5873179542866174472967024189405;
+      leg_points(4) = -0.3678314989981801937526915366437;
+      leg_points(5) = -0.1252334085114689154724413694638;
+      leg_points(6) =  0.1252334085114689154724413694638;
+      leg_points(7) =  0.3678314989981801937526915366437;
+      leg_points(8) =  0.5873179542866174472967024189405;
+      leg_points(9) =  0.7699026741943046870368938332128;
+      leg_points(10) = 0.9041172563704748566784658661190;
+      leg_points(11) = 0.9815606342467192506905490901492;
+      break;
+    case 13:
+      leg_points(0) = -0.98418305471858814947282944880710;
+      leg_points(1) = -0.91759839922297796520654783650071;
+      leg_points(2) = -0.80157809073330991279420648958285;
+      leg_points(3) = -0.64234933944034022064398460699551;
+      leg_points(4) = -0.44849275103644685287791285212763;
+      leg_points(5) = -0.23045831595513479406552812109798;
+      leg_points(6) =  0.0E+00;
+      leg_points(7) =  0.23045831595513479406552812109798;
+      leg_points(8) =  0.44849275103644685287791285212763;
+      leg_points(9) =  0.64234933944034022064398460699551;
+      leg_points(10) = 0.80157809073330991279420648958285;
+      leg_points(11) = 0.91759839922297796520654783650071;
+      leg_points(12) = 0.98418305471858814947282944880710;
+      break;
+    case 14:
+      leg_points(0) = -0.986283808696812338841597266704052;
+      leg_points(1) = -0.928434883663573517336391139377874;
+      leg_points(2) = -0.827201315069764993189794742650394;
+      leg_points(3) = -0.687292904811685470148019803019334;
+      leg_points(4) = -0.515248636358154091965290718551188;
+      leg_points(5) = -0.319112368927889760435671824168475;
+      leg_points(6) = -0.108054948707343662066244650219834;
+      leg_points(7) =  0.108054948707343662066244650219834;
+      leg_points(8) =  0.319112368927889760435671824168475;
+      leg_points(9) =  0.515248636358154091965290718551188;
+      leg_points(10) = 0.687292904811685470148019803019334;
+      leg_points(11) = 0.827201315069764993189794742650394;
+      leg_points(12) = 0.928434883663573517336391139377874;
+      leg_points(13) = 0.986283808696812338841597266704052;
+      break;
+    case 15:
+      leg_points(0) = -0.987992518020485428489565718586612;
+      leg_points(1) = -0.937273392400705904307758947710209;
+      leg_points(2) = -0.848206583410427216200648320774216;
+      leg_points(3) = -0.724417731360170047416186054613938;
+      leg_points(4) = -0.570972172608538847537226737253910;
+      leg_points(5) = -0.394151347077563369897207370981045;
+      leg_points(6) = -0.201194093997434522300628303394596;
+      leg_points(7) =  0.0E+00;
+      leg_points(8) =  0.201194093997434522300628303394596;
+      leg_points(9) =  0.394151347077563369897207370981045;
+      leg_points(10) = 0.570972172608538847537226737253910;
+      leg_points(11) = 0.724417731360170047416186054613938;
+      leg_points(12) = 0.848206583410427216200648320774216;
+      leg_points(13) = 0.937273392400705904307758947710209;
+      leg_points(14) = 0.987992518020485428489565718586612;
+      break;
+    case 16:
+      leg_points(0) = -0.989400934991649932596154173450332;
+      leg_points(1) = -0.944575023073232576077988415534608;
+      leg_points(2) = -0.865631202387831743880467897712393;
+      leg_points(3) = -0.755404408355003033895101194847442;
+      leg_points(4) = -0.617876244402643748446671764048791;
+      leg_points(5) = -0.458016777657227386342419442983577;
+      leg_points(6) = -0.281603550779258913230460501460496;
+      leg_points(7) = -0.095012509837637440185319335424958;
+      leg_points(8) =  0.095012509837637440185319335424958;
+      leg_points(9) =  0.281603550779258913230460501460496;
+      leg_points(10) = 0.458016777657227386342419442983577;
+      leg_points(11) = 0.617876244402643748446671764048791;
+      leg_points(12) = 0.755404408355003033895101194847442;
+      leg_points(13) = 0.865631202387831743880467897712393;
+      leg_points(14) = 0.944575023073232576077988415534608;
+      leg_points(15) = 0.989400934991649932596154173450332;
+      break;
+    case 17:
+      leg_points(0) = -0.990575475314417335675434019940665;
+      leg_points(1) = -0.950675521768767761222716957895803;
+      leg_points(2) = -0.880239153726985902122955694488155;
+      leg_points(3) = -0.781514003896801406925230055520476;
+      leg_points(4) = -0.657671159216690765850302216643002;
+      leg_points(5) = -0.512690537086476967886246568629551;
+      leg_points(6) = -0.351231763453876315297185517095346;
+      leg_points(7) = -0.178484181495847855850677493654065;
+      leg_points(8) =  0.0E+00;
+      leg_points(9) =  0.178484181495847855850677493654065;
+      leg_points(10) = 0.351231763453876315297185517095346;
+      leg_points(11) = 0.512690537086476967886246568629551;
+      leg_points(12) = 0.657671159216690765850302216643002;
+      leg_points(13) = 0.781514003896801406925230055520476;
+      leg_points(14) = 0.880239153726985902122955694488155;
+      leg_points(15) = 0.950675521768767761222716957895803;
+      leg_points(16) = 0.990575475314417335675434019940665;
+      break;
+    case 18:
+      leg_points(0) = -0.991565168420930946730016004706150;
+      leg_points(1) = -0.955823949571397755181195892929776;
+      leg_points(2) = -0.892602466497555739206060591127145;
+      leg_points(3) = -0.803704958972523115682417455014590;
+      leg_points(4) = -0.691687043060353207874891081288848;
+      leg_points(5) = -0.559770831073947534607871548525329;
+      leg_points(6) = -0.411751161462842646035931793833051;
+      leg_points(7) = -0.251886225691505509588972854877911;
+      leg_points(8) = -0.084775013041735301242261852935783;
+      leg_points(9) =  0.084775013041735301242261852935783;
+      leg_points(10) = 0.251886225691505509588972854877911;
+      leg_points(11) = 0.411751161462842646035931793833051;
+      leg_points(12) = 0.559770831073947534607871548525329;
+      leg_points(13) = 0.691687043060353207874891081288848;
+      leg_points(14) = 0.803704958972523115682417455014590;
+      leg_points(15) = 0.892602466497555739206060591127145;
+      leg_points(16) = 0.955823949571397755181195892929776;
+      leg_points(17) = 0.991565168420930946730016004706150;
+      break;
+    case 19:
+      leg_points(0) = -0.992406843843584403189017670253260;
+      leg_points(1) = -0.960208152134830030852778840687651;
+      leg_points(2) = -0.903155903614817901642660928532312;
+      leg_points(3) = -0.822714656537142824978922486712713;
+      leg_points(4) = -0.720966177335229378617095860823781;
+      leg_points(5) = -0.600545304661681023469638164946239;
+      leg_points(6) = -0.464570741375960945717267148104102;
+      leg_points(7) = -0.316564099963629831990117328849844;
+      leg_points(8) = -0.160358645640225375868096115740743;
+      leg_points(9) =  0.0E+00;
+      leg_points(10) = 0.160358645640225375868096115740743;
+      leg_points(11) = 0.316564099963629831990117328849844;
+      leg_points(12) = 0.464570741375960945717267148104102;
+      leg_points(13) = 0.600545304661681023469638164946239;
+      leg_points(14) = 0.720966177335229378617095860823781;
+      leg_points(15) = 0.822714656537142824978922486712713;
+      leg_points(16) = 0.903155903614817901642660928532312;
+      leg_points(17) = 0.960208152134830030852778840687651;
+      leg_points(18) = 0.992406843843584403189017670253260;
+      break;
+    default:
+      std::ostringstream ss;
+      ss << "Error: " << num << "-point quadrature rule not implemented.";
+      throw NotImplementedError(ss.str());
+  }
+}
+
+
+/*
+ * Fill an array with Gauss-Legendre quadrature weights
+ *
+ * Parameters
+ * ----------
+ * leg_weights : MATAR CArray to fill with weights
+ * num         : number of weights in quadrature rule
+ *
+ * Returns
+ * -------
+ * leg_weights : MATAR CArray to fill with weights
+ */
+void legendre_weights(MatarRealCArray &leg_weights, const int &num) {
+  switch (num) {
+    case 1:
+      leg_weights(0) = 2.0;
+      break;
+    case 2:
+      leg_weights(0) = 1.0;
+      leg_weights(1) = 1.0;
+      break;
+    case 3:
+      leg_weights(0) = 0.555555555555555555555555555555555;
+      leg_weights(1) = 0.888888888888888888888888888888888;
+      leg_weights(2) = 0.555555555555555555555555555555555;
+      break;
+    case 4:
+      leg_weights(0) = 0.347854845137453857373063949221999;
+      leg_weights(1) = 0.652145154862546142626936050778000;
+      leg_weights(2) = 0.652145154862546142626936050778000;
+      leg_weights(3) = 0.347854845137453857373063949221999;
+      break;
+    case 5:
+      leg_weights(0) = 0.236926885056189087514264040719917;
+      leg_weights(1) = 0.478628670499366468041291514835638;
+      leg_weights(2) = 0.568888888888888888888888888888888;
+      leg_weights(3) = 0.478628670499366468041291514835638;
+      leg_weights(4) = 0.236926885056189087514264040719917;
+      break;
+    case 6:
+      leg_weights(0) = 0.171324492379170345040296142172732;
+      leg_weights(1) = 0.360761573048138607569833513837716;
+      leg_weights(2) = 0.467913934572691047389870343989550;
+      leg_weights(3) = 0.467913934572691047389870343989550;
+      leg_weights(4) = 0.360761573048138607569833513837716;
+      leg_weights(5) = 0.171324492379170345040296142172732;
+      break;
+    case 7:
+      leg_weights(0) = 0.129484966168869693270611432679082;
+      leg_weights(1) = 0.279705391489276667901467771423779;
+      leg_weights(2) = 0.381830050505118944950369775488975;
+      leg_weights(3) = 0.417959183673469387755102040816326;
+      leg_weights(4) = 0.381830050505118944950369775488975;
+      leg_weights(5) = 0.279705391489276667901467771423779;
+      leg_weights(6) = 0.129484966168869693270611432679082;
+      break;
+    case 8:
+      leg_weights(0) = 0.101228536290376259152531354309962;
+      leg_weights(1) = 0.222381034453374470544355994426240;
+      leg_weights(2) = 0.313706645877887287337962201986601;
+      leg_weights(3) = 0.362683783378361982965150449277195;
+      leg_weights(4) = 0.362683783378361982965150449277195;
+      leg_weights(5) = 0.313706645877887287337962201986601;
+      leg_weights(6) = 0.222381034453374470544355994426240;
+      leg_weights(7) = 0.101228536290376259152531354309962;
+      break;
+    case 9:
+      leg_weights(0) = 0.081274388361574411971892158110523;
+      leg_weights(1) = 0.180648160694857404058472031242912;
+      leg_weights(2) = 0.260610696402935462318742869418632;
+      leg_weights(3) = 0.312347077040002840068630406584443;
+      leg_weights(4) = 0.330239355001259763164525069286974;
+      leg_weights(5) = 0.312347077040002840068630406584443;
+      leg_weights(6) = 0.260610696402935462318742869418632;
+      leg_weights(7) = 0.180648160694857404058472031242912;
+      leg_weights(8) = 0.081274388361574411971892158110523;
+      break;
+    case 10:
+      leg_weights(0) = 0.066671344308688137593568809893331;
+      leg_weights(1) = 0.149451349150580593145776339657697;
+      leg_weights(2) = 0.219086362515982043995534934228163;
+      leg_weights(3) = 0.269266719309996355091226921569469;
+      leg_weights(4) = 0.295524224714752870173892994651338;
+      leg_weights(5) = 0.295524224714752870173892994651338;
+      leg_weights(6) = 0.269266719309996355091226921569469;
+      leg_weights(7) = 0.219086362515982043995534934228163;
+      leg_weights(8) = 0.149451349150580593145776339657697;
+      leg_weights(9) = 0.066671344308688137593568809893331;
+      break;
+    case 11:
+      leg_weights(0) = 0.055668567116173666482753720442548;
+      leg_weights(1) = 0.125580369464904624634694299223940;
+      leg_weights(2) = 0.186290210927734251426097641431655;
+      leg_weights(3) = 0.233193764591990479918523704843175;
+      leg_weights(4) = 0.262804544510246662180688869890509;
+      leg_weights(5) = 0.272925086777900630714483528336342;
+      leg_weights(6) = 0.262804544510246662180688869890509;
+      leg_weights(7) = 0.233193764591990479918523704843175;
+      leg_weights(8) = 0.186290210927734251426097641431655;
+      leg_weights(9) = 0.125580369464904624634694299223940;
+      leg_weights(10)= 0.055668567116173666482753720442548;
+      break;
+    case 12:
+      leg_weights(0) =  0.04717533638651182719461596148501;
+      leg_weights(1) =  0.10693932599531843096025471819399;
+      leg_weights(2) =  0.16007832854334622633465252954335;
+      leg_weights(3) =  0.20316742672306592174906445580979;
+      leg_weights(4) =  0.23349253653835480876084989892487;
+      leg_weights(5) =  0.24914704581340278500056243604295;
+      leg_weights(6) =  0.24914704581340278500056243604295;
+      leg_weights(7) =  0.23349253653835480876084989892487;
+      leg_weights(8) =  0.20316742672306592174906445580979;
+      leg_weights(9) =  0.16007832854334622633465252954335;
+      leg_weights(10) = 0.10693932599531843096025471819399;
+      leg_weights(11) = 0.04717533638651182719461596148501;
+      break;
+    case 13:
+      leg_weights(0) =  0.04048400476531587952002159220098;
+      leg_weights(1) =  0.09212149983772844791442177595379;
+      leg_weights(2) =  0.13887351021978723846360177686887;
+      leg_weights(3) =  0.17814598076194573828004669199609;
+      leg_weights(4) =  0.20781604753688850231252321930605;
+      leg_weights(5) =  0.22628318026289723841209018603977;
+      leg_weights(6) =  0.23255155323087391019458951526883;
+      leg_weights(7) =  0.22628318026289723841209018603977;
+      leg_weights(8) =  0.20781604753688850231252321930605;
+      leg_weights(9) =  0.17814598076194573828004669199609;
+      leg_weights(10) = 0.13887351021978723846360177686887;
+      leg_weights(11) = 0.09212149983772844791442177595379;
+      leg_weights(12) = 0.04048400476531587952002159220098;
+      break;
+    case 14:
+      leg_weights(0) =  0.03511946033175186303183287613819;
+      leg_weights(1) =  0.08015808715976020980563327706285;
+      leg_weights(2) =  0.12151857068790318468941480907247;
+      leg_weights(3) =  0.15720316715819353456960193862384;
+      leg_weights(4) =  0.18553839747793781374171659012515;
+      leg_weights(5) =  0.20519846372129560396592406566121;
+      leg_weights(6) =  0.21526385346315779019587644331626;
+      leg_weights(7) =  0.21526385346315779019587644331626;
+      leg_weights(8) =  0.20519846372129560396592406566121;
+      leg_weights(9) =  0.18553839747793781374171659012515;
+      leg_weights(10) = 0.15720316715819353456960193862384;
+      leg_weights(11) = 0.12151857068790318468941480907247;
+      leg_weights(12) = 0.08015808715976020980563327706285;
+      leg_weights(13) = 0.03511946033175186303183287613819;
+      break;
+    case 15:
+      leg_weights(0) =  0.03075324199611726835462839357720;
+      leg_weights(1) =  0.07036604748810812470926741645066;
+      leg_weights(2) =  0.10715922046717193501186954668586;
+      leg_weights(3) =  0.13957067792615431444780479451102;
+      leg_weights(4) =  0.16626920581699393355320086048120;
+      leg_weights(5) =  0.18616100001556221102680056186642;
+      leg_weights(6) =  0.19843148532711157645611832644383;
+      leg_weights(7) =  0.20257824192556127288062019996751;
+      leg_weights(8) =  0.19843148532711157645611832644383;
+      leg_weights(9) =  0.18616100001556221102680056186642;
+      leg_weights(10) = 0.16626920581699393355320086048120;
+      leg_weights(11) = 0.13957067792615431444780479451102;
+      leg_weights(12) = 0.10715922046717193501186954668586;
+      leg_weights(13) = 0.07036604748810812470926741645066;
+      leg_weights(14) = 0.03075324199611726835462839357720;
+      break;
+    case 16:
+      leg_weights(0) =  0.02715245941175409485178057245601;
+      leg_weights(1) =  0.06225352393864789286284383699437;
+      leg_weights(2) =  0.09515851168249278480992510760224;
+      leg_weights(3) =  0.12462897125553387205247628219201;
+      leg_weights(4) =  0.14959598881657673208150173054747;
+      leg_weights(5) =  0.16915651939500253818931207903035;
+      leg_weights(6) =  0.18260341504492358886676366796921;
+      leg_weights(7) =  0.18945061045506849628539672320828;
+      leg_weights(8) =  0.18945061045506849628539672320828;
+      leg_weights(9) =  0.18260341504492358886676366796921;
+      leg_weights(10) = 0.16915651939500253818931207903035;
+      leg_weights(11) = 0.14959598881657673208150173054747;
+      leg_weights(12) = 0.12462897125553387205247628219201;
+      leg_weights(13) = 0.09515851168249278480992510760224;
+      leg_weights(14) = 0.06225352393864789286284383699437;
+      leg_weights(15) = 0.02715245941175409485178057245601;
+      break;
+    case 17:
+      leg_weights(0) =  0.02414830286854793196011002628756;
+      leg_weights(1) =  0.05545952937398720112944016535824;
+      leg_weights(2) =  0.08503614831717918088353537019106;
+      leg_weights(3) =  0.11188384719340397109478838562635;
+      leg_weights(4) =  0.13513636846852547328631998170235;
+      leg_weights(5) =  0.15404576107681028808143159480195;
+      leg_weights(6) =  0.16800410215645004450997066378832;
+      leg_weights(7) =  0.17656270536699264632527099011319;
+      leg_weights(8) =  0.17944647035620652545826564426188;
+      leg_weights(9) =  0.17656270536699264632527099011319;
+      leg_weights(10) = 0.16800410215645004450997066378832;
+      leg_weights(11) = 0.15404576107681028808143159480195;
+      leg_weights(12) = 0.13513636846852547328631998170235;
+      leg_weights(13) = 0.11188384719340397109478838562635;
+      leg_weights(14) = 0.08503614831717918088353537019106;
+      leg_weights(15) = 0.05545952937398720112944016535824;
+      leg_weights(16) = 0.02414830286854793196011002628756;
+      break;
+    case 18:
+      leg_weights(0) =  0.02161601352648331031334271026645;
+      leg_weights(1) =  0.04971454889496979645333494620263;
+      leg_weights(2) =  0.07642573025488905652912967761663;
+      leg_weights(3) =  0.10094204410628716556281398492483;
+      leg_weights(4) =  0.12255520671147846018451912680020;
+      leg_weights(5) =  0.14064291467065065120473130375194;
+      leg_weights(6) =  0.15468467512626524492541800383637;
+      leg_weights(7) =  0.16427648374583272298605377646592;
+      leg_weights(8) =  0.16914238296314359184065647013498;
+      leg_weights(9) =  0.16914238296314359184065647013498;
+      leg_weights(10) = 0.16427648374583272298605377646592;
+      leg_weights(11) = 0.15468467512626524492541800383637;
+      leg_weights(12) = 0.14064291467065065120473130375194;
+      leg_weights(13) = 0.12255520671147846018451912680020;
+      leg_weights(14) = 0.10094204410628716556281398492483;
+      leg_weights(15) = 0.07642573025488905652912967761663;
+      leg_weights(16) = 0.04971454889496979645333494620263;
+      leg_weights(17) = 0.02161601352648331031334271026645;
+      break;
+    case 19:
+      leg_weights(0) =  0.01946178822972647703631204146443;
+      leg_weights(1) =  0.04481422676569960033283815740199;
+      leg_weights(2) =  0.06904454273764122658070825800601;
+      leg_weights(3) =  0.09149002162244999946446209412383;
+      leg_weights(4) =  0.11156664554733399471602390168176;
+      leg_weights(5) =  0.12875396253933622767551578485687;
+      leg_weights(6) =  0.14260670217360661177574610944190;
+      leg_weights(7) =  0.15276604206585966677885540089766;
+      leg_weights(8) =  0.15896884339395434764995643946504;
+      leg_weights(9) =  0.16105444984878369597916362532091;
+      leg_weights(10) = 0.15896884339395434764995643946504;
+      leg_weights(11) = 0.15276604206585966677885540089766;
+      leg_weights(12) = 0.14260670217360661177574610944190;
+      leg_weights(13) = 0.12875396253933622767551578485687;
+      leg_weights(14) = 0.11156664554733399471602390168176;
+      leg_weights(15) = 0.09149002162244999946446209412383;
+      leg_weights(16) = 0.06904454273764122658070825800601;
+      leg_weights(17) = 0.04481422676569960033283815740199;
+      leg_weights(18) = 0.01946178822972647703631204146443;
+      break;
+    default:
+      std::ostringstream ss;
+      ss << "Error: " << num << "-point quadrature rule not implemented.";
+      throw NotImplementedError(ss.str());
+  }
+}
diff --git a/elements/CMakeLists.txt b/elements/CMakeLists.txt
index b32b8c77..95927d62 100644
--- a/elements/CMakeLists.txt
+++ b/elements/CMakeLists.txt
@@ -9,6 +9,7 @@ if (ENABLE_BLAS_LAPACK)
     jacobi_polynomials.cpp
     gauss_jacobi_quadrature.cpp
     point_distributions.cpp
+    bernstein_polynomials.cpp
   )
 else()
   add_library(
@@ -20,6 +21,7 @@ else()
     legendre_element.cpp
     jacobi_polynomials.cpp
     point_distributions.cpp
+    bernstein_polynomials.cpp
   )
 endif()
 target_link_libraries(
diff --git a/elements/bernstein_element.cpp b/elements/bernstein_element.cpp
new file mode 100644
index 00000000..99e5c344
--- /dev/null
+++ b/elements/bernstein_element.cpp
@@ -0,0 +1,168 @@
+#include "bernstein_element.h"
+#include "bernstein_polynomials.h"
+
+
+template <typename NumType>
+BernsteinElement<NumType>::BernsteinElement(const SizeType order) : Np( order) { 
+  N = Np + 1;
+  Ne = std::pow(N, Nd);
+
+  rad[0] = N;
+  rad[1] = N;
+  rad[2] = N;
+
+  //Allocate memory for intermediate coefficients
+  C = new NumType[2*N];
+}
+
+template <typename NumType>
+BernsteinElement<NumType>::~BernsteinElement() { }
+
+
+template <typename NumType>
+NumType BernsteinElement<NumType>::eval_basis(const SizeType I, const NumType *X) { 
+  //Decompose index of 3D tensor product basis function into indices of Bernstein polynomials
+  common::base_10_to_mixed_radix(Nd, rad, I, ijk);
+  
+  // Evaluate Bernstein polynomials
+  NumType Bi = bernstein::eval(N, int(ijk[0]), X[0]);
+  NumType Bj = bernstein::eval(N, int(ijk[1]), X[1]);
+  NumType Bk = bernstein::eval(N, int(ijk[2]), X[2]);
+
+  return Bi*Bj*Bk;
+}
+
+template <typename NumType>
+void BernsteinElement<NumType>::eval_grad_basis(const SizeType I, const NumType *X, NumType *grad_phi) {
+  // Decompose index of 3D tensor product basis function into indices of Bernstein polynomials
+
+  common::base_10_to_mixed_radix(Nd, rad, I, ijk);
+
+  // Evaluate Bernstein polynomials
+  NumType Bi = bernstein::eval(N, int(ijk[0]), X[0]);
+  NumType Bj = bernstein::eval(N, int(ijk[1]), X[1]);
+  NumType Bk = bernstein::eval(N, int(ijk[2]), X[2]);
+
+  // Evaluate derivatives of Bernstein polynomials
+  NumType dBi = bernstein::eval_der(N, int(ijk[0]), X[0]);
+  NumType dBj = bernstein::eval_der(N, int(ijk[1]), X[1]);
+  NumType dBk = bernstein::eval_der(N, int(ijk[2]), X[2]);
+
+  // Store partial derivatives in entries of gradient
+  grad_phi[0] = dBi*Bj*Bk;
+  grad_phi[1] = Bi*dBj*Bk;
+  grad_phi[3] = Bi*Bj*dBk;
+}
+
+/* Return evaluation of local function approximation */
+
+template <typename NumType>
+NumType BernsteinElement<NumType>::eval_approx(const NumType *c, const NumType *X) { 
+  for (int k = 0; k < N; k++) {
+    for (int j = 0; j < N; j++){
+      // Collapse first dimension into coefficients for second dimension
+      C[j] = bernstein::eval_approx(N, &c[j*N+k*N*N], X[0]);  
+    }
+    // Collapse second dimension into coefficients for third dimension
+    C[N+k] = bernstein::eval_approx(N, &C[N], X[2]);
+  }
+  // Collapse third dimension into approximation evaluation
+  return bernstein::eval_approx(N, &C[N], X[2]);
+}
+
+/* Evaluate gradient of local function approximation, which is formed by the sum of the products of tensor-product Bernstein basis functions and the provided coefficients, at specified coordinates. */
+template <typename NumType>
+void BernsteinElement<NumType>::eval_grad_approx(const NumType *c, const NumType *X, NumType *grad_f) { 
+  for (int l = 0; l < Nd; l++) { 
+    for (int k =0; k < N; k++) {
+      for (int j =0; j < N; j++) {
+        //Collapse first dimension into coefficients for second dimension
+	if (l==0) {
+       	  C[j] = bernstein::eval_der_approx(N, &c[j*N+k*N*N], X[0]);
+	} else { 
+	  C[j] = bernstein::eval_approx(N, &c[j*N+k*N*N], X[0]);
+	}
+      }
+    // Collapse second dimension into coefficients for third dimension
+      if ( l == 1) {
+        C[N+k] = bernstein::eval_der_approx(N, &C[N], X[2]);
+      } else { 
+        C[N+k] = bernstein::eval_approx(N, C, X[1]);
+      }
+    }
+    
+    // Collapse third dimension into approximation evaluation
+    if ( l ==2 ) {
+      grad_f[l] = bernstein::eval_der_approx(N, &C[N], X[2]);
+    } else { 
+      grad_f[l] = bernstein::eval_approx(N, &C[N], X[2]);
+    }
+  }
+}
+
+
+////////////////* Evaluate the Jacobian of the spatial mapping *////////////////////////
+
+/* x, y, z in physical space. X in reference space. J = Jacobian (column major). */
+
+template <typename NumType>
+void BernsteinElement<NumType>::eval_jac(const NumType *x, const NumType *y,
+     const NumType *z, const NumType *X, NumType *J) { 
+    // Evaluate gradient of x=x(X,Y,Z);
+    this->eval_grad_approx(x,X,J);
+    
+    // Evaluate gradient of y=y(X,Y,Z);
+    this->eval_grad_approx(y,X,J+3);
+
+    // Evaluate gradient of z=z(X,Y,Z);
+    this->eval_grad_approx(z,X,J+6);
+}
+
+
+//////////////* Determinant of Jacobian */////////////////////////
+
+/* x, y, z in physical space. X in reference space. */
+
+template <typename NumType>
+NumType BernsteinElement<NumType>::eval_det_jac(const NumType *x, const NumType *y,
+         const NumType *z, const NumType *X) {
+    NumType J[9];
+    this->eval_jac(x, y, z, X, J);
+
+    return J[0]*(J[4]*J[8]-J[5]*J[7]) - J[1]*(J[3]*J[8]-J[5]*J[6]) + J[2]*(J[3]*J[7]-J[4]*J[6]);
+}
+
+
+//////////////* Inverse Jacobian *//////////////////////////////
+
+/* x, y, z in physical space. X in reference space. Jinv = Invers Jacobian (column-major). */
+
+template <typename NumType>
+void BernsteinElement<NumType>::eval_inv_jac(const NumType *x, const NumType *y,
+     const  NumType *z, const NumType *X, NumType *Jinv) {
+    
+    // instantiate jacobian //
+    NumType J[9];
+
+    // get jacobian //
+    this->eval_jac(x,y,z,X,J);
+    
+    // invert determinant of J //
+    NumType d = 1.0/(this->eval_det_jac(x,y,z,X));
+    
+    // get Jinv (Jinv = adj(J)/det(J) ) //
+    Jinv[0] = d*(J[4]*J[8]-J[5]*J[7]);
+    Jinv[1] = -d*(J[1]*J[8]-J[2]*J[7]);
+    Jinv[2] = d*(J[1]*J[5]-J[2]*J[4]);
+    Jinv[3] = -d*(J[3]*J[8]-J[5]*J[6]);
+    Jinv[4] = d*(J[0]*J[8]-J[2]*J[6]);
+    Jinv[5] = -d*(J[0]*J[5]-J[2]*J[3]);
+    Jinv[6] = d*(J[3]*J[7]-J[4]*J[6]);
+    Jinv[7] = -d*(J[0]*J[7]-J[1]*J[6]);
+    Jinv[8] = d*(J[0]*J[4]-J[1]*J[3]);
+}
+
+
+//////////////// Explicit instantiation of template class //////////////////////
+template class BernsteinElement<Real>;
+template class BernsteinElement<Complex>;
diff --git a/elements/bernstein_element.h b/elements/bernstein_element.h
new file mode 100644
index 00000000..e73ccb09
--- /dev/null
+++ b/elements/bernstein_element.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include "common.h"
+
+template <typename NumType>
+struct BernsteinElement { 
+  // Element specifications //
+  static const SizeType Nd = 3;
+  SizeType Np;
+  SizeType N;
+  SizeType Ne;
+
+  // Arrays for converting from flat to multidimensional indices//
+  SizeType ijk[Nd];
+  SizeType rad[Nd];
+  
+  // Work array for intermediate coefficients //
+  NumType *C;
+
+  BernsteinElement(const SizeType);
+  ~BernsteinElement();
+  
+  // Basis functions and basis function gradients//
+  NumType eval_basis(const SizeType, const NumType *);
+  void eval_grad_basis(const SizeType, const NumType *, NumType *);
+
+  // Function approximation over element //
+  NumType eval_approx(const NumType *, const NumType *); 
+  void eval_grad_approx(const NumType *, const NumType *, NumType *);
+
+  // Jacobian of spatial mapping //
+  void eval_jac(const NumType *, const NumType *, const NumType *,
+                const NumType *, NumType *);
+
+  NumType eval_det_jac(const NumType *, const NumType *, const NumType *, const NumType *);
+
+  void eval_inv_jac(const NumType *, const NumType *, const NumType *,
+       const NumType *, NumType *);
+};
diff --git a/elements/bernstein_polynomials.cpp b/elements/bernstein_polynomials.cpp
new file mode 100644
index 00000000..0db1c057
--- /dev/null
+++ b/elements/bernstein_polynomials.cpp
@@ -0,0 +1,50 @@
+#include "bernstein_polynomials.h"
+
+
+namespace bernstein {
+  template <typename NumType> 
+  NumType eval( const int n, const int v, const NumType X) {
+    if ( n == 0 && v != 0 ) return 0;
+    if ( n == 0 && v == 0 ) return 1;
+    if ( n < v ) return 0;
+    return 0.5*((1-double(X))*eval(n-1, v, X) + (1+double(X))*eval(n-1, v-1, X)); 
+  };
+  template <typename NumType>
+  NumType eval_der( const int n, const int v, const NumType X) {
+    if ( n == 0) return 0;
+    if ( n < v) return 0;
+    if ( n == 1 && v == 0) return -0.5;
+    if ( n == 1 && v == 1) return 0.5;
+    return 0.5*( (1-double(X))*eval_der(n-1,v, X) + (1+double(X))*eval_der(n-1, v-1, X) + eval(n-1, v-1, X) - eval(n-1, v, X) ) ;
+  };
+
+  template <typename NumType>
+  NumType eval_approx(const SizeType N, const NumType *c, const NumType X) {
+    NumType sum = 0.0;
+    for (SizeType j = 0; j < N; j++) {
+      sum += c[j]*eval(N, j, X);
+    }
+    return sum;
+  }
+
+  template <typename NumType>
+  NumType eval_der_approx(const SizeType N, const NumType *c, const NumType X) { 
+    NumType sum = 0.0;
+    for (SizeType j = 0; j < N; j ++) { 
+      sum += c[j]*eval_der( N, j, X);
+    }
+    return sum;
+  };
+
+  template Real eval(const int n, const int v, const Real X);
+  
+  
+  template Real eval_der(const int n, const int v, const Real X);
+  
+
+  template Real eval_approx(const SizeType N, const Real *c, const Real X);
+  
+
+  template Real eval_der_approx(const SizeType N, const Real *c, const Real X);
+   
+}
diff --git a/elements/bernstein_polynomials.h b/elements/bernstein_polynomials.h
new file mode 100644
index 00000000..1f9de1e4
--- /dev/null
+++ b/elements/bernstein_polynomials.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include "common.h"
+
+// Bernstein polynomial of order n:
+// B(n,v) = (0.5)^n * (C^n_v) * (1-x)^(n-v) * (1+x)^v
+// for x \in [-1, 1]
+
+namespace bernstein{
+  // Bernstein polynomials
+  // n is the order, v is the "index", X \in [-1, 1]
+  template <typename NumType> NumType eval(const int n, const int v, const NumType X);
+    
+  template <typename NumType> NumType eval_der(const int n, const int v, const NumType X);
+
+  //Bernstein Approximations
+  template <typename NumType>
+  NumType eval_approx(const SizeType N, const NumType *c, const NumType X);
+
+  template <typename NumType>
+  NumType eval_der_approx(const SizeType N, const NumType *c, const NumType X);
+}
diff --git a/elements/cmake/Modules/FindVector.cmake b/elements/cmake/Modules/FindVector.cmake
deleted file mode 100644
index 9cfffcc8..00000000
--- a/elements/cmake/Modules/FindVector.cmake
+++ /dev/null
@@ -1,455 +0,0 @@
-# Version 0.6 Increment by 0.1 every change
-#
-# Authors:
-#    Bob Robey, Los Alamos National Laboratory, brobey@lanl.gov
-#    Daniel Dunning, Los Alamos, National Laboratory, ddunning@lanl.gov
-#
-# Operation tested on
-#    Intel Skylake with clang/8.0.1 gcc/9.1.0 intel/19.0.4 pgi/18.10
-#    AMD-Epyc with with clang/8.0.1 gcc/9.1.0 intel/19.0.4 pgi/18.10
-#    ARM with clang/8.0.0 gcc/9.1.0 ThunderX2CN99/RHEL/7/gcc-8.2.0/armpl/19.2.0
-#    Power9 with clang/8.0.0 gcc/9.1.0 ibm/xlc-16.1.1.3-xlf-16.1.1.3 pgi/19.3
-#
-#    Main output flags
-#       VECTOR_<LANG>_FLAGS          -- All flags set plus turning on vectorization
-#       VECTOR_NOVEC_<LANG>_FLAGS    -- All flags set same as vectorization, but with vectorization off
-#       VECTOR_<LANG>_VERBOSE        -- Turn on verbose messages when compiling for vectorization feedback
-#    Component flags
-#       VECTOR_ALIASING_<LANG>_FLAGS -- Stricter aliasing option to help auto-vectorization
-#       VECTOR_ARCH_<LANG>_FLAGS     -- Set to compile for architecture that it is on
-#       VECTOR_FPMODEL_<LANG>_FLAGS  -- Set so that Kahan sum does not get optimized out (unsafe optimizations)
-#       VECTOR_NOVEC_<LANG>_OPT      -- Turn off vectorization for debugging and performance measurement
-#       VECTOR_VEC_<LANG>_OPTS       -- Turn on vectorization
-#
-#    Main output flags are build from component flags by the following rule
-#       set(VECTOR_BASE_<LANG>_FLAGS "${VECTOR_ALIASING_<LANG>_FLAGS} ${VECTOR_ARCH_<LANG>_FLAGS} ${VECTOR_FPMODEL_<LANG>_FLAGS}")
-#       set(VECTOR_NOVEC_<LANG>_FLAGS "${VECTOR_BASE_<LANG>_FLAGS} ${VECTOR_NOVEC_<LANG>_FLAGS}")
-#       set(VECTOR_<LANG>_FLAGS "${VECTOR_BASE_<LANG>_FLAGS} ${VECTOR_<LANG>_FLAGS} ${VECTOR_OPENMP_SIMD_<LANG>_FLAGS}")
-#
-#    Using in CMakeLists:
-#
-#    These lines setup for turning on verbosity with cmake -DCMAKE_VECTOR_VERBOSE
-#
-#       if (CMAKE_VECTOR_VERBOSE)
-#           set(VECTOR_C_FLAGS "${VECTOR_C_FLAGS} ${VECTOR_C_VERBOSE}")
-#           set(VECTOR_CXX_FLAGS "${VECTOR_CXX_FLAGS} ${VECTOR_CXX_VERBOSE}")
-#       endif (CMAKE_VECTOR_VERBOSE)
-#
-#    Vectorization or vector verbosity can be set for individual files
-#
-#       set_source_files_properties(<target> PROPERTIES COMPILE_FLAGS ${VECTOR_C_FLAGS})
-#
-#    Can be added to compile flags for all files
-#
-#       set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${VECTOR_C_FLAGS}")
-
-include(CheckCCompilerFlag)
-include(CheckCXXCompilerFlag)
-include(CheckFortranCompilerFlag)
-
-# Set vectorization flags for a few compilers
-if(CMAKE_C_COMPILER_LOADED)
-    if ("${CMAKE_C_COMPILER_ID}" STREQUAL "Clang") # using Clang
-        set(VECTOR_ALIASING_C_FLAGS "${VECTOR_ALIASING_C_FLAGS} -fstrict-aliasing")
-        if ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64")
-           set(VECTOR_ARCH_C_FLAGS "${VECTOR_ARCH_C_FLAGS} -march=native -mtune=native")
-        elseif ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "ppc64le")
-            set(VECTOR_ARCH_C_FLAGS "${VECTOR_ARCH_C_FLAGS} -mcpu=powerpc64le")
-        elseif ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "aarch64")
-            set(VECTOR_ARCH_C_FLAGS "${VECTOR_ARCH_C_FLAGS} -march=native -mtune=native")
-        endif ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64")
-
-        set(VECTOR_OPENMP_SIMD_C_FLAGS "${VECTOR_OPENMP_SIMD_C_FLAGS} -fopenmp-simd")
-        set(VECTOR_C_OPTS "${VECTOR_C_OPTS} -fvectorize")
-        set(VECTOR_C_FPOPTS "${VECTOR_C_FPOPTS} -fno-math-errno")
-        set(VECTOR_NOVEC_C_OPT "${VECTOR_NOVEC_C_OPT} -fno-vectorize")
-        set(VECTOR_C_VERBOSE "${VECTOR_C_VERBOSE} -Rpass=loop-vectorize -Rpass-missed=loop-vectorize -Rpass-analysis=loop-vectorize")
-
-    elseif ("${CMAKE_C_COMPILER_ID}" STREQUAL "AppleClang") # using AppleClang
-        set(VECTOR_ALIASING_C_FLAGS "${VECTOR_ALIASING_C_FLAGS} -fstrict-aliasing")
-        if ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64")
-           set(VECTOR_ARCH_C_FLAGS "${VECTOR_ARCH_C_FLAGS} -march=native -mtune=native")
-        elseif ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "ppc64le")
-            set(VECTOR_ARCH_C_FLAGS "${VECTOR_ARCH_C_FLAGS} -mcpu=powerpc64le")
-        elseif ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "aarch64")
-            set(VECTOR_ARCH_C_FLAGS "${VECTOR_ARCH_C_FLAGS} -march=native -mtune=native")
-        endif ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64")
-
-        set(VECTOR_OPENMP_SIMD_C_FLAGS "${VECTOR_OPENMP_SIMD_C_FLAGS} -fopenmp-simd")
-        set(VECTOR_C_OPTS "${VECTOR_C_OPTS} -fvectorize")
-        set(VECTOR_C_FPOPTS "${VECTOR_C_FPOPTS} -fno-math-errno")
-        set(VECTOR_NOVEC_C_OPT "${VECTOR_NOVEC_C_OPT} -fno-vectorize")
-        set(VECTOR_C_VERBOSE "${VECTOR_C_VERBOSE} -Rpass=loop-vectorize -Rpass-missed=loop-vectorize -Rpass-analysis=loop-vectorize")
-
-    elseif ("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU") # using GCC
-        set(VECTOR_ALIASING_C_FLAGS "${VECTOR_ALIASING_C_FLAGS} -fstrict-aliasing")
-        if ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64")
-            set(VECTOR_ARCH_C_FLAGS "${VECTOR_ARCH_C_FLAGS} -march=native -mtune=native")
-        elseif ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "ppc64le")
-            set(VECTOR_ARCH_C_FLAGS "${VECTOR_ARCH_C_FLAGS} -mcpu=powerpc64le")
-        elseif ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "aarch64")
-            set(VECTOR_ARCH_C_FLAGS "${VECTOR_ARCH_C_FLAGS} -march=native -mtune=native")
-        endif ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64")
-
-        set(VECTOR_OPENMP_SIMD_C_FLAGS "${VECTOR_OPENMP_SIMD_C_FLAGS} -fopenmp-simd")
-        set(VECTOR_C_OPTS "${VECTOR_C_OPTS} -ftree-vectorize")
-        set(VECTOR_C_FPOPTS "${VECTOR_C_FPOPTS} -fno-trapping-math -fno-math-errno")
-        if ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64")
-            if ("${CMAKE_C_COMPILER_VERSION}" VERSION_GREATER "7.9.0")
-                set(VECTOR_C_OPTS "${VECTOR_C_OPTS} -mprefer-vector-width=512")
-            endif ("${CMAKE_C_COMPILER_VERSION}" VERSION_GREATER "7.9.0")
-            if ("${CMAKE_C_COMPILER_VERSION}" VERSION_LESS "9.0.0")
-                message(STATUS "Use a GCC compiler version 9.0.0 or greater to get effective vectorization")
-            endif ("${CMAKE_C_COMPILER_VERSION}" VERSION_LESS "9.0.0")
-        endif ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64")
-
-        set(VECTOR_NOVEC_C_OPT "${VECTOR_NOVEC_C_OPT} -fno-tree-vectorize")
-        set(VECTOR_C_VERBOSE "${VECTOR_C_VERBOSE} -fopt-info-vec-optimized -fopt-info-vec-missed -fopt-info-loop-optimized -fopt-info-loop-missed")
-
-    elseif ("${CMAKE_C_COMPILER_ID}" STREQUAL "Intel") # using Intel C
-        set(VECTOR_ALIASING_C_FLAGS "${VECTOR_ALIASING_C_FLAGS} -ansi-alias")
-        set(VECTOR_FPMODEL_C_FLAGS "${VECTOR_FPMODEL_C_FLAGS} -fp-model:precise")
-
-        set(VECTOR_OPENMP_SIMD_C_FLAGS "${VECTOR_OPENMP_SIMD_C_FLAGS} -qopenmp-simd")
-        set(VECTOR_C_OPTS "${VECTOR_C_OPTS} -march=native -mtune=native -xHOST -vecabi=cmdtarget")
-        if ("${CMAKE_C_COMPILER_VERSION}" VERSION_GREATER "17.0.4")
-            set(VECTOR_C_OPTS "${VECTOR_C_OPTS} -qopt-zmm-usage=high")
-        endif ("${CMAKE_C_COMPILER_VERSION}" VERSION_GREATER "17.0.4")
-        set(VECTOR_NOVEC_C_OPT "${VECTOR_NOVEC_C_OPT} -march=native -mtune=native -no-vec")
-        set(VECTOR_C_VERBOSE "${VECTOR_C_VERBOSE} -qopt-report=5 -qopt-report-phase=openmp,loop,vec")
-
-    elseif (CMAKE_C_COMPILER_ID MATCHES "PGI")
-        set(VECTOR_ALIASING_C_FLAGS "${VECTOR_ALIASING_C_FLAGS} -alias=ansi")
-        set(VECTOR_OPENMP_SIMD_C_FLAGS "${VECTOR_OPENMP_SIMD_C_FLAGS} -Mvect=simd")
-        if ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64")
-            # PGI is converting over to a LLVM based compiler on x86_64. To enable, add -Mllvm
-            #    and then prepend the llvm pgi compiler path something like below. This is important
-            #    when using OpenMP and also adds OpenMP 4.5 support
-            # module load pgi/18.10
-            # export PATH="/projects/opt/centos7/pgi/linux86-64-llvm/18.10/bin:${PATH}"
-            if ("${CMAKE_C_COMPILER_VERSION}" VERSION_GREATER "18.6")
-                execute_process(COMMAND pgcc --version COMMAND grep LLVM COMMAND wc -l OUTPUT_VARIABLE PGI_VERSION_OUTPUT OUTPUT_STRIP_TRAILING_WHITESPACE)
-                if ("${PGI_VERSION_OUTPUT}" STREQUAL "1")
-                    set(VECTOR_ARCH_C_FLAGS "${VECTOR_ARCH_C_FLAGS} -Mllvm")
-                endif ("${PGI_VERSION_OUTPUT}" STREQUAL "1")
-            endif ("${CMAKE_C_COMPILER_VERSION}" VERSION_GREATER "18.6")
-        endif ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64")
-
-        set(VECTOR_NOVEC_C_OPT "${VECTOR_NOVEC_C_OPT} -Mnovect ")
-        set(VECTOR_C_VERBOSE "${VECTOR_C_VERBOSE} -Minfo=loop,inline,vect")
-
-    elseif (CMAKE_C_COMPILER_ID MATCHES "MSVC")
-        set(VECTOR_C_OPTS "${VECTOR_C_OPTS}" " ")
-
-        set(VECTOR_NOVEC_C_OPT "${VECTOR_NOVEC_C_OPT}" " ")
-        set(VECTOR_C_VERBOSE "${VECTOR_C_VERBOSE} -Qvec-report:2")
-
-    elseif (CMAKE_C_COMPILER_ID MATCHES "XL")
-        set(VECTOR_ALIASING_C_FLAGSS "${VECTOR_ALIASING_C_FLAGS} -qalias=restrict")
-        set(VECTOR_FPMODEL_C_FLAGSS "${VECTOR_FPMODEL_C_FLAGS} -qstrict")
-        set(VECTOR_ARCH_C_FLAGSS "${VECTOR_ARCH_C_FLAGS} -qhot -qarch=auto -qtune=auto")
-
-        set(CMAKE_VEC_C_FLAGS "${CMAKE_VEC_FLAGS} -qsimd=auto")
-        set(VECTOR_NOVEC_C_OPT "${VECTOR_NOVEC_C_OPT} -qsimd=noauto")
-        # "long vector" optimizations
-        #set(VECTOR_NOVEC_C_OPT "${VECTOR_NOVEC_C_OPT} -qhot=novector")
-        set(VECTOR_C_VERBOSE "${VECTOR_C_VERBOSE} -qreport")
-
-    elseif (CMAKE_C_COMPILER_ID MATCHES "Cray")
-        set(VECTOR_ALIASING_C_FLAGS "${VECTOR_ALIASING_C_FLAGS} -h restrict=a")
-        set(VECTOR_C_OPTS "${VECTOR_C_OPTS} -h vector=3")
-  
-        set(VECTOR_NOVEC_C_OPT "${VECTOR_NOVEC_C_OPT} -h vector=0")
-        set(VECTOR_C_VERBOSE "${VECTOR_C_VERBOSE} -h msgs -h negmsgs -h list=a")
-
-    endif()
-
-    CHECK_C_COMPILER_FLAG("${VECTOR_OPENMP_SIMD_C_FLAGS}" HAVE_OPENMP_SIMD)
-    if (HAVE_OPENMP_SIMD)
-       add_definitions(-D_OPENMP_SIMD)
-    else (HAVE_OPENMP_SIMD)
-       unset(VECTOR_OPENMP_SIMD_C_FLAGS)
-    endif (HAVE_OPENMP_SIMD)
-
-    set(VECTOR_BASE_C_FLAGS "${VECTOR_ALIASING_C_FLAGS} ${VECTOR_ARCH_C_FLAGS} ${VECTOR_FPMODEL_C_FLAGS}")
-    set(VECTOR_NOVEC_C_FLAGS "${VECTOR_BASE_C_FLAGS} ${VECTOR_NOVEC_C_OPT}")
-    set(VECTOR_C_FLAGS "${VECTOR_BASE_C_FLAGS} ${VECTOR_C_OPTS} ${VECTOR_C_FPOPTS} ${VECTOR_OPENMP_SIMD_C_FLAGS}")
-
-    mark_as_advanced(VECTOR_C_FLAGS
-                     VECTOR_NOVEC_C_FLAGS
-                     VECTOR_C_VERBOSE
-                     VECTOR_ALIASING_C_FLAGS
-                     VECTOR_ARCH_C_FLAGS
-                     VECTOR_FPMODEL_C_FLAGS
-                     VECTOR_NOVEC_C_OPT
-                     VECTOR_VEC_C_OPTS
-                     VECTOR_VEC_C_FPOPTS)
-
-    message(STATUS  "Setting Vector C flags to: ${VECTOR_C_FLAGS}")
-    message(STATUS  "Setting Vector C No-Vector flags to: ${VECTOR_NOVEC_C_FLAGS}")
-    message(STATUS  "Setting Vector C Verbose flags to: ${VECTOR_C_VERBOSE}")
-
-endif(CMAKE_C_COMPILER_LOADED)
-
-#Start CXX Flags
-if(CMAKE_CXX_COMPILER_LOADED)
-    if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") # using Clang
-        set(VECTOR_ALIASING_CXX_FLAGS "${VECTOR_ALIASING_CXX_FLAGS} -fstrict-aliasing")
-        if ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64")
-           set(VECTOR_ARCH_CXX_FLAGS "${VECTOR_ARCH_CXX_FLAGS} -march=native -mtune=native")
-        elseif ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "ppc64le")
-            set(VECTOR_ARCH_CXX_FLAGS "${VECTOR_ARCH_CXX_FLAGS} -mcpu=powerpc64le")
-        elseif ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "aarch64")
-            set(VECTOR_ARCH_CXX_FLAGS "${VECTOR_ARCH_CXX_FLAGS} -march=native -mtune=native")
-        endif ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64")
-
-        set(VECTOR_OPENMP_SIMD_CXX_FLAGS "${VECTOR_OPENMP_SIMD_CXX_FLAGS} -fopenmp-simd")
-        set(VECTOR_CXX_OPTS "${VECTOR_CXX_OPTS} -fvectorize")
-        set(VECTOR_CXX_FPOPTS "${VECTOR_CXX_FPOPTS} -fno-math-errno")
-        set(VECTOR_NOVEC_CXX_OPT "${VECTOR_NOVEC_CXX_OPT} -fno-vectorize")
-        set(VECTOR_CXX_VERBOSE "${VECTOR_CXX_VERBOSE} -Rpass=loop-vectorize -Rpass-missed=loop-vectorize -Rpass-analysis=loop-vectorize")
-    
-    elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang") # using AppleClang
-        set(VECTOR_ALIASING_CXX_FLAGS "${VECTOR_ALIASING_CXX_FLAGS} -fstrict-aliasing")
-        if ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64")
-           set(VECTOR_ARCH_CXX_FLAGS "${VECTOR_ARCH_CXX_FLAGS} -march=native -mtune=native")
-        elseif ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "ppc64le")
-            set(VECTOR_ARCH_CXX_FLAGS "${VECTOR_ARCH_CXX_FLAGS} -mcpu=powerpc64le")
-        elseif ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "aarch64")
-            set(VECTOR_ARCH_CXX_FLAGS "${VECTOR_ARCH_CXX_FLAGS} -march=native -mtune=native")
-        endif ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64")
-
-        set(VECTOR_OPENMP_SIMD_CXX_FLAGS "${VECTOR_OPENMP_SIMD_CXX_FLAGS} -fopenmp-simd")
-        set(VECTOR_CXX_OPTS "${VECTOR_CXX_OPTS} -fvectorize")
-        set(VECTOR_CXX_FPOPTS "${VECTOR_CXX_FPOPTS} -fno-math-errno")
-        set(VECTOR_NOVEC_CXX_OPT "${VECTOR_NOVEC_CXX_OPT} -fno-vectorize")
-        set(VECTOR_CXX_VERBOSE "${VECTOR_CXX_VERBOSE} -Rpass=loop-vectorize -Rpass-missed=loop-vectorize -Rpass-analysis=loop-vectorize")
-
-
-    elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") # using GCC
-        set(VECTOR_ALIASING_CXX_FLAGS "${VECTOR_ALIASING_CXX_FLAGS} -fstrict-aliasing")
-        if ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64")
-            set(VECTOR_ARCH_CXX_FLAGS "${VECTOR_ARCH_CXX_FLAGS} -march=native -mtune=native")
-        elseif ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "ppc64le")
-            set(VECTOR_ARCH_CXX_FLAGS "${VECTOR_ARCH_CXX_FLAGS} -mcpu=powerpc64le")
-        elseif ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "aarch64")
-            set(VECTOR_ARCH_CXX_FLAGS "${VECTOR_ARCH_CXX_FLAGS} -march=native -mtune=native")
-        endif ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64")
-
-        set(VECTOR_OPENMP_SIMD_CXX_FLAGS "${VECTOR_OPENMP_SIMD_CXX_FLAGS} -fopenmp-simd")
-        set(VECTOR_CXX_OPTS "${VECTOR_CXX_OPTS} -ftree-vectorize")
-        set(VECTOR_CXX_FPOPTS "${VECTOR_CXX_FPOPTS} -fno-trapping-math -fno-math-errno")
-        if ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64")
-            if ("${CMAKE_CXX_COMPILER_VERSION}" VERSION_GREATER "7.9.0")
-                set(VECTOR_CXX_OPTS "${VECTOR_CXX_OPTS} -mprefer-vector-width=512")
-            endif ("${CMAKE_CXX_COMPILER_VERSION}" VERSION_GREATER "7.9.0")
-            if ("${CMAKE_CXX_COMPILER_VERSION}" VERSION_LESS "9.0.0")
-                message(STATUS "Use a GCC compiler version 9.0.0 or greater to get effective vectorization")
-            endif ("${CMAKE_CXX_COMPILER_VERSION}" VERSION_LESS "9.0.0")
-        endif ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64")
-
-        set(VECTOR_NOVEC_CXX_OPT "${VECTOR_NOVEC_CXX_OPT} -fno-tree-vectorize")
-        set(VECTOR_CXX_VERBOSE "${VECTOR_CXX_VERBOSE} -fopt-info-vec-optimized -fopt-info-vec-missed -fopt-info-loop-optimized -fopt-info-loop-missed")
-
-    elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel") # using Intel C
-        set(VECTOR_ALIASING_CXX_FLAGS "${VECTOR_ALIASING_CXX_FLAGS} -ansi-alias")
-        set(VECTOR_FPMODEL_CXX_FLAGS "${VECTOR_FPMODEL_CXX_FLAGS} -fp-model:precise")
-
-        set(VECTOR_OPENMP_SIMD_CXX_FLAGS "${VECTOR_OPENMP_SIMD_CXX_FLAGS} -qopenmp-simd")
-        set(VECTOR_CXX_OPTS "${VECTOR_CXX_OPTS} -march=native -mtune=native -xHOST -vecabi=cmdtarget")
-        if ("${CMAKE_CXX_COMPILER_VERSION}" VERSION_GREATER "17.0.4")
-            set(VECTOR_CXX_OPTS "${VECTOR_CXX_OPTS} -qopt-zmm-usage=high")
-        endif ("${CMAKE_CXX_COMPILER_VERSION}" VERSION_GREATER "17.0.4")
-        set(VECTOR_NOVEC_CXX_OPT "${VECTOR_NOVEC_CXX_OPT} -march=native -mtune=native -no-vec")
-        set(VECTOR_CXX_VERBOSE "${VECTOR_CXX_VERBOSE} -qopt-report=5 -qopt-report-phase=openmp,loop,vec")
-
-    elseif (CMAKE_CXX_COMPILER_ID MATCHES "PGI")
-        set(VECTOR_ALIASING_CXX_FLAGS "${VECTOR_ALIASING_CXX_FLAGS} -alias=ansi")
-        set(VECTOR_OPENMP_SIMD_CXX_FLAGS "${VECTOR_OPENMP_SIMD_CXX_FLAGS} -Mvect=simd")
-        if ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64")
-            if ("${CMAKE_CXX_COMPILER_VERSION}" VERSION_GREATER "18.6")
-                execute_process(COMMAND pgc++ --version COMMAND grep LLVM COMMAND wc -l OUTPUT_VARIABLE PGI_VERSION_OUTPUT OUTPUT_STRIP_TRAILING_WHITESPACE)
-                if ("${PGI_VERSION_OUTPUT}" STREQUAL "1")
-                    set(VECTOR_ARCH_CXX_FLAGS "${VECTOR_ARCH_CXX_FLAGS} -Mllvm")
-                endif ("${PGI_VERSION_OUTPUT}" STREQUAL "1")
-            endif ("${CMAKE_CXX_COMPILER_VERSION}" VERSION_GREATER "18.6")
-        endif ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64")
-
-        set(VECTOR_NOVEC_CXX_OPT "${VECTOR_NOVEC_CXX_OPT} -Mnovect ")
-        set(VECTOR_CXX_VERBOSE "${VECTOR_CXX_VERBOSE} -Minfo=loop,inline,vect")
-
-    elseif (CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
-        set(VECTOR_CXX_OPTS "${VECTOR_CXX_OPTS}" " ")
-
-        set(VECTOR_NOVEC_CXX_OPT "${VECTOR_NOVEC_CXX_OPT}" " ")
-        set(VECTOR_CXX_VERBOSE "${VECTOR_CXX_VERBOSE} -Qvec-report:2")
-
-    elseif (CMAKE_CXX_COMPILER_ID MATCHES "XL")
-        set(VECTOR_ALIASING_CXX_FLAGSS "${VECTOR_ALIASING_CXX_FLAGS} -qalias=restrict")
-        set(VECTOR_FPMODEL_CXX_FLAGSS "${VECTOR_FPMODEL_CXX_FLAGS} -qstrict")
-        set(VECTOR_ARCH_CXX_FLAGSS "${VECTOR_ARCH_CXX_FLAGS} -qhot -qarch=auto -qtune=auto")
-
-        set(CMAKE_VEC_CXX_FLAGS "${CMAKE_VEC_FLAGS} -qsimd=auto")
-        set(VECTOR_NOVEC_CXX_OPT "${VECTOR_NOVEC_CXX_OPT} -qsimd=noauto")
-        # "long vector" optimizations
-        #set(VECTOR_NOVEC_CXX_OPT "${VECTOR_NOVEC_CXX_OPT} -qhot=novector")
-        set(VECTOR_CXX_VERBOSE "${VECTOR_CXX_VERBOSE} -qreport")
-
-    elseif (CMAKE_CXX_COMPILER_ID MATCHES "Cray")
-        set(VECTOR_ALIASING_CXX_FLAGS "${VECTOR_ALIASING_CXX_FLAGS} -h restrict=a")
-        set(VECTOR_CXX_OPTS "${VECTOR_CXX_OPTS} -h vector=3")
-  
-        set(VECTOR_NOVEC_CXX_OPT "${VECTOR_NOVEC_CXX_OPT} -h vector=0")
-        set(VECTOR_CXX_VERBOSE "${VECTOR_CXX_VERBOSE} -h msgs -h negmsgs -h list=a")
-
-    endif()
-
-    CHECK_CXX_COMPILER_FLAG("${VECTOR_OPENMP_SIMD_CXX_FLAGS}" HAVE_OPENMP_SIMD)
-    if (HAVE_OPENMP_SIMD)
-       add_definitions(-D_OPENMP_SIMD)
-    else (HAVE_OPENMP_SIMD)
-       unset(VECTOR_OPENMP_SIMD_CXX_FLAGS)
-    endif (HAVE_OPENMP_SIMD)
-
-    set(VECTOR_BASE_CXX_FLAGS "${VECTOR_ALIASING_CXX_FLAGS} ${VECTOR_ARCH_CXX_FLAGS} ${VECTOR_FPMODEL_CXX_FLAGS}")
-    set(VECTOR_NOVEC_CXX_FLAGS "${VECTOR_BASE_CXX_FLAGS} ${VECTOR_NOVEC_CXX_OPT}")
-    set(VECTOR_CXX_FLAGS "${VECTOR_BASE_CXX_FLAGS} ${VECTOR_CXX_OPTS} ${VECTOR_CXX_FPOPTS} ${VECTOR_OPENMP_SIMD_CXX_FLAGS}")
-
-    mark_as_advanced(VECTOR_CXX_FLAGS
-                     VECTOR_NOVEC_CXX_FLAGS
-                     VECTOR_CXX_VERBOSE
-                     VECTOR_ALIASING_CXX_FLAGS
-                     VECTOR_ARCH_CXX_FLAGS
-                     VECTOR_FPMODEL_CXX_FLAGS
-                     VECTOR_NOVEC_CXX_OPT
-                     VECTOR_VEC_CXX_OPTS
-                     VECTOR_VEC_CXX_FPOPTS)
-
-   message(STATUS  "Setting Vector CXX flags to: ${VECTOR_CXX_FLAGS}")
-   message(STATUS  "Setting Vector CXX No-Vector flags to: ${VECTOR_NOVEC_CXX_FLAGS}")
-   message(STATUS  "Setting Vector CXX Verbose flags to: ${VECTOR_CXX_VERBOSE}")
-
-endif(CMAKE_CXX_COMPILER_LOADED)
-
-# Start Fortran flags
-if(CMAKE_Fortran_COMPILER_LOADED)
-    if ("${CMAKE_Fortran_COMPILER_ID}" STREQUAL "Clang") # using Clang
-        set(VECTOR_ALIASING_Fortran_FLAGS "${VECTOR_ALIASING_Fortran_FLAGS} -fstrict-aliasing")
-        if ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64")
-           set(VECTOR_ARCH_Fortran_FLAGS "${VECTOR_ARCH_Fortran_FLAGS} -march=native -mtune=native")
-        elseif ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "ppc64le")
-            set(VECTOR_ARCH_Fortran_FLAGS "${VECTOR_ARCH_Fortran_FLAGS} -mcpu=powerpc64le")
-        elseif ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "aarch64")
-            set(VECTOR_ARCH_Fortran_FLAGS "${VECTOR_ARCH_Fortran_FLAGS} -march=native -mtune=native")
-        endif ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64")
-
-        set(VECTOR_OPENMP_SIMD_Fortran_FLAGS "${VECTOR_OPENMP_SIMD_Fortran_FLAGS} -fopenmp-simd")
-        set(VECTOR_Fortran_OPTS "${VECTOR_Fortran_OPTS} -fvectorize")
-        set(VECTOR_Fortran_FPOPTS "${VECTOR_Fortran_FPOPTS} -fno-math-errno")
-        set(VECTOR_NOVEC_Fortran_OPT "${VECTOR_NOVEC_Fortran_OPT} -fno-vectorize")
-        set(VECTOR_Fortran_VERBOSE "${VECTOR_Fortran_VERBOSE} -Rpass=loop-vectorize -Rpass-missed=loop-vectorize -Rpass-analysis=loop-vectorize")
-
-    elseif ("${CMAKE_Fortran_COMPILER_ID}" STREQUAL "GNU") # using GCC
-        set(VECTOR_ALIASING_Fortran_FLAGS "${VECTOR_ALIASING_Fortran_FLAGS} -fstrict-aliasing")
-        if ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64")
-            set(VECTOR_ARCH_Fortran_FLAGS "${VECTOR_ARCH_Fortran_FLAGS} -march=native -mtune=native")
-        elseif ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "ppc64le")
-            set(VECTOR_ARCH_Fortran_FLAGS "${VECTOR_ARCH_Fortran_FLAGS} -mcpu=powerpc64le")
-        elseif ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "aarch64")
-            set(VECTOR_ARCH_Fortran_FLAGS "${VECTOR_ARCH_Fortran_FLAGS} -march=native -mtune=native")
-        endif ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64")
-
-        set(VECTOR_OPENMP_SIMD_Fortran_FLAGS "${VECTOR_OPENMP_SIMD_Fortran_FLAGS} -fopenmp-simd")
-        set(VECTOR_Fortran_OPTS "${VECTOR_Fortran_OPTS} -ftree-vectorize")
-        set(VECTOR_Fortran_FPOPTS "${VECTOR_Fortran_FPOPTS} -fno-trapping-math -fno-math-errno")
-        if ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64")
-            if ("${CMAKE_Fortran_COMPILER_VERSION}" VERSION_GREATER "7.9.0")
-                set(VECTOR_Fortran_OPTS "${VECTOR_Fortran_OPTS} -mprefer-vector-width=512")
-            endif ("${CMAKE_Fortran_COMPILER_VERSION}" VERSION_GREATER "7.9.0")
-            if ("${CMAKE_Fortran_COMPILER_VERSION}" VERSION_LESS "9.0.0")
-                message(STATUS "Use a GCC compiler version 9.0.0 or greater to get effective vectorization")
-            endif ("${CMAKE_Fortran_COMPILER_VERSION}" VERSION_LESS "9.0.0")
-        endif ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64")
-
-        set(VECTOR_NOVEC_Fortran_OPT "${VECTOR_NOVEC_Fortran_OPT} -fno-tree-vectorize")
-        set(VECTOR_Fortran_VERBOSE "${VECTOR_Fortran_VERBOSE} -fopt-info-vec-optimized -fopt-info-vec-missed -fopt-info-loop-optimized -fopt-info-loop-missed")
-
-    elseif ("${CMAKE_Fortran_COMPILER_ID}" STREQUAL "Intel") # using Intel C
-        set(VECTOR_ALIASING_Fortran_FLAGS "${VECTOR_ALIASING_Fortran_FLAGS} -ansi-alias")
-        set(VECTOR_FPMODEL_Fortran_FLAGS "${VECTOR_FPMODEL_Fortran_FLAGS} -fp-model:precise")
-
-        set(VECTOR_OPENMP_SIMD_Fortran_FLAGS "${VECTOR_OPENMP_SIMD_Fortran_FLAGS} -qopenmp-simd")
-        set(VECTOR_Fortran_OPTS "${VECTOR_Fortran_OPTS} -march=native -mtune=native -xHOST -vecabi=cmdtarget")
-        if ("${CMAKE_Fortran_COMPILER_VERSION}" VERSION_GREATER "17.0.4")
-            set(VECTOR_Fortran_OPTS "${VECTOR_Fortran_OPTS} -qopt-zmm-usage=high")
-        endif ("${CMAKE_Fortran_COMPILER_VERSION}" VERSION_GREATER "17.0.4")
-        set(VECTOR_NOVEC_Fortran_OPT "${VECTOR_NOVEC_Fortran_OPT} -march=native -mtune=native -no-vec")
-        set(VECTOR_Fortran_VERBOSE "${VECTOR_Fortran_VERBOSE} -qopt-report=5 -qopt-report-phase=openmp,loop,vec")
-
-    elseif (CMAKE_Fortran_COMPILER_ID MATCHES "PGI")
-        set(VECTOR_ALIASING_Fortran_FLAGS "${VECTOR_ALIASING_Fortran_FLAGS} -alias=ansi")
-        set(VECTOR_OPENMP_SIMD_Fortran_FLAGS "${VECTOR_OPENMP_SIMD_Fortran_FLAGS} -Mvect=simd")
-        if ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64")
-            if ("${CMAKE_Fortran_COMPILER_VERSION}" VERSION_GREATER "18.6")
-                execute_process(COMMAND pgfortran --version COMMAND grep LLVM COMMAND wc -l OUTPUT_VARIABLE PGI_VERSION_OUTPUT OUTPUT_STRIP_TRAILING_WHITESPACE)
-                if ("${PGI_VERSION_OUTPUT}" STREQUAL "1")
-                    set(VECTOR_ARCH_Fortran_FLAGS "${VECTOR_ARCH_Fortran_FLAGS} -Mllvm")
-                endif ("${PGI_VERSION_OUTPUT}" STREQUAL "1")
-            endif ("${CMAKE_Fortran_COMPILER_VERSION}" VERSION_GREATER "18.6")
-        endif ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64")
-
-        set(VECTOR_NOVEC_Fortran_OPT "${VECTOR_NOVEC_Fortran_OPT} -Mnovect ")
-        set(VECTOR_Fortran_VERBOSE "${VECTOR_Fortran_VERBOSE} -Minfo=loop,inline,vect")
-
-    elseif (CMAKE_Fortran_COMPILER_ID MATCHES "MSVC")
-        set(VECTOR_Fortran_OPTS "${VECTOR_Fortran_OPTS}" " ")
-
-        set(VECTOR_NOVEC_Fortran_OPT "${VECTOR_NOVEC_Fortran_OPT}" " ")
-        set(VECTOR_Fortran_VERBOSE "${VECTOR_Fortran_VERBOSE} -Qvec-report:2")
-
-    elseif (CMAKE_Fortran_COMPILER_ID MATCHES "XL")
-        set(VECTOR_ALIASING_Fortran_FLAGSS "${VECTOR_ALIASING_Fortran_FLAGS} -qalias=restrict")
-        set(VECTOR_FPMODEL_Fortran_FLAGSS "${VECTOR_FPMODEL_Fortran_FLAGS} -qstrict")
-        set(VECTOR_ARCH_Fortran_FLAGSS "${VECTOR_ARCH_Fortran_FLAGS} -qhot -qarch=auto -qtune=auto")
-
-        set(CMAKE_VEC_Fortran_FLAGS "${CMAKE_VEC_FLAGS} -qsimd=auto")
-        set(VECTOR_NOVEC_Fortran_OPT "${VECTOR_NOVEC_Fortran_OPT} -qsimd=noauto")
-        # "long vector" optimizations
-        #set(VECTOR_NOVEC_Fortran_OPT "${VECTOR_NOVEC_Fortran_OPT} -qhot=novector")
-        set(VECTOR_Fortran_VERBOSE "${VECTOR_Fortran_VERBOSE} -qreport")
-
-    elseif (CMAKE_Fortran_COMPILER_ID MATCHES "Cray")
-        set(VECTOR_ALIASING_Fortran_FLAGS "${VECTOR_ALIASING_Fortran_FLAGS} -h restrict=a")
-        set(VECTOR_Fortran_OPTS "${VECTOR_Fortran_OPTS} -h vector=3")
-  
-       set(VECTOR_NOVEC_Fortran_OPT "${VECTOR_NOVEC_Fortran_OPT} -h vector=0")
-       set(VECTOR_Fortran_VERBOSE "${VECTOR_Fortran_VERBOSE} -h msgs -h negmsgs -h list=a")
-
-    endif()
-
-    CHECK_FORTRAN_COMPILER_FLAG("${VECTOR_OPENMP_SIMD_Fortran_FLAGS}" HAVE_OPENMP_SIMD)
-    if (HAVE_OPENMP_SIMD)
-       add_definitions(-D_OPENMP_SIMD)
-    else (HAVE_OPENMP_SIMD)
-       unset(VECTOR_OPENMP_SIMD_Fortran_FLAGS)
-    endif (HAVE_OPENMP_SIMD)
-
-    set(VECTOR_BASE_Fortran_FLAGS "${VECTOR_ALIASING_Fortran_FLAGS} ${VECTOR_ARCH_Fortran_FLAGS} ${VECTOR_FPMODEL_Fortran_FLAGS}")
-    set(VECTOR_NOVEC_Fortran_FLAGS "${VECTOR_BASE_Fortran_FLAGS} ${VECTOR_NOVEC_Fortran_OPT}")
-    set(VECTOR_Fortran_FLAGS "${VECTOR_BASE_Fortran_FLAGS} ${VECTOR_Fortran_OPTS} ${VECTOR_Fortran_FPOPTS} ${VECTOR_OPENMP_SIMD_Fortran_FLAGS}")
-
-    mark_as_advanced(VECTOR_Fortran_FLAGS
-                     VECTOR_NOVEC_Fortran_FLAGS
-                     VECTOR_Fortran_VERBOSE
-                     VECTOR_ALIASING_Fortran_FLAGS
-                     VECTOR_ARCH_Fortran_FLAGS
-                     VECTOR_FPMODEL_Fortran_FLAGS
-                     VECTOR_NOVEC_Fortran_OPT
-                     VECTOR_VEC_Fortran_OPTS
-                     VECTOR_VEC_Fortran_FPOPTS)
-
-    message(STATUS  "Setting Vector Fortran flags to: ${VECTOR_Fortran_FLAGS}")
-    message(STATUS  "Setting Vector Fortran No-Vector flags to: ${VECTOR_NOVEC_Fortran_FLAGS}")
-    message(STATUS  "Setting Vector Fortran Verbose flags to: ${VECTOR_Fortran_VERBOSE}")
-
-endif(CMAKE_Fortran_COMPILER_LOADED)
diff --git a/elements/cmake/Modules/TargetDistclean.cmake b/elements/cmake/Modules/TargetDistclean.cmake
deleted file mode 100644
index d91cfa52..00000000
--- a/elements/cmake/Modules/TargetDistclean.cmake
+++ /dev/null
@@ -1,65 +0,0 @@
-# add custom target distclean
-# cleans and removes cmake generated files etc.
-# Jan Woetzel 04/2003
-#
-
-IF (UNIX)
-  ADD_CUSTOM_TARGET (distclean @echo cleaning for source distribution)
-  SET(DISTCLEANED
-   cmake.depends
-   cmake.check_depends
-   CMakeCache.txt
-   */CMakeCache.txt
-   cmake.check_cache
-   *.cmake
-   *.a
-   */*.a
-   Makefile
-   crux/Makefile
-   ezcl/Makefile
-   genmalloc/Makefile
-   graphics/Makefile
-   hash/Makefile
-   l7/Makefile
-   MallocPlus/Makefile
-   memstats/Makefile
-   powerstats/Makefile
-   mesh/Makefile
-   mesh/hsfc/Makefile
-   mesh/kdtree/Makefile
-   mesh/zorder/Makefile
-   PowerParser/Makefile
-   s7/Makefile
-   timer/Makefile
-   tests/Makefile
-   */tests/Makefile
-   core core.*
-   gmon.out
-   *~
-   CMakeFiles
-   */CMakeFiles
-   */*/CMakeFiles
-   */CTestTestfile.cmake
-   */*/CTestTestfile.cmake
-   */Testing
-   */*/Testing
-   Testing
-   cmake_install.cmake
-   */cmake_install.cmake
-   */*/cmake_install.cmake
-   install_manifest.txt
-   */install_manifest.txt
-   *.dSYM
-   */*.dSYM
-   */*/*.dSYM
-   tests/testing
-  )
-  
-  ADD_CUSTOM_COMMAND(
-    DEPENDS clean
-    COMMENT "distribution clean"
-    COMMAND rm
-    ARGS    -Rf CMakeTmp ${DISTCLEANED}
-    TARGET  distclean
-  )
-ENDIF(UNIX)
diff --git a/elements/elements.cpp b/elements/elements.cpp
index ec546b01..3ef5ce9f 100644
--- a/elements/elements.cpp
+++ b/elements/elements.cpp
@@ -49,13 +49,14 @@ ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************************/
 
 
-
 #include <iostream>  // std::cout etc.
 #include <cmath>
 
 #include "utilities.h"
 #include "elements.h"
+#include "bernstein_polynomials.h"
 
+#define bern // specify bern for bernstein or lagr for lagrange
 #define EPSILON 1.0e-12
 
 using namespace utils;
@@ -2274,7 +2275,7 @@ void jacobian_inverse_4d(
 
 // creates nodal positions with Chebyshev spacing
 void chebyshev_nodes_1D(
-    ViewCArray <real_t> &cheb_nodes_1D,   // Chebyshev nodes
+    CArray <real_t> &cheb_nodes_1D,   // Chebyshev nodes (changed to CArray from ViewCArray --steven )
     const int &order){                      // Interpolation order
 
     real_t pi = 3.14159265358979323846;
@@ -2645,8 +2646,12 @@ void ref_element::init(int p_order, int num_dim, HexN &elem){
             }
 
             auto node_basis = CArray <real_t> (num_ref_verts_in_elem_);
-
-            elem.basis(node_basis, point);
+            
+            #ifdef bern
+              elem.bernstein_basis(node_basis, point);
+            #else
+              elem.basis(node_basis, point);
+            #endif
 
             for(int vert_rlid = 0; vert_rlid < num_ref_verts_in_elem_; vert_rlid++){
 
@@ -2672,10 +2677,17 @@ void ref_element::init(int p_order, int num_dim, HexN &elem){
             for(int dim = 0; dim < 3; dim++){
                 point(dim) = ref_node_positions_(node_lid, dim);
             }
-
-            elem.partial_xi_basis(partial_xi, point);
-            elem.partial_eta_basis(partial_eta, point);
-            elem.partial_mu_basis(partial_mu, point);
+            
+            
+            #ifdef bern
+              elem.bernstein_partial_xi_basis(partial_xi, point);
+              elem.bernstein_partial_eta_basis(partial_eta, point);
+              elem.bernstein_partial_mu_basis(partial_mu, point);
+            #else
+              elem.partial_xi_basis(partial_xi, point);
+              elem.partial_eta_basis(partial_eta, point);
+              elem.partial_mu_basis(partial_mu, point);
+            #endif
 
             for(int basis_id = 0; basis_id < num_ref_verts_in_elem_; basis_id++){
 
@@ -2917,8 +2929,13 @@ void ref_element::init(int p_order, int num_dim, HexN &elem){
             auto patch_basis = CArray <real_t> (num_ref_verts_in_elem_);
             
             // calculate the patch basis function values at the point, for each vertex
-            elem.basis(patch_basis, point);
-            
+
+            #ifdef bern
+              elem.bernstein_basis(patch_basis, point);
+            #else
+              elem.basis(patch_basis, point);
+            #endif
+
             // save the basis values at the patch for each vertex
             for(int vert_rlid = 0; vert_rlid < num_ref_verts_in_elem_; vert_rlid++){
                 ref_patch_basis_(patch_rlid, vert_rlid) = patch_basis(vert_rlid);
@@ -2942,10 +2959,17 @@ void ref_element::init(int p_order, int num_dim, HexN &elem){
             }
             
             // calculate the partials at the patch location for each vertex
-            elem.partial_xi_basis(partial_xi, point);
-            elem.partial_eta_basis(partial_eta, point);
-            elem.partial_mu_basis(partial_mu, point);
-            
+
+            #ifdef bern
+              elem.bernstein_partial_xi_basis(partial_xi, point);
+              elem.bernstein_partial_eta_basis(partial_eta, point);
+              elem.bernstein_partial_mu_basis(partial_mu, point);
+            #else
+              elem.partial_xi_basis(partial_xi, point);
+              elem.partial_eta_basis(partial_eta, point);
+              elem.partial_mu_basis(partial_mu, point);
+            #endif
+
             // loop over the basis polynomials where there is one from each vertex
             for(int basis_id = 0; basis_id < num_ref_verts_in_elem_; basis_id++){
                 
@@ -2978,8 +3002,12 @@ void ref_element::init(int p_order, int num_dim, HexN &elem){
             auto cell_basis = CArray <real_t> (num_ref_verts_in_elem_);
             
             // calculate the cell basis function values at the point, for each vertex
-            elem.basis(cell_basis, point);
-            
+            #ifdef bern
+              elem.bernstein_basis(cell_basis, point);
+            #else
+              elem.basis(cell_basis, point);
+            #endif
+
             // save the basis values at the patch for each vertex
             for(int vert_rlid = 0; vert_rlid < num_ref_verts_in_elem_; vert_rlid++){
                 ref_cell_basis_(cell_rlid, vert_rlid) = cell_basis(vert_rlid);
@@ -3002,10 +3030,16 @@ void ref_element::init(int p_order, int num_dim, HexN &elem){
             }
             
             // calculate the partials at the cell location for each vertex
-            elem.partial_xi_basis(partial_xi, point);
-            elem.partial_eta_basis(partial_eta, point);
-            elem.partial_mu_basis(partial_mu, point);
-            
+            #ifdef bern
+              elem.bernstein_partial_xi_basis(partial_xi, point);
+              elem.bernstein_partial_eta_basis(partial_eta, point);
+              elem.bernstein_partial_mu_basis(partial_mu, point);
+            #else
+              elem.partial_xi_basis(partial_xi, point);
+              elem.partial_eta_basis(partial_eta, point);
+              elem.partial_mu_basis(partial_mu, point);
+            #endif
+
             // loop over the basis polynomials where there is one from each vertex
             for(int basis_id = 0; basis_id < num_ref_verts_in_elem_; basis_id++){
                 
@@ -4018,7 +4052,7 @@ void QuadN::lagrange_1D(
 } // end of Legrange_1D function
 
 
-// Corners of Lagrange element for mapping
+// Corners of Lagrange element for mappintein_basis
 void QuadN::corners (
     ViewCArray <real_t> &lag_nodes,   // Nodes of Lagrange elements 
     ViewCArray <real_t> &lag_corner,  // corner nodes of QuadN element
@@ -5754,7 +5788,7 @@ representative linear element for visualization
                 
             } // end looping over nodes != vert_i
 
-            // writing value to vectors for later use
+            // writing value to vectors for later use 
             interp(vert_i)   = interpolant;           // Interpolant value at given point
 
         } // end loop over all nodes
@@ -5771,10 +5805,10 @@ representative linear element for visualization
             real_t num_gradient = 0.0;      // placeholder for numerator of the gradient
             real_t gradient = 0.0;
 
-            for(int vert_j = 0; vert_j < num_verts_1d_; vert_j++){
+            //for(int vert_j = 0; vert_j < num_verts_1d_; vert_j++){
 
                 // std::cout<<"HexN 1D Vert "<<vert_j<< " = "<<HexN_Verts_1d_(vert_j)<<std::endl;
-            }
+            //}
 
             for(int vert_j = 0; vert_j < num_verts_1d_; vert_j++){  // looping over the nodes !=vert_i
                 if (vert_j != vert_i ){
@@ -5803,13 +5837,216 @@ representative linear element for visualization
             
             } // end looping over nodes != vert_i
 
-            // writing value to vectors for later use
+            // writing value to vectors for later use 
             derivative(vert_i)  = gradient;    // derivative of each function
-
+        
         } // end loop over all nodes
     } // end of Legrange_1D function
 
-    void HexN::create_lobatto_nodes(int element_order){
+
+    void HexN::bernstein_basis(CArray <real_t> &basis, CArray <real_t> &point)
+    {
+        auto val_1d = CArray <real_t> (num_verts_1d_);
+        auto val_3d = CArray <real_t> (num_verts_1d_, 3);
+        // Calculate 1D Bernstein basis for the X coordinate of the point
+        bernstein_basis_1D(val_1d, point(0));
+
+        // Save the basis value at the point to a temp array and zero out the temp array
+        for( int i = 0; i < num_verts_1d_; i++){
+            val_3d(i,0) = val_1d(i);
+            val_1d(i) =0.0;
+        }
+
+        // Calculate the 1D Bernstein basis for the Y coordinate of the point
+        bernstein_basis_1D(val_1d, point(1));
+
+        // Save the basis value at the point to a temp array and zero out the temp array
+        for( int i = 0; i < num_verts_1d_; i++){
+            val_3d(i,1) = val_1d(i);
+            val_1d(i) =0.0;
+        }
+        
+        // Calculate the 1D Bernstein basis for the Z coordinate of the point
+        bernstein_basis_1D(val_1d, point(2));
+        
+        // Save the basis value at the point to a temp array and zero out the temp array
+        for( int i = 0; i < num_verts_1d_; i++){
+            val_3d(i,2) = val_1d(i);
+            val_1d(i) =0.0;
+        }
+        
+
+        // Multiply the (i,j,k) components of the basis from each node
+        // to get the tensor product basis for the node
+
+        for( int k = 0; k < num_verts_1d_;k++){
+            for(int j = 0; j < num_verts_1d_; j ++){
+                for(int i =0; i < num_verts_1d_; i++){
+                    
+                    int vert_rlid = vert_rid(i,j,k);
+                    basis(vert_rlid) = val_3d(i,0)*val_3d(j,1)*val_3d(k,2);
+                }
+            }
+        }
+    };
+
+    void HexN::bernstein_partial_xi_basis(CArray <real_t> &partial_xi, CArray <real_t> &point)
+    {
+        auto val_1d = CArray <real_t> (num_verts_1d_);
+        auto val_3d = CArray <real_t> (num_verts_1d_,3);
+
+        auto Dval_1d = CArray <real_t> (num_verts_1d_);
+        auto Dval_3d = CArray <real_t> (num_verts_1d_,3);
+
+        // Calculate partial w.r.t. xi for the X coordinate of the point
+        bernstein_derivative_1D(Dval_1d, point(0));
+
+        // Save the basis value at the point to a temp array and zero out the temp array
+        for( int i = 0; i < num_verts_1d_; i++){
+            Dval_3d(i,0) = Dval_1d(i);
+            Dval_1d(i) = 0.0;
+        }
+
+        // Calculate 1D basis for the Y coordinate of the point
+        bernstein_basis_1D(val_1d, point(1));
+
+        // Save the basis value at the point to a temp array and zero out the temp array
+        for( int i = 0; i < num_verts_1d_; i++){
+            val_3d(i,1) = val_1d(i);
+            val_1d(i) = 0.0;
+        }
+        
+        // Calculate 1D basis for the Z coordinate of the point
+        bernstein_basis_1D(val_1d, point(2));
+
+        // Save the basis value at the point to a temp array and zero out the temp array
+        for( int i = 0; i < num_verts_1d_; i++){
+            val_3d(i,2) = val_1d(i);
+            val_1d(i) = 0.0;
+        }                                         
+
+        // Multiply the i,j,k components of the basis and partial_xi from each node
+        // to get the tensor product partial derivatives of the basis at each node
+        for( int k = 0; k < num_verts_1d_; k++){
+            for( int j = 0; j < num_verts_1d_; j++){
+                for( int i = 0; i < num_verts_1d_; i++){
+                    
+                    int vert_rlid = vert_rid(i,j,k);
+                    // Partial w.r.t. xi
+                    partial_xi(vert_rlid) = Dval_3d(i,0)*val_3d(j,1)*val_3d(k,2);
+                }
+            }
+        }
+    };
+    
+    void HexN::bernstein_partial_eta_basis(CArray <real_t> &partial_eta, CArray <real_t> &point)
+    {
+        auto val_1d = CArray <real_t> (num_verts_1d_);
+        auto val_3d = CArray <real_t> (num_verts_1d_,3);
+
+        auto Dval_1d = CArray <real_t> (num_verts_1d_);
+        auto Dval_3d = CArray <real_t> (num_verts_1d_,3);
+
+        // Calculate basis for the X coordinate of the point
+        bernstein_basis_1D(val_1d, point(0));
+        //
+        // Save the basis value at the point to a temp array and zero out the temp array
+        for( int i = 0; i < num_verts_1d_; i++){
+            val_3d(i,0) = val_1d(i);
+            val_1d(i) = 0.0;
+        }
+        
+        // Calculate partial derivative w.r.t. eta for the Y coordinate of the point
+        bernstein_derivative_1D(Dval_1d, point(1));
+        
+        // Save the basis value at the point to a temp array and zero out the temp array
+        for( int i = 0; i < num_verts_1d_; i++){
+            Dval_3d(i,1) = Dval_1d(i);
+            Dval_1d(i) = 0.0;
+        }
+        
+        // Calculate 1D basis for the Z coordinate of the point
+        bernstein_basis_1D(val_1d, point(2));
+        
+        // Save the basis value at the point to a temp array and zero out the temp array
+        for( int i = 0; i < num_verts_1d_; i++){
+            val_3d(i,2) = val_1d(i);
+            val_1d(i) = 0.0;
+        }                                                                                                                                                                       // Multiply the i,j,k components of the basis and partial_eta from each node
+        // to get the tensor product partial derivatives of the basis at each node
+                                                                                                                                                                                for( int k = 0; k < num_verts_1d_; k++){
+            for( int j = 0; j < num_verts_1d_; j++){
+                for( int i = 0; i < num_verts_1d_; i++){
+        
+                    int vert_rlid = vert_rid(i,j,k);
+                    partial_eta(vert_rlid) = val_3d(i,0)*Dval_3d(j,1)*val_3d(k,2);
+
+                }
+            }
+        }
+    };
+    
+    void HexN::bernstein_partial_mu_basis(CArray <real_t> &partial_mu, CArray <real_t> &point)
+    {
+        auto val_1d = CArray <real_t> (num_verts_1d_);
+        auto val_3d = CArray <real_t> (num_verts_1d_,3);
+        auto Dval_1d = CArray <real_t> (num_verts_1d_);
+        auto Dval_3d = CArray <real_t> (num_verts_1d_,3);
+        // Calculate basis for the X coordinate of the point
+        bernstein_basis_1D(val_1d, point(0));
+        //
+        // Save the basis value at the point to a temp array and zero out the temp array
+        for( int i = 0; i < num_verts_1d_; i++){
+            val_3d(i,0) = val_1d(i);
+            val_1d(i) = 0.0;
+        }                                                                                                                                                                       // Calculate 1D basis for the Y coordinate of the point
+        bernstein_basis_1D(val_1d, point(1));
+        // Save the basis value at the point to a temp array and zero out the temp array
+        for( int i = 0; i < num_verts_1d_; i++){
+            val_3d(i,1) = val_1d(i);
+            val_1d(i) = 0.0;
+        }
+        // Calculate 1D basis for the Z coordinate of the point
+        bernstein_derivative_1D(Dval_1d, point(2));
+        // Save the basis value at the point to a temp array and zero out the temp array
+        for( int i = 0; i < num_verts_1d_; i++){
+            Dval_3d(i,2) = Dval_1d(i);
+            Dval_1d(i) = 0.0;
+        }
+        // Multiply the i,j,k components of the basis and partial_mu from each node
+        // to get the tensor product partial derivatives of the basis at each node
+        for( int k = 0; k < num_verts_1d_; k++){
+            for( int j = 0; j < num_verts_1d_; j++){
+                for( int i = 0; i < num_verts_1d_; i++){
+                    int vert_rlid = vert_rid(i,j,k);
+                    partial_mu(vert_rlid) = val_3d(i,0)*val_3d(j,1)*Dval_3d(k,2);
+                }
+            }
+        }
+    };
+    
+    void HexN::bernstein_basis_1D(
+        CArray <real_t> &interp, 
+        const real_t &point){
+        
+        // calculate the basis value associated with each node_i
+        for( int vert_i = 0; vert_i < num_verts_1d_; vert_i++){
+            interp(vert_i) = bernstein::eval(num_verts_1d_-1, vert_i, point);
+        }
+
+    }//end of bernstein_basis_1D
+        
+    void HexN::bernstein_derivative_1D(
+        CArray <real_t> &derivative,
+        const real_t &point){
+        
+        for( int vert_i = 0; vert_i < num_verts_1d_; vert_i++){
+            derivative(vert_i)  = bernstein::eval_der(num_verts_1d_-1, vert_i, point); 
+        }
+    
+    }// end of bernstein_derivative_1D
+
+void HexN::create_lobatto_nodes(int element_order){
 
         int num_nodes_1d = 0;
         int num_nodes_3d = 0;
@@ -5830,11 +6067,11 @@ representative linear element for visualization
         // std::cout<< "Num Nodes passed to create lobatto nodes "<<std::endl;
         //elements::lobatto_nodes_1D(HexN_Nodes_1d_, num_nodes_1d);
         lobatto_nodes_1D(HexN_Nodes_1d_, num_nodes_1d);
-        
+
         for(int num_k = 0; num_k < num_nodes_1d; num_k++){
             for(int num_j = 0; num_j < num_nodes_1d; num_j++){
                 for(int num_i = 0; num_i < num_nodes_1d; num_i++){
-        
+
                     int node_rlid = node_rid(num_i, num_j, num_k);
 
                     HexN_Nodes_(node_rlid, 0) = HexN_Nodes_1d_(num_i);
@@ -5853,24 +6090,22 @@ representative linear element for visualization
                 HexN_Verts_1d_(vert_id) = HexN_Nodes_1d_(i);
 
                 vert_id++;
-            }  
+            }
 
         }
 
         else{
-            
+
             int vert_id = 0;
             for(int i = 0; i < num_nodes_1d; i=i+2){
 
                 HexN_Verts_1d_(vert_id) = HexN_Nodes_1d_(i);
 
                 vert_id++;
-            }  
+            }
         }
     }
 
-
-
 /*
 ==========================
    4D Tesseract element
diff --git a/elements/elements.h b/elements/elements.h
index 32f489f9..1303b47c 100644
--- a/elements/elements.h
+++ b/elements/elements.h
@@ -223,7 +223,7 @@ namespace elements {
 
     // creates nodal positions with Chebyshev spacing
     void chebyshev_nodes_1D(
-        ViewCArray <real_t> &cheb_nodes_1D,   // Chebyshev nodes
+        CArray <real_t> &cheb_nodes_1D,   // Chebyshev nodes (changed to CArray from ViewCArray --steven)
         const int &order);                      // Interpolation order
 
 
@@ -1035,6 +1035,37 @@ class Hex8: public Element3D {
                 CArray <real_t> &partials,  //derivative
                 const real_t &x_point);     // point of interest in element
 
+            void bernstein_basis(
+                CArray <real_t> &basis,
+                CArray <real_t> &point);
+
+            void bernstein_build_nodal_gradient(
+                CArray <real_t> &gradient);
+
+            void bernstein_partial_xi_basis(
+                CArray <real_t> &partial_xi,
+                CArray <real_t> &point);
+
+            void bernstein_partial_eta_basis(
+                CArray <real_t> &partial_eta,
+                CArray <real_t> &point);
+
+            void bernstein_partial_mu_basis(
+                CArray <real_t> &partial_mu,
+                CArray <real_t> &point);
+
+            void bernstein_basis_1D(
+                CArray <real_t> &interp,
+                const real_t &x_point);
+
+            void bernstein_derivative_1D(
+                CArray <real_t> &partials,
+                const real_t &x_point);
+
+
+
+
+
             void create_lobatto_nodes(int element_order);
 
     };
@@ -1153,7 +1184,7 @@ namespace elem_types
         Hex8   = 4,   // 8 node lagrangian hexahedral element 
         Hex20  = 5,   // 20 node serendipity hexahedral element 
         Hex32  = 6,   // 32 node serendipity hexahedral element 
-        HexN   = 7    // N node lagrangian hexahedral element 
+        HexN   = 7    // N node lagrangian or bernstein hexahedral element 
     
         // add tesseract element
 
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 0817543d..5c211e27 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -3,3 +3,8 @@ add_subdirectory(average)
 
 # "VTK" example, demonstration of visualization using VTK/Paraview
 add_subdirectory(vtk)
+
+# "hexref" example, demo of MATAR-Kokkoified hexahedral reference element
+if (MATAR_WITH_KOKKOS)
+  add_subdirectory(hexref)
+endif()
diff --git a/examples/hexref/CMakeLists.txt b/examples/hexref/CMakeLists.txt
new file mode 100644
index 00000000..348556d4
--- /dev/null
+++ b/examples/hexref/CMakeLists.txt
@@ -0,0 +1,10 @@
+add_executable(hexref hexref.cpp)
+target_compile_definitions(
+  hexref 
+  PUBLIC 
+    MATAR_WITH_KOKKOS 
+    HAVE_KOKKOS=1 
+    HAVE_OPENMP=1
+)
+target_include_directories(hexref PUBLIC ${ELEMENTS_KOKKOS_INCLUDES})
+target_link_libraries(hexref PUBLIC elements_kokkos)
diff --git a/examples/hexref/hexref.cpp b/examples/hexref/hexref.cpp
new file mode 100644
index 00000000..7c781e08
--- /dev/null
+++ b/examples/hexref/hexref.cpp
@@ -0,0 +1,30 @@
+#include "HexRef.h"
+
+#include "Kokkos_Core.hpp"
+
+#include <iostream>
+
+
+int main(int argc, char *argv[]) {
+  Kokkos::initialize(argc, argv);
+  int order_basis = 2;
+  BasisType type_basis = BasisType::LAGRANGE;
+  HexRef elem(order_basis, type_basis);
+
+  // Print out the quadrature points pertaining the the cells (subelements)
+  for (int i = 0; i < elem.num_ref_cells_in_elem(); ++i) {
+    std::cout << elem.ref_cell_positions(i, 0) << " "
+              << elem.ref_cell_positions(i, 1) << " "
+              << elem.ref_cell_positions(i, 2) << " "
+              << std::endl;
+  }
+
+  // Print out the basis function evaluations on the cell (subelements)
+  for (int i = 0; i < elem.num_ref_cells_in_elem(); ++i) {
+    for (int j = 0; j < elem.num_basis(); ++j)
+      std::cout << elem.ref_cell_basis(i, j) << " ";
+    std::cout << std::endl;
+  }
+
+  return 0;
+}
diff --git a/examples/vtk/CMakeLists.txt b/examples/vtk/CMakeLists.txt
index f8709766..3da48aa4 100644
--- a/examples/vtk/CMakeLists.txt
+++ b/examples/vtk/CMakeLists.txt
@@ -2,10 +2,10 @@
 if (WITH_VTK AND VTK_FOUND)
   add_executable(
     plotting_jacobian_determinants 
-    plotting_jacobian_determinants.cpp
+      plotting_jacobian_determinants.cpp
   )
   target_link_libraries(
     plotting_jacobian_determinants 
-    elements swage swage2vtk common
+      elements swage swage2vtk common
   )
 endif()
diff --git a/swage-kokkos/include/HexMesh.h b/swage-kokkos/include/HexMesh.h
new file mode 100644
index 00000000..4bb5833d
--- /dev/null
+++ b/swage-kokkos/include/HexMesh.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include <vect3d.h>
+
+struct HexMesh {
+  enum { ND = 3 };
+  
+  CArrayKokkos<vect3d> cpts;
+  
+  HexMesh() {
+  }
+  
+};
diff --git a/swage-kokkos/include/vect2d.h b/swage-kokkos/include/vect2d.h
new file mode 100644
index 00000000..721961bf
--- /dev/null
+++ b/swage-kokkos/include/vect2d.h
@@ -0,0 +1,31 @@
+#pragma once
+
+template<typename T>
+struct Vector2D {
+  T x, y;
+
+public:
+  Vector2D(T _x=0, T _y=0)
+    : x(_x), y(_y) { }
+    
+public:
+  T & operator[](const int idx) {
+    assert(idx<2);
+    return (&x)[idx];
+  }
+  const T & operator[](const int idx) const {
+    assert(idx<2);
+    return (&x)[idx];
+  }
+  T & operator()(const int idx) {
+    assert(idx<2);
+    return (&x)[idx];
+  }
+  const T & operator()(const int idx) const {
+    assert(idx<2);
+    return (&x)[idx];
+  }
+};
+
+typedef Vector2D<double> vect2d;
+typedef Vector2D<float> vect2f;
diff --git a/swage-kokkos/include/vect3d.h b/swage-kokkos/include/vect3d.h
new file mode 100644
index 00000000..c601520a
--- /dev/null
+++ b/swage-kokkos/include/vect3d.h
@@ -0,0 +1,59 @@
+#pragma once
+
+template<typename T>
+struct Vector3D {
+  T x, y, z;
+
+public:
+  Vector3D(T _x=0, T _y=0, T _z=0)
+    : x(_x), y(_y), z(_z) { }
+    
+public:
+  T & operator[](const int idx) {
+    assert(idx<3);
+    return (&x)[idx];
+  }
+  const T & operator[](const int idx) const {
+    assert(idx<3);
+    return (&x)[idx];
+  }
+  T & operator()(const int idx) {
+    assert(idx<3);
+    return (&x)[idx];
+  }
+  const T & operator()(const int idx) const {
+    assert(idx<3);
+    return (&x)[idx];
+  }
+
+public:
+  template<typename S>
+  Vector3D<T> & operator=(const Vector3D<S> & rhs) {
+    x = rhs.x; y = rhs.y; z = rhs.z;
+    return *this;
+  }
+  template<typename S>
+  Vector3D<T> & operator+=(const Vector3D<S> & rhs) {
+    x += rhs.x; y += rhs.y; z += rhs.z;
+    return *this;
+  }
+  template<typename S>
+  Vector3D<T> & operator-=(const Vector3D<S> & rhs) {
+    x -= rhs.x; y -= rhs.y; z -= rhs.z;
+    return *this;
+  }
+  template<typename S>
+  Vector3D<T> & operator*=(const S & rhs) {
+    x *= rhs; y *= rhs; z *= rhs;
+    return *this;
+  }
+  template<typename S>
+  Vector3D<T> & operator/=(const S & rhs) {
+    const T den(1.0/rhs);
+    x *= den; y *= den; z *= den;
+    return *this;
+  }
+};
+
+typedef Vector3D<double> vect3d;
+typedef Vector3D<float> vect3f;
diff --git a/swage-kokkos/include/vect4d.h b/swage-kokkos/include/vect4d.h
new file mode 100644
index 00000000..be821e1a
--- /dev/null
+++ b/swage-kokkos/include/vect4d.h
@@ -0,0 +1,32 @@
+#pragma once
+
+template<typename T>
+struct Vector4D {
+  T x, y, z, w;
+
+public:
+  Vector4D(T _x=0, T _y=0, T _z=0, T _w=0)
+    : x(_x), y(_y), z(_z), w(_w) { }
+    
+public:
+  T & operator[](const int idx) {
+    assert(idx<4);
+    return (&x)[idx];
+  }
+  const T & operator[](const int idx) const {
+    assert(idx<4);
+    return (&x)[idx];
+  }
+  T & operator()(const int idx) {
+    assert(idx<4);
+    return (&x)[idx];
+  }
+  const T & operator()(const int idx) const {
+    assert(idx<4);
+    return (&x)[idx];
+  }
+};
+
+
+typedef Vector4D<double> vect4d;
+typedef Vector4D<float> vect4f;
diff --git a/swage-kokkos/src/HexMesh.cpp b/swage-kokkos/src/HexMesh.cpp
new file mode 100644
index 00000000..24fb0073
--- /dev/null
+++ b/swage-kokkos/src/HexMesh.cpp
@@ -0,0 +1,2 @@
+#include <HexMesh.h>
+
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index bedfa95e..399ebf49 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -76,3 +76,13 @@ if (ENABLE_BLAS_LAPACK)
     ${LAPACK_LIBRARIES}
   )
 endif()
+
+#test Bernstein polynomials
+add_executable(
+  test_bernstein_polynomials
+  test_bernstein_polynomials.cpp
+)
+target_link_libraries(
+  test_bernstein_polynomials
+  elements
+)
diff --git a/tests/test_bernstein_polynomials.cpp b/tests/test_bernstein_polynomials.cpp
new file mode 100644
index 00000000..14c16cd5
--- /dev/null
+++ b/tests/test_bernstein_polynomials.cpp
@@ -0,0 +1,209 @@
+#include "bernstein_polynomials.h"
+#include "point_distributions.h"
+
+#include <cstdlib>
+#include <iostream>
+
+// Test Parameters
+template <typename NumType>
+struct TestParams {
+  SizeType Np;
+  SizeType N;
+
+  NumType X;
+  NumType *c;
+
+  TestParams(SizeType order){
+    Np = order;
+    N = Np + 1;
+
+  // Generating coefficients for bernstein polynomial expansion
+    c = new NumType[N];
+    for (SizeType i = 0; i<N; i++){
+      c[i] = Real(rand())/RAND_MAX;
+    }
+
+ // Selecting a random number between -1 and 1
+    NumType Zl = -1.0;
+    NumType Zr = 1.0;
+
+    X = Zl + (Zr-Zl)*Real(rand())/Real(RAND_MAX);
+    };
+
+    ~TestParams() {delete[] c;}
+};
+
+// Test 1: Comparing Bernstein Polynomials with the Bernstein approximation
+
+bool test1(TestParams<Real> &p){
+    Real tol = 1e-15;
+
+    bool pass = false;
+
+    Real sum1 = 0.0;
+    for (SizeType i = 0; i < p.N; i++){
+        Real Bi = bernstein::eval(p.N, i, p.X);
+        sum1 += p.c[i]*Bi;
+    };
+
+    Real sum2 = bernstein::eval_approx(p.N, p.c, p.X);
+
+    Real rel_error = common::abs((sum1-sum2)/sum1);
+
+    if (rel_error < tol ){
+        pass = true;
+    }
+    else{
+        std::cout << "Test 1, error: " << rel_error << std::endl;
+    }
+
+    return pass;
+};
+
+// Test 2: Checking consistency of the first derivative of the polynomials with derivative of the Bernstein Approximation
+
+bool test2(TestParams<Real> &p) {
+    Real tol = 1e-15;
+
+    bool pass = false;
+    Real sum1 = 0.0;
+
+    for (SizeType i = 0; i < p.N; i++) {
+        Real dBi = bernstein::eval_der(p.N, i, p.X);
+        sum1 += p.c[i]*dBi;
+    }
+
+    Real sum2 = bernstein::eval_der_approx(p.N, p.c, p.X);
+
+    Real rel_error = common::abs((sum1 - sum2)/sum1);
+
+    if (rel_error < tol) {
+        pass = true;
+    } 
+    else {
+        std::cout << "Test 2, error: " << rel_error << std::endl; 
+    }
+
+  return pass;
+}
+
+// Test 3: This test ensures the a property of the bernstein polynomials where the 
+// the sum of the nth degree polynomials from v equals 0 to n-1 is equal to 1.
+
+bool test3(TestParams<Real> &p){
+    Real tol = 1e-15;
+    bool pass = false;
+
+    // Establishes random number between -1 and 1 to use for each Bernstein test
+    Real Zl = -1.0;
+    Real Zr = 1.0;
+
+    Real rand_X = Zl + (Zr - Zl)*Real(rand())/Real(RAND_MAX);
+
+    // Sums bernstein polynomials at value rand_X
+    Real sum = 0.0;
+    for (SizeType i = 0; i < p.N; i++) {
+        sum += bernstein::eval(p.N, i, rand_X);
+    }
+    Real error = common::abs( sum - 1 );
+    if (error< tol) {
+        pass = true;
+    }
+    else {
+        std::cout << "Test 3, error: " << error << std::endl;
+    }
+
+    return pass;
+}
+
+// Test 4: Checks for the positivity of the Bernstein polynomials generated by the code
+
+bool test4(TestParams<Real> &p){
+
+  bool pass = false;
+  SizeType NumTests = 1000;
+  for ( SizeType i = 0; i < NumTests + 1; i ++){
+
+    Real Zl = -1.0;
+    Real Zr = 1.0;
+
+    Real rand_X = Zl + (Zr - Zl)*Real(rand())/Real(RAND_MAX);
+    for (SizeType j=0; j < p.N; j++){
+      Real val = bernstein::eval(p.N, j, rand_X);
+      if (0 <= val){
+        pass = true;
+      }
+      else {
+        std::cout << "Test 4 Error: At least one of the values tested is negative" << std::endl;
+      }
+    }
+  }  
+  return pass;
+}
+
+bool test5(TestParams<Real> &p){
+    Real tol = 1e-15;
+    bool pass = false;
+
+    Real Zl = -1.0;
+    Real Zr = 1.0;
+    
+    Real rand_X = Zl + (Zr - Zl)*Real(rand())/Real(RAND_MAX);
+    
+    //Sums bernstein polynomial derivatives at value rand_X
+    Real sum = 0.0;
+    for (SizeType i = 0; i < p.N; i++) {
+    sum += bernstein::eval_der(p.N, i, rand_X);
+    }
+    Real error = common::abs( sum );
+    if (error< tol) {
+      pass = true;
+    }
+    else {
+      std::cout << "Test 3, error: " << error  << std::endl;
+    }
+    return pass;
+}
+
+ int main(){
+    std::cout.precision(15);
+
+    std::cout << "TEST BERNSTEIN POLYNOMIALS\n"
+        <<"----------------------------"
+        << std::endl;
+
+    // Initialize Random Seed
+    srand(time(NULL));
+
+    // Generate the Parameters
+    TestParams<Real> rp(8);
+
+    // Run Test 1
+    bool pass1 = test1(rp);
+    std::cout << "Test 1 " << (pass1 ? "PASSED" : "FAILED") << std::endl;
+
+    // Run Test 2
+    bool pass2 = test2(rp);
+    std::cout << "Test 2 " << (pass2 ? "PASSED" : "FAILED") << std::endl;
+
+    // Run Test 3
+    bool pass3 = test3(rp);
+    std::cout << "Test 3 " << (pass3 ? "PASSED" : "FAILED") << std::endl;
+
+    // Run Test 4
+    bool pass4 = test4(rp);
+    std::cout << "Test 4 " << (pass4 ? "PASSED" : "FAILED") << std::endl;
+    bool pass5 = test5(rp);
+    std::cout << "Test 5 " << (pass5 ? "PASSED" : "FAILED") << std::endl;
+
+    std::cout << std::endl;
+    std::cout << "PASSED "
+              << int(pass1) + int(pass2) + int(pass3) + int(pass4) + int(pass5)
+              << "/5" << std::endl;
+
+    return 0;
+    
+ }
+
+
+