diff --git a/.gitignore b/.gitignore
index bb1c2c0..6084b60 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,7 @@
 build/*
 profile/*.[0-9]*
 profile/*/
+.*/*
+release*
+pmcl3d*
+!pmcl3d*h
diff --git a/CMakeLists.txt b/CMakeLists.txt
index cd29ed6..d01d69a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,22 +1,28 @@
 cmake_minimum_required(VERSION 3.10)
-cmake_policy(SET CMP0074 NEW)
-project(AWP_MINI VERSION 1.0 LANGUAGES C CUDA)
+project(AWP VERSION 1.0 LANGUAGES C CUDA)
+if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+  set(CMAKE_CUDA_ARCHITECTURES 90)
+endif()
+include(CMakePrintHelpers)
+#include(FindMPI.cmake)
 
 include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
-set(GCC_COMPILE_FLAGS "-std=c99 -Wall -Werror\
+set(GCC_COMPILE_FLAGS "-std=c99 -Wall\
    -Wextra -Wmissing-prototypes -Wstrict-prototypes \
-          -Wold-style-definition -Wno-unused-parameter")
-if (DEFINED ENV{ARCH})
-        set(ARCH $ENV{ARCH})
-else()
-        set(ARCH sm_70)
-endif()
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -arch=${ARCH} -Xptxas=-v -lineinfo -use_fast_math")
-
-set(CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} ${GCC_COMPILE_FLAGS} -D${ARCH}")
-set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -std=gnu11 -D${ARCH}")
+   -Wno-unused-parameter")
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O4 -Xcompiler -std=c++17 -use_fast_math -Xptxas=-v -g -lineinfo --allow-unsupported-compiler")
+set(CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} ${GCC_COMPILE_FLAGS}")
+set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -std=c++17")
+#add_compile_definitions(MPICH_SKIP_MPICXX=1)
 
+#### MPI
 find_package(MPI REQUIRED)
+if (MPI_FOUND)
+    cmake_print_variables(CMAKE_INCLUDE_PATH)
+    include_directories(${MY_INCLUDE_PATH})
+else (MPI_FOUND)
+    message(SEND_ERROR "This application cannot compile without MPI")
+endif (MPI_FOUND)
 
 
 include(CTest)
diff --git a/compile_ls6_impi.sh b/compile_ls6_impi.sh
new file mode 100644
index 0000000..d515350
--- /dev/null
+++ b/compile_ls6_impi.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+module unload intel
+module load cmake gcc impi cuda
+
+rm -r release
+mkdir -p release
+
+cd release
+export CC=$(which mpigcc)
+export CXX=$(which mpigxx)
+export LD_LIBRARY_PATH=/opt/intel/compilers_and_libraries_2020.4.304/linux/mpi/intel64/lib:$LD_LIBRARY_PATH
+export CPATH=/opt/intel/compilers_and_libraries_2020.4.304/linux/mpi/intel64/include:$CPATH
+#export PATH=/opt/intel/compilers_and_libraries_2020.4.304/linux/mpi/intel64/bin:$PATH
+module list
+env | grep "PATH"
+
+cmake -DCMAKE_VERBOSE_MAKEFILE=ON ..
+make
diff --git a/compile_ls6_mv2.sh b/compile_ls6_mv2.sh
new file mode 100644
index 0000000..df48d7c
--- /dev/null
+++ b/compile_ls6_mv2.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+module unload intel impi
+module load cmake gcc mvapich2 cuda
+module list
+
+rm -r release
+mkdir -p release
+
+cd release
+#export CC=$(which mpicc)
+#export CXX=$(which mpicxx)
+export LD_LIBRARY_PATH=/opt/apps/gcc11_2/mvapich2/2.3.7/lib:$LD_LIBRARY_PATH
+export CPATH=/opt/apps/gcc11_2/mvapich2/2.3.7/include:$CPATH
+#export LD_LIBRARY_PATH=/.../mvapich2/lib:$LD_LIBRARY_PATH
+#export CPATH=/.../mvapich2/include:$CPATH
+cmake -DCMAKE_VERBOSE_MAKEFILE=ON ..
+make
diff --git a/compile_summit.sh b/compile_summit.sh
new file mode 100644
index 0000000..1ef5487
--- /dev/null
+++ b/compile_summit.sh
@@ -0,0 +1,9 @@
+#!/bin/tcsh
+
+rm -r release
+mkdir -p release
+module load cmake gcc cuda
+
+cd release
+cmake ..
+make
diff --git a/compile_vista_mv2.sh b/compile_vista_mv2.sh
new file mode 100644
index 0000000..3cdab07
--- /dev/null
+++ b/compile_vista_mv2.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+
+
+module unload cmake
+#module unload gcc 
+#ml reset
+#ml nvhpc-hpcx-cuda11/23.7
+#module load intel cmake impi cuda
+module unload openmpi
+module use /scratch/00494/tg457572/packages/modulefiles
+
+module load e4s
+module load mvapich
+module load tau 
+ml cuda/12.4
+
+rm -r release
+mkdir -p release
+
+cd release
+#export CXX
+
+#export MPI_HOME="/home1/07936/tg872351/mvp-pre-rc-ofi-cuda12.5"
+export MPI_HOME="/home1/07936/tg872351/mvp-pre-rc-zfp-cuda12.4"
+
+export PATH=${MPI_HOME}/bin:$PATH
+export LD_LIBRARY_PATH=${MPI_HOME}/lib:$LD_LIBRARY_PATH
+export CPATH=${MPI_HOME}/include:$CPATH
+export C_INCLUDE_PATH=${MPI_HOME}/include:$C_INCLUDE_PATH
+
+export LD_LIBRARY_PATH=/home1/apps/nvidia/Linux_aarch64/24.7/cuda/12.5/lib64:$LD_LIBRARY_PATH
+
+export CC=$(which mpicc)
+export CXX=$(which mpicxx)
+export FC=$(which mpifort)
+export MPI_C_COMPILER=$(which mpicc)
+export MPI_CXX_COMPILER=$(which mpicxx)
+export MPI_INCLUDE_PATH=${MPI_HOME}/include
+
+
+#export CXX
+#export LD_LIBRARY_PATH=/opt/intel/compilers_and_libraries_2020.4.304/linux/mpi/intel64/lib:$LD_LIBRARY_PATH
+#export CPATH=/opt/intel/compilers_and_libraries_2020.4.304/linux/mpi/intel64/include:$CPATH
+#export C_INCLUDE_PATH=/opt/intel/compilers_and_libraries_2020.4.304/linux/mpi/intel64/include:$C_INCLUDE_PATH
+#export PATH=/opt/intel/compilers_and_libraries_2020.4.304/linux/mpi/intel64/bin:$PATH
+
+echo -e "\n"
+echo "======== PATH=============="
+echo $PATH | tr : '\n'
+
+echo -e "\n"
+echo "======== INCLUDE =============="
+echo $INCLUDE | tr : '\n'
+
+echo -e "\n"
+echo "======== LD_LIBRARY_PATH=============="
+echo $LD_LIBRARY_PATH | tr : '\n'
+
+module list
+echo -e "mpicc = `which mpicc`"
+echo ""
+
+echo "LD_PRELOAD=$LD_PRELOAD"
+
+#export MPI_HOME=${TACC_MPI_DIR}
+
+echo "TACC_IMPI_INC=$TACC_IMPI_INC"
+echo "MPI_HOME=$MPI_HOME"
+
+
+cmake -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_C_COMPILER=`which mpicc` -DCMAKE_CXX_COMPILER=`which mpicxx` -DMY_INCLUDE_PATH=$MPI_INCLUDE_PATH ..
+make
diff --git a/compile_vista_openmpi.sh b/compile_vista_openmpi.sh
new file mode 100644
index 0000000..220e255
--- /dev/null
+++ b/compile_vista_openmpi.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+
+
+module unload cmake
+#module unload gcc 
+#ml reset
+#ml nvhpc-hpcx-cuda11/23.7
+#module load intel cmake impi cuda
+
+rm -r release
+mkdir -p release
+
+cd release
+export CC=$(which mpicc)
+export CXX=$(which mpicxx)
+export FC=$(which mpifort)
+export MPI_C_COMPILER=$(which mpicc)
+export MPI_INCLUDE_PATH=${TACC_MPI_DIR}/include
+#export CXX
+#export LD_LIBRARY_PATH=/opt/intel/compilers_and_libraries_2020.4.304/linux/mpi/intel64/lib:$LD_LIBRARY_PATH
+#export CPATH=/opt/intel/compilers_and_libraries_2020.4.304/linux/mpi/intel64/include:$CPATH
+#export C_INCLUDE_PATH=/opt/intel/compilers_and_libraries_2020.4.304/linux/mpi/intel64/include:$C_INCLUDE_PATH
+#export PATH=/opt/intel/compilers_and_libraries_2020.4.304/linux/mpi/intel64/bin:$PATH
+
+echo -e "\n"
+echo "======== PATH=============="
+echo $PATH | tr : '\n'
+
+echo -e "\n"
+echo "======== INCLUDE =============="
+echo $INCLUDE | tr : '\n'
+
+echo -e "\n"
+echo "======== LD_LIBRARY_PATH=============="
+echo $LD_LIBRARY_PATH | tr : '\n'
+
+module list
+echo -e "mpicc = `which mpicc`"
+echo ""
+
+echo "LD_PRELOAD=$LD_PRELOAD"
+
+export MPI_HOME=${TACC_MPI_DIR}
+
+echo "TACC_IMPI_INC=$TACC_IMPI_INC"
+echo "MPI_HOME=$MPI_HOME"
+
+
+cmake -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_C_COMPILER=`which mpicc` -DCMAKE_CXX_COMPILER=`which mpicxx` -DMY_INCLUDE_PATH=$MPI_INCLUDE_PATH ..
+make
diff --git a/include/awp/definitions.h b/include/awp/definitions.h
index 5a530d7..99774f6 100644
--- a/include/awp/definitions.h
+++ b/include/awp/definitions.h
@@ -1,10 +1,6 @@
 #ifndef DEFINITIONS_H
 #define DEFINITIONS_H
 
-#define BLOCK_SIZE_X 2
-#define BLOCK_SIZE_Y 2
-#define BLOCK_SIZE_Z 4
-
 #include <mpi.h>
 
 #ifndef _prec
@@ -29,11 +25,11 @@ typedef float prec;
 #endif
 
 #ifndef ngsl
-#define ngsl 8
+#define ngsl 4
 #endif
 
 #ifndef ngsl2
-#define ngsl2 16
+#define ngsl2 8
 #endif
 
 #ifndef align
@@ -47,6 +43,7 @@ typedef float prec;
 #define STR_LEN 2048
 
 
+
 typedef struct
 {
         _prec x, y, z;
diff --git a/include/awp/error.h b/include/awp/error.h
index 6b6d8a6..b0632f0 100644
--- a/include/awp/error.h
+++ b/include/awp/error.h
@@ -1,8 +1,6 @@
 #ifndef ERROR_H
 #define ERROR_H
 
-int _last_error;
-
 enum error_codes {SUCCESS, 
                   ERR_FILE_OPEN = 100, 
                   ERR_FILE_READ = 101, 
diff --git a/include/awp/pmcl3d.h b/include/awp/pmcl3d.h
index 46f3c38..38467c0 100644
--- a/include/awp/pmcl3d.h
+++ b/include/awp/pmcl3d.h
@@ -36,7 +36,9 @@ void command(int argc, char **argv, _prec *TMAX, _prec *DH, _prec *DT,
              int *USETOPO, char *SOURCEFILE,
              int *USESOURCEFILE, char *RECVFILE, int *USERECVFILE,
              char *FORCEFILE, int *USEFORCEFILE,
-             char *SGTFILE, int *USESGTFILE);
+             char *SGTFILE, int *USESGTFILE, char *MMSFILE, int *USEMMSFILE, float *DHB, float *DHT,
+             char *ENERGYFILE, int *USEENERGYFILE, 
+             _prec *QSI, _prec *QPQSR, _prec *MAXVPVSR, _prec *VMIN, _prec *VMAX, _prec *DMIN);
 
 int read_src_ifault_2(int rank, int READ_STEP, 
     char *INSRC, char *INSRC_I2, 
@@ -72,6 +74,7 @@ void inimesh(int rank, int MEDIASTART, Grid3D d1, Grid3D mu, Grid3D lam, Grid3D
 	     Grid3D tau, Grid3D weights,Grid1D coeff,
 	     int nvar, _prec FP,  _prec FAC, _prec Q0, _prec EX, int nxt, int nyt, int nzt, int PX, int PY, int NX, int NY,
              int NZ, int *coords, MPI_Comm MCW, int IDYNA, int NVE, int SoCalQ, char *INVEL,
+            _prec qsi, _prec qpqsr, _prec maxvpvsr, _prec vmin, _prec vmax, _prec dmin,
              _prec *vse, _prec *vpe, _prec *dde);
 
 int checkmesh(int nxtl, int nytl, int nztl, int nxth, int nyth, int nzth, Grid3D varl, Grid3D varh,
@@ -109,7 +112,7 @@ void init_texture(int nxt,  int nyt,  int nzt,  Grid3D tau1,  Grid3D tau2,  Grid
 
 Grid3D Alloc3D(int nx, int ny, int nz);
 Grid3Dww Alloc3Dww(int nx, int ny, int nz); 
-Grid1D Alloc1D(int nx);    
+Grid1D Alloc1D(long nx);    
 PosInf Alloc1P(int nx);
 
 void Delloc3D(Grid3D U);
diff --git a/include/awp/pmcl3d_cons.h b/include/awp/pmcl3d_cons.h
index 8808c48..059fbc3 100644
--- a/include/awp/pmcl3d_cons.h
+++ b/include/awp/pmcl3d_cons.h
@@ -1,7 +1,7 @@
 #ifndef DEFINITIONS_H
 #define BLOCK_SIZE_X 2
 #define BLOCK_SIZE_Y 2
-#define BLOCK_SIZE_Z 4
+#define BLOCK_SIZE_Z 32
 #endif
 // Set floating-point precision. Make sure to configure both `_prec` and
 // `_mpi_prec`.
@@ -15,12 +15,9 @@
 #endif
 #define align 32
 #define loop  1 
-// Number of ghost cells is increased from 4 to 8 for topography kernels.
-// In the future, it should be possible to keep this number at four, but modify
-// the vertical velocity exchange so that 6 points is exchanged instead of 4.
-// No modifications necessary to the other velocity components. 
-#define ngsl 8     /* number of ghost cells x loop */
-#define ngsl2 16  /* ngsl * 2 */
+// Do not change the number of ghost cells.
+#define ngsl 4     /* number of ghost cells x loop */
+#define ngsl2 8  /* ngsl * 2 */
 
 #define Both  0
 #define Left  1
diff --git a/include/buffers/buffer.h b/include/buffers/buffer.h
index 5d826b7..a4c2f32 100644
--- a/include/buffers/buffer.h
+++ b/include/buffers/buffer.h
@@ -214,7 +214,7 @@ void buffer_copy_to_device(buffer_t *buffer, size_t step);
  *      buffer: Buffer data structure.
  *      step: Time step to query buffer at.
  */
-void buffer_copy_to_host(buffer_t *buffer, int step);
+void buffer_copy_to_host(buffer_t *buffer, size_t step);
 
 #ifdef __cplusplus
 }
diff --git a/include/grid/grid_3d.h b/include/grid/grid_3d.h
index 521e24e..fc002fe 100644
--- a/include/grid/grid_3d.h
+++ b/include/grid/grid_3d.h
@@ -131,6 +131,11 @@ grid3_t grid_init_metric_grid(const int3_t size, const int3_t shift,
                          const int3_t boundary2,
                          const _prec gridspacing);
 
+grid3_t grid_init_full_grid(const int3_t size, const int3_t shift,
+                         const int3_t coordinate, const int3_t boundary1,
+                         const int3_t boundary2,
+                         const _prec gridspacing);
+
 /* Initialize grid
  * 
  * Input arguments:
@@ -184,11 +189,29 @@ grid1_t grid_grid1_z(const grid3_t grid);
  *      out: Array to fill
  *      n: Array size. Must be greater than the grid size.
  *      grid: 1D grid data structure.
+ *      isxdir: Specify to `1` if the grid should be filled in the x-direction.
+ *          Adjusts for the particular internal coordinate system used by AWP, i.e., fields stored
+ *          in the (-,+,+) octant
+ *
+ * Return value:
+ *      Number of elements written.
+ */ 
+int grid_fill1(prec *out, const grid1_t grid, const int isxdir);
+/*
+ *
+ * Fill the array `out` with the grid point values in the y-direction for a given DM block
+ * ('blocknum') in one dimension.
+ *
+ * Arguments:
+ *      out: Array to fill
+ *      n: Array size. Must be greater than the grid size.
+ *      grid: 1D grid data structure.
+ *      blocknum: Block number. Must be a non-negative integer.
  *
  * Return value:
  *      Number of elements written.
  */ 
-int grid_fill1(prec *out, const grid1_t grid);
+int grid_fill_y_dm(prec *out, const grid1_t grid, const int blocknum);
 
 /*
  * Check if a query point is in bounds or not. The query point is in bounds if
@@ -204,8 +227,8 @@ int grid_fill1(prec *out, const grid1_t grid);
  */
 int grid_in_bounds1(const _prec *x, const _prec q, const grid1_t grid);
 
-int grid_in_bounds_ext1(const _prec *x, const _prec q, const grid1_t grid);
-
+int grid_in_bounds_sgt(const _prec *x, const _prec q, const grid1_t grid);
+int grid_in_bounds_moment_tensor(const _prec *x, const _prec q, const grid1_t grid);
 
 /*
  * Fill the array `out` with the grid point values in the x-direction of a grid
@@ -308,6 +331,13 @@ int grid_pow3(_prec *out, const _prec p, const grid3_t grid);
  */
 double grid_reduce3(const _prec *in, const grid3_t grid);
 
+_prec grid_overlap(const _prec h);
+_prec grid_height(const int nz, const _prec h, const int istopo);
+
+void global_to_local(_prec *zloc, int *block_index, const _prec z,
+                     const _prec h, const int *nz, const int num_grids,
+                     const int istopo);
+
 
 #ifdef __cplusplus
 }
diff --git a/include/interpolation/interpolation.h b/include/interpolation/interpolation.h
index c4b5ef9..e7dae04 100644
--- a/include/interpolation/interpolation.h
+++ b/include/interpolation/interpolation.h
@@ -4,7 +4,7 @@
 extern "C" {
 #endif
 
-#include <awp/definitions.h>
+#include <awp/pmcl3d_cons.h>
 #include <grid/grid_3d.h>
 
 /* 
@@ -55,9 +55,8 @@ int interp_grid_argnearest(int *nearest, const prec *x, const prec q,
  *       last: and last index + 1 of grid points in stencil. (output)
  *       lower: Number of points to the left.
  *       upper: Number of points to the right.
- *       nearest: Index of grid point nearest to query point. (output)
+ *       nearest: Index of grid point nearest to query point. 
  *       n: Number of grid points.
- *       q: Query point.
  *
  *   Returns:
  *       Error code (SUCCESS, ERR_OUT_OF_BOUNDS_LOWER, ERR_OUT_OF_BOUNDS_UPPER)
@@ -66,7 +65,7 @@ int interp_grid_argnearest(int *nearest, const prec *x, const prec q,
 int interp_argnearest_range(int *first, int *last,
                             const int lower, const int upper,
                             const int nearest,
-                            const int n, const prec query);
+                            const int n);
 
 /* Perform 1D Lagrange interpolation by interpolating in the neighborhood of
  * each query point. 
@@ -114,6 +113,11 @@ int interp_lagrange3(prec *out, const prec *in, const prec *x, const prec *y,
                      const int deg);
 
 
+// Get number of points to the left (see interp_argnearest_range) 
+int interp_get_lower(const prec xnearest, const prec query, const int deg);
+// Get number of points to the right (see interp_argnearest_range) 
+int interp_get_upper(const prec xnearest, const prec query, const int deg);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/mpi/distribute.h b/include/mpi/distribute.h
index a2957bb..7197562 100644
--- a/include/mpi/distribute.h
+++ b/include/mpi/distribute.h
@@ -3,14 +3,24 @@
 
 #include <awp/definitions.h>
 #include <grid/grid_3d.h>
+#include <topography/grids.h>
+#include <topography/sources/source.h>
 
 #ifdef __cplusplus
 extern "C" {
 #endif
+#
+enum dist_options {DIST_COUNT, DIST_INSERT_INDICES};
 
 int dist_indices(int **indices, size_t *nidx, const prec *qx, const prec *qy,
-                 const size_t n, grid3_t grid);
+                 const size_t n, const grid3_t grid, const int *grid_numbers,
+                 const int grid_number, const enum source_type st, const enum dist_options mode);
 
+int dist_indices_in_bounds(const prec qx, const prec qy,
+                           const prec *x, const size_t mx, 
+                           const prec *y, const size_t my,
+                           const prec hx, const prec hy,
+                           const enum source_type st);
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/readers/error.h b/include/readers/error.h
deleted file mode 100644
index 74c22d3..0000000
--- a/include/readers/error.h
+++ /dev/null
@@ -1,33 +0,0 @@
-#ifndef READERS_ERROR_H
-#define READERS_ERROR_H
-
-int _last_error;
-
-enum error_codes {SUCCESS, 
-                  ERR_FILE_OPEN = 100, 
-                  ERR_FILE_READ = 101, 
-                  ERR_FILE_WRITE = 102, 
-                  ERR_GET_VERSION = 200, 
-                  ERR_WRONG_VERSION = 201, 
-                  ERR_BROADCAST_VERSION = 202, 
-                  ERR_CONFIG_PARSE_SIZES = 300,
-                  ERR_CONFIG_SIZE_OVERFLOW = 301,
-                  ERR_CONFIG_DATA_FILENAME = 302,
-                  ERR_CONFIG_DATA_WRITEABLE = 303,
-                  ERR_CONFIG_DATA_MALLOC = 304,
-                  ERR_CONFIG_DATA_READ_ELEMENT = 305,
-                  ERR_CONFIG_BROADCAST = 306,
-                  ERR_CONFIG_DATA_SIZE = 307,
-                  ERR_CONFIG_PARSE_ARG = 308,
-                  ERR_CONFIG_PARSE_UNKNOWN_ARG = 309,
-                  ERR_CONFIG_PARSE_WRONG_DIMENSION = 310,
-                  ERR_CONFIG_PARSE_NOT_DIVISIBLE = 311,
-};
-// Display the error message associated with an error code.
-const char* error_message(const int err);
-void error_print(const int err);
-
-#ifdef __cplusplus
-}
-#endif
-#endif
diff --git a/include/topography/energy.cuh b/include/topography/energy.cuh
new file mode 100644
index 0000000..d206c4e
--- /dev/null
+++ b/include/topography/energy.cuh
@@ -0,0 +1,187 @@
+#ifndef ENERGY_CUH
+#define ENERGY_CUH
+
+
+#include <mpi.h>
+#include <stdio.h>
+#include <awp/pmcl3d_cons.h>
+#include <topography/metrics/metrics.h>
+
+typedef struct {
+    int use;
+    int rank;
+    MPI_Comm comm;
+    double *time;
+    double *kinetic_energy_rate;
+    double *strain_energy_rate;
+    double *kinetic_rate;
+    double *strain_rate;
+    int num_steps;
+    double dt;
+    size_t num_bytes;
+    // Copies of velocity components at previous time step
+    float *d_vxp;
+    float *d_vyp;
+    float *d_vzp;
+
+    // Copies of stress components at previous time step
+    float *d_xxp;
+    float *d_yyp;
+    float *d_zzp;
+    float *d_xyp;
+    float *d_xzp;
+    float *d_yzp;
+
+    // current output index
+    int index;
+    // How often to write to buffer
+    int stride;
+
+} energy_t;
+
+#ifdef __cplusplus
+extern "C" {
+void energy_rate(energy_t *e, const int step, const float *d_vx, const float *d_vy,
+                 const float *d_vz, const float *d_xx, const float *d_yy,
+                 const float *d_zz, const float *d_xy, const float *d_xz,
+                 const float *d_yz, const float *d_rho, const float *d_mui,
+                 const float *d_lami, const f_grid_t *metrics_f, const g_grid_t *metrics_g,
+                 const int nx, const int ny, const int nz);
+#endif
+#ifdef __cplusplus
+}
+#endif
+
+energy_t energy_init(int useenergy, const int rank, const MPI_Comm comm, const int num_steps, const float dt, const int nx, const int ny, const int nz, const int stride) {
+    energy_t energy;
+    energy.use = 0;
+    energy.rank = -1;
+
+    if (!useenergy) return energy;
+    energy.use = 1;
+
+    if (rank == 0)
+    printf("Energy output:: enabled\n");
+
+
+    energy.rank = rank;
+    energy.comm = comm;
+    energy.num_steps = (num_steps - 1) / stride + 1;
+    energy.index = 0;
+    energy.stride = stride;
+    energy.dt = dt;
+    energy.time = (double*)malloc(sizeof (double) * num_steps);
+    energy.kinetic_energy_rate = (double*)malloc(sizeof (double) * num_steps);
+    energy.strain_energy_rate = (double*)malloc(sizeof (double) * num_steps);
+    size_t num_bytes = (nx + 2 * ngsl + 4) * (ny + 2 * ngsl + 4) * (nz + 2 * align) * sizeof(float);
+    energy.num_bytes = num_bytes;
+    CUCHK(cudaMalloc((void**)&energy.d_vxp, num_bytes));
+    CUCHK(cudaMalloc((void**)&energy.d_vyp, num_bytes));
+    CUCHK(cudaMalloc((void**)&energy.d_vzp, num_bytes));
+    CUCHK(cudaMalloc((void**)&energy.d_xxp, num_bytes));
+    CUCHK(cudaMalloc((void**)&energy.d_yyp, num_bytes));
+    CUCHK(cudaMalloc((void**)&energy.d_zzp, num_bytes));
+    CUCHK(cudaMalloc((void**)&energy.d_xyp, num_bytes));
+    CUCHK(cudaMalloc((void**)&energy.d_xzp, num_bytes));
+    CUCHK(cudaMalloc((void**)&energy.d_yzp, num_bytes));
+    CUCHK(cudaMalloc((void**)&energy.kinetic_rate, sizeof(double)));
+    CUCHK(cudaMalloc((void**)&energy.strain_rate, sizeof(double)));
+
+    return energy;
+
+}
+
+void energy_update_previous_solutions(energy_t *e, float *d_vx, float *d_vy, float *d_vz, float *d_xx, float *d_yy, float *d_zz, float *d_xy, float *d_xz, float *d_yz) {
+
+    if (!e->use) return;
+    CUCHK(cudaMemcpy(e->d_vxp, d_vx, e->num_bytes, cudaMemcpyDeviceToDevice));
+    CUCHK(cudaMemcpy(e->d_vyp, d_vy, e->num_bytes, cudaMemcpyDeviceToDevice));
+    CUCHK(cudaMemcpy(e->d_vzp, d_vz, e->num_bytes, cudaMemcpyDeviceToDevice));
+    CUCHK(cudaMemcpy(e->d_xxp, d_xx, e->num_bytes, cudaMemcpyDeviceToDevice));
+    CUCHK(cudaMemcpy(e->d_yyp, d_yy, e->num_bytes, cudaMemcpyDeviceToDevice));
+    CUCHK(cudaMemcpy(e->d_zzp, d_zz, e->num_bytes, cudaMemcpyDeviceToDevice));
+    CUCHK(cudaMemcpy(e->d_xyp, d_xy, e->num_bytes, cudaMemcpyDeviceToDevice));
+    CUCHK(cudaMemcpy(e->d_xzp, d_xz, e->num_bytes, cudaMemcpyDeviceToDevice));
+    CUCHK(cudaMemcpy(e->d_yzp, d_yz, e->num_bytes, cudaMemcpyDeviceToDevice));
+
+}
+
+void energy_zero(energy_t *e, float *d_vx, float *d_vy, float *d_vz, float *d_xx, float *d_yy, float *d_zz, float *d_xy, float *d_xz, float *d_yz, int mode) {
+        //cudaMemset(d_vx, 0, e->num_bytes);
+        //cudaMemset(d_vy, 0, e->num_bytes);
+        //cudaMemset(d_vz, 0, e->num_bytes);
+        //cudaMemset(d_xx, 0, e->num_bytes);
+        //cudaMemset(d_yy, 0, e->num_bytes);
+        //cudaMemset(d_zz, 0, e->num_bytes);
+        //cudaMemset(d_xy, 0, e->num_bytes);
+        //cudaMemset(d_xz, 0, e->num_bytes);
+        //cudaMemset(d_yz, 0, e->num_bytes);
+
+    //if (mode == 0) {
+    //    cudaMemset(d_vx, 0, e->num_bytes);
+    //    cudaMemset(d_vy, 0, e->num_bytes);
+    //    cudaMemset(d_vz, 0, e->num_bytes);
+    //    cudaMemset(d_xx, 0, e->num_bytes);
+    //    cudaMemset(d_yy, 0, e->num_bytes);
+    //    cudaMemset(d_zz, 0, e->num_bytes);
+    //    cudaMemset(d_xy, 0, e->num_bytes);
+    //    cudaMemset(d_xz, 0, e->num_bytes);
+    //    cudaMemset(d_yz, 0, e->num_bytes);
+    //}
+
+    //if (mode == 1) {
+    //    cudaMemset(d_xx, 0, e->num_bytes);
+    //    cudaMemset(d_yy, 0, e->num_bytes);
+    //    cudaMemset(d_zz, 0, e->num_bytes);
+    //    cudaMemset(d_xy, 0, e->num_bytes);
+    //    cudaMemset(d_xz, 0, e->num_bytes);
+    //    cudaMemset(d_yz, 0, e->num_bytes);
+    //}
+
+}
+
+void energy_kinetic_rate(energy_t *e, int step) {
+    if (!e->use || step >= e->num_steps) return;
+
+    e->kinetic_energy_rate[step] = (double)step;
+    e->strain_energy_rate[step] = (double)step;
+
+}
+
+void energy_output(energy_t *e, const char *filename) {
+    if (!e->use || e->rank != 0) return;
+        
+    FILE *fh = fopen(filename, "w");
+    printf("Writing energy output\n");
+
+    if (e->rank == 0)
+    printf("Energy output written to: %s number of steps written: %d \n", filename, e->num_steps);
+    for (int i = 0; i < e->index; ++i)
+        fprintf(fh, "%g %g %g %g \n", 
+                e->time[i], 
+                e->kinetic_energy_rate[i], e->strain_energy_rate[i],
+                e->kinetic_energy_rate[i] + e->strain_energy_rate[i]
+                );
+
+    fclose(fh);
+}
+
+void energy_free(energy_t *e) {
+    if (!e->use) return;
+    free(e->time);
+    free(e->kinetic_energy_rate);
+    free(e->strain_energy_rate);
+    CUCHK(cudaFree(e->d_vxp));
+    CUCHK(cudaFree(e->d_vyp));
+    CUCHK(cudaFree(e->d_vzp));
+    CUCHK(cudaFree(e->d_xxp));
+    CUCHK(cudaFree(e->d_yyp));
+    CUCHK(cudaFree(e->d_zzp));
+    CUCHK(cudaFree(e->d_xyp));
+    CUCHK(cudaFree(e->d_xzp));
+    CUCHK(cudaFree(e->d_yzp));
+    CUCHK(cudaFree(e->strain_rate));
+    CUCHK(cudaFree(e->kinetic_rate));
+}
+
+#endif
diff --git a/include/topography/geometry/geometry.h b/include/topography/geometry/geometry.h
index e16a1e8..70dd421 100644
--- a/include/topography/geometry/geometry.h
+++ b/include/topography/geometry/geometry.h
@@ -6,10 +6,12 @@
 #define GEOMETRY_H
 
 #include <topography/metrics/metrics.h>
+#include <topography/mapping.h>
 #include <grid/grid_3d.h>
 
 void geom_cartesian_topography(f_grid_t *metrics_f);
 void geom_no_grid_stretching(g_grid_t *metrics_g);
+void geom_grid_stretching(g_grid_t *metrics_g, const struct mapping *map, const _prec block_height);
 
 void geom_gaussian(f_grid_t *metrics_f, const _prec *x, const _prec *y,
                    const fcn_grid_t grid, const _prec amplitude,
diff --git a/include/topography/grids.h b/include/topography/grids.h
index 89b1303..e082a07 100644
--- a/include/topography/grids.h
+++ b/include/topography/grids.h
@@ -27,7 +27,7 @@ typedef struct
         grid3_t node;
 } grids_t;
 
-enum grid_types {X, Y, Z, XX, YY, ZZ, XY, XZ, YZ, NODE};
+enum grid_types {X, Y, Z, SX, SY, SZ, XX, YY, ZZ, XY, XZ, YZ, NODE};
 
 grids_t grids_init(const int nx, const int ny, const int nz, const int coord_x,
                    const int coord_y, const int coord_z,
@@ -36,10 +36,12 @@ grids_t grids_init(const int nx, const int ny, const int nz, const int coord_x,
 
 void grids_finalize(grids_t *grids);
 
-void grid_data_init(grid_data_t *grid_data, const grid3_t grid);
+void grid_data_init(grid_data_t *grid_data, const grid3_t grid, const int block_number);
 void grid_data_free(grid_data_t *grid_data);
 grid3_t grids_select(const enum grid_types grid_type, const grids_t *grids);
 
+const char *grid_typename(const enum grid_types gt);
+
 
 
 #endif
diff --git a/include/topography/kernels/optimized_launch_config.cuh b/include/topography/kernels/optimized_launch_config.cuh
deleted file mode 100644
index b10ab18..0000000
--- a/include/topography/kernels/optimized_launch_config.cuh
+++ /dev/null
@@ -1,192 +0,0 @@
-#ifndef _OPT_TOPOGRAPHY_LAUNCH_CONFIG_H
-#define _OPT_TOPOGRAPHY_LAUNCH_CONFIG_H
-
-// Number of threads per block to use for interior velocity kernel
-#ifndef VEL_INT_X
-#define VEL_INT_X 64
-#endif
-#ifndef VEL_INT_Y
-#define VEL_INT_Y 4
-#endif
-#ifndef VEL_INT_Z
-#define VEL_INT_Z 4
-#endif
-
-// Number of threads per block to use for boundary velocity kernel
-#ifndef VEL_BND_X
-#define VEL_BND_X 7
-#endif
-#ifndef VEL_BND_Y
-#define VEL_BND_Y 8
-#endif
-#ifndef VEL_BND_Z
-#define VEL_BND_Z 1
-#endif
-
-// Number of threads per block to use for interior stress kernel
-#ifndef STR_INT_X
-#define STR_INT_X 32
-#endif
-#ifndef STR_INT_Y
-#define STR_INT_Y 4
-#endif
-#ifndef STR_INT_Z
-#define STR_INT_Z 1
-#endif
-
-// Number of threads per block to use for boundary stress kernel
-#ifndef STR_BND_X
-#define STR_BND_X 7
-#endif
-#ifndef STR_BND_Y
-#define STR_BND_Y 8
-#endif
-#ifndef STR_BND_Z
-#define STR_BND_Z 1
-#endif
-
-// Kernel naming convention
-// 110: Bottom boundary (only used in debug mode)
-// 111: Interior
-// 112: Top boundary
-
-
-// Number of threads per block
-// grid dimension (X, Y, Z) refers to CUDA grid indices
-#ifndef DTOPO_VEL_110_X
-#define DTOPO_VEL_110_X VEL_BND_X
-#endif
-#ifndef DTOPO_VEL_110_Y
-#define DTOPO_VEL_110_Y VEL_BND_Y
-#endif
-#ifndef DTOPO_VEL_110_Z
-#define DTOPO_VEL_110_Z VEL_BND_Z
-#endif
-
-#ifndef DTOPO_VEL_111_X
-#define DTOPO_VEL_111_X VEL_INT_X
-#endif
-#ifndef DTOPO_VEL_111_Y
-#define DTOPO_VEL_111_Y VEL_INT_Y
-#endif
-#ifndef DTOPO_VEL_111_Z
-#define DTOPO_VEL_111_Z VEL_INT_Z
-#endif
-
-#ifndef DTOPO_VEL_112_X
-#define DTOPO_VEL_112_X VEL_BND_X
-#endif
-#ifndef DTOPO_VEL_112_Y
-#define DTOPO_VEL_112_Y VEL_BND_Y
-#endif
-#ifndef DTOPO_VEL_112_Z
-#define DTOPO_VEL_112_Z VEL_BND_Z
-#endif
-
-#ifndef DTOPO_BUF_VEL_111_X
-#define DTOPO_BUF_VEL_111_X VEL_INT_X
-#endif
-#ifndef DTOPO_BUF_VEL_111_Y
-#define DTOPO_BUF_VEL_111_Y VEL_INT_Y
-#endif
-#ifndef DTOPO_BUF_VEL_111_Z
-#define DTOPO_BUF_VEL_111_Z VEL_INT_Z
-#endif
-
-#ifndef DTOPO_BUF_VEL_112_X
-#define DTOPO_BUF_VEL_112_X VEL_BND_X
-#endif
-#ifndef DTOPO_BUF_VEL_112_Y
-#define DTOPO_BUF_VEL_112_Y VEL_BND_Y
-#endif
-#ifndef DTOPO_BUF_VEL_112_Z
-#define DTOPO_BUF_VEL_112_Z VEL_BND_Z
-#endif
-
-#ifndef DTOPO_BUF_VEL_110_X
-#define DTOPO_BUF_VEL_110_X VEL_BND_X
-#endif
-#ifndef DTOPO_BUF_VEL_110_Y
-#define DTOPO_BUF_VEL_110_Y VEL_BND_Y
-#endif
-#ifndef DTOPO_BUF_VEL_110_Z
-#define DTOPO_BUF_VEL_110_Z VEL_BND_Z
-#endif
-
-#ifndef DTOPO_STR_110_X
-#define DTOPO_STR_110_X STR_INT_X
-#endif
-
-#ifndef DTOPO_STR_110_Y
-#define DTOPO_STR_110_Y STR_INT_Y
-#endif
-
-#ifndef DTOPO_STR_110_Z
-#define DTOPO_STR_110_Z STR_INT_Z
-#endif
-
-#ifndef DTOPO_STR_111_X
-#define DTOPO_STR_111_X STR_INT_X
-#endif
-
-#ifndef DTOPO_STR_111_Y
-#define DTOPO_STR_111_Y STR_INT_Y
-#endif
-
-#ifndef DTOPO_STR_111_Z
-#define DTOPO_STR_111_Z STR_INT_Z
-#endif
-
-#ifndef DTOPO_STR_112_X
-#define DTOPO_STR_112_X STR_INT_X
-#endif
-
-#ifndef DTOPO_STR_112_Y
-#define DTOPO_STR_112_Y STR_INT_Y
-#endif
-
-#ifndef DTOPO_STR_112_Z
-#define DTOPO_STR_112_Z STR_INT_Z
-#endif
-
-
-// Launch bounds
- 
-#ifndef DTOPO_VEL_110_MAX_THREADS_PER_BLOCK
-#define DTOPO_VEL_110_MAX_THREADS_PER_BLOCK 32
-#endif
-
-#ifndef DTOPO_VEL_111_MAX_THREADS_PER_BLOCK
-#define DTOPO_VEL_111_MAX_THREADS_PER_BLOCK 1024
-#endif
-
-#ifndef DTOPO_VEL_112_MAX_THREADS_PER_BLOCK
-#define DTOPO_VEL_112_MAX_THREADS_PER_BLOCK 64
-#endif
-
-#ifndef DTOPO_BUF_VEL_110_MAX_THREADS_PER_BLOCK
-#define DTOPO_BUF_VEL_110_MAX_THREADS_PER_BLOCK 1024
-#endif
-
-#ifndef DTOPO_BUF_VEL_111_MAX_THREADS_PER_BLOCK
-#define DTOPO_BUF_VEL_111_MAX_THREADS_PER_BLOCK 1024
-#endif
-
-#ifndef DTOPO_BUF_VEL_112_MAX_THREADS_PER_BLOCK
-#define DTOPO_BUF_VEL_112_MAX_THREADS_PER_BLOCK 1024
-#endif
-
-// Apply loop in kernel
-// This option must be compatible with the kernel. If there is no loop in the
-// kernel, turn off this option, and vice versa.
-#define DTOPO_VEL_110_LOOP_Z 1
-#define DTOPO_VEL_111_LOOP_Z 0
-#define DTOPO_VEL_112_LOOP_Z 0
-#define DTOPO_BUF_VEL_110_LOOP_Z 1
-#define DTOPO_BUF_VEL_111_LOOP_Z 0
-#define DTOPO_BUF_VEL_112_LOOP_Z 0
-#define DTOPO_STR_110_LOOP_Z 1
-#define DTOPO_STR_111_LOOP_Z 1
-#define DTOPO_STR_112_LOOP_Z 1
-
-#endif
diff --git a/include/topography/kernels/optimized_stress.cuh b/include/topography/kernels/optimized_stress.cuh
deleted file mode 100644
index 9059a68..0000000
--- a/include/topography/kernels/optimized_stress.cuh
+++ /dev/null
@@ -1,58 +0,0 @@
-#ifndef OPTIMIZED_STRESS_H
-#define OPTIMIZED_STRESS_H
-#include <awp/definitions.h>
-#include <math.h>
-
-__global__ void dtopo_str_110(
-    float *__restrict__ s11, float *__restrict__ s12, float *__restrict__ s13,
-    float *__restrict__ s22, float *__restrict__ s23, float *__restrict__ s33,
-    float *__restrict__ u1, float *__restrict__ u2, float *__restrict__ u3,
-    const float *__restrict__ dcrjx, const float *__restrict__ dcrjy,
-    const float *__restrict__ dcrjz, const float *__restrict__ f,
-    const float *__restrict__ f1_1, const float *__restrict__ f1_2,
-    const float *__restrict__ f1_c, const float *__restrict__ f2_1,
-    const float *__restrict__ f2_2, const float *__restrict__ f2_c,
-    const float *__restrict__ f_1, const float *__restrict__ f_2,
-    const float *__restrict__ f_c, const float *__restrict__ g,
-    const float *__restrict__ g3, const float *__restrict__ g3_c,
-    const float *__restrict__ g_c, const float *__restrict__ lami,
-    const float *__restrict__ mui, const float a, const float nu, const int nx,
-    const int ny, const int nz, const int bi, const int bj, const int ei,
-    const int ej);
-__global__ void dtopo_str_111(
-    float *__restrict__ s11, float *__restrict__ s12, float *__restrict__ s13,
-    float *__restrict__ s22, float *__restrict__ s23, float *__restrict__ s33,
-    float *__restrict__ u1, float *__restrict__ u2, float *__restrict__ u3,
-    const float *__restrict__ dcrjx, const float *__restrict__ dcrjy,
-    const float *__restrict__ dcrjz, const float *__restrict__ f,
-    const float *__restrict__ f1_1, const float *__restrict__ f1_2,
-    const float *__restrict__ f1_c, const float *__restrict__ f2_1,
-    const float *__restrict__ f2_2, const float *__restrict__ f2_c,
-    const float *__restrict__ f_1, const float *__restrict__ f_2,
-    const float *__restrict__ f_c, const float *__restrict__ g,
-    const float *__restrict__ g3, const float *__restrict__ g3_c,
-    const float *__restrict__ g_c, const float *__restrict__ lami,
-    const float *__restrict__ mui, const float a, const float nu, const int nx,
-    const int ny, const int nz, const int bi, const int bj, const int ei,
-    const int ej);
-__global__ void dtopo_str_112(
-    float *__restrict__ s11, float *__restrict__ s12, float *__restrict__ s13,
-    float *__restrict__ s22, float *__restrict__ s23, float *__restrict__ s33,
-    float *__restrict__ u1, float *__restrict__ u2, float *__restrict__ u3,
-    const float *__restrict__ dcrjx, const float *__restrict__ dcrjy,
-    const float *__restrict__ dcrjz, const float *__restrict__ f,
-    const float *__restrict__ f1_1, const float *__restrict__ f1_2,
-    const float *__restrict__ f1_c, const float *__restrict__ f2_1,
-    const float *__restrict__ f2_2, const float *__restrict__ f2_c,
-    const float *__restrict__ f_1, const float *__restrict__ f_2,
-    const float *__restrict__ f_c, const float *__restrict__ g,
-    const float *__restrict__ g3, const float *__restrict__ g3_c,
-    const float *__restrict__ g_c, const float *__restrict__ lami,
-    const float *__restrict__ mui, const float a, const float nu, const int nx,
-    const int ny, const int nz, const int bi, const int bj, const int ei,
-    const int ej);
-__global__ void dtopo_init_material_111(float *__restrict__ lami,
-                                        float *__restrict__ mui,
-                                        float *__restrict__ rho, const int nx,
-                                        const int ny, const int nz);
-#endif
\ No newline at end of file
diff --git a/include/topography/kernels/optimized_velocity.cuh b/include/topography/kernels/optimized_velocity.cuh
deleted file mode 100644
index efe3a5e..0000000
--- a/include/topography/kernels/optimized_velocity.cuh
+++ /dev/null
@@ -1,111 +0,0 @@
-#ifndef OPTIMIZED_VELOCITY_H
-#define OPTIMIZED_VELOCITY_H
-#include <awp/definitions.h>
-#include <math.h>
-
-__global__ void
-dtopo_vel_110(float *__restrict__ u1, float *__restrict__ u2,
-              float *__restrict__ u3, const float *__restrict__ dcrjx,
-              const float *__restrict__ dcrjy, const float *__restrict__ dcrjz,
-              const float *__restrict__ f, const float *__restrict__ f1_1,
-              const float *__restrict__ f1_2, const float *__restrict__ f1_c,
-              const float *__restrict__ f2_1, const float *__restrict__ f2_2,
-              const float *__restrict__ f2_c, const float *__restrict__ f_1,
-              const float *__restrict__ f_2, const float *__restrict__ f_c,
-              const float *__restrict__ g, const float *__restrict__ g3,
-              const float *__restrict__ g3_c, const float *__restrict__ g_c,
-              const float *__restrict__ rho, const float *__restrict__ s11,
-              const float *__restrict__ s12, const float *__restrict__ s13,
-              const float *__restrict__ s22, const float *__restrict__ s23,
-              const float *__restrict__ s33, const float a, const float nu,
-              const int nx, const int ny, const int nz, const int bi,
-              const int bj, const int ei, const int ej);
-__global__ void
-dtopo_vel_111(float *__restrict__ u1, float *__restrict__ u2,
-              float *__restrict__ u3, const float *__restrict__ dcrjx,
-              const float *__restrict__ dcrjy, const float *__restrict__ dcrjz,
-              const float *__restrict__ f, const float *__restrict__ f1_1,
-              const float *__restrict__ f1_2, const float *__restrict__ f1_c,
-              const float *__restrict__ f2_1, const float *__restrict__ f2_2,
-              const float *__restrict__ f2_c, const float *__restrict__ f_1,
-              const float *__restrict__ f_2, const float *__restrict__ f_c,
-              const float *__restrict__ g, const float *__restrict__ g3,
-              const float *__restrict__ g3_c, const float *__restrict__ g_c,
-              const float *__restrict__ rho, const float *__restrict__ s11,
-              const float *__restrict__ s12, const float *__restrict__ s13,
-              const float *__restrict__ s22, const float *__restrict__ s23,
-              const float *__restrict__ s33, const float a, const float nu,
-              const int nx, const int ny, const int nz, const int bi,
-              const int bj, const int ei, const int ej);
-__global__ void
-dtopo_vel_112(float *__restrict__ u1, float *__restrict__ u2,
-              float *__restrict__ u3, const float *__restrict__ dcrjx,
-              const float *__restrict__ dcrjy, const float *__restrict__ dcrjz,
-              const float *__restrict__ f, const float *__restrict__ f1_1,
-              const float *__restrict__ f1_2, const float *__restrict__ f1_c,
-              const float *__restrict__ f2_1, const float *__restrict__ f2_2,
-              const float *__restrict__ f2_c, const float *__restrict__ f_1,
-              const float *__restrict__ f_2, const float *__restrict__ f_c,
-              const float *__restrict__ g, const float *__restrict__ g3,
-              const float *__restrict__ g3_c, const float *__restrict__ g_c,
-              const float *__restrict__ rho, const float *__restrict__ s11,
-              const float *__restrict__ s12, const float *__restrict__ s13,
-              const float *__restrict__ s22, const float *__restrict__ s23,
-              const float *__restrict__ s33, const float a, const float nu,
-              const int nx, const int ny, const int nz, const int bi,
-              const int bj, const int ei, const int ej);
-__global__ void dtopo_buf_vel_110(
-    float *__restrict__ buf_u1, float *__restrict__ buf_u2,
-    float *__restrict__ buf_u3, const float *__restrict__ dcrjx,
-    const float *__restrict__ dcrjy, const float *__restrict__ dcrjz,
-    const float *__restrict__ f, const float *__restrict__ f1_1,
-    const float *__restrict__ f1_2, const float *__restrict__ f1_c,
-    const float *__restrict__ f2_1, const float *__restrict__ f2_2,
-    const float *__restrict__ f2_c, const float *__restrict__ f_1,
-    const float *__restrict__ f_2, const float *__restrict__ f_c,
-    const float *__restrict__ g, const float *__restrict__ g3,
-    const float *__restrict__ g3_c, const float *__restrict__ g_c,
-    const float *__restrict__ rho, const float *__restrict__ s11,
-    const float *__restrict__ s12, const float *__restrict__ s13,
-    const float *__restrict__ s22, const float *__restrict__ s23,
-    const float *__restrict__ s33, const float *__restrict__ u1,
-    const float *__restrict__ u2, const float *__restrict__ u3, const float a,
-    const float nu, const int nx, const int ny, const int nz, const int bj,
-    const int ej, const int rj0);
-__global__ void dtopo_buf_vel_111(
-    float *__restrict__ buf_u1, float *__restrict__ buf_u2,
-    float *__restrict__ buf_u3, const float *__restrict__ dcrjx,
-    const float *__restrict__ dcrjy, const float *__restrict__ dcrjz,
-    const float *__restrict__ f, const float *__restrict__ f1_1,
-    const float *__restrict__ f1_2, const float *__restrict__ f1_c,
-    const float *__restrict__ f2_1, const float *__restrict__ f2_2,
-    const float *__restrict__ f2_c, const float *__restrict__ f_1,
-    const float *__restrict__ f_2, const float *__restrict__ f_c,
-    const float *__restrict__ g, const float *__restrict__ g3,
-    const float *__restrict__ g3_c, const float *__restrict__ g_c,
-    const float *__restrict__ rho, const float *__restrict__ s11,
-    const float *__restrict__ s12, const float *__restrict__ s13,
-    const float *__restrict__ s22, const float *__restrict__ s23,
-    const float *__restrict__ s33, const float *__restrict__ u1,
-    const float *__restrict__ u2, const float *__restrict__ u3, const float a,
-    const float nu, const int nx, const int ny, const int nz, const int bj,
-    const int ej, const int rj0);
-__global__ void dtopo_buf_vel_112(
-    float *__restrict__ buf_u1, float *__restrict__ buf_u2,
-    float *__restrict__ buf_u3, const float *__restrict__ dcrjx,
-    const float *__restrict__ dcrjy, const float *__restrict__ dcrjz,
-    const float *__restrict__ f, const float *__restrict__ f1_1,
-    const float *__restrict__ f1_2, const float *__restrict__ f1_c,
-    const float *__restrict__ f2_1, const float *__restrict__ f2_2,
-    const float *__restrict__ f2_c, const float *__restrict__ f_1,
-    const float *__restrict__ f_2, const float *__restrict__ f_c,
-    const float *__restrict__ g, const float *__restrict__ g3,
-    const float *__restrict__ g3_c, const float *__restrict__ g_c,
-    const float *__restrict__ rho, const float *__restrict__ s11,
-    const float *__restrict__ s12, const float *__restrict__ s13,
-    const float *__restrict__ s22, const float *__restrict__ s23,
-    const float *__restrict__ s33, const float *__restrict__ u1,
-    const float *__restrict__ u2, const float *__restrict__ u3, const float a,
-    const float nu, const int nx, const int ny, const int nz, const int bj,
-    const int ej, const int rj0);
-#endif
\ No newline at end of file
diff --git a/include/topography/kernels/stress_attenuation.cuh b/include/topography/kernels/stress_attenuation.cuh
deleted file mode 100644
index 174e5f2..0000000
--- a/include/topography/kernels/stress_attenuation.cuh
+++ /dev/null
@@ -1,70 +0,0 @@
-#ifndef TOPO_STRESS_H
-#define TOPO_STRESS_H
-#include <awp/definitions.h>
-#include <math.h>
-
-void set_constants(const _prec dh, const _prec dt, const int nxt, const int
-                nyt, const int nzt);
-
-__global__ void dtopo_str_111(_prec*  __restrict__ xx, _prec*  __restrict__ yy, _prec*  __restrict__ zz,
-           _prec*  __restrict__ xy, _prec*  __restrict__ xz, _prec*  __restrict__ yz,
-       _prec*  __restrict__ r1, _prec*  __restrict__ r2,  _prec*  __restrict__ r3, 
-       _prec*  __restrict__ r4, _prec*  __restrict__ r5,  _prec*  __restrict__ r6,
-       _prec*  __restrict__ u1, 
-       _prec*  __restrict__ v1,    
-       _prec*  __restrict__ w1,    
-       const float *__restrict__ f,
-       const float *__restrict__ f1_1, const float *__restrict__ f1_2,
-       const float *__restrict__ f1_c, const float *__restrict__ f2_1,
-       const float *__restrict__ f2_2, const float *__restrict__ f2_c,
-       const float *__restrict__ f_1, const float *__restrict__ f_2,
-       const float *__restrict__ f_c, const float *__restrict__ g,
-       const float *__restrict__ g3, const float *__restrict__ g3_c,
-       const float *__restrict__ g_c,
-       const _prec *__restrict__  lam,   
-       const _prec *__restrict__  mu,     
-       const _prec *__restrict__  qp,
-       const _prec *__restrict__  coeff, 
-       const _prec *__restrict__  qs, 
-       const _prec *__restrict__  dcrjx, 
-       const _prec *__restrict__  dcrjy, 
-       const _prec *__restrict__  dcrjz, 
-       const _prec *__restrict__ d_vx1, 
-       const _prec *__restrict__ d_vx2, 
-       const int *__restrict__ d_ww, 
-       const _prec *__restrict__ d_wwo,
-       int NX, int ny, int nz, int rankx, int ranky, 
-       int nzt, int s_i, int e_i, int s_j, int e_j);
-
-__global__ void dtopo_str_112(_prec*  __restrict__ xx, _prec*  __restrict__ yy, _prec*  __restrict__ zz,
-           _prec*  __restrict__ xy, _prec*  __restrict__ xz, _prec*  __restrict__ yz,
-       _prec*  __restrict__ r1, _prec*  __restrict__ r2,  _prec*  __restrict__ r3, 
-       _prec*  __restrict__ r4, _prec*  __restrict__ r5,  _prec*  __restrict__ r6,
-       _prec*  __restrict__ u1, 
-       _prec*  __restrict__ v1,    
-       _prec*  __restrict__ w1,    
-       const float *__restrict__ f,
-       const float *__restrict__ f1_1, const float *__restrict__ f1_2,
-       const float *__restrict__ f1_c, const float *__restrict__ f2_1,
-       const float *__restrict__ f2_2, const float *__restrict__ f2_c,
-       const float *__restrict__ f_1, const float *__restrict__ f_2,
-       const float *__restrict__ f_c, const float *__restrict__ g,
-       const float *__restrict__ g3, const float *__restrict__ g3_c,
-       const float *__restrict__ g_c,
-       const _prec *__restrict__  lam,   
-       const _prec *__restrict__  mu,     
-       const _prec *__restrict__  qp,
-       const _prec *__restrict__  coeff, 
-       const _prec *__restrict__  qs, 
-       const _prec *__restrict__  dcrjx, 
-       const _prec *__restrict__  dcrjy, 
-       const _prec *__restrict__  dcrjz, 
-       const _prec *__restrict__ d_vx1, 
-       const _prec *__restrict__ d_vx2, 
-       const int *__restrict__ d_ww, 
-       const _prec *__restrict__ d_wwo,
-       int NX, int ny, int nz, int rankx, int ranky, 
-       int nzt, int s_i, int e_i, int s_j, int e_j);
-
-
-#endif
diff --git a/include/topography/kernels/unoptimized.cuh b/include/topography/kernels/unoptimized.cuh
deleted file mode 100644
index 30545e5..0000000
--- a/include/topography/kernels/unoptimized.cuh
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef CUTOPOGRAPHY_KERNEL_H
-#define CUTOPOGRAPHY_KERNEL_H
-#include <math.h>
-#include <awp/definitions.h>
-
-__global__ void dtopo_vel_110(float *u1, float *u2, float *u3, const float *dcrjx, const float *dcrjy, const float *dcrjz, const float *f, const float *f1_1, const float *f1_2, const float *f1_c, const float *f2_1, const float *f2_2, const float *f2_c, const float *f_1, const float *f_2, const float *f_c, const float *g, const float *g3, const float *g3_c, const float *g_c, const float *rho, const float *s11, const float *s12, const float *s13, const float *s22, const float *s23, const float *s33, const float a, const float nu, const int nx, const int ny, const int nz, const int bi, const int bj, const int ei, const int ej);
-__global__ void dtopo_vel_111(float *u1, float *u2, float *u3, const float *dcrjx, const float *dcrjy, const float *dcrjz, const float *f, const float *f1_1, const float *f1_2, const float *f1_c, const float *f2_1, const float *f2_2, const float *f2_c, const float *f_1, const float *f_2, const float *f_c, const float *g, const float *g3, const float *g3_c, const float *g_c, const float *rho, const float *s11, const float *s12, const float *s13, const float *s22, const float *s23, const float *s33, const float a, const float nu, const int nx, const int ny, const int nz, const int bi, const int bj, const int ei, const int ej);
-__global__ void dtopo_vel_112(float *u1, float *u2, float *u3, const float *dcrjx, const float *dcrjy, const float *dcrjz, const float *f, const float *f1_1, const float *f1_2, const float *f1_c, const float *f2_1, const float *f2_2, const float *f2_c, const float *f_1, const float *f_2, const float *f_c, const float *g, const float *g3, const float *g3_c, const float *g_c, const float *rho, const float *s11, const float *s12, const float *s13, const float *s22, const float *s23, const float *s33, const float a, const float nu, const int nx, const int ny, const int nz, const int bi, const int bj, const int ei, const int ej);
-__global__ void dtopo_buf_vel_110(float *buf_u1, float *buf_u2, float *buf_u3, const float *dcrjx, const float *dcrjy, const float *dcrjz, const float *f, const float *f1_1, const float *f1_2, const float *f1_c, const float *f2_1, const float *f2_2, const float *f2_c, const float *f_1, const float *f_2, const float *f_c, const float *g, const float *g3, const float *g3_c, const float *g_c, const float *rho, const float *s11, const float *s12, const float *s13, const float *s22, const float *s23, const float *s33, const float *u1, const float *u2, const float *u3, const float a, const float nu, const int nx, const int ny, const int nz, const int bj, const int ej, const int rj0);
-__global__ void dtopo_buf_vel_111(float *buf_u1, float *buf_u2, float *buf_u3, const float *dcrjx, const float *dcrjy, const float *dcrjz, const float *f, const float *f1_1, const float *f1_2, const float *f1_c, const float *f2_1, const float *f2_2, const float *f2_c, const float *f_1, const float *f_2, const float *f_c, const float *g, const float *g3, const float *g3_c, const float *g_c, const float *rho, const float *s11, const float *s12, const float *s13, const float *s22, const float *s23, const float *s33, const float *u1, const float *u2, const float *u3, const float a, const float nu, const int nx, const int ny, const int nz, const int bj, const int ej, const int rj0);
-__global__ void dtopo_buf_vel_112(float *buf_u1, float *buf_u2, float *buf_u3, const float *dcrjx, const float *dcrjy, const float *dcrjz, const float *f, const float *f1_1, const float *f1_2, const float *f1_c, const float *f2_1, const float *f2_2, const float *f2_c, const float *f_1, const float *f_2, const float *f_c, const float *g, const float *g3, const float *g3_c, const float *g_c, const float *rho, const float *s11, const float *s12, const float *s13, const float *s22, const float *s23, const float *s33, const float *u1, const float *u2, const float *u3, const float a, const float nu, const int nx, const int ny, const int nz, const int bj, const int ej, const int rj0);
-__global__ void dtopo_str_110(float *s11, float *s12, float *s13, float *s22, float *s23, float *s33, float *u1, float *u2, float *u3, const float *dcrjx, const float *dcrjy, const float *dcrjz, const float *f, const float *f1_1, const float *f1_2, const float *f1_c, const float *f2_1, const float *f2_2, const float *f2_c, const float *f_1, const float *f_2, const float *f_c, const float *g, const float *g3, const float *g3_c, const float *g_c, const float *lami, const float *mui, const float a, const float nu, const int nx, const int ny, const int nz, const int bi, const int bj, const int ei, const int ej);
-__global__ void dtopo_str_111(float *s11, float *s12, float *s13, float *s22, float *s23, float *s33, float *u1, float *u2, float *u3, const float *dcrjx, const float *dcrjy, const float *dcrjz, const float *f, const float *f1_1, const float *f1_2, const float *f1_c, const float *f2_1, const float *f2_2, const float *f2_c, const float *f_1, const float *f_2, const float *f_c, const float *g, const float *g3, const float *g3_c, const float *g_c, const float *lami, const float *mui, const float a, const float nu, const int nx, const int ny, const int nz, const int bi, const int bj, const int ei, const int ej);
-__global__ void dtopo_str_112(float *s11, float *s12, float *s13, float *s22, float *s23, float *s33, float *u1, float *u2, float *u3, const float *dcrjx, const float *dcrjy, const float *dcrjz, const float *f, const float *f1_1, const float *f1_2, const float *f1_c, const float *f2_1, const float *f2_2, const float *f2_c, const float *f_1, const float *f_2, const float *f_c, const float *g, const float *g3, const float *g3_c, const float *g_c, const float *lami, const float *mui, const float a, const float nu, const int nx, const int ny, const int nz, const int bi, const int bj, const int ei, const int ej);
-__global__ void dtopo_init_material_111(float *lami, float *mui, float *rho, const int nx, const int ny, const int nz);
-#endif
-
diff --git a/include/topography/mapping.cuh b/include/topography/mapping.cuh
new file mode 100644
index 0000000..fad642b
--- /dev/null
+++ b/include/topography/mapping.cuh
@@ -0,0 +1,42 @@
+#ifndef _TOPOGRAPHY_MAPPING_CUH
+#define _TOPOGRAPHY_MAPPING_CUH
+#include <topography/mapping.h>
+#define TOL 1e-4
+
+
+__device__ __host__ __inline__ float topo_mapping0(const float f, const float r,
+                                                  const float h, const int n) {
+        float l = (n - 2) * h;
+        float d1 = h * 6.0;
+
+        if (r < h * MAPPING_START_POINT) return r;
+        else 
+                return f * (r - h * MAPPING_START_POINT) + h * MAPPING_START_POINT;
+}                             
+
+
+// Differentiate mapping with respect to r1, r2  
+__device__ __host__ __inline__ float topo_mapping(const float f_1, const float r,
+                                                  const float h, const int n) {
+        float l = (n - 2) * h;
+        float d1 = h * 6.0;
+
+        //return 0.0;
+        return f_1 * (r - h * MAPPING_START_POINT);
+        //return r*(d1 - r)*f_1/(d1 - l);
+}
+
+// Differentiate mapping with respect to r3
+__device__ __host__ __inline__ float topo_diff_mapping(const float f,
+                                                       const float r,
+                                                       const float h,
+                                                       const int n) {
+        float l = (n - 2) * h;
+        float d1 = h * 6.0;
+        return f;
+        //return  (- d1*f + l + 2*r*f - 2*r)/( l - d1);
+}
+
+#endif
+
+
diff --git a/include/topography/mapping.h b/include/topography/mapping.h
new file mode 100644
index 0000000..4d08f37
--- /dev/null
+++ b/include/topography/mapping.h
@@ -0,0 +1,32 @@
+#ifndef _TOPOGRAPHY_MAPPING_H
+#define _TOPOGRAPHY_MAPPING_H
+#define OVERLAP 7.0
+#define MAPPING_START_POINT 7
+#define MAPPING_INVERSION_TOL 1e-2
+#define MAPPING_MAX_ITER 1000
+
+struct mapping {
+    double dzb;
+    double dzt;
+    double h;
+    double r[4];
+    double z[4];
+    double m[4];
+};
+
+
+double map_height(const int nz, const double dz);
+struct mapping map_init(const double dzb, const double dzt, const double h);
+int map_find_cell_r(const double r, const struct mapping *map);
+int map_find_cell_z(const double z, const struct mapping *map);
+double map_eval(const double r, const struct mapping *map);
+double map_eval_derivative(const double r, const struct mapping *map);
+double map_invert(const double z, const struct mapping *map, const double eps, const int maxiter);
+
+// Error handling 
+enum map_err_codes {MAP_SUCCESS, MAP_NON_MONOTONIC, MAP_OUTSIDE};
+
+const char* map_error_string(const enum map_err_codes err_code);
+enum map_err_codes map_get_last_error(void);
+
+#endif
diff --git a/include/topography/metrics/metrics.h b/include/topography/metrics/metrics.h
index 52c4881..401ada4 100644
--- a/include/topography/metrics/metrics.h
+++ b/include/topography/metrics/metrics.h
@@ -22,12 +22,14 @@
 #include <awp/definitions.h>
 #include <grid/grid_3d.h>
 
-#define pmetrics_f_index(g,i,j) ((g)->offset[1] + (g)->bounds_y[0] + j) + \
+// This parameter pads the compute region. Its needed for the computation of
+// derivative and interpolation stencils. Do not change its value.
+
+#define metrics_f_index(g,i,j) ((g)->offset[1] + (g)->bounds_y[0] + j) + \
                                ((g)->offset[0] + (g)->bounds_x[0] + i) * \
                                (g)->slice
-#define metrics_f_index(g,i,j) ((g).offset[1] + (g).bounds_y[0] + j) + \
-                               ((g).offset[0] + (g).bounds_x[0] + i) * \
-                               (g).slice
+
+static const int metrics_padding = 8;
 
 /*
  * Topography function `f(x1, x2)`. 
@@ -128,7 +130,8 @@ typedef struct
 
 } g_grid_t;
 
-f_grid_t metrics_init_f(const int *size, const _prec gridspacing);
+f_grid_t metrics_init_f(const int *size, const _prec gridspacing,
+                            const int pad);
 void metrics_build_f(f_grid_t *f);
 void metrics_free_f(f_grid_t *f);
 void metrics_print_info_f(const f_grid_t *f);
@@ -142,11 +145,17 @@ void metrics_h_free_f(f_grid_t *f);
 void metrics_d_free_f(f_grid_t *f);
 void metrics_interpolate_f(f_grid_t *f);
 void metrics_differentiate_f(f_grid_t *f);
+void metrics_shift_f(f_grid_t *fout, const f_grid_t *fin);
 int metrics_interpolate_f_point(const f_grid_t *f, prec *out, const prec *in,
                                 const prec *x, const prec *y,
                                 const grid3_t grid, const prec *qx,
                                 const prec *qy, const int m, const int deg);
 
+int metrics_interpolate_jacobian(const f_grid_t *fgrid, float *out, const float *f, const float *g,
+                        const float *x, const float *y, const float *z,
+                        grid3_t grid, const float *qx,
+                        const float *qy, const float *qz, const int m, const int deg);
+
 g_grid_t metrics_init_g(const int *size, const _prec gridspacing);
 void metrics_build_g(g_grid_t *g);
 void metrics_free_g(g_grid_t *g);
diff --git a/include/topography/metrics/shift.h b/include/topography/metrics/shift.h
new file mode 100644
index 0000000..7288fd3
--- /dev/null
+++ b/include/topography/metrics/shift.h
@@ -0,0 +1,9 @@
+#ifndef METRICS_SHIFT_H
+#define METRICS_SHIFT_H
+
+#include <topography/metrics/metrics.h>
+
+void metrics_shift_f_apply(float *fout, const float *fin, const int nx,
+                           const int ny);
+
+#endif
diff --git a/include/topography/mms.cuh b/include/topography/mms.cuh
new file mode 100644
index 0000000..397ff74
--- /dev/null
+++ b/include/topography/mms.cuh
@@ -0,0 +1,50 @@
+#ifndef _TOPOGRAPHY_MMS_H
+#define _TOPOGRAPHY_MMS_H
+
+
+#include <mpi.h>
+
+#ifdef __cplusplus
+extern "C" {
+
+#include <awp/error.h>
+#endif
+void mms_init(const char *MMSFILE, const int *nxt,
+              const int *nyt, const int *nzt, const int ngrids, float **d_d1,
+              float **d_lam, float **d_mu, float **d_qp, float **d_qs,
+              float **d_vx, float **d_vy, float **d_vz, float **d_xx,
+              float **d_yy, float **d_zz, float **d_xy, float **d_xz,
+              float **d_yz, int px, int py, int rank, const MPI_Comm comm, const float *h, const float dt);
+
+void mms_exact_velocity(
+              float *d_vx, float *d_vy, float *d_vz,
+              const int nx, const int ny, const int nz, 
+              const int px, const int py, const int pz, 
+              const int bi, const int bj, const int bk, 
+              const int ei, const int ej, const int ek, 
+              const float h, const float t, const int apply_in_interior);
+
+void mms_exact_stress(
+              float *d_xx, float *d_yy, float *d_zz, 
+              float *d_xy, float *d_xz, float *d_yz, 
+              const int nx, const int ny, const int nz, 
+              const int px, const int py, const int pz, 
+              const int bi, const int bj, const int bk, 
+              const int ei, const int ej, const int ek, 
+              const float h, const float t, const int apply_in_interior);
+
+
+void mms_force_velocity(float *d_vx, float *d_vy, float *d_vz, const int nx,
+                        const int ny, const int nz, const float h, const int px,
+                        const int py, const int pz, const float t, const float dt);
+
+void mms_force_stress(float *d_xx, float *d_yy, float *d_zz, float *d_xy,
+                      float *d_xz, float *d_yz, const int nx, const int ny, const int nz,
+                      const float h, const int px, const int py, const int pz, const float t, const float dt);
+#ifdef __cplusplus
+}
+#endif
+
+#endif // MMS_CUH
+
+
diff --git a/include/topography/receivers/receiver.h b/include/topography/receivers/receiver.h
index 1ec9fa4..aa17908 100644
--- a/include/topography/receivers/receiver.h
+++ b/include/topography/receivers/receiver.h
@@ -6,8 +6,10 @@
 typedef source_t recv_t;
 recv_t receiver_init(const char *filename, 
                      const enum grid_types grid_type,
+                     const enum source_type st,
                      const input_t *input,
                      const grids_t *grids, 
+                     const struct mapping *map, 
                      const int ngrids,
                      const f_grid_t *f, 
                      const int rank,
diff --git a/include/topography/receivers/receivers.h b/include/topography/receivers/receivers.h
index 1ad49c2..8f6f468 100644
--- a/include/topography/receivers/receivers.h
+++ b/include/topography/receivers/receivers.h
@@ -7,8 +7,9 @@
 
 #include <topography/grids.h>
 #include <topography/metrics/metrics.h>
+#include <topography/receivers/receiver.h>
 
-void receivers_init(const char *filename, const grids_t *grids, int ngrids,
+void receivers_init(const char *filename, const grids_t *grids, const struct mapping *map, int ngrids,
                     const f_grid_t *f, const MPI_Comm comm, const int rank,
                     const int size);
 void receivers_finalize(void);
@@ -16,7 +17,9 @@ void receivers_write(const prec *d_vx, const prec *d_vy, const prec *d_vz,
                      const size_t step, const size_t num_steps,
                      const int grid_num);
 size_t receivers_last_step(void);
+recv_t receivers_get_receiver(enum grid_types grid_type);
 void receivers_step_format(char *out, size_t step, const char *base);
 
+
 #endif
 
diff --git a/include/topography/receivers/sgt.h b/include/topography/receivers/sgt.h
index 867aab6..791a833 100644
--- a/include/topography/receivers/sgt.h
+++ b/include/topography/receivers/sgt.h
@@ -6,14 +6,15 @@
 #include <mpi.h>
 
 #include <topography/grids.h>
+#include <topography/mapping.h>
 #include <topography/metrics/metrics.h>
 
-void sgt_init(const char *filename, const grids_t *grids, int ngrids,
+void sgt_init(const char *filename, const grids_t *grids, const struct mapping *map, int ngrids,
                     const f_grid_t *f, const MPI_Comm comm, const int rank,
                     const int size);
 void sgt_finalize(void);
 void sgt_write_material_properties(const prec *d_d1, const prec *d_lami,
-                                   const prec *d_mui, const int grid_num);
+                                   const prec *d_mui, const int grid_num, const int rank);
 void sgt_write(const prec *d_xx, const prec *d_yy, const prec *d_zz,
                const prec *d_xy, const prec *d_xz, const prec *d_yz,
                      const size_t step, const size_t num_steps,
diff --git a/include/topography/sources/forces.h b/include/topography/sources/forces.h
index 0bef642..484a01a 100644
--- a/include/topography/sources/forces.h
+++ b/include/topography/sources/forces.h
@@ -6,17 +6,22 @@
 #include <mpi.h>
 
 #include <topography/grids.h>
+#include <topography/mapping.h>
 #include <topography/sources/source.h>
 #include <topography/metrics/metrics.h>
 
-void forces_init(const char *filename, const grids_t *grids, int ngrids,
-                  const f_grid_t *f, const MPI_Comm comm, const int rank,
-                  const int size);
+void forces_init(const char *filename, const grids_t *grids, const struct mapping *map, int ngrids,
+                  const f_grid_t *f, const g_grid_t *g, const MPI_Comm comm, const int rank,
+                  const int size, const float *d_rho, const int istopo);
 int forces_boundary_check(const source_t *Fx);
 void forces_read(const size_t step);
 void forces_add(prec *d_u1, prec *d_v1, prec *d_w1, const prec *d_d1,
                 const size_t step, const prec h, const prec dt,
                 const f_grid_t *f, const g_grid_t *g, const int grid_num);
+void forces_add_cartesian(prec *d_xz, prec *d_yz, prec *d_zz, const size_t step,
+                const int nx, const int ny, const int nz, const prec h, const prec dt, const int grid_num);
+void forces_add_cartesian_velocity(prec *d_vx, prec *d_vy, prec *d_vz, const size_t step,
+                const int nx, const int ny, const int nz, const prec h, const prec dt, const int grid_num);
 void forces_finalize(void);
 
 #endif
diff --git a/include/topography/sources/source.cuh b/include/topography/sources/source.cuh
index 54e867f..c7d4e54 100644
--- a/include/topography/sources/source.cuh
+++ b/include/topography/sources/source.cuh
@@ -21,7 +21,7 @@ __global__ void cusource_add_cartesian(prec *out, const prec *in,
                                  const int num_query, const grid3_t grid);
 void cusource_add_curvilinear_H(const cu_interp_t *I, prec *out, const prec *in,
                                 const prec h, const prec dt, const prec *f,
-                                const int ny, const prec *dg);
+                                const int ny, const prec *dg, const int zhat);
 __global__ void cusource_add_curvilinear(prec *out, const prec *in,
                                  const prec *lx, const prec *ly, const prec *lz,
                                  const int num_basis, const int *ix,
@@ -29,12 +29,12 @@ __global__ void cusource_add_curvilinear(prec *out, const prec *in,
                                  const int *lidx,
                                  const prec h, const prec dt,
                                  const int num_query, const grid3_t grid, 
-                                 const prec *f, const int ny, const prec *dg);
+                                 const prec *f, const int ny, const prec *dg, const int zhat);
 void cusource_add_force_H(const cu_interp_t *I, prec *out, const prec *in,
                           const prec *d1, const prec h, const prec dt,
                           const prec quad_weight,
                           const prec *f, const int nx, const int ny,
-                          const int nz, const prec *dg);
+                          const int nz, const prec *dg, const int sourcetype, const int dir);
 __global__ void cusource_add_force(prec *out, const prec *in, const prec *d1,
                                    const prec *lx, const prec *ly,
                                    const prec *lz, const int num_basis,
@@ -44,6 +44,26 @@ __global__ void cusource_add_force(prec *out, const prec *in, const prec *d1,
                                    const int num_query, const grid3_t grid,
                                    const prec *f, const int nx, const int ny,
                                    const int nz, const prec *dg);
+
+__global__ void cusource_add_force_stress(prec *out, const prec *in, const prec *d1,
+                                   const prec *lx, const prec *ly,
+                                   const prec *lz, const int num_basis,
+                                   const int *ix, const int *iy, const int *iz,
+                                   const int *lidx, const prec h, const prec dt,
+                                   const prec quad_weight,
+                                   const int num_query, const grid3_t grid,
+                                   const prec *f, const int nx, const int ny,
+                                   const int nz, const prec *dg, const int dir);
+
+__global__ void cusource_add_force_velocity(prec *out, const prec *in, const prec *d1,
+                                   const prec *lx, const prec *ly,
+                                   const prec *lz, const int num_basis,
+                                   const int *ix, const int *iy, const int *iz,
+                                   const int *lidx, const prec h, const prec dt,
+                                   const prec quad_weight,
+                                   const int num_query, const grid3_t grid,
+                                   const prec *f, const int nx, const int ny,
+                                   const int nz, const prec *dg, const int dir);
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/topography/sources/source.h b/include/topography/sources/source.h
index 810d164..4923374 100644
--- a/include/topography/sources/source.h
+++ b/include/topography/sources/source.h
@@ -14,16 +14,12 @@
 #include <topography/metrics/metrics.h>
 #include <mpi/io.h>
 #include <interpolation/interpolation.cuh>
+#include <topography/mapping.h>
 
 // Offsets in grid spacings factor with respect to the previous grid
 #define SOURCE_DM_OFFSET_X 0
 #define SOURCE_DM_OFFSET_Y -1
 
-// Shift due to inconsistency with the user coordinate (0, 0, 0) defined at a
-// material grid point, but (0, 0, 0) defined at the shear stress xz in the
-// internal coordinate system (see shift.c)
-#define SOURCE_OFFSET_X -0.5
-
 typedef struct {
         int *indices;
         int *offsets;
@@ -52,18 +48,25 @@ typedef struct {
         int use;
         char filename[STR_LEN*2];
         int ngrids;
+        size_t steps;
 
 } source_t;
 
 
+// Source type determines how to partition velocity and stress input/output types across
+// an MPI subdomain. 
+enum source_type {MOMENT_TENSOR, FORCE, RECEIVER, SGT};
+
 source_t source_init(const char *file_end, 
                      const enum grid_types grid_type,
                      const input_t *input,
                      const grids_t *grids, 
+                     const struct mapping *map, 
                      const int ngrids,
                      const f_grid_t *f, 
                      const int rank,
-                     const MPI_Comm comm);
+                     const MPI_Comm comm,
+                     const enum source_type st);
 
 void source_finalize(source_t *src);
 
@@ -71,31 +74,36 @@ void source_find_grid_number(const input_t *input, const
                              grids_t *grids, int *grid_number, 
                              const int *indices,
                              const int length,
-                             const int num_grids);
+                             const int num_grids,
+                             const int is_topo);
 void source_init_common(source_t *src, const char *filename,
                         const enum grid_types grid_type, 
                         const input_t *input, 
                         const grids_t *grids, 
+                        const struct mapping *map,
                         const int ngrids,
                         const f_grid_t *f,
                         const int rank, 
-                        const MPI_Comm comm);
+                        const MPI_Comm comm,
+                        const enum source_type st);
 MPI_Comm source_communicator(source_t *src, const int rank,
                              const MPI_Comm comm);
 void source_read(source_t *src, size_t step);
 void source_add_cartesian(prec *out, source_t *src, const size_t step,
                           const prec h, const prec dt, const int grid_num);
 
+// zhat: indicates if the source should be applied on the cell-centered grid in
+// the z-direction or not
 void source_add_curvilinear(prec *out, source_t *src, const size_t step,
                             const prec h, const prec dt, const prec *f,
-                            const int ny, const prec *dg, const int grid_num);
+                            const int ny, const prec *dg, const int grid_num, const int zhat);
 
 void source_add_force(prec *out, const prec *d1, source_t *src,
                       const size_t step, const prec h, const prec dt,
                       const prec quad_weight,
                       const prec *f, const int nx, const int ny, const int nz, 
                       const prec *dg,
-                      const int grid_num);
+                      const int grid_num, const int sourcetype, const int dir);
 
 #endif
 
diff --git a/include/topography/sources/sources.h b/include/topography/sources/sources.h
index 0fa1609..b09ff4f 100644
--- a/include/topography/sources/sources.h
+++ b/include/topography/sources/sources.h
@@ -6,11 +6,12 @@
 #include <mpi.h>
 
 #include <topography/grids.h>
+#include <topography/mapping.h>
 #include <topography/metrics/metrics.h>
 #include <topography/sources/source.h>
 
-void sources_init(const char *filename, const grids_t *grids, int ngrids,
-                  const f_grid_t *f, const MPI_Comm comm, const int rank,
+void sources_init(const char *filename, const grids_t *grids, const struct mapping *map, int ngrids,
+                  const f_grid_t *f, const g_grid_t *g, const MPI_Comm comm, const int rank,
                   const int size);
 void sources_read(const size_t step);
 void sources_add_cartesian(prec *d_xx, prec *d_yy, prec *d_zz, prec *d_xy,
diff --git a/include/topography/stress.cuh b/include/topography/stress.cuh
index 5e15934..f5758fe 100644
--- a/include/topography/stress.cuh
+++ b/include/topography/stress.cuh
@@ -1,24 +1,25 @@
 #ifndef _TOPOGRAPHY_STRESS_H
 #define _TOPOGRAPHY_STRESS_H
-#include <cuda.h>
-#include <nvToolsExt.h>
-#include <stdio.h>
 
 #include <awp/definitions.h>
 #include <topography/topography.h>
 #include <topography/opt_topography.cuh>
-#include <topography/kernels/optimized_stress.cuh>
-#include <topography/kernels/optimized_velocity.cuh>
-#include <topography/kernels/optimized_launch_config.cuh>
+#include <topography/stress.cuh>
 
 #ifdef __cplusplus
 extern "C" {
 #endif
+void topo_set_constants(topo_t *T);
 void topo_stress_interior_H(topo_t *T);
 void topo_stress_left_H(topo_t *T);
 void topo_stress_right_H(topo_t *T);
 #ifdef __cplusplus
 }
+#else
+void topo_set_constants(topo_t *T);
+void topo_stress_interior_H(topo_t *T);
+void topo_stress_left_H(topo_t *T);
+void topo_stress_right_H(topo_t *T);
 #endif
 
 #endif
diff --git a/include/topography/stress_attenuation.cuh b/include/topography/stress_attenuation.cuh
deleted file mode 100644
index 1d9a719..0000000
--- a/include/topography/stress_attenuation.cuh
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef _TOPOGRAPHY_STRESS_ATTENUATION_H
-#define _TOPOGRAPHY_STRESS_ATTENUATION_H
-
-#include <topography/stress.cuh>
-
-#ifdef __cplusplus
-extern "C" {
-void topo_set_constants(topo_t *T);
-#endif
-#ifdef __cplusplus
-}
-#else
-void topo_set_constants(topo_t *T);
-#endif
-
-#endif
diff --git a/include/topography/topography.cuh b/include/topography/topography.cuh
index 07913cc..d8b0457 100644
--- a/include/topography/topography.cuh
+++ b/include/topography/topography.cuh
@@ -152,7 +152,7 @@ void topo_stress_right_H(topo_t *T);
 }
 #endif
 
-// Number of threads per block to use
+// Min. block dimensions
 #ifndef TBX
 #define TBX 1
 #endif
diff --git a/include/topography/topography.h b/include/topography/topography.h
index 43de102..2118bb3 100644
--- a/include/topography/topography.h
+++ b/include/topography/topography.h
@@ -11,6 +11,7 @@
 #include <awp/definitions.h>
 #include <vtk/vtk.h>
 #include <test/test.h>
+#include <topography/mapping.h>
 
 
 // TOPO: Enable topography calls. If disabled, then no topography function calls
@@ -204,6 +205,9 @@ typedef struct
         _prec dth;
         _prec timestep;
         _prec gridspacing;
+        _prec block_height;
+        _prec gridspacing_bot;
+        _prec gridspacing_top;
         // Material properties
         _prec*  __restrict__ rho;
         _prec*  __restrict__ lami;
@@ -248,6 +252,7 @@ typedef struct
 
         // Topography function
         f_grid_t metrics_f;
+        f_grid_t metrics_f_init;
         // Grid stretching function
         g_grid_t metrics_g;
         grid3_t topography_grid;
@@ -270,6 +275,8 @@ typedef struct
         cudaStream_t stream_2;
         cudaStream_t stream_i;
 
+        struct mapping map;
+
 } topo_t;                 
                    
 topo_t topo_init(const int USETOPO, 
@@ -287,6 +294,8 @@ topo_t topo_init(const int USETOPO,
                  int nzt,
                  const _prec dt,
                  const _prec h,
+                 const _prec hb,
+                 const _prec ht,
                  cudaStream_t stream_1,
                  cudaStream_t stream_2,
                  cudaStream_t stream_i
diff --git a/include/topography/velocity.cuh b/include/topography/velocity.cuh
index 02cdd93..a6de6c7 100644
--- a/include/topography/velocity.cuh
+++ b/include/topography/velocity.cuh
@@ -6,9 +6,6 @@
 
 #include <awp/definitions.h>
 #include <topography/topography.h>
-#include <topography/opt_topography.cuh>
-#include <topography/kernels/optimized_velocity.cuh>
-#include <topography/kernels/optimized_launch_config.cuh>
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 0b0e955..b07218a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,13 +1,12 @@
 add_subdirectory(awp)
-add_subdirectory(grid)
 add_subdirectory(topography)
 add_subdirectory(argparse)
 add_subdirectory(mpi)
 add_subdirectory(interpolation)
-add_subdirectory(functions)
 add_subdirectory(buffers)
 add_subdirectory(readers)
 add_subdirectory(vtk)
 add_subdirectory(test)
 add_subdirectory(checksum)
+add_subdirectory(grid)
 
diff --git a/src/argparse/CMakeLists.txt b/src/argparse/CMakeLists.txt
index ebfacce..d13c1dd 100644
--- a/src/argparse/CMakeLists.txt
+++ b/src/argparse/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(HEADERS
-    ${AWP_MINI_SOURCE_DIR}/include/argparse/argparse.h
+    ${AWP_SOURCE_DIR}/include/argparse/argparse.h
     )
 
 add_library(argparse
@@ -8,6 +8,6 @@ add_library(argparse
 
 target_include_directories(argparse
         PUBLIC
-        ${AWP_MINI_SOURCE_DIR}/include/
+        ${AWP_SOURCE_DIR}/include/
         )
 
diff --git a/src/awp/CMakeLists.txt b/src/awp/CMakeLists.txt
index cf493eb..c26ca9e 100644
--- a/src/awp/CMakeLists.txt
+++ b/src/awp/CMakeLists.txt
@@ -1,6 +1,5 @@
 set(HEADERS 
-   ${AWP_MINI_SOURCE_DIR}/include/awp/definitions.h 
-   ${AWP_MINI_SOURCE_DIR}/include/awp/pmcl3d_cons.h 
+   ${AWP_SOURCE_DIR}/include/awp/pmcl3d_cons.h 
    )
 
 add_library(awp
@@ -9,7 +8,7 @@ add_library(awp
 
 target_include_directories(awp
         PUBLIC
-        ${AWP_MINI_SOURCE_DIR}/include/
+        ${AWP_SOURCE_DIR}/include/
         )
 
 
@@ -19,7 +18,7 @@ add_library(error
 
 target_include_directories(error
         PUBLIC
-        ${AWP_MINI_SOURCE_DIR}/include/
+        ${AWP_SOURCE_DIR}/include/
         )
 
 add_library(lpmcl3d
@@ -37,11 +36,11 @@ add_library(lpmcl3d
         utils.c
         ${HEADERS})
 
-target_link_libraries(lpmcl3d opt_topography_attenuation buffers mpi checksum)
+target_link_libraries(lpmcl3d topography buffers mpi checksum)
 #
 target_include_directories(lpmcl3d
         PUBLIC
-        ${AWP_MINI_SOURCE_DIR}/include/
+        ${AWP_SOURCE_DIR}/include/
         )
 
 add_executable(pmcl3d pmcl3d.c)
diff --git a/src/awp/cerjan.c b/src/awp/cerjan.c
index ac907d8..dc82e63 100644
--- a/src/awp/cerjan.c
+++ b/src/awp/cerjan.c
@@ -4,61 +4,64 @@
 
 void inicrj(_prec ARBC, int *coords, int nxt, int nyt, int nzt, int NX, int NY, int ND, Grid1D dcrjx, Grid1D dcrjy, Grid1D dcrjz, int islowest, int NPC)
 {
-  int nxp, nyp, nzp;
-  int i,   j,   k;
+ 
+  int i,   j,   k, ix, iy, iz;
   _prec alpha;
-  alpha = sqrt(-log(ARBC))/ND;
+  alpha = sqrt(-log(ARBC))/(ND-1);
+
+    if (NPC < 2)
+    {
+        for(i=0;i<4+nxt;i++)
+        {
+            ix = nxt*coords[0] + i + 1 - 2; //ix is one-indexing
+            if((ix>=1) && (ix<=ND))
+            {
+                dcrjx[ngsl+i] = dcrjx[ngsl+i]*(exp(-((alpha*(ND-ix))*(alpha*(ND-ix)))));
+            }
+
+            if( (ix>=(NX-ND+1)) && (ix<=NX))
+            {
+                dcrjx[ngsl+i] = dcrjx[ngsl+i]*(exp(-((alpha*(ix-(NX-ND)-1))*(alpha*(ix-(NX-ND)-1)))));
+            }
+
+            if((ix<1) || (ix>NX))
+            {
+                dcrjx[ngsl+i] = ARBC;
+            }
+        }
+
+        for(j=0;j<4+nyt;j++)
+        {
+            iy = nyt*coords[1] + j + 1 - 2; //iy is one-indexing
+            if((iy>=1) && (iy<=ND))
+            {
+                dcrjy[ngsl+j] = dcrjy[ngsl+j]*(exp(-((alpha*(ND-iy))*(alpha*(ND-iy)))));
+            }
+
+            if((iy>=(NY-ND+1)) && (iy<NY))
+            {
+                dcrjy[ngsl+j] = dcrjy[ngsl+j]*(exp(-((alpha*(iy-(NY-ND)-1))*(alpha*(iy-(NY-ND)-1)))));
+            }
+
+            if((iy<1) || (iy>NY))
+            {
+                dcrjy[ngsl+j] = ARBC;
+            }
+        }
+    }
 
-  nxp   = nxt*coords[0] + 1;
-  if ((nxp <= ND) && (NPC < 2))  /* added by Daniel for periodic BCs */
-  {
-     for(i=0;i<ND;i++)
-     {
-        nxp        = i + 1;
-        dcrjx[i+2+ngsl] = dcrjx[i+2+ngsl]*(exp(-((alpha*(ND-nxp+1))*(alpha*(ND-nxp+1)))));
-     } 
-  }
-  nxp   = nxt*coords[0] + 1;
-  if( ((nxp+nxt-1) >= (NX-ND+1)) && (NPC < 2)) /* added by Daniel for periodic BCs */
-  {
-     for(i=nxt-ND;i<nxt;i++)
-     {
-        nxp        = i + NX - nxt + 1;
-        dcrjx[i+2+ngsl] = dcrjx[i+2+ngsl]*(exp(-((alpha*(ND-(NX-nxp)))*(alpha*(ND-(NX-nxp))))));
-     }
-  }
 
-  nyp   = nyt*coords[1] + 1;
-  if((nyp <= ND) && (NPC < 2)) /* added by Daniel for periodic BCs */
-  {
-     for(j=0;j<ND;j++)
-     {
-        nyp        = j + 1;
-        dcrjy[j+2+ngsl] = dcrjy[j+2+ngsl]*(exp(-((alpha*(ND-nyp+1))*(alpha*(ND-nyp+1)))));
-     }
-  }
-  nyp   = nyt*coords[1] + 1;
-  if(((nyp+nyt-1) >= (NY-ND+1)) && (NPC < 2))
-  {
-     for(j=nyt-ND;j<nyt;j++)
-     {
-        nyp        = j + NY - nyt + 1;
-        dcrjy[j+2+ngsl] = dcrjy[j+2+ngsl]*(exp(-((alpha*(ND-(NY-nyp)))*((alpha*(ND-(NY-nyp)))))));
-     }
-  }
 
   /* in the vertical direction, the Cerjan ABCs are only set for the lowest grid */
-  if (islowest){
-     nzp = 1;
-     if(nzp <= ND)
-     {
-	for(k=0;k<ND;k++)
-	{
-	   nzp            = k + 1;
-	   dcrjz[k+align] = dcrjz[k+align]*(exp(-((alpha*(ND-nzp+1))*((alpha*(ND-nzp+1))))));
-	}
-     }
-  }
+    if (islowest)
+    {
+        for(k=0;k<ND;k++)
+        {
+            iz=k+1; //iz is one-indexing
+            dcrjz[k+align] = dcrjz[k+align]*(exp(-((alpha*(ND-iz))*(alpha*(ND-iz)))));
+        }
+    }
+
   return;
 }  
 
diff --git a/src/awp/command.c b/src/awp/command.c
index 05e1fe4..50a1b83 100644
--- a/src/awp/command.c
+++ b/src/awp/command.c
@@ -55,6 +55,16 @@
 *  RECVFILE     <STRING>                      Receiver output file
 *  FORCEFILE    <STRING>                      Boundary point force input file
 *  SGTFILE      <STRING>                      Strain Green's tensor output file
+*  MMSFILE      <STRING>                      MMS input file
+*  DHB          <FLOAT>                       Grid spacing at the bottom of the curvilinear block  
+*  DHT          <FLOAT>                       Grid spacing at the top of the curvilinear block  
+*  ENERGYFILE  <STRING>                       File to write energy information at each time step to  
+*  QSI          <FLOAT>                         Qs setting, when QSI>=1, Qs is constant with Qs=QSI, when QSI<1, Qs depends on local Vs where Qs=QSI*Vs
+*  QPQSR        <FLOAT>                         Qp/Qs ratio
+*  MAXVPVSR     <FLOAT>                         Cap of Vp/Vs ratio
+*  VMIN         <FLOAT>                         Minimum allowable Vs
+*  VMAX         <FLOAT>                         Maximum allowable Vp
+*  DMIN         <FLOAT>                         Minimum allowable density
 ****************************************************************************************************************
 */
 
@@ -134,6 +144,18 @@ const char def_SOURCEFILE[IN_FILE_LEN] = "";
 const char def_RECVFILE[IN_FILE_LEN] = "";
 const char def_FORCEFILE[IN_FILE_LEN] = "";
 const char def_SGTFILE[IN_FILE_LEN] = "";
+const char def_MMSFILE[IN_FILE_LEN] = "";
+const char def_ENERGYFILE[IN_FILE_LEN] = "";
+
+const _prec def_QSI = 0.1;
+const _prec def_QPQSR = 2.;
+const _prec def_MAXVPVSR = 10.;
+const _prec def_VMIN = 0.;
+const _prec def_VMAX = 9900.;
+const _prec def_DMIN = 1500.;
+
+const _prec def_DHB = -1.0;
+const _prec def_DHT = -1.0;
 
 void parsemultiple(char *optarg, int *val);
 
@@ -161,7 +183,9 @@ void command(int argc, char **argv, _prec *TMAX, _prec *DH, _prec *DT,
              int *USETOPO, char *SOURCEFILE,
              int *USESOURCEFILE, char *RECVFILE, int *USERECVFILE,
              char *FORCEFILE, int *USEFORCEFILE,
-             char *SGTFILE, int *USESGTFILE)
+             char *SGTFILE, int *USESGTFILE, char *MMSFILE, int *USEMMSFILE, float *DHB, float *DHT,
+             char *ENERGYFILE, int *USEENERGYFILE, 
+             _prec *QSI, _prec *QPQSR, _prec *MAXVPVSR, _prec *VMIN, _prec *VMAX, _prec *DMIN)
 {
         int p;
 
@@ -222,12 +246,21 @@ void command(int argc, char **argv, _prec *TMAX, _prec *DH, _prec *DT,
         strcpy(INTOPO, def_INTOPO);
         strcpy(SOURCEFILE, def_SOURCEFILE);
         strcpy(RECVFILE, def_RECVFILE);
+        strcpy(MMSFILE, def_MMSFILE);
+
+
+        *QSI = def_QSI;
+        *QPQSR = def_QPQSR;
+        *MAXVPVSR = def_MAXVPVSR;
+        *VMIN = def_VMIN;
+        *VMAX = def_VMAX;
+        *DMIN = def_DMIN;
 
 
         extern char *optarg;
         static const char *optstring =
             "-T:H:t:A:P:M:D:S:N:V:B:n:I:R:Q:X:Y:Z:x:y:G:z:i:l:h:30:p:s:r:W:1:2:"
-            "3:11:12:13:21:22:23:100:101:102:103:106:107:109:9:o:c:";
+            "3:11:12:13:21:22:23:100:101:102:103:106:107:109:9:14:o:c:15:16:17:40:41:42:43:44:45:";
         static struct option long_options[] = {
             {"TMAX", required_argument, NULL, 'T'},
             {"DH", required_argument, NULL, 'H'},
@@ -278,6 +311,16 @@ void command(int argc, char **argv, _prec *TMAX, _prec *DH, _prec *DT,
             {"RECVFILE", required_argument, NULL, 109},
             {"FORCEFILE", required_argument, NULL, 9},
             {"SGTFILE", required_argument, NULL, 10},
+            {"MMSFILE", required_argument, NULL, 14},
+            {"DHB", required_argument, NULL, 15},
+            {"DHT", required_argument, NULL, 16},
+            {"ENERGYFILE", required_argument, NULL, 17},
+            {"QSI", required_argument, NULL, 40},
+            {"QPQSR", required_argument, NULL, 41},
+            {"MAXVPVSR", required_argument, NULL, 42},
+            {"VMIN", required_argument, NULL, 43},
+            {"VMAX", required_argument, NULL, 44},
+            {"DMIN", required_argument, NULL, 45},
         };
 
 
@@ -446,6 +489,38 @@ void command(int argc, char **argv, _prec *TMAX, _prec *DH, _prec *DT,
                                 strcpy(SGTFILE, optarg);
                                 *USESGTFILE = 1;
                                 break;
+                        case 14:
+                                strcpy(MMSFILE, optarg);
+                                *USEMMSFILE = 1;
+                                break;
+                        case 15:
+                                *DHB = atof(optarg);
+                                break;
+                        case 16:
+                                *DHT = atof(optarg);
+                                break;
+                        case 17:
+                                strcpy(ENERGYFILE, optarg);
+                                *USEENERGYFILE = 1;
+                                break;
+                        case 40:
+                                *QSI = atof(optarg);
+                                break;
+                        case 41:
+                                *QPQSR = atof(optarg);
+                                break;
+                        case 42:
+                                *MAXVPVSR = atof(optarg);
+                                break;
+                        case 43:
+                                *VMIN = atof(optarg);
+                                break;
+                        case 44:
+                                *VMAX = atof(optarg);
+                                break;
+                        case 45:
+                                *DMIN = atof(optarg);
+                                break;
                         default:
                                 printf(
                                     "Usage: %s \nOptions:\n\t[(-T | --TMAX) "
@@ -520,6 +595,15 @@ void command(int argc, char **argv, _prec *TMAX, _prec *DH, _prec *DT,
                                 printf(
                                     "\n\t[(-10 | --SGTFILE) <SGT "
                                     "file>]\n\n");
+                                printf(
+                                    "\n\t[(-14 | --MMSFILE) <MMS "
+                                    "file>]\n\n");
+                                printf(
+                                    "\n\t[(-15 | --DHB) <Bottom grid spacing> ]\n\n");
+                                printf(
+                                    "\n\t[(-16 | --DHT) <Top grid spacing> ]\n\n");
+                                printf(
+                                    "\n\t[(-17 | --ENERGYFILE) <File to output energy information to> ]\n\n");
                                 exit(-1);
                 }
         }
diff --git a/src/awp/cuPrintf.cu b/src/awp/cuPrintf.cu
new file mode 100644
index 0000000..7742f9c
--- /dev/null
+++ b/src/awp/cuPrintf.cu
@@ -0,0 +1,879 @@
+/*
+	Copyright 2009 NVIDIA Corporation.  All rights reserved.
+
+	NOTICE TO LICENSEE:   
+
+	This source code and/or documentation ("Licensed Deliverables") are subject 
+	to NVIDIA intellectual property rights under U.S. and international Copyright 
+	laws.  
+
+	These Licensed Deliverables contained herein is PROPRIETARY and CONFIDENTIAL 
+	to NVIDIA and is being provided under the terms and conditions of a form of 
+	NVIDIA software license agreement by and between NVIDIA and Licensee ("License 
+	Agreement") or electronically accepted by Licensee.  Notwithstanding any terms 
+	or conditions to the contrary in the License Agreement, reproduction or 
+	disclosure of the Licensed Deliverables to any third party without the express 
+	written consent of NVIDIA is prohibited.     
+
+	NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE LICENSE AGREEMENT, 
+	NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THESE LICENSED 
+	DELIVERABLES FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED 
+	WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE 
+	LICENSED DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, 
+	NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.   NOTWITHSTANDING ANY 
+	TERMS OR CONDITIONS TO THE CONTRARY IN THE LICENSE AGREEMENT, IN NO EVENT SHALL 
+	NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, 
+	OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,	WHETHER 
+	IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,  ARISING OUT OF 
+	OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THESE LICENSED DELIVERABLES.  
+
+	U.S. Government End Users. These Licensed Deliverables are a "commercial item" 
+	as that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of 
+	"commercial computer  software"  and "commercial computer software documentation" 
+	as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995) and is provided to the 
+	U.S. Government only as a commercial end item.  Consistent with 48 C.F.R.12.212 
+	and 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all U.S. Government 
+	End Users acquire the Licensed Deliverables with only those rights set forth 
+	herein. 
+
+	Any use of the Licensed Deliverables in individual and commercial software must 
+	include, in the user documentation and internal comments to the code, the above 
+	Disclaimer and U.S. Government End Users Notice.
+ */
+
+/*
+ *	cuPrintf.cu
+ *
+ *	This is a printf command callable from within a kernel. It is set
+ *	up so that output is sent to a memory buffer, which is emptied from
+ *	the host side - but only after a cudaThreadSynchronize() on the host.
+ *
+ *	Currently, there is a limitation of around 200 characters of output
+ *	and no more than 10 arguments to a single cuPrintf() call. Issue
+ *	multiple calls if longer format strings are required.
+ *
+ *	It requires minimal setup, and is *NOT* optimised for performance.
+ *	For example, writes are not coalesced - this is because there is an
+ *	assumption that people will not want to printf from every single one
+ *	of thousands of threads, but only from individual threads at a time.
+ *
+ *	Using this is simple - it requires one host-side call to initialise
+ *	everything, and then kernels can call cuPrintf at will. Sample code
+ *	is the easiest way to demonstrate:
+ *
+	#include "cuPrintf.cu"
+ 	
+	__global__ void testKernel(int val)
+	{
+		cuPrintf("Value is: %d\n", val);
+	}
+
+	int main()
+	{
+		cudaPrintfInit();
+		testKernel<<< 2, 3 >>>(10);
+		cudaPrintfDisplay(stdout, true);
+		cudaPrintfEnd();
+        return 0;
+	}
+ *
+ *	See the header file, "cuPrintf.cuh" for more info, especially
+ *	arguments to cudaPrintfInit() and cudaPrintfDisplay();
+ */
+
+#ifndef CUPRINTF_CU
+#define CUPRINTF_CU
+
+#include "cuPrintf.cuh"
+#if __CUDA_ARCH__ > 100      // Atomics only used with > sm_10 architecture
+//#include <sm_11_atomic_functions.h>
+#endif
+
+// This is the smallest amount of memory, per-thread, which is allowed.
+// It is also the largest amount of space a single printf() can take up
+const static int CUPRINTF_MAX_LEN = 256;
+
+// This structure is used internally to track block/thread output restrictions.
+typedef struct __align__(8) {
+	int threadid;				// CUPRINTF_UNRESTRICTED for unrestricted
+	int blockid;				// CUPRINTF_UNRESTRICTED for unrestricted
+} cuPrintfRestriction;
+
+// The main storage is in a global print buffer, which has a known
+// start/end/length. These are atomically updated so it works as a
+// circular buffer.
+// Since the only control primitive that can be used is atomicAdd(),
+// we cannot wrap the pointer as such. The actual address must be
+// calculated from printfBufferPtr by mod-ing with printfBufferLength.
+// For sm_10 architecture, we must subdivide the buffer per-thread
+// since we do not even have an atomic primitive.
+__constant__ static char *globalPrintfBuffer = NULL;         // Start of circular buffer (set up by host)
+__constant__ static int printfBufferLength = 0;              // Size of circular buffer (set up by host)
+__device__ static cuPrintfRestriction restrictRules;         // Output restrictions
+__device__ volatile static char *printfBufferPtr = NULL;     // Current atomically-incremented non-wrapped offset
+
+// This is the header preceeding all printf entries.
+// NOTE: It *must* be size-aligned to the maximum entity size (size_t)
+typedef struct __align__(8) {
+    unsigned short magic;                   // Magic number says we're valid
+    unsigned short fmtoffset;               // Offset of fmt string into buffer
+    unsigned short blockid;                 // Block ID of author
+    unsigned short threadid;                // Thread ID of author
+} cuPrintfHeader;
+
+// Special header for sm_10 architecture
+#define CUPRINTF_SM10_MAGIC   0xC810        // Not a valid ascii character
+typedef struct __align__(16) {
+    unsigned short magic;                   // sm_10 specific magic number
+    unsigned short unused;
+    unsigned int thread_index;              // thread ID for this buffer
+    unsigned int thread_buf_len;            // per-thread buffer length
+    unsigned int offset;                    // most recent printf's offset
+} cuPrintfHeaderSM10;
+
+
+// Because we can't write an element which is not aligned to its bit-size,
+// we have to align all sizes and variables on maximum-size boundaries.
+// That means sizeof(double) in this case, but we'll use (long long) for
+// better arch<1.3 support
+#define CUPRINTF_ALIGN_SIZE      sizeof(long long)
+
+// All our headers are prefixed with a magic number so we know they're ready
+#define CUPRINTF_SM11_MAGIC  (unsigned short)0xC811        // Not a valid ascii character
+
+
+//
+//  getNextPrintfBufPtr
+//
+//  Grabs a block of space in the general circular buffer, using an
+//  atomic function to ensure that it's ours. We handle wrapping
+//  around the circular buffer and return a pointer to a place which
+//  can be written to.
+//
+//  Important notes:
+//      1. We always grab CUPRINTF_MAX_LEN bytes
+//      2. Because of 1, we never worry about wrapping around the end
+//      3. Because of 1, printfBufferLength *must* be a factor of CUPRINTF_MAX_LEN
+//
+//  This returns a pointer to the place where we own.
+//
+__device__ static char *getNextPrintfBufPtr()
+{
+    // Initialisation check
+    if(!printfBufferPtr)
+        return NULL;
+
+	// Thread/block restriction check
+	if((restrictRules.blockid != CUPRINTF_UNRESTRICTED) && (restrictRules.blockid != (blockIdx.x + gridDim.x*blockIdx.y)))
+		return NULL;
+	if((restrictRules.threadid != CUPRINTF_UNRESTRICTED) && (restrictRules.threadid != (threadIdx.x + blockDim.x*threadIdx.y + blockDim.x*blockDim.y*threadIdx.z)))
+		return NULL;
+
+	// Conditional section, dependent on architecture
+#if __CUDA_ARCH__ == 100
+    // For sm_10 architectures, we have no atomic add - this means we must split the
+    // entire available buffer into per-thread blocks. Inefficient, but what can you do.
+    int thread_count = (gridDim.x * gridDim.y) * (blockDim.x * blockDim.y * blockDim.z);
+    int thread_index = threadIdx.x + blockDim.x*threadIdx.y + blockDim.x*blockDim.y*threadIdx.z +
+                       (blockIdx.x + gridDim.x*blockIdx.y) * (blockDim.x * blockDim.y * blockDim.z);
+    
+    // Find our own block of data and go to it. Make sure the per-thread length
+	// is a precise multiple of CUPRINTF_MAX_LEN, otherwise we risk size and
+	// alignment issues! We must round down, of course.
+    unsigned int thread_buf_len = printfBufferLength / thread_count;
+	thread_buf_len &= ~(CUPRINTF_MAX_LEN-1);
+
+	// We *must* have a thread buffer length able to fit at least two printfs (one header, one real)
+	if(thread_buf_len < (CUPRINTF_MAX_LEN * 2))
+		return NULL;
+
+	// Now address our section of the buffer. The first item is a header.
+    char *myPrintfBuffer = globalPrintfBuffer + (thread_buf_len * thread_index);
+    cuPrintfHeaderSM10 hdr = *(cuPrintfHeaderSM10 *)(void *)myPrintfBuffer;
+    if(hdr.magic != CUPRINTF_SM10_MAGIC)
+    {
+        // If our header is not set up, initialise it
+        hdr.magic = CUPRINTF_SM10_MAGIC;
+        hdr.thread_index = thread_index;
+        hdr.thread_buf_len = thread_buf_len;
+        hdr.offset = 0;         // Note we start at 0! We pre-increment below.
+        *(cuPrintfHeaderSM10 *)(void *)myPrintfBuffer = hdr;       // Write back the header
+
+        // For initial setup purposes, we might need to init thread0's header too
+        // (so that cudaPrintfDisplay() below will work). This is only run once.
+        cuPrintfHeaderSM10 *tophdr = (cuPrintfHeaderSM10 *)(void *)globalPrintfBuffer;
+        tophdr->thread_buf_len = thread_buf_len;
+    }
+
+    // Adjust the offset by the right amount, and wrap it if need be
+    unsigned int offset = hdr.offset + CUPRINTF_MAX_LEN;
+    if(offset >= hdr.thread_buf_len)
+        offset = CUPRINTF_MAX_LEN;
+
+    // Write back the new offset for next time and return a pointer to it
+    ((cuPrintfHeaderSM10 *)(void *)myPrintfBuffer)->offset = offset;
+    return myPrintfBuffer + offset;
+#else
+    // Much easier with an atomic operation!
+    size_t offset = atomicAdd((unsigned int *)&printfBufferPtr, CUPRINTF_MAX_LEN) - (size_t)globalPrintfBuffer;
+    offset %= printfBufferLength;
+    return globalPrintfBuffer + offset;
+#endif
+}
+
+
+//
+//  writePrintfHeader
+//
+//  Inserts the header for containing our UID, fmt position and
+//  block/thread number. We generate it dynamically to avoid
+//	issues arising from requiring pre-initialisation.
+//
+__device__ static void writePrintfHeader(char *ptr, char *fmtptr)
+{
+    if(ptr)
+    {
+        cuPrintfHeader header;
+        header.magic = CUPRINTF_SM11_MAGIC;
+        header.fmtoffset = (unsigned short)(fmtptr - ptr);
+        header.blockid = blockIdx.x + gridDim.x*blockIdx.y;
+        header.threadid = threadIdx.x + blockDim.x*threadIdx.y + blockDim.x*blockDim.y*threadIdx.z;
+        *(cuPrintfHeader *)(void *)ptr = header;
+    }
+}
+
+
+//
+//  cuPrintfStrncpy
+//
+//  This special strncpy outputs an aligned length value, followed by the
+//  string. It then zero-pads the rest of the string until a 64-aligned
+//  boundary. The length *includes* the padding. A pointer to the byte
+//  just after the \0 is returned.
+//
+//  This function could overflow CUPRINTF_MAX_LEN characters in our buffer.
+//  To avoid it, we must count as we output and truncate where necessary.
+//
+__device__ static char *cuPrintfStrncpy(char *dest, const char *src, int n, char *end)
+{
+    // Initialisation and overflow check
+    if(!dest || !src || (dest >= end))
+        return NULL;
+
+    // Prepare to write the length specifier. We're guaranteed to have
+    // at least "CUPRINTF_ALIGN_SIZE" bytes left because we only write out in
+    // chunks that size, and CUPRINTF_MAX_LEN is aligned with CUPRINTF_ALIGN_SIZE.
+    int *lenptr = (int *)(void *)dest;
+    int len = 0;
+    dest += CUPRINTF_ALIGN_SIZE;
+
+    // Now copy the string
+    while(n--)
+    {
+        if(dest >= end)     // Overflow check
+            break;
+
+        len++;
+        *dest++ = *src;
+        if(*src++ == '\0')
+            break;
+    }
+
+    // Now write out the padding bytes, and we have our length.
+    while((dest < end) && (((long)dest & (CUPRINTF_ALIGN_SIZE-1)) != 0))
+    {
+        len++;
+        *dest++ = 0;
+    }
+    *lenptr = len;
+    return (dest < end) ? dest : NULL;        // Overflow means return NULL
+}
+
+
+//
+//  copyArg
+//
+//  This copies a length specifier and then the argument out to the
+//  data buffer. Templates let the compiler figure all this out at
+//  compile-time, making life much simpler from the programming
+//  point of view. I'm assuimg all (const char *) is a string, and
+//  everything else is the variable it points at. I'd love to see
+//  a better way of doing it, but aside from parsing the format
+//  string I can't think of one.
+//
+//  The length of the data type is inserted at the beginning (so that
+//  the display can distinguish between float and double), and the
+//  pointer to the end of the entry is returned.
+//
+__device__ static char *copyArg(char *ptr, const char *arg, char *end)
+{
+    // Initialisation check
+    if(!ptr || !arg)
+        return NULL;
+
+    // strncpy does all our work. We just terminate.
+    if((ptr = cuPrintfStrncpy(ptr, arg, CUPRINTF_MAX_LEN, end)) != NULL)
+        *ptr = 0;
+
+    return ptr;
+}
+
+template <typename T>
+__device__ static char *copyArg(char *ptr, T &arg, char *end)
+{
+    // Initisalisation and overflow check. Alignment rules mean that
+    // we're at least CUPRINTF_ALIGN_SIZE away from "end", so we only need
+    // to check that one offset.
+    if(!ptr || ((ptr+CUPRINTF_ALIGN_SIZE) >= end))
+        return NULL;
+
+    // Write the length and argument
+    *(int *)(void *)ptr = sizeof(arg);
+    ptr += CUPRINTF_ALIGN_SIZE;
+    *(T *)(void *)ptr = arg;
+    ptr += CUPRINTF_ALIGN_SIZE;
+    *ptr = 0;
+
+    return ptr;
+}
+
+
+//
+//  cuPrintf
+//
+//  Templated printf functions to handle multiple arguments.
+//  Note we return the total amount of data copied, not the number
+//  of characters output. But then again, who ever looks at the
+//  return from printf() anyway?
+//
+//  The format is to grab a block of circular buffer space, the
+//  start of which will hold a header and a pointer to the format
+//  string. We then write in all the arguments, and finally the
+//  format string itself. This is to make it easy to prevent
+//  overflow of our buffer (we support up to 10 arguments, each of
+//  which can be 12 bytes in length - that means that only the
+//  format string (or a %s) can actually overflow; so the overflow
+//  check need only be in the strcpy function.
+//
+//  The header is written at the very last because that's what
+//  makes it look like we're done.
+//
+//  Errors, which are basically lack-of-initialisation, are ignored
+//  in the called functions because NULL pointers are passed around
+//
+
+// All printf variants basically do the same thing, setting up the
+// buffer, writing all arguments, then finalising the header. For
+// clarity, we'll pack the code into some big macros.
+#define CUPRINTF_PREAMBLE \
+    char *start, *end, *bufptr, *fmtstart; \
+    if((start = getNextPrintfBufPtr()) == NULL) return 0; \
+    end = start + CUPRINTF_MAX_LEN; \
+    bufptr = start + sizeof(cuPrintfHeader);
+
+// Posting an argument is easy
+#define CUPRINTF_ARG(argname) \
+	bufptr = copyArg(bufptr, argname, end);
+
+// After args are done, record start-of-fmt and write the fmt and header
+#define CUPRINTF_POSTAMBLE \
+    fmtstart = bufptr; \
+    end = cuPrintfStrncpy(bufptr, fmt, CUPRINTF_MAX_LEN, end); \
+    writePrintfHeader(start, end ? fmtstart : NULL); \
+    return end ? (int)(end - start) : 0;
+
+__device__ int cuPrintf(const char *fmt)
+{
+	CUPRINTF_PREAMBLE;
+
+	CUPRINTF_POSTAMBLE;
+}
+template <typename T1> __device__ int cuPrintf(const char *fmt, T1 arg1)
+{
+	CUPRINTF_PREAMBLE;
+	    
+	CUPRINTF_ARG(arg1);
+
+	CUPRINTF_POSTAMBLE;
+}
+template <typename T1, typename T2> __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2)
+{
+	CUPRINTF_PREAMBLE;
+	    
+	CUPRINTF_ARG(arg1);
+	CUPRINTF_ARG(arg2);
+
+	CUPRINTF_POSTAMBLE;
+}
+template <typename T1, typename T2, typename T3> __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3)
+{
+	CUPRINTF_PREAMBLE;
+	    
+	CUPRINTF_ARG(arg1);
+	CUPRINTF_ARG(arg2);
+	CUPRINTF_ARG(arg3);
+
+	CUPRINTF_POSTAMBLE;
+}
+template <typename T1, typename T2, typename T3, typename T4> __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3, T4 arg4)
+{
+	CUPRINTF_PREAMBLE;
+	    
+	CUPRINTF_ARG(arg1);
+	CUPRINTF_ARG(arg2);
+	CUPRINTF_ARG(arg3);
+	CUPRINTF_ARG(arg4);
+
+	CUPRINTF_POSTAMBLE;
+}
+template <typename T1, typename T2, typename T3, typename T4, typename T5> __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3, T4 arg4, T5 arg5)
+{
+	CUPRINTF_PREAMBLE;
+	    
+	CUPRINTF_ARG(arg1);
+	CUPRINTF_ARG(arg2);
+	CUPRINTF_ARG(arg3);
+	CUPRINTF_ARG(arg4);
+	CUPRINTF_ARG(arg5);
+
+	CUPRINTF_POSTAMBLE;
+}
+template <typename T1, typename T2, typename T3, typename T4, typename T5, typename T6> __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3, T4 arg4, T5 arg5, T6 arg6)
+{
+	CUPRINTF_PREAMBLE;
+	    
+	CUPRINTF_ARG(arg1);
+	CUPRINTF_ARG(arg2);
+	CUPRINTF_ARG(arg3);
+	CUPRINTF_ARG(arg4);
+	CUPRINTF_ARG(arg5);
+	CUPRINTF_ARG(arg6);
+	CUPRINTF_POSTAMBLE;
+}
+template <typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7> __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3, T4 arg4, T5 arg5, T6 arg6, T7 arg7)
+{
+	CUPRINTF_PREAMBLE;
+	    
+	CUPRINTF_ARG(arg1);
+	CUPRINTF_ARG(arg2);
+	CUPRINTF_ARG(arg3);
+	CUPRINTF_ARG(arg4);
+	CUPRINTF_ARG(arg5);
+	CUPRINTF_ARG(arg6);
+	CUPRINTF_ARG(arg7);
+
+	CUPRINTF_POSTAMBLE;
+}
+template <typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8> __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3, T4 arg4, T5 arg5, T6 arg6, T7 arg7, T8 arg8)
+{
+	CUPRINTF_PREAMBLE;
+
+	CUPRINTF_ARG(arg1);
+	CUPRINTF_ARG(arg2);
+	CUPRINTF_ARG(arg3);
+	CUPRINTF_ARG(arg4);
+	CUPRINTF_ARG(arg5);
+	CUPRINTF_ARG(arg6);
+	CUPRINTF_ARG(arg7);
+	CUPRINTF_ARG(arg8);
+
+	CUPRINTF_POSTAMBLE;
+}
+template <typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8, typename T9> __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3, T4 arg4, T5 arg5, T6 arg6, T7 arg7, T8 arg8, T9 arg9)
+{
+	CUPRINTF_PREAMBLE;
+	    
+	CUPRINTF_ARG(arg1);
+	CUPRINTF_ARG(arg2);
+	CUPRINTF_ARG(arg3);
+	CUPRINTF_ARG(arg4);
+	CUPRINTF_ARG(arg5);
+	CUPRINTF_ARG(arg6);
+	CUPRINTF_ARG(arg7);
+	CUPRINTF_ARG(arg8);
+	CUPRINTF_ARG(arg9);
+
+	CUPRINTF_POSTAMBLE;
+}
+template <typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8, typename T9, typename T10> __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3, T4 arg4, T5 arg5, T6 arg6, T7 arg7, T8 arg8, T9 arg9, T10 arg10)
+{
+	CUPRINTF_PREAMBLE;
+	    
+	CUPRINTF_ARG(arg1);
+	CUPRINTF_ARG(arg2);
+	CUPRINTF_ARG(arg3);
+	CUPRINTF_ARG(arg4);
+	CUPRINTF_ARG(arg5);
+	CUPRINTF_ARG(arg6);
+	CUPRINTF_ARG(arg7);
+	CUPRINTF_ARG(arg8);
+	CUPRINTF_ARG(arg9);
+	CUPRINTF_ARG(arg10);
+
+	CUPRINTF_POSTAMBLE;
+}
+#undef CUPRINTF_PREAMBLE
+#undef CUPRINTF_ARG
+#undef CUPRINTF_POSTAMBLE
+
+
+//
+//	cuPrintfRestrict
+//
+//	Called to restrict output to a given thread/block.
+//	We store the info in "restrictRules", which is set up at
+//	init time by the host. It's not the cleanest way to do this
+//	because it means restrictions will last between
+//	invocations, but given the output-pointer continuity,
+//	I feel this is reasonable.
+//
+__device__ void cuPrintfRestrict(int threadid, int blockid)
+{
+    int thread_count = blockDim.x * blockDim.y * blockDim.z;
+	if(((threadid < thread_count) && (threadid >= 0)) || (threadid == CUPRINTF_UNRESTRICTED))
+		restrictRules.threadid = threadid;
+
+	int block_count = gridDim.x * gridDim.y;
+	if(((blockid < block_count) && (blockid >= 0)) || (blockid == CUPRINTF_UNRESTRICTED))
+		restrictRules.blockid = blockid;
+}
+
+
+///////////////////////////////////////////////////////////////////////////////
+// HOST SIDE
+
+#include <stdio.h>
+static FILE *printf_fp;
+
+static char *printfbuf_start=NULL;
+static char *printfbuf_device=NULL;
+static int printfbuf_len=0;
+
+
+//
+//  outputPrintfData
+//
+//  Our own internal function, which takes a pointer to a data buffer
+//  and passes it through libc's printf for output.
+//
+//  We receive the formate string and a pointer to where the data is
+//  held. We then run through and print it out.
+//
+//  Returns 0 on failure, 1 on success
+//
+static int outputPrintfData(char *fmt, char *data)
+{
+    // Format string is prefixed by a length that we don't need
+    fmt += CUPRINTF_ALIGN_SIZE;
+
+    // Now run through it, printing everything we can. We must
+    // run to every % character, extract only that, and use printf
+    // to format it.
+    char *p = strchr(fmt, '%');
+    while(p != NULL)
+    {
+        // Print up to the % character
+        *p = '\0';
+        fputs(fmt, printf_fp);
+        *p = '%';           // Put back the %
+
+        // Now handle the format specifier
+        char *format = p++;         // Points to the '%'
+        p += strcspn(p, "%cdiouxXeEfgGaAnps");
+        if(*p == '\0')              // If no format specifier, print the whole thing
+        {
+            fmt = format;
+            break;
+        }
+
+        // Cut out the format bit and use printf to print it. It's prefixed
+        // by its length.
+        int arglen = *(int *)data;
+        if(arglen > CUPRINTF_MAX_LEN)
+        {
+            fputs("Corrupt printf buffer data - aborting\n", printf_fp);
+            return 0;
+        }
+
+        data += CUPRINTF_ALIGN_SIZE;
+        
+        char specifier = *p++;
+        char c = *p;        // Store for later
+        *p = '\0';
+        switch(specifier)
+        {
+            // These all take integer arguments
+            case 'c':
+            case 'd':
+            case 'i':
+            case 'o':
+            case 'u':
+            case 'x':
+            case 'X':
+            case 'p':
+                fprintf(printf_fp, format, *((int *)data));
+                break;
+
+            // These all take double arguments
+            case 'e':
+            case 'E':
+            case 'f':
+            case 'g':
+            case 'G':
+            case 'a':
+            case 'A':
+                if(arglen == 4)     // Float vs. Double thing
+                    fprintf(printf_fp, format, *((float *)data));
+                else
+                    fprintf(printf_fp, format, *((double *)data));
+                break;
+
+            // Strings are handled in a special way
+            case 's':
+                fprintf(printf_fp, format, (char *)data);
+                break;
+
+            // % is special
+            case '%':
+                fprintf(printf_fp, "%%");
+                break;
+
+            // Everything else is just printed out as-is
+            default:
+                fprintf(printf_fp, format);
+                break;
+        }
+        data += CUPRINTF_ALIGN_SIZE;         // Move on to next argument
+        *p = c;                     // Restore what we removed
+        fmt = p;                    // Adjust fmt string to be past the specifier
+        p = strchr(fmt, '%');       // and get the next specifier
+    }
+
+    // Print out the last of the string
+    fputs(fmt, printf_fp);
+    return 1;
+}
+
+
+//
+//  doPrintfDisplay
+//
+//  This runs through the blocks of CUPRINTF_MAX_LEN-sized data, calling the
+//  print function above to display them. We've got this separate from
+//  cudaPrintfDisplay() below so we can handle the SM_10 architecture
+//  partitioning.
+//
+static int doPrintfDisplay(int headings, int clear, char *bufstart, char *bufend, char *bufptr, char *endptr)
+{
+    // Grab, piece-by-piece, each output element until we catch
+    // up with the circular buffer end pointer
+    int printf_count=0;
+    char printfbuf_local[CUPRINTF_MAX_LEN+1];
+    printfbuf_local[CUPRINTF_MAX_LEN] = '\0';
+
+    while(bufptr != endptr)
+    {
+        // Wrap ourselves at the end-of-buffer
+        if(bufptr == bufend)
+            bufptr = bufstart;
+
+        // Adjust our start pointer to within the circular buffer and copy a block.
+        cudaMemcpy(printfbuf_local, bufptr, CUPRINTF_MAX_LEN, cudaMemcpyDeviceToHost);
+
+        // If the magic number isn't valid, then this write hasn't gone through
+        // yet and we'll wait until it does (or we're past the end for non-async printfs).
+        cuPrintfHeader *hdr = (cuPrintfHeader *)printfbuf_local;
+        if((hdr->magic != CUPRINTF_SM11_MAGIC) || (hdr->fmtoffset >= CUPRINTF_MAX_LEN))
+        {
+            //fprintf(printf_fp, "Bad magic number in printf header\n");
+            break;
+        }
+
+        // Extract all the info and get this printf done
+        if(headings)
+            fprintf(printf_fp, "[%d, %d]: ", hdr->blockid, hdr->threadid);
+        if(hdr->fmtoffset == 0)
+            fprintf(printf_fp, "printf buffer overflow\n");
+        else if(!outputPrintfData(printfbuf_local+hdr->fmtoffset, printfbuf_local+sizeof(cuPrintfHeader)))
+            break;
+        printf_count++;
+
+        // Clear if asked
+        if(clear)
+            cudaMemset(bufptr, 0, CUPRINTF_MAX_LEN);
+
+        // Now advance our start location, because we're done, and keep copying
+        bufptr += CUPRINTF_MAX_LEN;
+    }
+
+    return printf_count;
+}
+
+
+//
+//  cudaPrintfInit
+//
+//  Takes a buffer length to allocate, creates the memory on the device and
+//  returns a pointer to it for when a kernel is called. It's up to the caller
+//  to free it.
+//
+extern "C" cudaError_t cudaPrintfInit(size_t bufferLen)
+{
+    // Fix up bufferlen to be a multiple of CUPRINTF_MAX_LEN
+    bufferLen = (bufferLen < CUPRINTF_MAX_LEN) ? CUPRINTF_MAX_LEN : bufferLen;
+    if((bufferLen % CUPRINTF_MAX_LEN) > 0)
+        bufferLen += (CUPRINTF_MAX_LEN - (bufferLen % CUPRINTF_MAX_LEN));
+    printfbuf_len = (int)bufferLen;
+
+    // Allocate a print buffer on the device and zero it
+    if(cudaMalloc((void **)&printfbuf_device, printfbuf_len) != cudaSuccess)
+		return cudaErrorInitializationError;
+    cudaMemset(printfbuf_device, 0, printfbuf_len);
+    printfbuf_start = printfbuf_device;         // Where we start reading from
+
+	// No restrictions to begin with
+	cuPrintfRestriction restrict;
+	restrict.threadid = restrict.blockid = CUPRINTF_UNRESTRICTED;
+	cudaMemcpyToSymbol(restrictRules, &restrict, sizeof(restrict));
+
+    // Initialise the buffer and the respective lengths/pointers.
+    cudaMemcpyToSymbol(globalPrintfBuffer, &printfbuf_device, sizeof(char *));
+    cudaMemcpyToSymbol(printfBufferPtr, &printfbuf_device, sizeof(char *));
+    cudaMemcpyToSymbol(printfBufferLength, &printfbuf_len, sizeof(printfbuf_len));
+
+    return cudaSuccess;
+}
+
+
+//
+//  cudaPrintfEnd
+//
+//  Frees up the memory which we allocated
+//
+extern "C" void cudaPrintfEnd()
+{
+    if(!printfbuf_start || !printfbuf_device)
+        return;
+
+    cudaFree(printfbuf_device);
+    printfbuf_start = printfbuf_device = NULL;
+}
+
+
+//
+//  cudaPrintfDisplay
+//
+//  Each call to this function dumps the entire current contents
+//	of the printf buffer to the pre-specified FILE pointer. The
+//	circular "start" pointer is advanced so that subsequent calls
+//	dumps only new stuff.
+//
+//  In the case of async memory access (via streams), call this
+//  repeatedly to keep trying to empty the buffer. If it's a sync
+//  access, then the whole buffer should empty in one go.
+//
+//	Arguments:
+//		outputFP     - File descriptor to output to (NULL => stdout)
+//		showThreadID - If true, prints [block,thread] before each line
+//
+extern "C" cudaError_t cudaPrintfDisplay(void *outputFP, bool showThreadID)
+{
+	printf_fp = (FILE *)((outputFP == NULL) ? stdout : outputFP);
+
+    // For now, we force "synchronous" mode which means we're not concurrent
+	// with kernel execution. This also means we don't need clearOnPrint.
+	// If you're patching it for async operation, here's where you want it.
+    bool sync_printfs = true;
+	bool clearOnPrint = false;
+
+    // Initialisation check
+    if(!printfbuf_start || !printfbuf_device || !printf_fp)
+        return cudaErrorMissingConfiguration;
+
+    // To determine which architecture we're using, we read the
+    // first short from the buffer - it'll be the magic number
+    // relating to the version.
+    unsigned short magic;
+    cudaMemcpy(&magic, printfbuf_device, sizeof(unsigned short), cudaMemcpyDeviceToHost);
+
+    // For SM_10 architecture, we've split our buffer into one-per-thread.
+    // That means we must do each thread block separately. It'll require
+    // extra reading. We also, for now, don't support async printfs because
+    // that requires tracking one start pointer per thread.
+    if(magic == CUPRINTF_SM10_MAGIC)
+    {
+        sync_printfs = true;
+	    clearOnPrint = false;
+        int blocklen = 0;
+        char *blockptr = printfbuf_device;
+        while(blockptr < (printfbuf_device + printfbuf_len))
+        {
+            cuPrintfHeaderSM10 hdr;
+            cudaMemcpy(&hdr, blockptr, sizeof(hdr), cudaMemcpyDeviceToHost);
+
+            // We get our block-size-step from the very first header
+            if(hdr.thread_buf_len != 0)
+                blocklen = hdr.thread_buf_len;
+
+            // No magic number means no printfs from this thread
+            if(hdr.magic != CUPRINTF_SM10_MAGIC)
+            {
+                if(blocklen == 0)
+                {
+                    fprintf(printf_fp, "No printf headers found at all!\n");
+                    break;                              // No valid headers!
+                }
+                blockptr += blocklen;
+                continue;
+            }
+
+            // "offset" is non-zero then we can print the block contents
+            if(hdr.offset > 0)
+            {
+                // For synchronous printfs, we must print from endptr->bufend, then from start->end
+                if(sync_printfs)
+                    doPrintfDisplay(showThreadID, clearOnPrint, blockptr+CUPRINTF_MAX_LEN, blockptr+hdr.thread_buf_len, blockptr+hdr.offset+CUPRINTF_MAX_LEN, blockptr+hdr.thread_buf_len);
+                doPrintfDisplay(showThreadID, clearOnPrint, blockptr+CUPRINTF_MAX_LEN, blockptr+hdr.thread_buf_len, blockptr+CUPRINTF_MAX_LEN, blockptr+hdr.offset+CUPRINTF_MAX_LEN);
+            }
+
+            // Move on to the next block and loop again
+            blockptr += hdr.thread_buf_len;
+        }
+    }
+    // For SM_11 and up, everything is a single buffer and it's simple
+    else if(magic == CUPRINTF_SM11_MAGIC)
+    {
+	    // Grab the current "end of circular buffer" pointer.
+        char *printfbuf_end = NULL;
+        cudaMemcpyFromSymbol(&printfbuf_end, printfBufferPtr, sizeof(char *));
+
+        // Adjust our starting and ending pointers to within the block
+        char *bufptr = ((printfbuf_start - printfbuf_device) % printfbuf_len) + printfbuf_device;
+        char *endptr = ((printfbuf_end - printfbuf_device) % printfbuf_len) + printfbuf_device;
+
+        // For synchronous (i.e. after-kernel-exit) printf display, we have to handle circular
+        // buffer wrap carefully because we could miss those past "end".
+        if(sync_printfs)
+            doPrintfDisplay(showThreadID, clearOnPrint, printfbuf_device, printfbuf_device+printfbuf_len, endptr, printfbuf_device+printfbuf_len);
+        doPrintfDisplay(showThreadID, clearOnPrint, printfbuf_device, printfbuf_device+printfbuf_len, bufptr, endptr);
+
+        printfbuf_start = printfbuf_end;
+    }
+    else
+        ;//printf("Bad magic number in cuPrintf buffer header\n");
+
+    // If we were synchronous, then we must ensure that the memory is cleared on exit
+    // otherwise another kernel launch with a different grid size could conflict.
+    if(sync_printfs)
+        cudaMemset(printfbuf_device, 0, printfbuf_len);
+
+    return cudaSuccess;
+}
+
+// Cleanup
+#undef CUPRINTF_MAX_LEN
+#undef CUPRINTF_ALIGN_SIZE
+#undef CUPRINTF_SM10_MAGIC
+#undef CUPRINTF_SM11_MAGIC
+
+#endif
diff --git a/src/awp/cuPrintf.cuh b/src/awp/cuPrintf.cuh
new file mode 100644
index 0000000..cdf8613
--- /dev/null
+++ b/src/awp/cuPrintf.cuh
@@ -0,0 +1,162 @@
+/*
+	Copyright 2009 NVIDIA Corporation.  All rights reserved.
+
+	NOTICE TO LICENSEE:   
+
+	This source code and/or documentation ("Licensed Deliverables") are subject 
+	to NVIDIA intellectual property rights under U.S. and international Copyright 
+	laws.  
+
+	These Licensed Deliverables contained herein is PROPRIETARY and CONFIDENTIAL 
+	to NVIDIA and is being provided under the terms and conditions of a form of 
+	NVIDIA software license agreement by and between NVIDIA and Licensee ("License 
+	Agreement") or electronically accepted by Licensee.  Notwithstanding any terms 
+	or conditions to the contrary in the License Agreement, reproduction or 
+	disclosure of the Licensed Deliverables to any third party without the express 
+	written consent of NVIDIA is prohibited.     
+
+	NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE LICENSE AGREEMENT, 
+	NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THESE LICENSED 
+	DELIVERABLES FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED 
+	WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE 
+	LICENSED DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, 
+	NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.   NOTWITHSTANDING ANY 
+	TERMS OR CONDITIONS TO THE CONTRARY IN THE LICENSE AGREEMENT, IN NO EVENT SHALL 
+	NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, 
+	OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,	WHETHER 
+	IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,  ARISING OUT OF 
+	OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THESE LICENSED DELIVERABLES.  
+
+	U.S. Government End Users. These Licensed Deliverables are a "commercial item" 
+	as that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of 
+	"commercial computer  software"  and "commercial computer software documentation" 
+	as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995) and is provided to the 
+	U.S. Government only as a commercial end item.  Consistent with 48 C.F.R.12.212 
+	and 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all U.S. Government 
+	End Users acquire the Licensed Deliverables with only those rights set forth 
+	herein. 
+
+	Any use of the Licensed Deliverables in individual and commercial software must 
+	include, in the user documentation and internal comments to the code, the above 
+	Disclaimer and U.S. Government End Users Notice.
+ */
+
+#ifndef CUPRINTF_H
+#define CUPRINTF_H
+
+/*
+ *	This is the header file supporting cuPrintf.cu and defining both
+ *	the host and device-side interfaces. See that file for some more
+ *	explanation and sample use code. See also below for details of the
+ *	host-side interfaces.
+ *
+ *  Quick sample code:
+ *
+	#include "cuPrintf.cu"
+ 	
+	__global__ void testKernel(int val)
+	{
+		cuPrintf("Value is: %d\n", val);
+	}
+
+	int main()
+	{
+		cudaPrintfInit();
+		testKernel<<< 2, 3 >>>(10);
+		cudaPrintfDisplay(stdout, true);
+		cudaPrintfEnd();
+        return 0;
+	}
+ */
+
+///////////////////////////////////////////////////////////////////////////////
+// DEVICE SIDE
+// External function definitions for device-side code
+
+// Abuse of templates to simulate varargs
+__device__ int cuPrintf(const char *fmt);
+template <typename T1> __device__ int cuPrintf(const char *fmt, T1 arg1);
+template <typename T1, typename T2> __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2);
+template <typename T1, typename T2, typename T3> __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3);
+template <typename T1, typename T2, typename T3, typename T4> __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3, T4 arg4);
+template <typename T1, typename T2, typename T3, typename T4, typename T5> __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3, T4 arg4, T5 arg5);
+template <typename T1, typename T2, typename T3, typename T4, typename T5, typename T6> __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3, T4 arg4, T5 arg5, T6 arg6);
+template <typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7> __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3, T4 arg4, T5 arg5, T6 arg6, T7 arg7);
+template <typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8> __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3, T4 arg4, T5 arg5, T6 arg6, T7 arg7, T8 arg8);
+template <typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8, typename T9> __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3, T4 arg4, T5 arg5, T6 arg6, T7 arg7, T8 arg8, T9 arg9);
+template <typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8, typename T9, typename T10> __device__ int cuPrintf(const char *fmt, T1 arg1, T2 arg2, T3 arg3, T4 arg4, T5 arg5, T6 arg6, T7 arg7, T8 arg8, T9 arg9, T10 arg10);
+
+
+//
+//	cuPrintfRestrict
+//
+//	Called to restrict output to a given thread/block. Pass
+//	the constant CUPRINTF_UNRESTRICTED to unrestrict output
+//	for thread/block IDs. Note you can therefore allow
+//	"all printfs from block 3" or "printfs from thread 2
+//	on all blocks", or "printfs only from block 1, thread 5".
+//
+//	Arguments:
+//		threadid - Thread ID to allow printfs from
+//		blockid - Block ID to allow printfs from
+//
+//	NOTE: Restrictions last between invocations of
+//	kernels unless cudaPrintfInit() is called again.
+//
+#define CUPRINTF_UNRESTRICTED	-1
+__device__ void cuPrintfRestrict(int threadid, int blockid);
+
+
+
+///////////////////////////////////////////////////////////////////////////////
+// HOST SIDE
+// External function definitions for host-side code
+
+//
+//	cudaPrintfInit
+//
+//	Call this once to initialise the printf system. If the output
+//	file or buffer size needs to be changed, call cudaPrintfEnd()
+//	before re-calling cudaPrintfInit().
+//
+//	The default size for the buffer is 1 megabyte. For CUDA
+//	architecture 1.1 and above, the buffer is filled linearly and
+//	is completely used;	however for architecture 1.0, the buffer
+//	is divided into as many segments are there are threads, even
+//	if some threads do not call cuPrintf().
+//
+//	Arguments:
+//		bufferLen - Length, in bytes, of total space to reserve
+//		            (in device global memory) for output.
+//
+//	Returns:
+//		cudaSuccess if all is well.
+//
+extern "C" cudaError_t cudaPrintfInit(size_t bufferLen=1048576);   // 1-meg - that's enough for 4096 printfs by all threads put together
+
+//
+//	cudaPrintfEnd
+//
+//	Cleans up all memories allocated by cudaPrintfInit().
+//	Call this at exit, or before calling cudaPrintfInit() again.
+//
+extern "C" void cudaPrintfEnd();
+
+//
+//	cudaPrintfDisplay
+//
+//	Dumps the contents of the output buffer to the specified
+//	file pointer. If the output pointer is not specified,
+//	the default "stdout" is used.
+//
+//	Arguments:
+//		outputFP     - A file pointer to an output stream.
+//		showThreadID - If "true", output strings are prefixed
+//		               by "[blockid, threadid] " at output.
+//
+//	Returns:
+//		cudaSuccess if all is well.
+//
+extern "C" cudaError_t cudaPrintfDisplay(void *outputFP=NULL, bool showThreadID=false);
+
+#endif  // CUPRINTF_H
diff --git a/src/awp/grid.c b/src/awp/grid.c
index b346df6..2406bf9 100644
--- a/src/awp/grid.c
+++ b/src/awp/grid.c
@@ -62,11 +62,10 @@ Grid3Dww Alloc3Dww(int nx, int ny, int nz)
 
 
 
-Grid1D Alloc1D(int nx)
+Grid1D Alloc1D(long nx)
 {
-   int i;
+   long i;
    Grid1D U = (Grid1D)malloc(sizeof(_prec)*nx);
-
    if (!U){
        printf("Cannot allocate 2D _prec array\n");
        exit(-1);
diff --git a/src/awp/kernel.cu b/src/awp/kernel.cu
index 7f94abf..72b89fd 100644
--- a/src/awp/kernel.cu
+++ b/src/awp/kernel.cu
@@ -2,6 +2,7 @@
 #include <math.h>
 #include "awp/kernel.h"
 #include "awp/pmcl3d_cons.h"
+#include "cuPrintf.cu"
 #include <cuda.h>
 
 __constant__ _prec d_c1;
@@ -19,6 +20,12 @@ __constant__ int   d_slice_2[MAXGRIDS];
 __constant__ int   d_yline_1[MAXGRIDS];
 __constant__ int   d_yline_2[MAXGRIDS];
 
+/*
+texture<float, 1, cudaReadModeElementType> p_vx1;
+texture<float, 1, cudaReadModeElementType> p_vx2;
+texture<int, 1, cudaReadModeElementType> p_ww;
+texture<float, 1, cudaReadModeElementType> p_wwo;
+*/
 //Parameters used for STF filtering (Daniel)
 __constant__ int d_filtorder;
 __constant__ double d_srcfilt_b[MAXFILT], d_srcfilt_a[MAXFILT];
@@ -65,6 +72,7 @@ __device__ void rotate_principal(register _prec sigma2, register _prec pfluid, r
 
 //end of routines for on-GPU initial stress computation (Daniel)
 
+extern "C"
 void SetDeviceConstValue(_prec *DH, _prec DT, int *nxt, int *nyt, int *nzt, int ngrids,
    _prec fmajor, _prec fminor, _prec *Rz, _prec *RzT)
 {
@@ -77,12 +85,12 @@ void SetDeviceConstValue(_prec *DH, _prec DT, int *nxt, int *nyt, int *nzt, int
     h_c2  = -1.0/24.0;
     h_dt1 = 1.0/DT;
 
-    h_dth=(_prec* ) calloc(ngrids, sizeof(_prec));
-    h_dh1=(_prec* ) calloc(ngrids, sizeof(_prec));
-    slice_1=(int*) calloc(ngrids, sizeof(_prec));
-    slice_2=(int*) calloc(ngrids, sizeof(_prec));
-    yline_1=(int*) calloc(ngrids, sizeof(_prec));
-    yline_2=(int*) calloc(ngrids, sizeof(_prec));
+    h_dth=(float*) calloc(ngrids, sizeof(float));
+    h_dh1=(float*) calloc(ngrids, sizeof(float));
+    slice_1=(int*) calloc(ngrids, sizeof(float));
+    slice_2=(int*) calloc(ngrids, sizeof(float));
+    yline_1=(int*) calloc(ngrids, sizeof(float));
+    yline_2=(int*) calloc(ngrids, sizeof(float));
 
     for (k=0; k<ngrids; k++){
        h_dth[k] = DT/DH[k];
@@ -93,13 +101,13 @@ void SetDeviceConstValue(_prec *DH, _prec DT, int *nxt, int *nyt, int *nzt, int
        yline_2[k]  = (nzt[k]+2*align)*2;
     }
 
-    CUCHK(cudaMemcpyToSymbol(d_c1,      &h_c1,    sizeof(_prec)));
-    CUCHK(cudaMemcpyToSymbol(d_c2,      &h_c2,    sizeof(_prec)));
-    CUCHK(cudaMemcpyToSymbol(d_dt1,     &h_dt1,   sizeof(_prec)));
-    CUCHK(cudaMemcpyToSymbol(d_DT,      &DT,      sizeof(_prec)));
-    CUCHK(cudaMemcpyToSymbol(d_dth,     h_dth,   sizeof(_prec) * ngrids));
-    CUCHK(cudaMemcpyToSymbol(d_dh1,     h_dh1,   sizeof(_prec) * ngrids));
-    CUCHK(cudaMemcpyToSymbol(d_DH,      DH,      sizeof(_prec) * ngrids));
+    CUCHK(cudaMemcpyToSymbol(d_c1,      &h_c1,    sizeof(float)));
+    CUCHK(cudaMemcpyToSymbol(d_c2,      &h_c2,    sizeof(float)));
+    CUCHK(cudaMemcpyToSymbol(d_dt1,     &h_dt1,   sizeof(float)));
+    CUCHK(cudaMemcpyToSymbol(d_DT,      &DT,      sizeof(float)));
+    CUCHK(cudaMemcpyToSymbol(d_dth,     h_dth,   sizeof(float) * ngrids));
+    CUCHK(cudaMemcpyToSymbol(d_dh1,     h_dh1,   sizeof(float) * ngrids));
+    CUCHK(cudaMemcpyToSymbol(d_DH,      DH,      sizeof(float) * ngrids));
     CUCHK(cudaMemcpyToSymbol(d_nxt,     nxt,     sizeof(int) * ngrids));
     CUCHK(cudaMemcpyToSymbol(d_nyt,     nyt,     sizeof(int) * ngrids));
     CUCHK(cudaMemcpyToSymbol(d_nzt,     nzt,     sizeof(int) * ngrids));
@@ -109,13 +117,35 @@ void SetDeviceConstValue(_prec *DH, _prec DT, int *nxt, int *nyt, int *nzt, int
     CUCHK(cudaMemcpyToSymbol(d_yline_2, yline_2, sizeof(int) * ngrids));
 
     //Compute initial stress on GPU (Daniel)
-    CUCHK(cudaMemcpyToSymbol(d_fmajor, &fmajor, sizeof(_prec)));
-    CUCHK(cudaMemcpyToSymbol(d_fminor, &fminor, sizeof(_prec)));
-    CUCHK(cudaMemcpyToSymbol(d_Rz, Rz, 9*sizeof(_prec)));
-    CUCHK(cudaMemcpyToSymbol(d_RzT, RzT, 9*sizeof(_prec)));
+    CUCHK(cudaMemcpyToSymbol(d_fmajor, &fmajor, sizeof(float)));
+    CUCHK(cudaMemcpyToSymbol(d_fminor, &fminor, sizeof(float)));
+    CUCHK(cudaMemcpyToSymbol(d_Rz, Rz, 9*sizeof(float)));
+    CUCHK(cudaMemcpyToSymbol(d_RzT, RzT, 9*sizeof(float)));
     return;
 }
+/*
+extern "C"
+void BindArrayToTexture(float* vx1, float* vx2,int* ww, float* wwo, int memsize)   
+{
+   cudaBindTexture(0, p_vx1,  vx1,  memsize);
+   cudaBindTexture(0, p_vx2,  vx2,  memsize);
+   cudaBindTexture(0, p_ww,   ww,   memsize);
+   cudaBindTexture(0, p_wwo,   wwo,   memsize);
+   cudaDeviceSynchronize ();
+   return;
+}
 
+extern "C"
+void UnBindArrayFromTexture()
+{
+   cudaUnbindTexture(p_vx1);
+   cudaUnbindTexture(p_vx2);
+   cudaUnbindTexture(p_ww);
+   cudaUnbindTexture(p_wwo);
+   return;
+}
+*/
+extern "C"
 void SetDeviceFilterParameters(int filtorder, double *srcfilt_b, double *srcfilt_a){
     CUCHK(cudaMemcpyToSymbol(d_filtorder, &filtorder, sizeof(int)));
     CUCHK(cudaMemcpyToSymbol(d_srcfilt_b, srcfilt_b, (filtorder+1)*sizeof(double)));
@@ -222,11 +252,6 @@ __global__ void dvelcx_opt(_prec * __restrict__ u1,
     	u1[pos]  = (u1[pos] + f_d1*( d_c1*(f_xx        - xx_im1)      + d_c2*(xx_ip1      - xx_im2) 
                                    + d_c1*(f_xy        - xy[pos_jm1]) + d_c2*(xy[pos_jp1] - xy[pos_jm2])
                                    + d_c1*(f_xz        - xz[pos_km1]) + d_c2*(xz[pos_kp1] - xz[pos_km2]) ))*f_dcrj; 
-
-        //if ((d_i==0) && (k==80) && (i==69) && (j==69)) {
-        //   printf("velocities: %e %e %e \n", 
-        //      u1[pos], v1[pos], w1[pos]);
-        //}
         /*if ((d_i==0) && (k==32) && (i==94) && (j==97)) {
            cuPrintf("after update: u1[%d]=%e, f_d1=%e, xx=%.20g, %20g, %20g, %20g\n", 
               pos, u1[pos], f_d1, f_xx, xx_im1, xx_ip1, xx_im2);
@@ -271,9 +296,9 @@ __global__ void print_const(int ngrids)
 {
     int p;
     for (p=0; p<ngrids; p++){
-       //cuPrintf("device constants[%d]:\nd_yline_=%d,%d, d_slice=%d,%d,nxt,nyt,nzt=%d,%d,%d\n",
-       //      p, d_yline_1[p], d_yline_2[p], d_slice_1[p], d_slice_2[p], d_nxt[p], d_nyt[p], d_nzt[p]);
-       //cuPrintf("d_DH=%e, d_dth=%e, d_dh1=%e\n", d_DH[p], d_dth[p], d_dh1[p]); 
+       cuPrintf("device constants[%d]:\nd_yline_=%d,%d, d_slice=%d,%d,nxt,nyt,nzt=%d,%d,%d\n",
+	     p, d_yline_1[p], d_yline_2[p], d_slice_1[p], d_slice_2[p], d_nxt[p], d_nyt[p], d_nzt[p]);
+       cuPrintf("d_DH=%e, d_dth=%e, d_dh1=%e\n", d_DH[p], d_dth[p], d_dh1[p]); 
     }
     /*cuPrintf("d_filtorder=%d\n", d_filtorder);
     if (d_filtorder > 0){
@@ -284,78 +309,102 @@ __global__ void print_const(int ngrids)
     }*/
 }
 
+extern "C"
 void print_const_H(int ngrids)
 {
     dim3 block (1, 1, 1);
     dim3 grid (1, 1, 1);
-    //cudaPrintfInit();
-    //print_const<<<grid, block, 0>>>(ngrids);
-    //cudaPrintfDisplay(stdout, 1);
-    //cudaPrintfEnd();
+    CUCHK(cudaPrintfInit());
+    print_const<<<grid, block, 0>>>(ngrids);
+    CUCHK(cudaPrintfDisplay(stdout, 1));
+    cudaPrintfEnd();
     return;
 }
 
-void dvelcx_H_opt(_prec*  u1,    _prec*  v1,    _prec*  w1,    
-                  _prec*  xx,  _prec*  yy, _prec*  zz, _prec*  xy,      _prec*  xz, _prec*  yz,
-                  _prec*  dcrjx, _prec*  dcrjy, _prec*  dcrjz,
-                  _prec*  d_1, int nyt,   int nzt,  
+extern "C"
+void dvelcx_H_opt(float* u1,    float* v1,    float* w1,    
+                  float* xx,  float* yy, float* zz, float* xy,      float* xz, float* yz,
+                  float* dcrjx, float* dcrjy, float* dcrjz,
+                  float* d_1, int nyt,   int nzt,  
                   cudaStream_t St, int s_i,   int e_i, int d_i, int ngrids)
 {
     dim3 block (BLOCK_SIZE_Z, BLOCK_SIZE_Y, 1);
     dim3 grid ((nzt+BLOCK_SIZE_Z-1)/BLOCK_SIZE_Z, (nyt+BLOCK_SIZE_Y-1)/BLOCK_SIZE_Y,1);
     CUCHK(cudaFuncSetCacheConfig(dvelcx_opt<BLOCK_SIZE_Z, BLOCK_SIZE_Y>, cudaFuncCachePreferL1));
     CUCHK(cudaGetLastError());
-    //cudaPrintfInit();
+    /*CUCHK(cudaPrintfInit());*/
     //fprintf(stdout, "launching dvelcx_opt\n");
     dvelcx_opt<BLOCK_SIZE_Z, BLOCK_SIZE_Y><<<grid, block, 0, St>>>(u1, v1, w1, xx, yy, zz, xy, xz, yz, dcrjx, dcrjy, dcrjz, d_1, 
          s_i, e_i, d_i, ngrids);
-    //cudaPrintfDisplay(stdout, 1);
-    //cudaPrintfEnd();
+    /*CUCHK(cudaPrintfDisplay(stdout, 1));
+    cudaPrintfEnd();*/
     CUCHK(cudaGetLastError());
     return;
 }
-void dvelcy_H(_prec*  u1,       _prec*  v1,    _prec*  w1,    _prec*  xx,  _prec*  yy, _prec*  zz, _prec*  xy,   _prec*  xz,   _prec*  yz,
-              _prec*  dcrjx,    _prec*  dcrjy, _prec*  dcrjz, _prec*  d_1, int nxt,   int nzt,   _prec*  s_u1, _prec*  s_v1, _prec*  s_w1,  
+extern "C"
+void dvelcy_H(float* u1,       float* v1,    float* w1,    float* xx,  float* yy, float* zz, float* xy,   float* xz,   float* yz,
+              float* dcrjx,    float* dcrjy, float* dcrjz, float* d_1, int nxt,   int nzt,   float* s_u1, float* s_v1, float* s_w1,  
               cudaStream_t St, int s_j,      int e_j,      int rank, int d_i)
 {
     if(rank==-1) return;
     dim3 block (BLOCK_SIZE_Z, BLOCK_SIZE_Y, 1);
     dim3 grid ((nzt+BLOCK_SIZE_Z-1)/BLOCK_SIZE_Z, (nxt+BLOCK_SIZE_Y-1)/BLOCK_SIZE_Y,1);
-    CUCHK(cudaFuncSetCacheConfig(dvelcy, cudaFuncCachePreferL1));
+    cudaFuncSetCacheConfig(dvelcy, cudaFuncCachePreferL1);
     CUCHK(cudaGetLastError());
     dvelcy<<<grid, block, 0, St>>>(u1, v1, w1, xx, yy, zz, xy, xz, yz, dcrjx, dcrjy, dcrjz, d_1, s_u1, s_v1, s_w1, s_j, e_j, d_i);
     CUCHK(cudaGetLastError());
     return;
 }
 
-void update_bound_y_H(_prec*  u1,   _prec*  v1, _prec*  w1, _prec*  f_u1,      _prec*  f_v1,      _prec*  f_w1,  _prec*  b_u1, _prec*  b_v1, 
-                      _prec*  b_w1, int nxt,   int nzt,   cudaStream_t St1, cudaStream_t St2, int rank_f,  int rank_b, int d_i)
+extern "C"
+void update_bound_y_H(float* u1,   float* v1, float* w1, float* f_u1,      float* f_v1,      float* f_w1,  float* b_u1, float* b_v1, 
+                      float* b_w1, int nxt,   int nzt,   cudaStream_t St1, cudaStream_t St2, int rank_f,  int rank_b, int d_i)
 {
      if(rank_f==-1 && rank_b==-1) return;
      dim3 block (BLOCK_SIZE_Z, BLOCK_SIZE_Y, 1);
      dim3 grid ((nzt+BLOCK_SIZE_Z-1)/BLOCK_SIZE_Z, (nxt+BLOCK_SIZE_Y-1)/BLOCK_SIZE_Y,1);
-     CUCHK(cudaFuncSetCacheConfig(update_boundary_y, cudaFuncCachePreferL1));
+     cudaFuncSetCacheConfig(update_boundary_y, cudaFuncCachePreferL1);
      update_boundary_y<<<grid, block, 0, St1>>>(u1, v1, w1, f_u1, f_v1, f_w1, rank_f, Front, d_i);
      update_boundary_y<<<grid, block, 0, St2>>>(u1, v1, w1, b_u1, b_v1, b_w1, rank_b, Back, d_i);
      return;
 }
 
+extern "C"
+void dstrqc_H(float* xx,       float* yy,     float* zz,    float* xy,    float* xz, float* yz,
+              float* r1,       float* r2,     float* r3,    float* r4,    float* r5, float* r6,
+              float* u1,       float* v1,     float* w1,    float* lam,   float* mu, float* qp,float* coeff, 
+              float* qs,       float* dcrjx,  float* dcrjy, float* dcrjz, int nyt,   int nzt, 
+              cudaStream_t St, float* lam_mu, 
+              _prec *vx1, _prec *vx2, int *ww, _prec *wwo,
+              int NX,       int NPC, int rankx,    int ranky, int  s_i,  
+              int e_i,         int s_j,       int e_j, int d_i)
+{
+    dim3 block (BLOCK_SIZE_Z, BLOCK_SIZE_Y, 1);
+    dim3 grid ((nzt+BLOCK_SIZE_Z-1)/BLOCK_SIZE_Z, (e_j-s_j+1+BLOCK_SIZE_Y-1)/BLOCK_SIZE_Y,1);
+    cudaFuncSetCacheConfig(dstrqc, cudaFuncCachePreferL1);
+    dstrqc<<<grid, block, 0, St>>>(xx,    yy,    zz,  xy,  xz, yz, r1, r2,    r3,    r4,    r5,     r6, 
+                                   u1,    v1,    w1,  lam, mu, qp,coeff, qs, dcrjx, dcrjy, dcrjz, lam_mu, 
+                                   vx1, vx2, ww, wwo, 
+                                   NX, NPC, rankx, ranky, nzt, s_i, e_i, s_j, e_j, d_i);
+    return;
+}
+
 template<int BLOCKX, int BLOCKY>
 __global__ void 
 __launch_bounds__(512,2)
-dstrqc_new(_prec*  __restrict__ xx, _prec*  __restrict__ yy, _prec*  __restrict__ zz,
-           _prec*  __restrict__ xy, _prec*  __restrict__ xz, _prec*  __restrict__ yz,
-       _prec*  __restrict__ r1, _prec*  __restrict__ r2,  _prec*  __restrict__ r3, 
-       _prec*  __restrict__ r4, _prec*  __restrict__ r5,  _prec*  __restrict__ r6,
-       _prec*  __restrict__ u1, 
-       _prec*  __restrict__ v1,    
-       _prec*  __restrict__ w1,    
-       _prec*  lam,   
-       _prec*  mu,     
-       _prec*  qp,
-       _prec*  coeff, 
-       _prec*  qs, 
-       _prec*  dcrjx, _prec*  dcrjy, _prec*  dcrjz, _prec*  lam_mu, 
+dstrqc_new(float* __restrict__ xx, float* __restrict__ yy, float* __restrict__ zz,
+           float* __restrict__ xy, float* __restrict__ xz, float* __restrict__ yz,
+       float* __restrict__ r1, float* __restrict__ r2,  float* __restrict__ r3, 
+       float* __restrict__ r4, float* __restrict__ r5,  float* __restrict__ r6,
+       float* __restrict__ u1, 
+       float* __restrict__ v1,    
+       float* __restrict__ w1,    
+       float* lam,   
+       float* mu,     
+       float* qp,
+       float* coeff, 
+       float* qs, 
+       float* dcrjx, float* dcrjy, float* dcrjz, float* lam_mu, 
        //_prec *d_vx1, _prec *d_vx2, _prec *d_ww, _prec *d_wwo, //pengs version
        _prec *d_vx1, _prec *d_vx2, int *d_ww, _prec *d_wwo,
        int NX, int NPC, int rankx, int ranky, int nzt, int s_i, int e_i, int s_j, int e_j, int d_i) 
@@ -537,9 +586,7 @@ dstrqc_new(_prec*  __restrict__ xx, _prec*  __restrict__ yy, _prec*  __restrict_
 //                qpaw=qpaw/2.;
     }
     else {
-        //suggested by Kyle
-	qpaw  = 2.0f*f_wwo*qpa;
-        // qpaw  = f_wwo*qpa;
+      qpaw  = f_wwo*qpa;
     }
 //                 printf("qpaw %f\n",qpaw);
 //              printf("qpaw1 %g\n",qpaw);
@@ -560,9 +607,7 @@ dstrqc_new(_prec*  __restrict__ xx, _prec*  __restrict__ yy, _prec*  __restrict_
       //                  hw=hw/2.0f;
     }
     else {
-      //suggested by Kyle
-      hw  = 2.0f*f_wwo*h;
-      // hw  = f_wwo*h;
+      hw  = f_wwo*h;
     }
     hw=hw/f_wwo;
 
@@ -575,9 +620,7 @@ dstrqc_new(_prec*  __restrict__ xx, _prec*  __restrict__ yy, _prec*  __restrict_
       //                  h1w=h1w/2.0f;
     }
     else {
-        //suggested by Kyle
-	h1w  = 2.0f*f_wwo*h1;
-        // h1w  = f_wwo*h1;
+      h1w  = f_wwo*h1;
     }
     h1w=h1w/f_wwo;
 
@@ -590,9 +633,7 @@ dstrqc_new(_prec*  __restrict__ xx, _prec*  __restrict__ yy, _prec*  __restrict_
       //                  h2w=h2w/2.;
     }
     else {
-        //suggested by Kyle
-        //h2w  = f_wwo*h2;
-	h2w  = 2.0f*f_wwo*h2;
+      h2w  = f_wwo*h2;
     }
     h2w=h2w/f_wwo;
 
@@ -604,9 +645,7 @@ dstrqc_new(_prec*  __restrict__ xx, _prec*  __restrict__ yy, _prec*  __restrict_
       //                  h3w=h3w/2.0f;
     }
     else {
-      //suggested by Kyle
-      h3w  = 2.0f*f_wwo*h3;
-      //h3w  = f_wwo*h3;
+      h3w  = f_wwo*h3;
     }
     h3w=h3w/f_wwo;
 
@@ -734,12 +773,12 @@ dstrqc_new(_prec*  __restrict__ xx, _prec*  __restrict__ yy, _prec*  __restrict_
 
     tmp      = xl*(vs1+vs2+vs3);
     #ifdef ELA
-    if (k==41 && i==102 && j==102) printf("before update xx=%.20g\n", xx[pos]);
+    if (k==41 && i==102 && j==102) cuPrintf("before update xx=%.20g\n", xx[pos]);
     xx[pos]  = (xx[pos] + tmp - xm*(vs2+vs3))*f_dcrj;
     yy[pos]  = (yy[pos] + tmp - xm*(vs1+vs3))*f_dcrj;
     zz[pos]  = (zz[pos] + tmp - xm*(vs1+vs2))*f_dcrj;
     if (k==41 && i==102 && j==102)
-       printf("after update xx=%.30g, xm=%.30g, vs1=%.30g, vs2=%.30g, vs3=%.30g, f_drj=%.30g\n", 
+       cuPrintf("after update xx=%.30g, xm=%.30g, vs1=%.30g, vs2=%.30g, vs3=%.30g, f_drj=%.30g\n", 
 	     xx[pos], xm, vs1, vs2, vs3, f_dcrj);
     #else
     a1       = qpa*(vs1+vs2+vs3);
@@ -858,464 +897,128 @@ dstrqc_new(_prec*  __restrict__ xx, _prec*  __restrict__ yy, _prec*  __restrict_
   return;
 }
 
-void dstrqc_H(float* xx,       float* yy,     float* zz,    float* xy,    float* xz, float* yz,
-              float* r1,       float* r2,     float* r3,    float* r4,    float* r5, float* r6,
-              float* u1,       float* v1,     float* w1,    float* lam,   float* mu, float* qp,float* coeff, 
-              float* qs,       float* dcrjx,  float* dcrjy, float* dcrjz, int nyt,   int nzt, 
-              cudaStream_t St, float* lam_mu, 
-              float *vx1, float *vx2, int *ww, float *wwo,
-              int NX,       int NPC, int rankx,    int ranky, int  s_i,  
-              int e_i,         int s_j,       int e_j, int d_i)
-{
-    dim3 block (BLOCK_SIZE_Z, BLOCK_SIZE_Y, 1);
-    dim3 grid ((nzt+BLOCK_SIZE_Z-1)/BLOCK_SIZE_Z, (e_j-s_j+1+BLOCK_SIZE_Y-1)/BLOCK_SIZE_Y,1);
-    cudaFuncSetCacheConfig(dstrqc, cudaFuncCachePreferL1);
-    dstrqc<<<grid, block, 0, St>>>(xx,    yy,    zz,  xy,  xz, yz, r1, r2,    r3,    r4,    r5,     r6, 
-                                   u1,    v1,    w1,  lam, mu, qp,coeff, qs, dcrjx, dcrjy, dcrjz, lam_mu, 
-                                   vx1, vx2, ww, wwo, 
-                                   NX, NPC, rankx, ranky, nzt, s_i, e_i, s_j, e_j, d_i);
-    return;
-}
 
-__global__ void dstrqc(float* xx, float* yy,    float* zz,    float* xy,    float* xz,     float* yz,
-                       float* r1, float* r2,    float* r3,    float* r4,    float* r5,     float* r6,
-                       float* u1, float* v1,    float* w1,    float* lam,   float* mu,     float* qp,float* coeff, 
-                       float* qs, float* dcrjx, float* dcrjy, float* dcrjz, float* lam_mu, 
-                       float *d_vx1, float *d_vx2, int *d_ww, float *d_wwo,
-                       int NX, int NPC, int rankx, int ranky, int nzt, int s_i, int e_i, int s_j, int e_j, int d_i)
+
+extern "C"
+void dstrqc_H_new(float* xx,       float* yy,     float* zz,    float* xy,    float* xz, float* yz,
+                  float* r1,       float* r2,     float* r3,    float* r4,    float* r5, float* r6,
+                  float* u1,       float* v1,     float* w1,    float* lam,   float* mu, float* qp,float* coeff, 
+                  float* qs,       float* dcrjx,  float* dcrjy, float* dcrjz, int nyt,   int nzt, 
+                  cudaStream_t St, float* lam_mu, 
+                  //_prec *vx1, _prec *vx2, _prec *ww, _prec *wwo, //peng's version
+                  _prec *vx1, _prec *vx2, int *ww, _prec *wwo,
+                  int NX,          int NPC,       int rankx,    int ranky, int  s_i,  
+                  int e_i,         int s_j,       int e_j, int d_i)
 {
-    register int   i,  j,  k,  g_i;
-    register int   pos,     pos_ip1, pos_im2, pos_im1;
-    register int   pos_km2, pos_km1, pos_kp1, pos_kp2;
-    register int   pos_jm2, pos_jm1, pos_jp1, pos_jp2;
-    register int   pos_ik1, pos_jk1, pos_ijk, pos_ijk1,f_ww;
-    register float vs1, vs2, vs3, a1, tmp, vx1,f_wwo;
-    register float xl,  xm,  xmu1, xmu2, xmu3;
-    register float qpa, h,   h1,   h2,   h3;
-     register float qpaw,hw,h1w,h2w,h3w; 
-    register float f_vx1, f_vx2,  f_dcrj, f_r,  f_dcrjy, f_dcrjz;
-      register float f_rtmp;
-    register float f_u1, u1_ip1, u1_ip2, u1_im1;
-    register float f_v1, v1_im1, v1_ip1, v1_im2;
-    register float f_w1, w1_im1, w1_im2, w1_ip1;
-    int maxk, mink = align+3;
-    
-    k    = blockIdx.x*BLOCK_SIZE_Z+threadIdx.x+align;
-    j    = blockIdx.y*BLOCK_SIZE_Y+threadIdx.y+s_j;
+    //fprintf(stderr, "nzt=%d, e_j=%d, s_j=%d\n", nzt, e_j, s_j);
+    /*cudaPrintfInit();*/
+    if (0 == (nzt % 64) && 0 == (( e_j-s_j+1) % 8)) {
+      const int blockx = 64, blocky = 8;
+      dim3 block(blockx, blocky, 1);
+      dim3 grid ((nzt+block.x-1)/block.x, (e_j-s_j+1+block.y-1)/block.y,1);
+      CUCHK( cudaFuncSetCacheConfig(dstrqc_new<blockx,blocky>, cudaFuncCachePreferShared) );
+      dstrqc_new<blockx,blocky><<<grid, block, 0, St>>>(xx, yy, zz, xy,  xz, yz, r1, r2,    r3,    r4,    r5,     r6, 
+                                     u1, v1, w1, lam, mu, qp,coeff, qs, dcrjx, dcrjy, dcrjz, lam_mu, 
+                                     vx1, vx2, ww, wwo,
+                                     NX, NPC, rankx, ranky, nzt, s_i, e_i, s_j, e_j, d_i);
+    } else {
+      const int blockx = BLOCK_SIZE_Z, blocky = BLOCK_SIZE_Y;
+      dim3 block(blockx, blocky, 1);
+      dim3 grid ((nzt+block.x-1)/block.x, (e_j-s_j+1+block.y-1)/block.y,1);
+      CUCHK( cudaFuncSetCacheConfig(dstrqc_new<blockx,blocky>, cudaFuncCachePreferShared) );
+      dstrqc_new<blockx,blocky><<<grid, block, 0, St>>>(xx, yy, zz, xy,  xz, yz, r1, r2,    r3,    r4,    r5,     r6, 
+                                     u1, v1, w1, lam, mu, qp,coeff, qs, dcrjx, dcrjy, dcrjz, lam_mu, 
+                                     vx1, vx2, ww, wwo,
+                                     NX, NPC, rankx, ranky, nzt, s_i, e_i, s_j, e_j, d_i);
 
-    if (d_i == 0) {
-       maxk = nzt + align -1;
     }
-    else maxk = nzt + align -3;
+    cudaError_t cerr;
+    CUCHK(cerr=cudaGetLastError());
+    if(cerr!=cudaSuccess) printf("CUDA ERROR: dstrqc_H_new after kernel: %s\n",cudaGetErrorString(cerr));
+    /*cudaPrintfDisplay(stdout, 1);
+    cudaPrintfEnd();*/
+    return;
+}
 
-    if (k < mink || k > maxk || j > e_j) return;
- 
-    i    = e_i;
-    pos  = i*d_slice_1[d_i]+j*d_yline_1[d_i]+k;
 
-    u1_ip1 = u1[pos+d_slice_2[d_i]];
-    f_u1   = u1[pos+d_slice_1[d_i]];
-    u1_im1 = u1[pos];    
-    f_v1   = v1[pos+d_slice_1[d_i]];
-    v1_im1 = v1[pos];
-    v1_im2 = v1[pos-d_slice_1[d_i]];
-    f_w1   = w1[pos+d_slice_1[d_i]];
-    w1_im1 = w1[pos];
-    w1_im2 = w1[pos-d_slice_1[d_i]];
-    f_dcrjz = dcrjz[k];
-    f_dcrjy = dcrjy[j];
-    for(i=e_i;i>=s_i;i--)
-    {
-        /*f_vx1    = tex1Dfetch(p_vx1, pos);
-        f_vx2    = tex1Dfetch(p_vx2, pos);
-        f_ww     = tex1Dfetch(p_ww, pos);
-        f_wwo     = tex1Dfetch(p_wwo, pos);*/
-        f_vx1 = d_vx1[pos];
-        f_vx2 = d_vx2[pos];
-        f_ww  = d_ww[pos];
-        f_wwo = d_wwo[pos];
-        /*
-        if(f_wwo!=f_wwo){
-          xx[pos] = yy[pos] = zz[pos] = xy[pos] = xz[pos] = yz[pos] = 1.0;
-          r1[pos] = r2[pos] = r3[pos] = r4[pos] = r5[pos] = r6[pos] = 1.0;
-          return;
-        }
-*/
-        f_dcrj   = dcrjx[i]*f_dcrjy*f_dcrjz;
+/* kernel function to apply free-surface B.C. to stresses - (Daniel) */
+extern "C"
+void fstr_H(float* zz, float* xz, float* yz, cudaStream_t St, int s_i, int e_i, int s_j, int e_j)
+{
+    dim3 block (2, BLOCK_SIZE_Y, 1);
+    dim3 grid (1,(e_j-s_j+1+BLOCK_SIZE_Y-1)/BLOCK_SIZE_Y,1);
+    cudaFuncSetCacheConfig(fstr, cudaFuncCachePreferL1);
+    fstr<<<grid, block, 0, St>>>(zz, xz, yz, s_i, e_i, s_j);
+    return;
+}
 
-        pos_km2  = pos-2;
-        pos_km1  = pos-1;
-        pos_kp1  = pos+1;
-        pos_kp2  = pos+2;
-        pos_jm2  = pos-d_yline_2[d_i];
-        pos_jm1  = pos-d_yline_1[d_i];
-        pos_jp1  = pos+d_yline_1[d_i];
-        pos_jp2  = pos+d_yline_2[d_i];
-        pos_im2  = pos-d_slice_2[d_i];
-        pos_im1  = pos-d_slice_1[d_i];
-        pos_ip1  = pos+d_slice_1[d_i];
-        pos_jk1  = pos-d_yline_1[d_i]-1;
-        pos_ik1  = pos+d_slice_1[d_i]-1;
-        pos_ijk  = pos+d_slice_1[d_i]-d_yline_1[d_i];
-        pos_ijk1 = pos+d_slice_1[d_i]-d_yline_1[d_i]-1;
 
-        xl       = 8.0/(  lam[pos]      + lam[pos_ip1] + lam[pos_jm1] + lam[pos_ijk]
-                        + lam[pos_km1]  + lam[pos_ik1] + lam[pos_jk1] + lam[pos_ijk1] );
-        xm       = 16.0/( mu[pos]       + mu[pos_ip1]  + mu[pos_jm1]  + mu[pos_ijk]
-                        + mu[pos_km1]   + mu[pos_ik1]  + mu[pos_jk1]  + mu[pos_ijk1] );
-        xmu1     = 2.0/(  mu[pos]       + mu[pos_km1] );
-        xmu2     = 2.0/(  mu[pos]       + mu[pos_jm1] );
-        xmu3     = 2.0/(  mu[pos]       + mu[pos_ip1] );
-        xl       = xl  +  xm;
-        qpa      = 0.0625*( qp[pos]     + qp[pos_ip1] + qp[pos_jm1] + qp[pos_ijk]
-                          + qp[pos_km1] + qp[pos_ik1] + qp[pos_jk1] + qp[pos_ijk1] );
+__global__ void 
+__launch_bounds__(512,2)
+drprecpc_calc_opt(_prec *xx, _prec *yy, _prec *zz, 
+                  const float* __restrict__ xy, 
+                  const float* __restrict__ xz, 
+                  const float* __restrict__ yz, 
+                  _prec *mu, _prec *d1, 
+                  _prec *sigma2, 
+                  _prec *yldfac,_prec *cohes, _prec *phi,
+                  _prec *neta,
+                  int nzt, int s_i, int e_i, int s_j, int e_j,  int d_i) { 
+  register int i,j,k,pos;
+  register int pos_im1,pos_ip1,pos_jm1,pos_km1;
+  register int pos_ip1jm1;
+  register int pos_ip1km1,pos_jm1km1;
+  register _prec Sxx, Syy, Szz, Sxy, Sxz, Syz;
+  register _prec Sxxp, Syyp, Szzp, Sxyp, Sxzp, Syzp;
+  register _prec depxx, depyy, depzz, depxy, depxz, depyz;
+  register _prec SDxx, SDyy, SDzz;
+  register _prec iyldfac, Tv, sigma_m, taulim, taulim2, rphi;
+  register _prec xm, iixx, iiyy, iizz;
+  register _prec mu_, secinv, sqrtSecinv;
+  register int   jj,kk;
 
-//                        www=f_ww;
-        if(1./(qpa*2.0)<=200.0)
-        {
-//      printf("coeff[f_ww*2-2] %g\n",coeff[f_ww*2-2]);
-                  qpaw=coeff[f_ww*2-2]*(2.*qpa)*(2.*qpa)+coeff[f_ww*2-1]*(2.*qpa);
-//              qpaw=coeff[www*2-2]*(2.*qpa)*(2.*qpa)+coeff[www*2-1]*(2.*qpa);
-//                qpaw=qpaw/2.;
-                  }
-               else {
-                  qpaw  = 2.0f*f_wwo*qpa;  //Fix for Q(f) suggested by Kyle
-		  	}
-//                 printf("qpaw %f\n",qpaw);
-//              printf("qpaw1 %g\n",qpaw);
-        qpaw=qpaw/f_wwo;
-//      printf("qpaw2 %g\n",qpaw);
+  // Compute initial stress on GPU (Daniel)
+  register _prec ini[9], ini_ip1[9];
+  register _prec depth, pfluid;
+  register int srfpos;
 
+  k    = blockIdx.x*blockDim.x+threadIdx.x+align;
+  j    = blockIdx.y*blockDim.y+threadIdx.y+s_j;
+  
+  //if (k >= nzt+align || j > e_j) return;
+  if (k > nzt+align+1 || j > e_j) return;
 
+  i    = e_i;
+  pos  = i*d_slice_1[d_i]+j*d_yline_1[d_i]+k;
 
-        h        = 0.0625*( qs[pos]     + qs[pos_ip1] + qs[pos_jm1] + qs[pos_ijk]
-                          + qs[pos_km1] + qs[pos_ik1] + qs[pos_jk1] + qs[pos_ijk1] );
+  kk   = k - align;
+  jj   = j - (2+ngsl);
 
-       if(1./(h*2.0)<=200.0)
-        {
-                  hw=coeff[f_ww*2-2]*(2.*h)*(2.*h)+coeff[f_ww*2-1]*(2.*h);
-                  //                  hw=hw/2.;
-                  }
-               else {
-                  hw  = 2.0f*f_wwo*h;  //Fix for Q(f) suggested by Kyle
-                }
-        hw=hw/f_wwo;
+  srfpos = d_nzt[d_i] + align - 1;
+  depth = (float) (srfpos - k) * d_DH[d_i];
 
+  if (depth > 0) pfluid = (depth + d_DH[d_i]*0.5) * 9.81e3;
+  else pfluid = d_DH[d_i] / 2. * 9.81e3;
+ 
+  //cuPrintf("k=%d, depth=%f, pfluid=%e\n", k, depth, pfluid);
 
-        h1       = 0.250*(  qs[pos]     + qs[pos_km1] );
-
-        if(1./(h1*2.0)<=200.0)
-        {
-                  h1w=coeff[f_ww*2-2]*(2.*h1)*(2.*h1)+coeff[f_ww*2-1]*(2.*h1);
-                  //                  h1w=h1w/2.;
-                  }
-                         else {
-                  h1w  = 2.0f*f_wwo*h1; //Fix for Q(f) suggested by Kyle
-                }
-        h1w=h1w/f_wwo;
-
-
-
-        h2       = 0.250*(  qs[pos]     + qs[pos_jm1] );
-        if(1./(h2*2.0)<=200.0)
-        {
-                  h2w=coeff[f_ww*2-2]*(2.*h2)*(2.*h2)+coeff[f_ww*2-1]*(2.*h2);
-                  //                  h2w=h2w/2.;
-                  }
-                         else {
-                  h2w  = 2.0f*f_wwo*h2; //Fix for Q(f) suggested by Kyle
-                }
-        h2w=h2w/f_wwo;
-
-
-        h3       = 0.250*(  qs[pos]     + qs[pos_ip1] );
-        if(1./(h3*2.0)<=200.0)
-        {
-                  h3w=coeff[f_ww*2-2]*(2.*h3)*(2.*h3)+coeff[f_ww*2-1]*(2.*h3);
-                  //                  h3w=h3w/2.;
-                  }
-                         else {
-                  h3w  = 2.0f*f_wwo*h3; //Fix for Q(f) suggested by Kyle
-                }
-        h3w=h3w/f_wwo;
-
-	h        = -xm*hw*d_dh1[d_i];
-        h1       = -xmu1*h1w*d_dh1[d_i];
-        h2       = -xmu2*h2w*d_dh1[d_i];
-        h3       = -xmu3*h3w*d_dh1[d_i];
-
-
-        //        h1       = -xmu1*hw1*d_dh1[d_i];
-        //h2       = -xmu2*hw2*d_dh1[d_i];
-        //h3       = -xmu3*hw3*d_dh1[d_i];
-
-
-        qpa      = -qpaw*xl*d_dh1[d_i];
-        //        qpa      = -qpaw*xl*d_dh1[d_i];
-
-        xm       = xm*d_dth[d_i];
-        xmu1     = xmu1*d_dth[d_i];
-        xmu2     = xmu2*d_dth[d_i];
-        xmu3     = xmu3*d_dth[d_i];
-        xl       = xl*d_dth[d_i];
-      //  f_vx2    = f_vx2*f_vx1;
-        h        = h*f_vx1;
-        h1       = h1*f_vx1;
-        h2       = h2*f_vx1;
-        h3       = h3*f_vx1;
-        qpa      = qpa*f_vx1;
-
-        xm       = xm+d_DT*h;
-        xmu1     = xmu1+d_DT*h1;
-        xmu2     = xmu2+d_DT*h2;
-        xmu3     = xmu3+d_DT*h3;
-        vx1      = d_DT*(1+f_vx2*f_vx1);
-        
-        u1_ip2   = u1_ip1;
-        u1_ip1   = f_u1;
-        f_u1     = u1_im1;
-        u1_im1   = u1[pos_im1];
-        v1_ip1   = f_v1;
-        f_v1     = v1_im1;
-        v1_im1   = v1_im2;
-        v1_im2   = v1[pos_im2];
-        w1_ip1   = f_w1;
-        f_w1     = w1_im1;
-        w1_im1   = w1_im2;
-        w1_im2   = w1[pos_im2];
-
-        if (d_i == 0){ /*Apply FS condition on uppermost grid only*/
-	  if(k == d_nzt[d_i]+align-1) {
-	      u1[pos_kp1] = f_u1 - (f_w1 - w1_im1);
-	      v1[pos_kp1] = f_v1 - (w1[pos_jp1] - f_w1);
-
-	      g_i  = d_nxt[d_i]*rankx + i - ngsl - 1;
-
-	      if(g_i<NX)
-		      vs1 = u1_ip1 - (w1_ip1 - f_w1);
-	      else
-		      vs1 = 0.0;
-
-	      g_i  = d_nyt[d_i]*ranky + j - ngsl - 1;
-	      if(g_i>1 || NPC == 2) //periodic BCs
-		      vs2 = v1[pos_jm1] - (f_w1 - w1[pos_jm1]);
-	      else
-		      vs2 = 0.0;
-
-	      w1[pos_kp1] = w1[pos_km1] - lam_mu[i*(d_nyt[d_i]+4+ngsl2) + j]*((vs1 - u1[pos_kp1]) + (u1_ip1 - f_u1)
-                           + (v1[pos_kp1] - vs2) + (f_v1   - v1[pos_jm1]) );
-	  }
-	  else if(k == d_nzt[d_i]+align-2) {
-		  u1[pos_kp2] = u1[pos_kp1] - (w1[pos_kp1]   - w1[pos_im1+1]);
-		  v1[pos_kp2] = v1[pos_kp1] - (w1[pos_jp1+1] - w1[pos_kp1]);
-	  }
-        }
- 
-    	vs1      = d_c1*(u1_ip1 - f_u1)        + d_c2*(u1_ip2      - u1_im1);
-        vs2      = d_c1*(f_v1   - v1[pos_jm1]) + d_c2*(v1[pos_jp1] - v1[pos_jm2]);
-        vs3      = d_c1*(f_w1   - w1[pos_km1]) + d_c2*(w1[pos_kp1] - w1[pos_km2]);
- 
-        tmp      = xl*(vs1+vs2+vs3);
-        a1       = qpa*(vs1+vs2+vs3);
-        tmp      = tmp+d_DT*a1;
-
-        f_r      = r1[pos];
-	 f_rtmp   = -h*(vs2+vs3) + a1; 
-	 xx[pos]  = xx[pos]  + tmp - xm*(vs2+vs3) + vx1*f_r;  
-	 r1[pos]  = f_vx2*f_r + f_wwo*f_rtmp;
-	 f_rtmp   = f_rtmp*(f_wwo-1) + f_vx2*f_r*(1-f_vx1); 
-	  xx[pos]  = (xx[pos] + d_DT*f_rtmp)*f_dcrj;
-
-        f_r      = r2[pos];
-	 f_rtmp   = -h*(vs1+vs3) + a1;  
-        yy[pos]  = (yy[pos]  + tmp - xm*(vs1+vs3) + vx1*f_r)*f_dcrj;
-
-	 r2[pos]  = f_vx2*f_r + f_wwo*f_rtmp; 
-	 f_rtmp   = f_rtmp*(f_wwo-1) + f_vx2*f_r*(1-f_vx1); 
-	  yy[pos]  = (yy[pos] + d_DT*f_rtmp)*f_dcrj;
-	
-        f_r      = r3[pos];
-	f_rtmp   = -h*(vs1+vs2) + a1;
-        zz[pos]  = (zz[pos]  + tmp - xm*(vs1+vs2) + vx1*f_r)*f_dcrj;
-	 r3[pos]  = f_vx2*f_r + f_wwo*f_rtmp;
-	 f_rtmp   = f_rtmp*(f_wwo-1) + f_vx2*f_r*(1-f_vx1);  
-	 zz[pos]  = (zz[pos] + d_DT*f_rtmp)*f_dcrj;
-
-        vs1      = d_c1*(u1[pos_jp1] - f_u1)   + d_c2*(u1[pos_jp2] - u1[pos_jm1]);
-        vs2      = d_c1*(f_v1        - v1_im1) + d_c2*(v1_ip1      - v1_im2);
-        f_r      = r4[pos];
- 	f_rtmp   = h1*(vs1+vs2); 
-	 xy[pos]  = xy[pos]  + xmu1*(vs1+vs2) + vx1*f_r;
-	 r4[pos]  = f_vx2*f_r + f_wwo*f_rtmp; 
-	 f_rtmp   = f_rtmp*(f_wwo-1) + f_vx2*f_r*(1-f_vx1);
-	 xy[pos]  = (xy[pos] + d_DT*f_rtmp)*f_dcrj;
- 
-        //moved to separate subroutine fstr, to be executed after plasticity (Daniel)
-        /*if(k == d_nzt+align-1)
-        {
-                zz[pos+1] = -zz[pos];
-        	xz[pos]   = 0.0;
-                yz[pos]   = 0.0;
-        }
-        else
-        {*/
-        	vs1     = d_c1*(u1[pos_kp1] - f_u1)   + d_c2*(u1[pos_kp2] - u1[pos_km1]);
-        	vs2     = d_c1*(f_w1        - w1_im1) + d_c2*(w1_ip1      - w1_im2);
-        	f_r     = r5[pos];
-		 f_rtmp  = h2*(vs1+vs2);
-		  xz[pos] = xz[pos]  + xmu2*(vs1+vs2) + vx1*f_r; 
-		   r5[pos] = f_vx2*f_r + f_wwo*f_rtmp; 
-		   f_rtmp  = f_rtmp*(f_wwo-1) + f_vx2*f_r*(1-f_vx1); 
-		   xz[pos] = (xz[pos] + d_DT*f_rtmp)*f_dcrj;
-	 
-
-        	vs1     = d_c1*(v1[pos_kp1] - f_v1) + d_c2*(v1[pos_kp2] - v1[pos_km1]);
-        	vs2     = d_c1*(w1[pos_jp1] - f_w1) + d_c2*(w1[pos_jp2] - w1[pos_jm1]);
-        	f_r     = r6[pos];
-		f_rtmp  = h3*(vs1+vs2);
-		yz[pos] = yz[pos]  + xmu3*(vs1+vs2) + vx1*f_r;
-		 r6[pos] = f_vx2*f_r + f_wwo*f_rtmp;
-		  f_rtmp  = f_rtmp*(f_wwo-1) + f_vx2*f_r*(1-f_vx1); 
-		  yz[pos] = (yz[pos] + d_DT*f_rtmp)*f_dcrj; 
-
-                // also moved to fstr (Daniel)
-                /*if(k == d_nzt+align-2)
-                {
-                    zz[pos+3] = -zz[pos];
-                    xz[pos+2] = -xz[pos];
-                    yz[pos+2] = -yz[pos];                                               
-		}
-		else if(k == d_nzt+align-3)
-		{
-                    xz[pos+4] = -xz[pos];
-                    yz[pos+4] = -yz[pos];
-		}*/
- 	/*}*/
-        pos     = pos_im1;
-    }
-    return;
-}
-
-
-
-void dstrqc_H_new(_prec*  xx,       _prec*  yy,     _prec*  zz,    _prec*  xy,    _prec*  xz, _prec*  yz,
-                  _prec*  r1,       _prec*  r2,     _prec*  r3,    _prec*  r4,    _prec*  r5, _prec*  r6,
-                  _prec*  u1,       _prec*  v1,     _prec*  w1,    _prec*  lam,   _prec*  mu, _prec*  qp,_prec*  coeff, 
-                  _prec*  qs,       _prec*  dcrjx,  _prec*  dcrjy, _prec*  dcrjz, int nyt,   int nzt, 
-                  cudaStream_t St, _prec*  lam_mu, 
-                  //_prec *vx1, _prec *vx2, _prec *ww, _prec *wwo, //peng's version
-                  _prec *vx1, _prec *vx2, int *ww, _prec *wwo,
-                  int NX,          int NPC,       int rankx,    int ranky, int  s_i,  
-                  int e_i,         int s_j,       int e_j, int d_i)
-{
-    //fprintf(stderr, "nzt=%d, e_j=%d, s_j=%d\n", nzt, e_j, s_j);
-    //cudaPrintfInit();
-    if (0 == (nzt % 64) && 0 == (( e_j-s_j+1) % 8)) {
-      const int blockx = 64, blocky = 8;
-      dim3 block(blockx, blocky, 1);
-      dim3 grid ((nzt+block.x-1)/block.x, (e_j-s_j+1+block.y-1)/block.y,1);
-       CUCHK(cudaFuncSetCacheConfig(dstrqc_new<blockx,blocky>, cudaFuncCachePreferShared)) ;
-      dstrqc_new<blockx,blocky><<<grid, block, 0, St>>>(xx, yy, zz, xy,  xz, yz, r1, r2,    r3,    r4,    r5,     r6, 
-                                     u1, v1, w1, lam, mu, qp,coeff, qs, dcrjx, dcrjy, dcrjz, lam_mu, 
-                                     vx1, vx2, ww, wwo,
-                                     NX, NPC, rankx, ranky, nzt, s_i, e_i, s_j, e_j, d_i);
-    } else {
-      const int blockx = BLOCK_SIZE_Z, blocky = BLOCK_SIZE_Y;
-      dim3 block(blockx, blocky, 1);
-      dim3 grid ((nzt+block.x-1)/block.x, (e_j-s_j+1+block.y-1)/block.y,1);
-       CUCHK(cudaFuncSetCacheConfig(dstrqc_new<blockx,blocky>, cudaFuncCachePreferShared)) ;
-      dstrqc_new<blockx,blocky><<<grid, block, 0, St>>>(xx, yy, zz, xy,  xz, yz, r1, r2,    r3,    r4,    r5,     r6, 
-                                     u1, v1, w1, lam, mu, qp,coeff, qs, dcrjx, dcrjy, dcrjz, lam_mu, 
-                                     vx1, vx2, ww, wwo,
-                                     NX, NPC, rankx, ranky, nzt, s_i, e_i, s_j, e_j, d_i);
-
-    }
-    cudaError_t cerr;
-    cerr=cudaGetLastError();
-    if(cerr!=cudaSuccess) printf("CUDA ERROR: dstrqc_H_new after kernel: %s\n",cudaGetErrorString(cerr));
-    //cudaPrintfDisplay(stdout, 1);
-    //cudaPrintfEnd();
-    return;
-}
-
-
-/* kernel function to apply free-surface B.C. to stresses - (Daniel) */
-void fstr_H(_prec*  zz, _prec*  xz, _prec*  yz, cudaStream_t St, int s_i, int e_i, int s_j, int e_j)
-{
-    dim3 block (2, BLOCK_SIZE_Y, 1);
-    dim3 grid (1,(e_j-s_j+1+BLOCK_SIZE_Y-1)/BLOCK_SIZE_Y,1);
-    CUCHK(cudaFuncSetCacheConfig(fstr, cudaFuncCachePreferL1));
-    fstr<<<grid, block, 0, St>>>(zz, xz, yz, s_i, e_i, s_j);
-    return;
-}
-
-
-__global__ void 
-__launch_bounds__(512,2)
-drprecpc_calc_opt(_prec *xx, _prec *yy, _prec *zz, 
-                  const _prec*  __restrict__ xy, 
-                  const _prec*  __restrict__ xz, 
-                  const _prec*  __restrict__ yz, 
-                  _prec *mu, _prec *d1, 
-                  _prec *sigma2, 
-                  _prec *yldfac,_prec *cohes, _prec *phi,
-                  _prec *neta,
-                  int nzt, int s_i, int e_i, int s_j, int e_j,  int d_i) { 
-  register int i,j,k,pos;
-  register int pos_im1,pos_ip1,pos_jm1,pos_km1;
-  register int pos_ip1jm1;
-  register int pos_ip1km1,pos_jm1km1;
-  register _prec Sxx, Syy, Szz, Sxy, Sxz, Syz;
-  register _prec Sxxp, Syyp, Szzp, Sxyp, Sxzp, Syzp;
-  register _prec depxx, depyy, depzz, depxy, depxz, depyz;
-  register _prec SDxx, SDyy, SDzz;
-  register _prec iyldfac, Tv, sigma_m, taulim, taulim2, rphi;
-  register _prec xm, iixx, iiyy, iizz;
-  register _prec mu_, secinv, sqrtSecinv;
-  register int   jj,kk;
-
-  // Compute initial stress on GPU (Daniel)
-  register _prec ini[9], ini_ip1[9];
-  register _prec depth, pfluid;
-  register int srfpos;
-
-  k    = blockIdx.x*blockDim.x+threadIdx.x+align;
-  j    = blockIdx.y*blockDim.y+threadIdx.y+s_j;
-  
-  //if (k >= nzt+align || j > e_j) return;
-  if (k > nzt+align+1 || j > e_j) return;
-
-  i    = e_i;
-  pos  = i*d_slice_1[d_i]+j*d_yline_1[d_i]+k;
-
-  kk   = k - align;
-  jj   = j - (2+ngsl);
-
-  srfpos = d_nzt[d_i] + align - 1;
-  depth = (_prec) (srfpos - k) * d_DH[d_i];
-
-  if (depth > 0) pfluid = (depth + d_DH[d_i]*0.5) * 9.81e3;
-  else pfluid = d_DH[d_i] / 2. * 9.81e3;
- 
-  //cuPrintf("k=%d, depth=%f, pfluid=%e\n", k, depth, pfluid);
-
-  _prec sigma2_ip1, sigma2_i;
-  _prec xy_ip1, xy_i, xz_ip1, xz_i, yz_ip1, yz_i;
-  _prec mu_ip1, mu_i;
-  _prec xz_km1, xz_ip1km1, xy_jm1, xy_ip1jm1;
-  sigma2_i = sigma2[pos + d_slice_1[d_i]];
-  xy_i    = xy   [pos + d_slice_1[d_i]];
-  xz_i    = xz   [pos + d_slice_1[d_i]];
-  mu_i    = mu   [pos + d_slice_1[d_i]];
-  xz_km1  = xz   [pos + d_slice_1[d_i] - 1];
-  xy_jm1  = xy   [pos + d_slice_1[d_i] - d_yline_1[d_i]];
-  for(i=e_i;i>=s_i;--i){
-    sigma2_ip1 = sigma2_i;
-    xy_ip1    = xy_i;
-    xz_ip1    = xz_i;
-    mu_ip1    = mu_i;
-    xz_ip1km1 = xz_km1;
-    xy_ip1jm1 = xy_jm1;
+  _prec sigma2_ip1, sigma2_i;
+  _prec xy_ip1, xy_i, xz_ip1, xz_i, yz_ip1, yz_i;
+  _prec mu_ip1, mu_i;
+  _prec xz_km1, xz_ip1km1, xy_jm1, xy_ip1jm1;
+  sigma2_i = sigma2[pos + d_slice_1[d_i]];
+  xy_i    = xy   [pos + d_slice_1[d_i]];
+  xz_i    = xz   [pos + d_slice_1[d_i]];
+  mu_i    = mu   [pos + d_slice_1[d_i]];
+  xz_km1  = xz   [pos + d_slice_1[d_i] - 1];
+  xy_jm1  = xy   [pos + d_slice_1[d_i] - d_yline_1[d_i]];
+  for(i=e_i;i>=s_i;--i){
+    sigma2_ip1 = sigma2_i;
+    xy_ip1    = xy_i;
+    xz_ip1    = xz_i;
+    mu_ip1    = mu_i;
+    xz_ip1km1 = xz_km1;
+    xy_ip1jm1 = xy_jm1;
 
     pos_im1 = pos - d_slice_1[d_i];
     pos_ip1 = pos + d_slice_1[d_i];
@@ -1422,6 +1125,7 @@ drprecpc_calc_opt(_prec *xx, _prec *yy, _prec *zz,
 }
 
 // drprecpc is for plasticity computation for cerjan and wave propagation
+extern "C"
 void drprecpc_calc_H_opt(_prec *xx, _prec *yy, _prec *zz, _prec *xy, _prec *xz, _prec *yz,
         _prec *mu, _prec *d1, _prec *sigma2,
         _prec *yldfac,_prec *cohes, _prec *phi,
@@ -1431,7 +1135,7 @@ void drprecpc_calc_H_opt(_prec *xx, _prec *yy, _prec *zz, _prec *xy, _prec *xz,
 
     dim3 block (BLOCK_SIZE_Z, BLOCK_SIZE_Y, 1);
     dim3 grid ((nzt+BLOCK_SIZE_Z-1)/BLOCK_SIZE_Z, ((yre-yls+1)+BLOCK_SIZE_Y-1)/BLOCK_SIZE_Y,1);
-    CUCHK(cudaFuncSetCacheConfig(drprecpc_calc_opt, cudaFuncCachePreferL1));
+    cudaFuncSetCacheConfig(drprecpc_calc_opt, cudaFuncCachePreferL1);
 
     //split into tho routines, one for the normal, one for shear stress components (Daniel)
     drprecpc_calc_opt<<<grid, block, 0, St>>>(xx,yy,zz,xy,xz,yz,mu,d1,
@@ -1442,6 +1146,7 @@ void drprecpc_calc_H_opt(_prec *xx, _prec *yy, _prec *zz, _prec *xy, _prec *xz,
 return;
 }
 
+extern "C"
 void drprecpc_app_H(_prec *xx, _prec *yy, _prec *zz, 
         _prec *xy, _prec *xz, _prec *yz,
         _prec *mu, _prec *sigma2, _prec *yldfac, 
@@ -1453,7 +1158,7 @@ void drprecpc_app_H(_prec *xx, _prec *yy, _prec *zz,
 
     cerr=cudaGetLastError();
     if(cerr!=cudaSuccess) printf("CUDA ERROR: drprecpc_app before kernel: %s\n",cudaGetErrorString(cerr));
-    CUCHK(cudaFuncSetCacheConfig(drprecpc_app, cudaFuncCachePreferL1));
+    cudaFuncSetCacheConfig(drprecpc_app, cudaFuncCachePreferL1);
     drprecpc_app<<<grid, block, 0, St>>>(xx,yy,zz,xy,xz,yz,mu,
         sigma2,yldfac,xls,xre,yls,d_i);
     cerr=cudaGetLastError();
@@ -1462,188 +1167,509 @@ void drprecpc_app_H(_prec *xx, _prec *yy, _prec *zz,
 return;
 }
 
-void addsrc_H(int i,      int READ_STEP, int dim,    int* psrc,  int npsrc,  cudaStream_t St,
-              _prec*  axx, _prec*  ayy,    _prec*  azz, _prec*  axz, _prec*  ayz, _prec*  axy,
-              _prec*  xx,  _prec*  yy,     _prec*  zz,  _prec*  xy,  _prec*  yz,  _prec*  xz, int d_i)
-{
-    dim3 grid, block;
-    if(npsrc < 256)
-    {
-       block.x = npsrc;
-       grid.x = 1;
-    }
-    else
-    {
-       block.x = 256;
-       grid.x  = int((npsrc+255)/256);
+extern "C"
+void addsrc_H(int i,      int READ_STEP, int dim,    int* psrc,  int npsrc,  cudaStream_t St,
+              float* axx, float* ayy,    float* azz, float* axz, float* ayz, float* axy,
+              float* xx,  float* yy,     float* zz,  float* xy,  float* yz,  float* xz, int d_i)
+{
+    dim3 grid, block;
+    if(npsrc < 256)
+    {
+       block.x = npsrc;
+       grid.x = 1;
+    }
+    else
+    {
+       block.x = 256;
+       grid.x  = int((npsrc+255)/256);
+    }
+    cudaError_t cerr;
+    cerr=cudaGetLastError();
+    if(cerr!=cudaSuccess) printf("CUDA ERROR: addsrc before kernel: %s\n",cudaGetErrorString(cerr));
+    /*cudaPrintfInit();*/
+    addsrc_cu<<<grid, block, 0, St>>>(i,  READ_STEP, dim, psrc, npsrc, axx, ayy, azz, axz, ayz, axy,
+                                      xx, yy,        zz,  xy,   yz,  xz, d_i);
+    cerr=cudaGetLastError();
+    /*cudaPrintfDisplay(stdout, 1);
+    cudaPrintfEnd();*/
+    if(cerr!=cudaSuccess) printf("CUDA ERROR: addsrc after kernel: %s\n",cudaGetErrorString(cerr));
+    return;
+}
+
+__global__ void dvelcy(float* u1,    float* v1,    float* w1,    float* xx,  float* yy,   float* zz,   float* xy, float* xz, float* yz,
+                       float* dcrjx, float* dcrjy, float* dcrjz, float* d_1, float* s_u1, float* s_v1, float* s_w1, int s_j, int e_j,
+                       int d_i)
+{
+    register int   i, j, k, pos,     j2,      pos2, pos_jm1, pos_jm2;
+    register int   pos_km2, pos_km1, pos_kp1, pos_kp2;
+    register int   pos_im2, pos_im1, pos_ip1, pos_ip2;
+    register int   pos_jk1, pos_ik1, pos_ijk;
+    register _prec f_xy,    xy_jp1,  xy_jm1,  xy_jm2;
+    register _prec f_yy,    yy_jp2,  yy_jp1,  yy_jm1;
+    register _prec f_yz,    yz_jp1,  yz_jm1,  yz_jm2;
+    register _prec f_d1,    f_d2,    f_d3,    f_dcrj, f_dcrjx, f_dcrjz, f_xz;
+
+    if (k > d_nzt[d_i]+align-3 && d_i > 0) return;
+
+    k     = blockIdx.x*BLOCK_SIZE_Z+threadIdx.x+align;
+    i     = blockIdx.y*BLOCK_SIZE_Y+threadIdx.y+2+ngsl;
+    j     = e_j;
+    j2    = ngsl-1;
+    pos   = i*d_slice_1[d_i]+j*d_yline_1[d_i]+k;
+    pos2  = i*ngsl*d_yline_1[d_i]+j2*d_yline_1[d_i]+k; 
+
+    f_xy    = xy[pos+d_yline_1[d_i]];
+    xy_jm1  = xy[pos];
+    xy_jm2  = xy[pos-d_yline_1[d_i]];
+    yy_jp1  = yy[pos+d_yline_2[d_i]];
+    f_yy    = yy[pos+d_yline_1[d_i]];
+    yy_jm1  = yy[pos];
+    f_yz    = yz[pos+d_yline_1[d_i]];
+    yz_jm1  = yz[pos];
+    yz_jm2  = yz[pos-d_yline_1[d_i]];
+    f_dcrjz = dcrjz[k];
+    f_dcrjx = dcrjx[i];
+    for(j=e_j; j>=s_j; j--)
+    {
+        pos_km2  = pos-2;
+        pos_km1  = pos-1;
+        pos_kp1  = pos+1;
+        pos_kp2  = pos+2;
+        pos_jm2  = pos-d_yline_2[d_i];
+        pos_jm1  = pos-d_yline_1[d_i];
+        pos_im1  = pos-d_slice_1[d_i];
+        pos_im2  = pos-d_slice_2[d_i];
+        pos_ip1  = pos+d_slice_1[d_i];
+        pos_ip2  = pos+d_slice_2[d_i];
+        pos_jk1  = pos-d_yline_1[d_i]-1;
+        pos_ik1  = pos+d_slice_1[d_i]-1;
+        pos_ijk  = pos+d_slice_1[d_i]-d_yline_1[d_i];
+
+        xy_jp1   = f_xy;
+        f_xy     = xy_jm1;
+        xy_jm1   = xy_jm2;
+        xy_jm2   = xy[pos_jm2];
+        yy_jp2   = yy_jp1;
+        yy_jp1   = f_yy;
+        f_yy     = yy_jm1;
+        yy_jm1   = yy[pos_jm1];
+        yz_jp1   = f_yz;
+        f_yz     = yz_jm1;
+        yz_jm1   = yz_jm2;
+        yz_jm2   = yz[pos_jm2];
+        f_xz     = xz[pos];
+
+        f_dcrj   = f_dcrjx*dcrjy[j]*f_dcrjz;
+        f_d1     = 0.25*(d_1[pos] + d_1[pos_jm1] + d_1[pos_km1] + d_1[pos_jk1]);
+        f_d2     = 0.25*(d_1[pos] + d_1[pos_ip1] + d_1[pos_km1] + d_1[pos_ik1]);
+        f_d3     = 0.25*(d_1[pos] + d_1[pos_ip1] + d_1[pos_jm1] + d_1[pos_ijk]);
+
+        f_d1     = d_dth[d_i]/f_d1;
+        f_d2     = d_dth[d_i]/f_d2;
+        f_d3     = d_dth[d_i]/f_d3;
+
+        s_u1[pos2] = (u1[pos] + f_d1*( d_c1*(xx[pos]     - xx[pos_im1]) + d_c2*(xx[pos_ip1] - xx[pos_im2])
+                                     + d_c1*(f_xy        - xy_jm1)      + d_c2*(xy_jp1      - xy_jm2)
+                                     + d_c1*(f_xz        - xz[pos_km1]) + d_c2*(xz[pos_kp1] - xz[pos_km2]) ))*f_dcrj;
+        s_v1[pos2] = (v1[pos] + f_d2*( d_c1*(xy[pos_ip1] - f_xy)        + d_c2*(xy[pos_ip2] - xy[pos_im1])
+                                     + d_c1*(yy_jp1      - f_yy)        + d_c2*(yy_jp2      - yy_jm1)
+                                     + d_c1*(f_yz        - yz[pos_km1]) + d_c2*(yz[pos_kp1] - yz[pos_km2]) ))*f_dcrj;
+        s_w1[pos2] = (w1[pos] + f_d3*( d_c1*(xz[pos_ip1] - f_xz)        + d_c2*(xz[pos_ip2] - xz[pos_im1])
+                                     + d_c1*(f_yz        - yz_jm1)      + d_c2*(yz_jp1      - yz_jm2)
+                                     + d_c1*(zz[pos_kp1] - zz[pos])     + d_c2*(zz[pos_kp2] - zz[pos_km1]) ))*f_dcrj;
+
+        pos        = pos_jm1;
+        pos2       = pos2 - d_yline_1[d_i];
+    }
+    return;
+}
+
+__global__ void update_boundary_y(float* u1, float* v1, float* w1, float* s_u1, float* s_v1, float* s_w1, int rank, int flag, int d_i)
+{
+    register int i, j, k, pos, posj;
+    k     = blockIdx.x*BLOCK_SIZE_Z+threadIdx.x+align;
+    i     = blockIdx.y*BLOCK_SIZE_Y+threadIdx.y+2+ngsl;
+
+    if(flag==Front && rank!=-1){
+	j     = 2;
+    	pos   = i*d_slice_1[d_i]+j*d_yline_1[d_i]+k;
+        posj  = i*ngsl*d_yline_1[d_i]+k;
+	for(j=2;j<2+ngsl;j++){
+		u1[pos] = s_u1[posj];
+		v1[pos] = s_v1[posj];
+		w1[pos] = s_w1[posj];
+		pos	= pos  + d_yline_1[d_i];
+  		posj	= posj + d_yline_1[d_i];	
+	}
+    }
+
+    if(flag==Back && rank!=-1){
+    	j     = d_nyt[d_i]+ngsl+2;
+    	pos   = i*d_slice_1[d_i]+j*d_yline_1[d_i]+k;
+        posj  = i*ngsl*d_yline_1[d_i]+k;
+	for(j=d_nyt[d_i]+ngsl+2;j<d_nyt[d_i]+ngsl2+2;j++){
+	        u1[pos] = s_u1[posj];
+                v1[pos] = s_v1[posj];
+                w1[pos] = s_w1[posj];
+                pos     = pos  + d_yline_1[d_i];
+                posj    = posj + d_yline_1[d_i];
+	}
+    }
+    return;
+}
+
+/* kernel functions to apply free-surface B.C.s to stress */
+__global__ void fstr (float* zz, float* xz, float* yz, int s_i, int e_i, int s_j)
+{
+    register int i, j, k;
+    register int pos, pos_im1; 
+
+    k    = d_nzt[0]+align-1;
+    j    = blockIdx.y*BLOCK_SIZE_Y+threadIdx.y+s_j;
+    i    = e_i;
+    pos  = i*d_slice_1[0]+j*d_yline_1[0]+k;
+
+    for(i=e_i;i>=s_i;i--)
+    {
+        pos_im1  = pos-d_slice_1[0];
+
+        // asymmetry reflection above free surface
+        zz[pos+1] = -zz[pos];
+        zz[pos+2] = -zz[pos-1];
+
+        xz[pos+1] = -xz[pos-1];
+        xz[pos+2] = -xz[pos-2];
+
+        yz[pos+1] = -yz[pos-1];                                               
+        yz[pos+2] = -yz[pos-2];
+
+        // horizontal shear stresses on free surface
+        xz[pos]   = 0.0;
+        yz[pos]   = 0.0;
+
+        pos     = pos_im1;
     }
-    cudaError_t cerr;
-    cerr=cudaGetLastError();
-    if(cerr!=cudaSuccess) printf("CUDA ERROR: addsrc before kernel: %s\n",cudaGetErrorString(cerr));
-    //cudaPrintfInit();
-    addsrc_cu<<<grid, block, 0, St>>>(i,  READ_STEP, dim, psrc, npsrc, axx, ayy, azz, axz, ayz, axy,
-                                      xx, yy,        zz,  xy,   yz,  xz, d_i);
-    cerr=cudaGetLastError();
-    //cudaPrintfDisplay(stdout, 1);
-    //cudaPrintfEnd();
-    if(cerr!=cudaSuccess) printf("CUDA ERROR: addsrc after kernel: %s\n",cudaGetErrorString(cerr));
-    return;
+
 }
 
-__global__ void dvelcy(_prec*  u1,    _prec*  v1,    _prec*  w1,    _prec*  xx,  _prec*  yy,   _prec*  zz,   _prec*  xy, _prec*  xz, _prec*  yz,
-                       _prec*  dcrjx, _prec*  dcrjy, _prec*  dcrjz, _prec*  d_1, _prec*  s_u1, _prec*  s_v1, _prec*  s_w1, int s_j, int e_j,
-                       int d_i)
+/* Old dstrqc routine */
+__global__ void dstrqc(float* xx, float* yy,    float* zz,    float* xy,    float* xz,     float* yz,
+                       float* r1, float* r2,    float* r3,    float* r4,    float* r5,     float* r6,
+                       float* u1, float* v1,    float* w1,    float* lam,   float* mu,     float* qp,float* coeff, 
+                       float* qs, float* dcrjx, float* dcrjy, float* dcrjz, float* lam_mu, 
+                       _prec *d_vx1, _prec *d_vx2, int *d_ww, _prec *d_wwo,
+                       int NX, int NPC, int rankx, int ranky, int nzt, int s_i, int e_i, int s_j, int e_j, int d_i)
 {
-    register int   i, j, k, pos,     j2,      pos2, pos_jm1, pos_jm2;
+    register int   i,  j,  k,  g_i;
+    register int   pos,     pos_ip1, pos_im2, pos_im1;
     register int   pos_km2, pos_km1, pos_kp1, pos_kp2;
-    register int   pos_im2, pos_im1, pos_ip1, pos_ip2;
-    register int   pos_jk1, pos_ik1, pos_ijk;
-    register _prec f_xy,    xy_jp1,  xy_jm1,  xy_jm2;
-    register _prec f_yy,    yy_jp2,  yy_jp1,  yy_jm1;
-    register _prec f_yz,    yz_jp1,  yz_jm1,  yz_jm2;
-    register _prec f_d1,    f_d2,    f_d3,    f_dcrj, f_dcrjx, f_dcrjz, f_xz;
+    register int   pos_jm2, pos_jm1, pos_jp1, pos_jp2;
+    register int   pos_ik1, pos_jk1, pos_ijk, pos_ijk1,f_ww;
+    register _prec vs1, vs2, vs3, a1, tmp, vx1,f_wwo;
+    register _prec xl,  xm,  xmu1, xmu2, xmu3;
+    register _prec qpa, h,   h1,   h2,   h3;
+     register _prec qpaw,hw,h1w,h2w,h3w; 
+    register _prec f_vx1, f_vx2,  f_dcrj, f_r,  f_dcrjy, f_dcrjz;
+      register _prec f_rtmp;
+    register _prec f_u1, u1_ip1, u1_ip2, u1_im1;
+    register _prec f_v1, v1_im1, v1_ip1, v1_im2;
+    register _prec f_w1, w1_im1, w1_im2, w1_ip1;
+    int maxk, mink = align+3;
+    
+    k    = blockIdx.x*BLOCK_SIZE_Z+threadIdx.x+align;
+    j    = blockIdx.y*BLOCK_SIZE_Y+threadIdx.y+s_j;
 
-    if (k > d_nzt[d_i]+align-3 && d_i > 0) return;
+    if (d_i == 0) {
+       maxk = nzt + align -1;
+    }
+    else maxk = nzt + align -3;
 
-    k     = blockIdx.x*BLOCK_SIZE_Z+threadIdx.x+align;
-    i     = blockIdx.y*BLOCK_SIZE_Y+threadIdx.y+2+ngsl;
-    j     = e_j;
-    j2    = ngsl-1;
-    pos   = i*d_slice_1[d_i]+j*d_yline_1[d_i]+k;
-    pos2  = i*ngsl*d_yline_1[d_i]+j2*d_yline_1[d_i]+k; 
+    if (k < mink || k > maxk || j > e_j) return;
+ 
+    i    = e_i;
+    pos  = i*d_slice_1[d_i]+j*d_yline_1[d_i]+k;
 
-    f_xy    = xy[pos+d_yline_1[d_i]];
-    xy_jm1  = xy[pos];
-    xy_jm2  = xy[pos-d_yline_1[d_i]];
-    yy_jp1  = yy[pos+d_yline_2[d_i]];
-    f_yy    = yy[pos+d_yline_1[d_i]];
-    yy_jm1  = yy[pos];
-    f_yz    = yz[pos+d_yline_1[d_i]];
-    yz_jm1  = yz[pos];
-    yz_jm2  = yz[pos-d_yline_1[d_i]];
+    u1_ip1 = u1[pos+d_slice_2[d_i]];
+    f_u1   = u1[pos+d_slice_1[d_i]];
+    u1_im1 = u1[pos];    
+    f_v1   = v1[pos+d_slice_1[d_i]];
+    v1_im1 = v1[pos];
+    v1_im2 = v1[pos-d_slice_1[d_i]];
+    f_w1   = w1[pos+d_slice_1[d_i]];
+    w1_im1 = w1[pos];
+    w1_im2 = w1[pos-d_slice_1[d_i]];
     f_dcrjz = dcrjz[k];
-    f_dcrjx = dcrjx[i];
-    for(j=e_j; j>=s_j; j--)
+    f_dcrjy = dcrjy[j];
+    for(i=e_i;i>=s_i;i--)
     {
+        /*f_vx1    = tex1Dfetch(p_vx1, pos);
+        f_vx2    = tex1Dfetch(p_vx2, pos);
+        f_ww     = tex1Dfetch(p_ww, pos);
+        f_wwo     = tex1Dfetch(p_wwo, pos);*/
+        f_vx1 = d_vx1[pos];
+        f_vx2 = d_vx2[pos];
+        f_ww  = d_ww[pos];
+        f_wwo = d_wwo[pos];
+        /*
+        if(f_wwo!=f_wwo){
+          xx[pos] = yy[pos] = zz[pos] = xy[pos] = xz[pos] = yz[pos] = 1.0;
+          r1[pos] = r2[pos] = r3[pos] = r4[pos] = r5[pos] = r6[pos] = 1.0;
+          return;
+        }
+*/
+        f_dcrj   = dcrjx[i]*f_dcrjy*f_dcrjz;
+
         pos_km2  = pos-2;
         pos_km1  = pos-1;
         pos_kp1  = pos+1;
         pos_kp2  = pos+2;
         pos_jm2  = pos-d_yline_2[d_i];
         pos_jm1  = pos-d_yline_1[d_i];
-        pos_im1  = pos-d_slice_1[d_i];
+        pos_jp1  = pos+d_yline_1[d_i];
+        pos_jp2  = pos+d_yline_2[d_i];
         pos_im2  = pos-d_slice_2[d_i];
+        pos_im1  = pos-d_slice_1[d_i];
         pos_ip1  = pos+d_slice_1[d_i];
-        pos_ip2  = pos+d_slice_2[d_i];
         pos_jk1  = pos-d_yline_1[d_i]-1;
         pos_ik1  = pos+d_slice_1[d_i]-1;
         pos_ijk  = pos+d_slice_1[d_i]-d_yline_1[d_i];
+        pos_ijk1 = pos+d_slice_1[d_i]-d_yline_1[d_i]-1;
 
-        xy_jp1   = f_xy;
-        f_xy     = xy_jm1;
-        xy_jm1   = xy_jm2;
-        xy_jm2   = xy[pos_jm2];
-        yy_jp2   = yy_jp1;
-        yy_jp1   = f_yy;
-        f_yy     = yy_jm1;
-        yy_jm1   = yy[pos_jm1];
-        yz_jp1   = f_yz;
-        f_yz     = yz_jm1;
-        yz_jm1   = yz_jm2;
-        yz_jm2   = yz[pos_jm2];
-        f_xz     = xz[pos];
+        xl       = 8.0/(  lam[pos]      + lam[pos_ip1] + lam[pos_jm1] + lam[pos_ijk]
+                        + lam[pos_km1]  + lam[pos_ik1] + lam[pos_jk1] + lam[pos_ijk1] );
+        xm       = 16.0/( mu[pos]       + mu[pos_ip1]  + mu[pos_jm1]  + mu[pos_ijk]
+                        + mu[pos_km1]   + mu[pos_ik1]  + mu[pos_jk1]  + mu[pos_ijk1] );
+        xmu1     = 2.0/(  mu[pos]       + mu[pos_km1] );
+        xmu2     = 2.0/(  mu[pos]       + mu[pos_jm1] );
+        xmu3     = 2.0/(  mu[pos]       + mu[pos_ip1] );
+        xl       = xl  +  xm;
+        qpa      = 0.0625*( qp[pos]     + qp[pos_ip1] + qp[pos_jm1] + qp[pos_ijk]
+                          + qp[pos_km1] + qp[pos_ik1] + qp[pos_jk1] + qp[pos_ijk1] );
 
-        f_dcrj   = f_dcrjx*dcrjy[j]*f_dcrjz;
-        f_d1     = 0.25*(d_1[pos] + d_1[pos_jm1] + d_1[pos_km1] + d_1[pos_jk1]);
-        f_d2     = 0.25*(d_1[pos] + d_1[pos_ip1] + d_1[pos_km1] + d_1[pos_ik1]);
-        f_d3     = 0.25*(d_1[pos] + d_1[pos_ip1] + d_1[pos_jm1] + d_1[pos_ijk]);
+//                        www=f_ww;
+        if(1./(qpa*2.0)<=200.0)
+        {
+//      printf("coeff[f_ww*2-2] %g\n",coeff[f_ww*2-2]);
+                  qpaw=coeff[f_ww*2-2]*(2.*qpa)*(2.*qpa)+coeff[f_ww*2-1]*(2.*qpa);
+//              qpaw=coeff[www*2-2]*(2.*qpa)*(2.*qpa)+coeff[www*2-1]*(2.*qpa);
+//                qpaw=qpaw/2.;
+                  }
+               else {
+                  qpaw  = 2.0f*f_wwo*qpa;  //Fix for Q(f) suggested by Kyle
+		  	}
+//                 printf("qpaw %f\n",qpaw);
+//              printf("qpaw1 %g\n",qpaw);
+        qpaw=qpaw/f_wwo;
+//      printf("qpaw2 %g\n",qpaw);
 
-        f_d1     = d_dth[d_i]/f_d1;
-        f_d2     = d_dth[d_i]/f_d2;
-        f_d3     = d_dth[d_i]/f_d3;
 
-        s_u1[pos2] = (u1[pos] + f_d1*( d_c1*(xx[pos]     - xx[pos_im1]) + d_c2*(xx[pos_ip1] - xx[pos_im2])
-                                     + d_c1*(f_xy        - xy_jm1)      + d_c2*(xy_jp1      - xy_jm2)
-                                     + d_c1*(f_xz        - xz[pos_km1]) + d_c2*(xz[pos_kp1] - xz[pos_km2]) ))*f_dcrj;
-        s_v1[pos2] = (v1[pos] + f_d2*( d_c1*(xy[pos_ip1] - f_xy)        + d_c2*(xy[pos_ip2] - xy[pos_im1])
-                                     + d_c1*(yy_jp1      - f_yy)        + d_c2*(yy_jp2      - yy_jm1)
-                                     + d_c1*(f_yz        - yz[pos_km1]) + d_c2*(yz[pos_kp1] - yz[pos_km2]) ))*f_dcrj;
-        s_w1[pos2] = (w1[pos] + f_d3*( d_c1*(xz[pos_ip1] - f_xz)        + d_c2*(xz[pos_ip2] - xz[pos_im1])
-                                     + d_c1*(f_yz        - yz_jm1)      + d_c2*(yz_jp1      - yz_jm2)
-                                     + d_c1*(zz[pos_kp1] - zz[pos])     + d_c2*(zz[pos_kp2] - zz[pos_km1]) ))*f_dcrj;
 
-        pos        = pos_jm1;
-        pos2       = pos2 - d_yline_1[d_i];
-    }
-    return;
-}
+        h        = 0.0625*( qs[pos]     + qs[pos_ip1] + qs[pos_jm1] + qs[pos_ijk]
+                          + qs[pos_km1] + qs[pos_ik1] + qs[pos_jk1] + qs[pos_ijk1] );
+
+       if(1./(h*2.0)<=200.0)
+        {
+                  hw=coeff[f_ww*2-2]*(2.*h)*(2.*h)+coeff[f_ww*2-1]*(2.*h);
+                  //                  hw=hw/2.;
+                  }
+               else {
+                  hw  = 2.0f*f_wwo*h;  //Fix for Q(f) suggested by Kyle
+                }
+        hw=hw/f_wwo;
+
+
+        h1       = 0.250*(  qs[pos]     + qs[pos_km1] );
+
+        if(1./(h1*2.0)<=200.0)
+        {
+                  h1w=coeff[f_ww*2-2]*(2.*h1)*(2.*h1)+coeff[f_ww*2-1]*(2.*h1);
+                  //                  h1w=h1w/2.;
+                  }
+                         else {
+                  h1w  = 2.0f*f_wwo*h1; //Fix for Q(f) suggested by Kyle
+                }
+        h1w=h1w/f_wwo;
+
+
+
+        h2       = 0.250*(  qs[pos]     + qs[pos_jm1] );
+        if(1./(h2*2.0)<=200.0)
+        {
+                  h2w=coeff[f_ww*2-2]*(2.*h2)*(2.*h2)+coeff[f_ww*2-1]*(2.*h2);
+                  //                  h2w=h2w/2.;
+                  }
+                         else {
+                  h2w  = 2.0f*f_wwo*h2; //Fix for Q(f) suggested by Kyle
+                }
+        h2w=h2w/f_wwo;
+
+
+        h3       = 0.250*(  qs[pos]     + qs[pos_ip1] );
+        if(1./(h3*2.0)<=200.0)
+        {
+                  h3w=coeff[f_ww*2-2]*(2.*h3)*(2.*h3)+coeff[f_ww*2-1]*(2.*h3);
+                  //                  h3w=h3w/2.;
+                  }
+                         else {
+                  h3w  = 2.0f*f_wwo*h3; //Fix for Q(f) suggested by Kyle
+                }
+        h3w=h3w/f_wwo;
+
+	h        = -xm*hw*d_dh1[d_i];
+        h1       = -xmu1*h1w*d_dh1[d_i];
+        h2       = -xmu2*h2w*d_dh1[d_i];
+        h3       = -xmu3*h3w*d_dh1[d_i];
+
+
+        //        h1       = -xmu1*hw1*d_dh1[d_i];
+        //h2       = -xmu2*hw2*d_dh1[d_i];
+        //h3       = -xmu3*hw3*d_dh1[d_i];
+
+
+        qpa      = -qpaw*xl*d_dh1[d_i];
+        //        qpa      = -qpaw*xl*d_dh1[d_i];
+
+        xm       = xm*d_dth[d_i];
+        xmu1     = xmu1*d_dth[d_i];
+        xmu2     = xmu2*d_dth[d_i];
+        xmu3     = xmu3*d_dth[d_i];
+        xl       = xl*d_dth[d_i];
+      //  f_vx2    = f_vx2*f_vx1;
+        h        = h*f_vx1;
+        h1       = h1*f_vx1;
+        h2       = h2*f_vx1;
+        h3       = h3*f_vx1;
+        qpa      = qpa*f_vx1;
+
+        xm       = xm+d_DT*h;
+        xmu1     = xmu1+d_DT*h1;
+        xmu2     = xmu2+d_DT*h2;
+        xmu3     = xmu3+d_DT*h3;
+        vx1      = d_DT*(1+f_vx2*f_vx1);
+        
+        u1_ip2   = u1_ip1;
+        u1_ip1   = f_u1;
+        f_u1     = u1_im1;
+        u1_im1   = u1[pos_im1];
+        v1_ip1   = f_v1;
+        f_v1     = v1_im1;
+        v1_im1   = v1_im2;
+        v1_im2   = v1[pos_im2];
+        w1_ip1   = f_w1;
+        f_w1     = w1_im1;
+        w1_im1   = w1_im2;
+        w1_im2   = w1[pos_im2];
 
-__global__ void update_boundary_y(_prec*  u1, _prec*  v1, _prec*  w1, _prec*  s_u1, _prec*  s_v1, _prec*  s_w1, int rank, int flag, int d_i)
-{
-    register int i, j, k, pos, posj;
-    k     = blockIdx.x*BLOCK_SIZE_Z+threadIdx.x+align;
-    i     = blockIdx.y*BLOCK_SIZE_Y+threadIdx.y+2+ngsl;
+        if (d_i == 0){ /*Apply FS condition on uppermost grid only*/
+	  if(k == d_nzt[d_i]+align-1) {
+	      u1[pos_kp1] = f_u1 - (f_w1 - w1_im1);
+	      v1[pos_kp1] = f_v1 - (w1[pos_jp1] - f_w1);
 
-    if(flag==Front && rank!=-1){
-	j     = 2;
-    	pos   = i*d_slice_1[d_i]+j*d_yline_1[d_i]+k;
-        posj  = i*ngsl*d_yline_1[d_i]+k;
-	for(j=2;j<2+ngsl;j++){
-		u1[pos] = s_u1[posj];
-		v1[pos] = s_v1[posj];
-		w1[pos] = s_w1[posj];
-		pos	= pos  + d_yline_1[d_i];
-  		posj	= posj + d_yline_1[d_i];	
-	}
-    }
+	      g_i  = d_nxt[d_i]*rankx + i - ngsl - 1;
 
-    if(flag==Back && rank!=-1){
-    	j     = d_nyt[d_i]+ngsl+2;
-    	pos   = i*d_slice_1[d_i]+j*d_yline_1[d_i]+k;
-        posj  = i*ngsl*d_yline_1[d_i]+k;
-	for(j=d_nyt[d_i]+ngsl+2;j<d_nyt[d_i]+ngsl2+2;j++){
-	        u1[pos] = s_u1[posj];
-                v1[pos] = s_v1[posj];
-                w1[pos] = s_w1[posj];
-                pos     = pos  + d_yline_1[d_i];
-                posj    = posj + d_yline_1[d_i];
-	}
-    }
-    return;
-}
+	      if(g_i<NX || NPC == 2)
+		      vs1 = u1_ip1 - (w1_ip1 - f_w1);
+	      else
+		      vs1 = 0.0;
 
-/* kernel functions to apply free-surface B.C.s to stress */
-__global__ void fstr (_prec*  zz, _prec*  xz, _prec*  yz, int s_i, int e_i, int s_j)
-{
-    register int i, j, k;
-    register int pos, pos_im1; 
+	      g_i  = d_nyt[d_i]*ranky + j - ngsl - 1;
+	      if(g_i>1 || NPC == 2) //periodic BCs
+		      vs2 = v1[pos_jm1] - (f_w1 - w1[pos_jm1]);
+	      else
+		      vs2 = 0.0;
 
-    k    = d_nzt[0]+align-1;
-    j    = blockIdx.y*BLOCK_SIZE_Y+threadIdx.y+s_j;
-    i    = e_i;
-    pos  = i*d_slice_1[0]+j*d_yline_1[0]+k;
+	      w1[pos_kp1] = w1[pos_km1] - lam_mu[i*(d_nyt[d_i]+4+ngsl2) + j]*((vs1 - u1[pos_kp1]) + (u1_ip1 - f_u1)
+                           + (v1[pos_kp1] - vs2) + (f_v1   - v1[pos_jm1]) );
+	  }
+	  else if(k == d_nzt[d_i]+align-2) {
+		  u1[pos_kp2] = u1[pos_kp1] - (w1[pos_kp1]   - w1[pos_im1+1]);
+		  v1[pos_kp2] = v1[pos_kp1] - (w1[pos_jp1+1] - w1[pos_kp1]);
+	  }
+        }
+ 
+    	vs1      = d_c1*(u1_ip1 - f_u1)        + d_c2*(u1_ip2      - u1_im1);
+        vs2      = d_c1*(f_v1   - v1[pos_jm1]) + d_c2*(v1[pos_jp1] - v1[pos_jm2]);
+        vs3      = d_c1*(f_w1   - w1[pos_km1]) + d_c2*(w1[pos_kp1] - w1[pos_km2]);
+ 
+        tmp      = xl*(vs1+vs2+vs3);
+        a1       = qpa*(vs1+vs2+vs3);
+        tmp      = tmp+d_DT*a1;
 
-    for(i=e_i;i>=s_i;i--)
-    {
-        pos_im1  = pos-d_slice_1[0];
+        f_r      = r1[pos];
+	 f_rtmp   = -h*(vs2+vs3) + a1; 
+	 xx[pos]  = xx[pos]  + tmp - xm*(vs2+vs3) + vx1*f_r;  
+	 r1[pos]  = f_vx2*f_r + f_wwo*f_rtmp;
+	 f_rtmp   = f_rtmp*(f_wwo-1) + f_vx2*f_r*(1-f_vx1); 
+	  xx[pos]  = (xx[pos] + d_DT*f_rtmp)*f_dcrj;
 
-        // asymmetry reflection above free surface
-        zz[pos+1] = -zz[pos];
-        zz[pos+2] = -zz[pos-1];
+        f_r      = r2[pos];
+	 f_rtmp   = -h*(vs1+vs3) + a1;  
+        yy[pos]  = (yy[pos]  + tmp - xm*(vs1+vs3) + vx1*f_r)*f_dcrj;
 
-        xz[pos+1] = -xz[pos-1];
-        xz[pos+2] = -xz[pos-2];
+	 r2[pos]  = f_vx2*f_r + f_wwo*f_rtmp; 
+	 f_rtmp   = f_rtmp*(f_wwo-1) + f_vx2*f_r*(1-f_vx1); 
+	  yy[pos]  = (yy[pos] + d_DT*f_rtmp)*f_dcrj;
+	
+        f_r      = r3[pos];
+	f_rtmp   = -h*(vs1+vs2) + a1;
+        zz[pos]  = (zz[pos]  + tmp - xm*(vs1+vs2) + vx1*f_r)*f_dcrj;
+	 r3[pos]  = f_vx2*f_r + f_wwo*f_rtmp;
+	 f_rtmp   = f_rtmp*(f_wwo-1) + f_vx2*f_r*(1-f_vx1);  
+	 zz[pos]  = (zz[pos] + d_DT*f_rtmp)*f_dcrj;
 
-        yz[pos+1] = -yz[pos-1];                                               
-        yz[pos+2] = -yz[pos-2];
+        vs1      = d_c1*(u1[pos_jp1] - f_u1)   + d_c2*(u1[pos_jp2] - u1[pos_jm1]);
+        vs2      = d_c1*(f_v1        - v1_im1) + d_c2*(v1_ip1      - v1_im2);
+        f_r      = r4[pos];
+ 	f_rtmp   = h1*(vs1+vs2); 
+	 xy[pos]  = xy[pos]  + xmu1*(vs1+vs2) + vx1*f_r;
+	 r4[pos]  = f_vx2*f_r + f_wwo*f_rtmp; 
+	 f_rtmp   = f_rtmp*(f_wwo-1) + f_vx2*f_r*(1-f_vx1);
+	 xy[pos]  = (xy[pos] + d_DT*f_rtmp)*f_dcrj;
+ 
+        //moved to separate subroutine fstr, to be executed after plasticity (Daniel)
+        /*if(k == d_nzt+align-1)
+        {
+                zz[pos+1] = -zz[pos];
+        	xz[pos]   = 0.0;
+                yz[pos]   = 0.0;
+        }
+        else
+        {*/
+        	vs1     = d_c1*(u1[pos_kp1] - f_u1)   + d_c2*(u1[pos_kp2] - u1[pos_km1]);
+        	vs2     = d_c1*(f_w1        - w1_im1) + d_c2*(w1_ip1      - w1_im2);
+        	f_r     = r5[pos];
+		 f_rtmp  = h2*(vs1+vs2);
+		  xz[pos] = xz[pos]  + xmu2*(vs1+vs2) + vx1*f_r; 
+		   r5[pos] = f_vx2*f_r + f_wwo*f_rtmp; 
+		   f_rtmp  = f_rtmp*(f_wwo-1) + f_vx2*f_r*(1-f_vx1); 
+		   xz[pos] = (xz[pos] + d_DT*f_rtmp)*f_dcrj;
+	 
 
-        // horizontal shear stresses on free surface
-        xz[pos]   = 0.0;
-        yz[pos]   = 0.0;
+        	vs1     = d_c1*(v1[pos_kp1] - f_v1) + d_c2*(v1[pos_kp2] - v1[pos_km1]);
+        	vs2     = d_c1*(w1[pos_jp1] - f_w1) + d_c2*(w1[pos_jp2] - w1[pos_jm1]);
+        	f_r     = r6[pos];
+		f_rtmp  = h3*(vs1+vs2);
+		yz[pos] = yz[pos]  + xmu3*(vs1+vs2) + vx1*f_r;
+		 r6[pos] = f_vx2*f_r + f_wwo*f_rtmp;
+		  f_rtmp  = f_rtmp*(f_wwo-1) + f_vx2*f_r*(1-f_vx1); 
+		  yz[pos] = (yz[pos] + d_DT*f_rtmp)*f_dcrj; 
 
+                // also moved to fstr (Daniel)
+                /*if(k == d_nzt+align-2)
+                {
+                    zz[pos+3] = -zz[pos];
+                    xz[pos+2] = -xz[pos];
+                    yz[pos+2] = -yz[pos];                                               
+		}
+		else if(k == d_nzt+align-3)
+		{
+                    xz[pos+4] = -xz[pos];
+                    yz[pos+4] = -yz[pos];
+		}*/
+ 	/*}*/
         pos     = pos_im1;
     }
-
+    return;
 }
 
 // treatment of shear stress components moved to separate kernel code (Daniel)
@@ -1672,8 +1698,8 @@ __global__ void drprecpc_app(_prec *xx, _prec *yy, _prec *zz,
     pos  = i*d_slice_1[d_i]+j*d_yline_1[d_i]+k;
 
     srfpos = d_nzt[d_i] + align - 1;
-    depth = (_prec) (srfpos - k) * d_DH[d_i];
-    depth_kp1 = (_prec) (srfpos - k + 1) * d_DH[d_i];
+    depth = (float) (srfpos - k) * d_DH[d_i];
+    depth_kp1 = (float) (srfpos - k + 1) * d_DH[d_i];
 
     if (depth > 0) pfluid = (depth + d_DH[d_i]/2.) * 9.81e3;
     else pfluid = d_DH[d_i] / 2. * 9.81e3;
@@ -1778,14 +1804,14 @@ __global__ void drprecpc_app(_prec *xx, _prec *yy, _prec *zz,
 }
 
 __global__ void addsrc_cu(int i,      int READ_STEP, int dim,    int* psrc,  int npsrc,
-                          _prec*  axx, _prec*  ayy,    _prec*  azz, _prec*  axz, _prec*  ayz, _prec*  axy,
-                          _prec*  xx,  _prec*  yy,     _prec*  zz,  _prec*  xy,  _prec*  yz,  _prec*  xz, int d_i)
+                          float* axx, float* ayy,    float* azz, float* axz, float* ayz, float* axy,
+                          float* xx,  float* yy,     float* zz,  float* xy,  float* yz,  float* xz, int d_i)
 {
         register _prec vtst;
         register int idx, idy, idz, j, pos;
         j = blockIdx.x*blockDim.x+threadIdx.x;
         if(j >= npsrc) return;
-        vtst = (_prec)d_DT/(d_DH[d_i]*d_DH[d_i]*d_DH[d_i]);
+        vtst = (float)d_DT/(d_DH[d_i]*d_DH[d_i]*d_DH[d_i]);
 
         i   = i - 1;
         idx = psrc[j*dim]   + 1 + ngsl;
@@ -1795,6 +1821,7 @@ __global__ void addsrc_cu(int i,      int READ_STEP, int dim,    int* psrc,  int
 
         /*cuPrintf("addsrc_cu: (%d,%d,%d) (%e,%e,%e,%e,%e,%e)\n", idx, idy, idz, 
            axx[j*READ_STEP+i], ayy[j*READ_STEP+i], azz[j*READ_STEP+i], axz[j*READ_STEP+i], ayz[j*READ_STEP+i], axy[j*READ_STEP+i]);*/
+
 	xx[pos] = xx[pos] - vtst*axx[j*READ_STEP+i];
 	yy[pos] = yy[pos] - vtst*ayy[j*READ_STEP+i];
 	zz[pos] = zz[pos] - vtst*azz[j*READ_STEP+i];
@@ -1805,9 +1832,10 @@ __global__ void addsrc_cu(int i,      int READ_STEP, int dim,    int* psrc,  int
         return;
 }
 
+extern "C"
 void frcvel_H(int i,      int READ_STEP, int dim,    int* psrc,  int npsrc,  int tskp, cudaStream_t St,
-              _prec*  axx, _prec*  ayy,    _prec*  azz, _prec*  axz, _prec*  ayz, _prec*  axy,
-              _prec*  u1,  _prec*  v1,     _prec*  w1, int ymin, int ymax, int d_i)
+              float* axx, float* ayy,    float* azz, float* axz, float* ayz, float* axy,
+              float* u1,  float* v1,     float* w1, int ymin, int ymax, int d_i)
 {
     dim3 grid, block;
     if(npsrc < 256)
@@ -1834,8 +1862,8 @@ void frcvel_H(int i,      int READ_STEP, int dim,    int* psrc,  int npsrc,  int
 }
 
 __global__ void frcvel_cu(int i,      int READ_STEP, int dim,    int* psrc,  int npsrc, int tskp,
-                          _prec*  axx, _prec*  ayy,    _prec*  azz, _prec*  axz, _prec*  ayz, _prec*  axy,
-                          _prec*  u1,  _prec*  v1,     _prec*  w1, int xmin, int xmax, int d_i)
+                          float* axx, float* ayy,    float* azz, float* axz, float* ayz, float* axy,
+                          float* u1,  float* v1,     float* w1, int xmin, int xmax, int d_i)
 {
         register int idx, idy, idz, j, pos;
         register int i0, i1;
@@ -1934,19 +1962,20 @@ __global__ void frcvel_cu(int i,      int READ_STEP, int dim,    int* psrc,  int
 
 
 /* kernel function to apply free-surface B.C. to velocities - (Daniel) */
-void fvel_H(_prec*  u1, _prec*  v1, _prec*  w1, cudaStream_t St, _prec*  lam_mu, int NX, int rankx, int ranky, 
+extern "C"
+void fvel_H(float* u1, float* v1, float* w1, cudaStream_t St, float* lam_mu, int NX, int rankx, int ranky, 
      int s_i, int e_i, int s_j, int e_j)
 {
     dim3 block (2, BLOCK_SIZE_Y, 1);
     dim3 grid (1,(e_j-s_j+1+BLOCK_SIZE_Y-1)/BLOCK_SIZE_Y,1);
-    CUCHK(cudaFuncSetCacheConfig(fstr, cudaFuncCachePreferL1));
+    cudaFuncSetCacheConfig(fvel, cudaFuncCachePreferL1);
     fvel<<<grid, block, 0, St>>>(u1, v1, w1, lam_mu, NX, rankx, ranky, s_i, e_i, s_j);
     return;
 }
 
 
 /* kernel functions to apply free-surface B.C.s to velocity */
-__global__ void fvel (_prec*  u1, _prec*  v1, _prec*  w1, _prec*  lam_mu, int NX, int rankx, int ranky, int s_i, int e_i, int s_j)
+__global__ void fvel (float* u1, float* v1, float* w1, float* lam_mu, int NX, int rankx, int ranky, int s_i, int e_i, int s_j)
 {
     register int i, j, k;
     //register _prec w1_im1, w1_im2, u1_ip1, f_u1, f_v1, f_w1;
@@ -2033,7 +2062,8 @@ __global__ void fvel (_prec*  u1, _prec*  v1, _prec*  w1, _prec*  lam_mu, int NX
 
 }
 
-void update_yldfac_buffer_x_H(_prec*  yldfac, _prec *buf_L, _prec *buf_R, int nyt, int nzt, cudaStream_t St1, cudaStream_t St2, 
+extern "C"
+void update_yldfac_buffer_x_H(float* yldfac, _prec *buf_L, _prec *buf_R, int nyt, int nzt, cudaStream_t St1, cudaStream_t St2, 
      int rank_L, int rank_R, int d_i) {
      if(rank_L==-1 && rank_R==-1) return;
 
@@ -2044,16 +2074,16 @@ void update_yldfac_buffer_x_H(_prec*  yldfac, _prec *buf_L, _prec *buf_R, int ny
      //cudaPrintfInit();
      CUCHK(cudaFuncSetCacheConfig(update_yldfac_buffer_x, cudaFuncCachePreferL1));
      update_yldfac_buffer_x<<<grid, block, 0, St1>>>(yldfac, buf_L, rank_L, Left, d_i);
-      CUCHK(cudaGetLastError()) ;
+     CUCHK( cudaGetLastError() );
      update_yldfac_buffer_x<<<grid, block, 0, St2>>>(yldfac, buf_R, rank_R, Right, d_i);
-      CUCHK(cudaGetLastError()) ;
+     CUCHK( cudaGetLastError() );
      //cudaPrintfDisplay(stdout, 1);
      //cudaPrintfEnd();
      return;
 }
 
 /* buffer exchanged for the swap area */
-__global__ void update_yldfac_buffer_x(_prec*  yldfac, _prec *buf, int rank, int flag, int d_i)
+__global__ void update_yldfac_buffer_x(float* yldfac, _prec *buf, int rank, int flag, int d_i)
 {
     register int i, j, k, pos, bpos;
     register int b_slice_1, b_yline_1;
@@ -2097,7 +2127,8 @@ __global__ void update_yldfac_buffer_x(_prec*  yldfac, _prec *buf, int rank, int
     return;
 }
 
-void update_yldfac_data_x_H(_prec*  yldfac, _prec *buf_L, _prec *buf_R, int nyt, int nzt, cudaStream_t St1, cudaStream_t St2, 
+extern "C"
+void update_yldfac_data_x_H(float* yldfac, _prec *buf_L, _prec *buf_R, int nyt, int nzt, cudaStream_t St1, cudaStream_t St2, 
      int rank_L, int rank_R, int d_i) {
      if(rank_L==-1 && rank_R==-1) return;
 
@@ -2106,16 +2137,16 @@ void update_yldfac_data_x_H(_prec*  yldfac, _prec *buf_L, _prec *buf_R, int nyt,
      //cudaPrintfInit();
      CUCHK(cudaFuncSetCacheConfig(update_yldfac_buffer_x, cudaFuncCachePreferL1));
      update_yldfac_data_x<<<grid, block, 0, St1>>>(yldfac, buf_L, rank_L, Left, d_i);
-      CUCHK(cudaGetLastError()) ;
+     CUCHK( cudaGetLastError() );
      update_yldfac_data_x<<<grid, block, 0, St2>>>(yldfac, buf_R, rank_R, Right, d_i);
-      CUCHK(cudaGetLastError()) ;
+     CUCHK( cudaGetLastError() );
      //cudaPrintfDisplay(stdout, 1);
      //cudaPrintfEnd();
      return;
 }
 
 /* copy exchanged buffer data back to swap zone*/
-__global__ void update_yldfac_data_x(_prec*  yldfac, _prec *buf, int rank, int flag, int d_i)
+__global__ void update_yldfac_data_x(float* yldfac, _prec *buf, int rank, int flag, int d_i)
 {
     register int i, j, k, pos, bpos;
     register int b_slice_1, b_yline_1;
@@ -2159,7 +2190,8 @@ __global__ void update_yldfac_data_x(_prec*  yldfac, _prec *buf, int rank, int f
     return;
 }
 
-void update_yldfac_buffer_y_H(_prec*  yldfac, _prec *buf_F, _prec *buf_B, int nxt, int nzt,
+extern "C"
+void update_yldfac_buffer_y_H(float* yldfac, _prec *buf_F, _prec *buf_B, int nxt, int nzt,
    cudaStream_t St1, cudaStream_t St2, int rank_F, int rank_B, int d_i) {
      if(rank_F==-1 && rank_B==-1) return;
 
@@ -2168,16 +2200,16 @@ void update_yldfac_buffer_y_H(_prec*  yldfac, _prec *buf_F, _prec *buf_B, int nx
      //cudaPrintfInit();
      CUCHK(cudaFuncSetCacheConfig(update_yldfac_buffer_y, cudaFuncCachePreferL1));
      update_yldfac_buffer_y<<<grid, block, 0, St1>>>(yldfac, buf_F, rank_F, Front, d_i);
-      CUCHK(cudaGetLastError()) ;
+     CUCHK( cudaGetLastError() );
      update_yldfac_buffer_y<<<grid, block, 0, St2>>>(yldfac, buf_B, rank_B, Back, d_i);
-      CUCHK(cudaGetLastError()) ;
+     CUCHK( cudaGetLastError() );
      /*cudaPrintfDisplay(stdout, 1);
-     CUCHK(cudaPrintfEnd());*/
+     cudaPrintfEnd();*/
      return;
 }
 
 /* buffer exchanged for the swap area along Y*/
-__global__ void update_yldfac_buffer_y(_prec*  yldfac, _prec *buf, int rank, int flag, int d_i)
+__global__ void update_yldfac_buffer_y(float* yldfac, _prec *buf, int rank, int flag, int d_i)
 {
     register int i, j, k, pos, bpos;
     register int b_slice_1, b_yline_1;
@@ -2222,7 +2254,8 @@ __global__ void update_yldfac_buffer_y(_prec*  yldfac, _prec *buf, int rank, int
     return;
 }
 
-void update_yldfac_data_y_H(_prec*  yldfac, _prec *buf_F, _prec *buf_B, int nxt, int nzt,
+extern "C"
+void update_yldfac_data_y_H(float* yldfac, _prec *buf_F, _prec *buf_B, int nxt, int nzt,
     cudaStream_t St1, cudaStream_t St2, int rank_F, int rank_B, int d_i) {
      if(rank_F==-1 && rank_B==-1) return;
 
@@ -2231,16 +2264,16 @@ void update_yldfac_data_y_H(_prec*  yldfac, _prec *buf_F, _prec *buf_B, int nxt,
      //cudaPrintfInit();
      CUCHK(cudaFuncSetCacheConfig(update_yldfac_buffer_y, cudaFuncCachePreferL1));
      update_yldfac_data_y<<<grid, block, 0, St1>>>(yldfac, buf_F, rank_F, Front, d_i);
-      CUCHK(cudaGetLastError()) ;
+     CUCHK( cudaGetLastError() );
      update_yldfac_data_y<<<grid, block, 0, St2>>>(yldfac, buf_B, rank_B, Back, d_i);
-      CUCHK(cudaGetLastError()) ;
+     CUCHK( cudaGetLastError() );
      /*cudaPrintfDisplay(stdout, 1);
-     CUCHK(cudaPrintfEnd());*/
+     cudaPrintfEnd();*/
      return;
 }
 
 /* copy exchanged buffer data back to swap zone*/
-__global__ void update_yldfac_data_y(_prec*  yldfac, _prec *buf, int rank, int flag, int d_i)
+__global__ void update_yldfac_data_y(float* yldfac, _prec *buf, int rank, int flag, int d_i)
 {
     register int i, j, k, pos, bpos;
     register int b_slice_1, b_yline_1;
@@ -2286,8 +2319,8 @@ __global__ void update_yldfac_data_y(_prec*  yldfac, _prec *buf, int rank, int f
     return;
 }
 
-__global__ void dvelc2(_prec*  u1,    _prec*  v1,    _prec*  w1,    _prec*  xx,  _prec*  yy, _prec*  zz, _prec*  xy, 
-             _prec*  xz, _prec*  yz, _prec*  dcrjx, _prec*  dcrjy, _prec*  dcrjz, _prec*  d_1, int d_i)
+__global__ void dvelc2(float* u1,    float* v1,    float* w1,    float* xx,  float* yy, float* zz, float* xy, 
+             float* xz, float* yz, float* dcrjx, float* dcrjy, float* dcrjz, float* d_1, int d_i)
 {
     register int   i, j, k, pos,     pos_im1;
     register int   pos_km1, pos_kp1;
@@ -2342,35 +2375,39 @@ __global__ void dvelc2(_prec*  u1,    _prec*  v1,    _prec*  w1,    _prec*  xx,
      return;
 }
 
-void dvelc2_H(_prec*  u1,    _prec*  v1,    _prec*  w1,    _prec*  xx,  _prec*  yy, _prec*  zz, _prec*  xy, _prec*  xz, _prec*  yz,
-             _prec*  dcrjx, _prec*  dcrjy, _prec*  dcrjz, _prec*  d_1, int nxt,   int nyt, cudaStream_t St, int d_i)
+extern "C"
+void dvelc2_H(float* u1,    float* v1,    float* w1,    float* xx,  float* yy, float* zz, float* xy, float* xz, float* yz,
+             float* dcrjx, float* dcrjy, float* dcrjz, float* d_1, int nxt,   int nyt, cudaStream_t St, int d_i)
 {
     dim3 block (BLOCK_SIZE_X, BLOCK_SIZE_Y, 1);
     dim3 grid ((nxt+BLOCK_SIZE_X-1)/BLOCK_SIZE_X, (nyt+BLOCK_SIZE_Y-1)/BLOCK_SIZE_Y,1);
-    CUCHK(cudaFuncSetCacheConfig(dvelc2, cudaFuncCachePreferL1));
+    cudaFuncSetCacheConfig(dvelc2, cudaFuncCachePreferL1);
 
-    //cudaPrintfInit();
+    /*cudaPrintfInit();*/
     dvelc2<<<grid,block,0,St>>>(u1,v1,w1,xx,yy,zz,xy,xz,yz,dcrjx,dcrjy,dcrjz,d_1,d_i);
-    //cudaPrintfDisplay(stdout, 1);
-    //cudaPrintfEnd();
+    /*cudaPrintfDisplay(stdout, 1);
+    cudaPrintfEnd();*/
 }
 
 
-__global__ void intp3d(_prec *u1l, _prec*  v1l, _prec *w1l, _prec *xxl, _prec *yyl, _prec *zzl, 
-        _prec *xyl, _prec * xzl, _prec*  yzl,
-        _prec *u1h, _prec *v1h, _prec*  w1h, _prec *xxh, _prec *yyh, _prec *zzh, 
-        _prec *xyh, _prec *xzh, _prec*  yzh, int rank, int d_i)
+__global__ void intp3d(_prec *u1l, float* v1l, _prec *w1l, _prec *xxl, _prec *yyl, _prec *zzl, 
+        _prec *xyl, _prec * xzl, float* yzl,
+        _prec *u1h, _prec *v1h, float* w1h, _prec *xxh, _prec *yyh, _prec *zzh, 
+        _prec *xyh, _prec *xzh, float* yzh, int rank, int d_i)
 {
     register int i,j,k,ii,jj,posl;
     register int posl_ip1,posl_jp1,posl_ij1;
     register int ih,jh,kh,posh,index;
     register _prec w[4],var[4];
+    register int maxindex;
 
     w[0]=1.;
     w[1]=2./3.;
     w[2]=1./3.;
     w[3]=0.;
 
+    maxindex=(d_nxt[d_i-1]+4+ngsl2)*(d_nyt[d_i-1]+4+ngsl2)*(d_nzt[d_i-1]+2*align)-1;
+
     i = blockIdx.x*blockDim.x+threadIdx.x+ngsl;
     j = blockIdx.y*blockDim.y+threadIdx.y+ngsl;
 
@@ -2404,11 +2441,14 @@ __global__ void intp3d(_prec *u1l, _prec*  v1l, _prec *w1l, _prec *xxl, _prec *y
         {
             for(ii = 1; ii<=4; ii++ )
             {
-                index = posh + (ii-1) * d_slice_1[d_i-1] + (jj-1) * d_yline_1[d_i-1];
-                xzh[index] = var[0]*w[ii-1]*w[jj-1] +
-                             var[1]*w[4-ii]*w[jj-1] +
-                             var[2]*w[ii-1]*w[4-jj] +
-                             var[3]*w[4-ii]*w[4-jj];
+                   //This would be the correct way, but the if condition results in different results during 
+                   //each run (thread divergence issues?)
+                   //if ((ih+ii) < (d_nxt[d_i-1]+4+ngsl2) && (jh+jj) < (d_nyt[d_i-1]+4+ngsl2)){
+		   index = min(posh + (ii-1) * d_slice_1[d_i-1] + (jj-1) * d_yline_1[d_i-1], maxindex);
+		   xzh[index] =    var[0]*w[ii-1]*w[jj-1] +
+				   var[1]*w[4-ii]*w[jj-1] +
+				   var[2]*w[ii-1]*w[4-jj] +
+				   var[3]*w[4-ii]*w[4-jj];
             }
         }
 
@@ -2422,11 +2462,11 @@ __global__ void intp3d(_prec *u1l, _prec*  v1l, _prec *w1l, _prec *xxl, _prec *y
         {
             for(ii = 1; ii<=4; ii++ )
             {
-                index = posh + (ii) * d_slice_1[d_i-1] + (jj) * d_yline_1[d_i-1];
-                yzh[index] = var[0]*w[ii-1]*w[jj-1] +
-                             var[1]*w[4 - ii]*w[jj-1] +
-                             var[2]*w[ii-1]*w[4 - jj] +
-                             var[3]*w[4-ii]*w[4 - jj];
+		   index = min(posh + (ii) * d_slice_1[d_i-1] + (jj) * d_yline_1[d_i-1], maxindex);
+		   yzh[index] =    var[0]*w[ii-1]*w[jj-1] +
+				   var[1]*w[4 - ii]*w[jj-1] +
+				   var[2]*w[ii-1]*w[4 - jj] +
+				   var[3]*w[4-ii]*w[4 - jj];
             }
         }
 
@@ -2434,29 +2474,23 @@ __global__ void intp3d(_prec *u1l, _prec*  v1l, _prec *w1l, _prec *xxl, _prec *y
   	var[1] = w1l[posl_ip1];
  	var[2] = w1l[posl_jp1];
  	var[3] = w1l[posl_ij1];
-        /*if (((rank==1) && (i==4)) && (j==35)){
-           cuPrintf("%d>> var=[%.16g, %.16g, %.16g, %.16g]\n", rank, var[0], var[1], var[2], var[3]);
-           cuPrintf("%d>>i=%d, j=%d, k=%d\n", rank, i, j, k);
-           cuPrintf("%d>>posl, ip1, jp1, ij1=%d %d %d %d\n", rank, posl, posl_ip1, posl_jp1, posl_ij1);
-        }*/
-        /*if (((rank==0) && (i==36)) && (j==35)){
-           cuPrintf("%d>> var=[%.16g, %.16g, %.16g, %.16g]\n", rank, var[0], var[1], var[2], var[3]);
-           cuPrintf("%d>> i=%d, j=%d, k=%d\n", rank, i, j, k);
-           cuPrintf("%d>> posl, ip1, jp1, ij1=%d %d %d %d\n", rank, posl, posl_ip1, posl_jp1, posl_ij1);
-        }*/
+
         for(jj = 1; jj<=4; jj++ )
         {
             for(ii = 1; ii<=4; ii++ )
             {
-                index = posh + (ii) * d_slice_1[d_i-1] + (jj-1) * d_yline_1[d_i-1];
-                w1h[index] = var[0]*w[ii-1]*w[jj-1] +
-                             var[1]*w[4-ii]*w[jj-1] +
-                             var[2]*w[ii-1]*w[4-jj] +
-                             var[3]*w[4-ii]*w[4-jj];
+		   index = min(posh + (ii) * d_slice_1[d_i-1] + (jj-1) * d_yline_1[d_i-1], maxindex);
+		   w1h[index] =    var[0]*w[ii-1]*w[jj-1] +
+				   var[1]*w[4-ii]*w[jj-1] +
+				   var[2]*w[ii-1]*w[4-jj] +
+				   var[3]*w[4-ii]*w[4-jj];
             }
         }
 
-        var[0] = xxl[posl];
+    /* xx,yy,zz,u1,v1 and xy can not be interpolated horizontally from kh=align+1 (not vertically aligned) */
+    /* Uncommented this code segment.  Daniel Roten, December 6 2018 */
+
+    /*    var[0] = xxl[posl];
   	var[1] = xxl[posl_ip1];
  	var[2] = xxl[posl_jp1];
  	var[3] = xxl[posl_ij1];
@@ -2464,11 +2498,11 @@ __global__ void intp3d(_prec *u1l, _prec*  v1l, _prec *w1l, _prec *xxl, _prec *y
         {
             for(ii = 1; ii<=4; ii++ )
             {
-                index = posh + (ii) * d_slice_1[d_i-1] + (jj-1) * d_yline_1[d_i-1];
-                xxh[index] = var[0]*w[ii-1]*w[jj-1] +
-                             var[1]*w[4-ii]*w[jj-1] +
-                             var[2]*w[ii-1]*w[4-jj] +
-                             var[3]*w[4-ii]*w[4-jj];
+		   index = min(posh + (ii) * d_slice_1[d_i-1] + (jj-1) * d_yline_1[d_i-1], maxindex);
+		   xxh[index] = var[0]*w[ii-1]*w[jj-1] +
+				var[1]*w[4-ii]*w[jj-1] +
+				var[2]*w[ii-1]*w[4-jj] +
+				var[3]*w[4-ii]*w[4-jj];
             }
         }
 
@@ -2480,11 +2514,11 @@ __global__ void intp3d(_prec *u1l, _prec*  v1l, _prec *w1l, _prec *xxl, _prec *y
         {
             for(ii = 1; ii<=4; ii++ )
             {
-                index = posh + (ii) * d_slice_1[d_i-1] + (jj-1) * d_yline_1[d_i-1];
-                yyh[index] = var[0]*w[ii-1]*w[jj-1] +
-                             var[1]*w[4-ii]*w[jj-1] +
-                             var[2]*w[ii-1]*w[4-jj] +
-                             var[3]*w[4-ii]*w[4-jj];
+		   index = min(posh + (ii) * d_slice_1[d_i-1] + (jj-1) * d_yline_1[d_i-1], maxindex);
+		   yyh[index] = var[0]*w[ii-1]*w[jj-1] +
+				var[1]*w[4-ii]*w[jj-1] +
+				var[2]*w[ii-1]*w[4-jj] +
+				var[3]*w[4-ii]*w[4-jj];
             }
         }
 
@@ -2496,11 +2530,11 @@ __global__ void intp3d(_prec *u1l, _prec*  v1l, _prec *w1l, _prec *xxl, _prec *y
         {
             for(ii = 1; ii<=4; ii++ )
             {
-                index = posh + (ii) * d_slice_1[d_i-1] + (jj-1) * d_yline_1[d_i-1];
-                zzh[index] = var[0]*w[ii-1]*w[jj-1] +
-                             var[1]*w[4-ii]*w[jj-1] +
-                             var[2]*w[ii-1]*w[4-jj] +
-                             var[3]*w[4-ii]*w[4-jj];
+		   index = min(posh + (ii) * d_slice_1[d_i-1] + (jj-1) * d_yline_1[d_i-1], maxindex);
+		   zzh[index] = var[0]*w[ii-1]*w[jj-1] +
+				var[1]*w[4-ii]*w[jj-1] +
+				var[2]*w[ii-1]*w[4-jj] +
+				var[3]*w[4-ii]*w[4-jj];
             }
         }
 
@@ -2513,11 +2547,11 @@ __global__ void intp3d(_prec *u1l, _prec*  v1l, _prec *w1l, _prec *xxl, _prec *y
         {
             for(ii = 1; ii<=4; ii++ )
             {
-                index = posh + (ii-1) * d_slice_1[d_i-1] + (jj-1) * d_yline_1[d_i-1];
-                u1h[index] = var[0]*w[ii-1]*w[jj-1] +
-                             var[1]*w[4-ii]*w[jj-1] +
-                             var[2]*w[ii-1]*w[4-jj] +
-                             var[3]*w[4-ii]*w[4-jj];
+		   index = min(posh + (ii-1) * d_slice_1[d_i-1] + (jj-1) * d_yline_1[d_i-1], maxindex);
+		   u1h[index] = var[0]*w[ii-1]*w[jj-1] +
+				var[1]*w[4-ii]*w[jj-1] +
+				var[2]*w[ii-1]*w[4-jj] +
+				var[3]*w[4-ii]*w[4-jj];
             }
         }
 
@@ -2530,11 +2564,11 @@ __global__ void intp3d(_prec *u1l, _prec*  v1l, _prec *w1l, _prec *xxl, _prec *y
         {
             for(ii = 1; ii<=4; ii++ )
             {
-                index = posh + (ii) * d_slice_1[d_i-1] + (jj) * d_yline_1[d_i-1];
-                v1h[index] = var[0]*w[ii-1]*w[jj-1] +
-                             var[1]*w[4 - ii]*w[jj-1] +
-                             var[2]*w[ii-1]*w[4 - jj] +
-                             var[3]*w[4-ii]*w[4 - jj];
+		   index = min(posh + (ii) * d_slice_1[d_i-1] + (jj) * d_yline_1[d_i-1], maxindex);
+		   v1h[index] = var[0]*w[ii-1]*w[jj-1] +
+				var[1]*w[4 - ii]*w[jj-1] +
+				var[2]*w[ii-1]*w[4 - jj] +
+				var[3]*w[4-ii]*w[4 - jj];
             }
         }
 
@@ -2547,13 +2581,13 @@ __global__ void intp3d(_prec *u1l, _prec*  v1l, _prec *w1l, _prec *xxl, _prec *y
         {
             for(ii = 1; ii<=4; ii++ )
             {
-                index = posh + (ii-1) * d_slice_1[d_i-1] + (jj) * d_yline_1[d_i-1];
-                xyh[index] = var[0]*w[ii-1]*w[jj-1] +
-                             var[1]*w[4 - ii]*w[jj-1] +
-                             var[2]*w[ii-1]*w[4 - jj] +
-                             var[3]*w[4-ii]*w[4 - jj];
+		   index = min(posh + (ii-1) * d_slice_1[d_i-1] + (jj) * d_yline_1[d_i-1], maxindex);
+		   xyh[index] = var[0]*w[ii-1]*w[jj-1] +
+				var[1]*w[4 - ii]*w[jj-1] +
+				var[2]*w[ii-1]*w[4 - jj] +
+				var[3]*w[4-ii]*w[4 - jj];
             }
-        }
+        } */
     } // if (1<i && i<d_nxtl+1 && 1<j && j<d_nytl+1)
 //
 
@@ -2561,11 +2595,11 @@ __global__ void intp3d(_prec *u1l, _prec*  v1l, _prec *w1l, _prec *xxl, _prec *y
 }
 
 // 2nd order stress update
-__global__ void dstrqc2(_prec*  xx, _prec*  yy,    _prec*  zz,    _prec*  xy,    _prec*  xz,  _prec*  yz,
-                       _prec*  r1, _prec*  r2,    _prec*  r3,    _prec*  r4,    _prec*  r5,  _prec*  r6,
-                       _prec*  u1, _prec*  v1,    _prec*  w1,    _prec*  lam,   _prec*  mu,  _prec*  qp,
-                       _prec*  qs, _prec*  dcrjx, _prec*  dcrjy, _prec*  dcrjz, 
-                       _prec*  coeff, _prec *d_vx1, _prec *d_vx2, int *d_ww, _prec *d_wwo,
+__global__ void dstrqc2(float* xx, float* yy,    float* zz,    float* xy,    float* xz,  float* yz,
+                       float* r1, float* r2,    float* r3,    float* r4,    float* r5,  float* r6,
+                       float* u1, float* v1,    float* w1,    float* lam,   float* mu,  float* qp,
+                       float* qs, float* dcrjx, float* dcrjy, float* dcrjz, 
+                       float* coeff, _prec *d_vx1, _prec *d_vx2, int *d_ww, _prec *d_wwo,
                        int s_i, int e_i, int s_j, int e_j, int d_i)
 {
     register int   i,  j,  k;
@@ -2766,16 +2800,17 @@ __global__ void dstrqc2(_prec*  xx, _prec*  yy,    _prec*  zz,    _prec*  xy,
 	return;
 }
 
-void dstrqc2_H(_prec*  xx, _prec*  yy,    _prec*  zz,    _prec*  xy,    _prec*  xz,  _prec*  yz,
-              _prec*  r1, _prec*  r2,    _prec*  r3,    _prec*  r4,    _prec*  r5,  _prec*  r6,
-              _prec*  u1, _prec*  v1,    _prec*  w1,    _prec*  lam,   _prec*  mu,  _prec*  qp,
-              _prec*  qs, _prec*  dcrjx, _prec*  dcrjy, _prec*  dcrjz, int nxt,    int nyt,
+extern "C"
+void dstrqc2_H(float* xx, float* yy,    float* zz,    float* xy,    float* xz,  float* yz,
+              float* r1, float* r2,    float* r3,    float* r4,    float* r5,  float* r6,
+              float* u1, float* v1,    float* w1,    float* lam,   float* mu,  float* qp,
+              float* qs, float* dcrjx, float* dcrjy, float* dcrjz, int nxt,    int nyt,
               cudaStream_t St, 
-              _prec*  coeff, _prec *vx1, _prec *vx2, int *ww, _prec *wwo,
+              float* coeff, _prec *vx1, _prec *vx2, int *ww, _prec *wwo,
               int s_i, int e_i, int s_j, int e_j, int d_i) {
     dim3 block (BLOCK_SIZE_X, BLOCK_SIZE_Y, 1);
     dim3 grid ((nxt+BLOCK_SIZE_X+ngsl2-1)/BLOCK_SIZE_X, (nyt+BLOCK_SIZE_Y+ngsl2-1)/BLOCK_SIZE_Y,1);
-    CUCHK(cudaFuncSetCacheConfig(dstrqc2, cudaFuncCachePreferL1));
+    cudaFuncSetCacheConfig(dstrqc2, cudaFuncCachePreferL1);
     dstrqc2<<<grid, block, 0, St>>>(xx, yy, zz, xy, xz, yz, r1, r2, r3, r4, r5, r6, u1, v1, w1,
                             lam, mu, qp, qs, dcrjx, dcrjy, dcrjz, coeff, vx1, vx2, ww, wwo, 
                             s_i, e_i, s_j, e_j, d_i);
@@ -2783,23 +2818,34 @@ void dstrqc2_H(_prec*  xx, _prec*  yy,    _prec*  zz,    _prec*  xy,    _prec*
     return;
 }
 
-void intp3d_H(_prec *u1l, _prec*  v1l, _prec *w1l, _prec *xxl, _prec *yyl, _prec *zzl, 
-        _prec *xyl, _prec * xzl, _prec*  yzl,
-        _prec *u1h, _prec *v1h, _prec*  w1h, _prec *xxh, _prec *yyh, _prec *zzh, 
-        _prec *xyh, _prec *xzh, _prec*  yzh,
+extern "C"
+void intp3d_H(_prec *u1l, float* v1l, _prec *w1l, _prec *xxl, _prec *yyl, _prec *zzl, 
+        _prec *xyl, _prec * xzl, float* yzl,
+        _prec *u1h, _prec *v1h, float* w1h, _prec *xxh, _prec *yyh, _prec *zzh, 
+        _prec *xyh, _prec *xzh, float* yzh,
         int nxtl, int nytl, int rank, cudaStream_t St, int d_i) {
 
     /* here, d_i is the grid number of the "low" grid, to which xzl, yzl, and w1l pertain */
 
     dim3 block (BLOCK_SIZE_X, BLOCK_SIZE_Y, 1);
     dim3 grid ((nxtl+BLOCK_SIZE_X+ngsl2-1)/BLOCK_SIZE_X, (nytl+BLOCK_SIZE_Y+ngsl2-1)/BLOCK_SIZE_Y,1);
-    CUCHK(cudaFuncSetCacheConfig(intp3d, cudaFuncCachePreferL1));
+    /*cudaEvent_t start, stop;
+    _prec duration = 0;*/
+    cudaFuncSetCacheConfig(intp3d, cudaFuncCachePreferL1);
     //cudaPrintfInit();
+   
+    /*cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+    cudaEventRecord(start);*/
     intp3d<<<grid, block, 0, St>>>(u1l,v1l,w1l,xxl,yyl,zzl,xyl,xzl,yzl,
                                    u1h,v1h,w1h,xxh,yyh,zzh,xyh,xzh,yzh,
                                    rank,d_i); 
-    //cudaPrintfDisplay(stdout, 1);
-    //cudaPrintfEnd();
+    /*cudaEventRecord(stop);
+    cudaEventSynchronize(stop);
+    cudaEventElapsedTime(&duration, start, stop);
+    fprintf(stdout, "Time for intp3d: %f ms\n", duration);*/
+    /*cudaPrintfDisplay(stdout, 1);
+    cudaPrintfEnd();*/
     return;
 }
 
@@ -2816,7 +2862,7 @@ __device__ _prec bgcaccess(_prec *dsub, int varpos, _prec *buf_L, _prec *buf_R,
    register int xs_left = -WWL, xs_right = d_nxt[d_i]+4+ngsl2-6;  /*check this ! */
    register int ys_front = -WWL, ys_back = d_nyt[d_i]+4+ngsl2-6; 
    register int ys_lr = -WWL;
-   register int zs=align+2, ze=align+7;
+   register int zs=align+1, ze=align+8;
    _prec nval;
 
    blr_slice_1  = (d_nyt[d_i]+4+ngsl2+2*WWL)*(ze-zs+1);
@@ -2851,8 +2897,8 @@ __device__ _prec bgcaccess(_prec *dsub, int varpos, _prec *buf_L, _prec *buf_R,
    return(nval);
 }
 
-__global__ void swap(_prec * xxl, _prec*  yyl, _prec*  zzl, _prec*  xyl,_prec*  xzl,_prec*  yzl,_prec*  u1l, _prec*  v1l, _prec*  w1l,
-                     _prec * xxh, _prec*  yyh, _prec*  zzh, _prec*  xyh, _prec*  xzh, _prec*  yzh,_prec*  u1h, _prec*  v1h, _prec*  w1h, 
+__global__ void swap(_prec * xxl, float* yyl, float* zzl, float* xyl,float* xzl,float* yzl,float* u1l, float* v1l, float* w1l,
+                     _prec * xxh, float* yyh, float* zzh, float* xyh, float* xzh, float* yzh,float* u1h, float* v1h, float* w1h, 
                      _prec *buf_L, _prec *buf_R, _prec *buf_F, _prec *buf_B, int rank, int d_i) {
     register int i,j,k,ih,jh,kh,posl,posh,ii,jj,poshij;
     register _prec sum1, sum2, sum3;
@@ -3029,8 +3075,8 @@ __global__ void swap(_prec * xxl, _prec*  yyl, _prec*  zzl, _prec*  xyl,_prec*
     return;
 }
 
-__global__ void swap3(_prec * xxl, _prec*  yyl, _prec*  zzl, _prec*  xyl,_prec*  xzl,_prec*  yzl,_prec*  u1l, _prec*  v1l, _prec*  w1l,
-                     _prec * xxh, _prec*  yyh, _prec*  zzh, _prec*  xyh, _prec*  xzh, _prec*  yzh,_prec*  u1h, _prec*  v1h, _prec*  w1h, 
+__global__ void swap3(_prec * xxl, float* yyl, float* zzl, float* xyl,float* xzl,float* yzl,float* u1l, float* v1l, float* w1l,
+                     _prec * xxh, float* yyh, float* zzh, float* xyh, float* xzh, float* yzh,float* u1h, float* v1h, float* w1h, 
                      _prec *buf_L, _prec *buf_R, _prec *buf_F, _prec *buf_B, int rank, int d_i) {
     register int i,j,k,ih,jh,kh,posl,posh,ii,jj,kk,poshij;
     register _prec sum1, sum2, sum3;
@@ -3038,7 +3084,7 @@ __global__ void swap3(_prec * xxl, _prec*  yyl, _prec*  zzl, _prec*  xyl,_prec*
     register long int b_offset, bpos, bposij;
     //register int zs=2, ze=7;
     register double ttlwght2=0., ttlwght3=0., ttlwght4=0.;
-    register int wwl_kk2=2, wwl_kk3, wwl_kk4;
+    register int wwl_kk2=1, wwl_kk3, wwl_kk4;
 
     /*b_slice_1  = (2+ngsl+WWL)*(ze-zs+1);
     b_yline_1  = ze-zs+1;
@@ -3048,7 +3094,7 @@ __global__ void swap3(_prec * xxl, _prec*  yyl, _prec*  zzl, _prec*  xyl,_prec*
     j = blockIdx.y*blockDim.y+threadIdx.y+ngsl;
 
     if (WWL >= 3) wwl_kk3 = 3; else wwl_kk3=WWL;
-    if (WWL >= 4) wwl_kk3 = 4; else wwl_kk4=WWL;
+    if (WWL >= 4) wwl_kk4 = 4; else wwl_kk4=WWL;
 
     for(jj=-WWL;jj<=WWL;jj++)
       for(ii=-WWL;ii<=WWL;ii++){
@@ -3216,20 +3262,21 @@ __global__ void swap3(_prec * xxl, _prec*  yyl, _prec*  zzl, _prec*  xyl,_prec*
     return;
 }
 
-void swap_H(_prec * xxl, _prec*  yyl, _prec*  zzl, _prec*  xyl,_prec*  xzl,_prec*  yzl,_prec*  u1l, _prec*  v1l, _prec*  w1l,
-            _prec * xxh, _prec*  yyh, _prec*  zzh, _prec*  xyh, _prec*  xzh, _prec*  yzh,_prec*  u1h, _prec*  v1h, _prec*  w1h, 
+extern "C"
+void swap_H(_prec * xxl, float* yyl, float* zzl, float* xyl,float* xzl,float* yzl,float* u1l, float* v1l, float* w1l,
+            _prec * xxh, float* yyh, float* zzh, float* xyh, float* xzh, float* yzh,float* u1h, float* v1h, float* w1h, 
             int nxtl,int nytl, _prec *buf_L, _prec *buf_R, _prec *buf_F, _prec *buf_B, int rank, cudaStream_t St, int d_i) {
 
     /* here, d_i is the grid number of the "high" grid, to which xxh, yyh, ... pertain */
 
     dim3 block (BLOCK_SIZE_X, BLOCK_SIZE_Y, 1);
     dim3 grid ((nxtl+BLOCK_SIZE_X+ngsl-1)/(BLOCK_SIZE_X), (nytl+BLOCK_SIZE_Y+ngsl-1)/(BLOCK_SIZE_Y),1);
-    CUCHK(cudaFuncSetCacheConfig(swap, cudaFuncCachePreferL1));
-    //cudaPrintfInit();
+    cudaFuncSetCacheConfig(swap, cudaFuncCachePreferL1);
+    /*cudaPrintfInit();*/
     swap3<<<grid,block,0,St>>>(xxl,yyl,zzl,xyl,xzl,yzl,u1l,v1l,w1l,xxh,yyh,zzh,xyh,xzh,yzh,u1h,v1h,w1h,
                          buf_L, buf_R, buf_F, buf_B, rank, d_i);
-    //cudaPrintfDisplay(stdout, 1);
-    //cudaPrintfEnd();
+    /*cudaPrintfDisplay(stdout, 1);
+    cudaPrintfEnd();*/
     return;
 }
 
@@ -3237,7 +3284,7 @@ __global__ void print_nonzero(_prec *array, int nx, int ny, int nz, int d_i)
 {
     int ix, iy, iz;
 
-    //cuPrintf("nonzeros in grid %d: =====================================================\n", d_i);
+    cuPrintf("nonzeros in grid %d: =====================================================\n", d_i);
     for (iz=0; iz<nz; iz++)
     {
         for (iy=0; iy<ny; iy++)
@@ -3249,8 +3296,8 @@ __global__ void print_nonzero(_prec *array, int nx, int ny, int nz, int d_i)
                 // {
                 //     printf("|   [%d][%d][%d] = %+le\n", ix, iy, iz, array[idx]);
                 // }
-                //if (array[idx] != 0.0)
-                   //cuPrintf("%d %d %d %e\n", ix, iy, iz, array[idx]);
+                if (array[idx] != 0.0)
+                   cuPrintf("%d %d %d %e\n", ix, iy, iz, array[idx]);
             }
         }
     }
@@ -3258,10 +3305,10 @@ __global__ void print_nonzero(_prec *array, int nx, int ny, int nz, int d_i)
 
 void print_nonzero_H(_prec *array, int nx, int ny, int nz, int d_i)
 {
-    //cudaPrintfInit();
+    cudaPrintfInit();
     print_nonzero<<<1,1>>>(array, nx, ny, nz, d_i);
-    //cudaPrintfDisplay(stdout, 1);
-    //cudaPrintfEnd();
+    cudaPrintfDisplay(stdout, 1);
+    cudaPrintfEnd();
 }
 
 __global__ void print_nonzero_mat(_prec *array, int nx, int ny, int nz, int d_i,
@@ -3269,26 +3316,27 @@ __global__ void print_nonzero_mat(_prec *array, int nx, int ny, int nz, int d_i,
 {
     int ix, iy, iz;
 
-    //cuPrintf("nonzeros in grid %d: =====================================================\n", d_i);
+    cuPrintf("nonzeros in grid %d: =====================================================\n", d_i);
     for (iz=0; iz<nz; iz++) {
         for (iy=0; iy<ny; iy++) {
             for (ix=0; ix<nx; ix++) {
                 int idx = ix*ny*nz + iy*nz + iz;
-                //if (array[idx] != 0.0)
-                   //cuPrintf("%d: mat @ %d %d %d: %e, %e, %e, %e, %e; val=%e\n", rank, ix, iy, iz, d1[idx], mu[idx], lam[idx], qp[idx], 
-                      //qs[idx], array[idx]);
+                if (array[idx] != 0.0)
+                   cuPrintf("%d: mat @ %d %d %d: %e, %e, %e, %e, %e; val=%e\n", rank, ix, iy, iz, d1[idx], mu[idx], lam[idx], qp[idx], 
+                      qs[idx], array[idx]);
             }
         }
     }
 }
 
+extern "C"
 void print_nonzero_mat_H(_prec *array, int nx, int ny, int nz, int d_i, 
      _prec *d1, _prec *mu, _prec *lam, _prec *qp, _prec *qs, int rank)
 {
-    //cudaPrintfInit();
+    cudaPrintfInit();
     print_nonzero_mat<<<1,1>>>(array, nx, ny, nz, d_i, d1, mu, lam, qp, qs, rank);
-    //cudaPrintfDisplay(stdout, 1);
-    //cudaPrintfEnd();
+    cudaPrintfDisplay(stdout, 1);
+    cudaPrintfEnd();
 }
 
 __global__ void print_nan(_prec *array, int nx, int ny, int nz, char *vname)
@@ -3302,40 +3350,42 @@ __global__ void print_nan(_prec *array, int nx, int ny, int nz, char *vname)
             for (ix=0; ix<nx; ix++)
             {
                 int idx = ix*ny*nz + iy*nz + iz;
-                //if (array[idx] != array[idx])
-                   //cuPrintf("%s(%d,%d,%d)=%e\n", vname, ix, iy, iz, array[idx]);
+                if (array[idx] != array[idx])
+                   cuPrintf("%s(%d,%d,%d)=%e\n", vname, ix, iy, iz, array[idx]);
             }
         }
     }
 }
 
+extern "C"
 void print_nan_H(_prec *array, int nx, int ny, int nz, char *vname)
 {
-    //cudaPrintfInit();
+    cudaPrintfInit();
     print_nan<<<1,1>>>(array, nx, ny, nz, vname);
-    //cudaPrintfDisplay(stdout, 1);
-    //cudaPrintfEnd();
+    cudaPrintfDisplay(stdout, 1);
+    cudaPrintfEnd();
 }
 
-void update_swapzone_buffer_x_H(_prec*  u1, _prec*  v1, _prec*  w1, _prec*  xx, _prec*  yy, _prec*  zz, _prec *xy, _prec *xz, _prec *yz, 
+extern "C"
+void update_swapzone_buffer_x_H(float* u1, float* v1, float* w1, float* xx, float* yy, float* zz, _prec *xy, _prec *xz, _prec *yz, 
    _prec *buf_L, _prec *buf_R, int nyt, cudaStream_t St1, cudaStream_t St2, int rank_L, int rank_R, int zs, int ze, int d_i) {
      if(rank_L==-1 && rank_R==-1) return;
 
      dim3 block (1, BLOCK_SIZE_Y, 1);
      dim3 grid (1, (nyt+4+ngsl2+BLOCK_SIZE_Y-1)/BLOCK_SIZE_Y,1);
-     //cudaPrintfInit();
+     /*cudaPrintfInit();*/
      CUCHK(cudaFuncSetCacheConfig(update_swapzone_buffer_x, cudaFuncCachePreferL1));
      update_swapzone_buffer_x<<<grid, block, 0, St1>>>(u1, v1, w1, xx, yy, zz, xy, xz, yz, buf_L, rank_L, Left, zs, ze, d_i);
-      CUCHK(cudaGetLastError()) ;
+     CUCHK( cudaGetLastError() );
      update_swapzone_buffer_x<<<grid, block, 0, St2>>>(u1, v1, w1, xx, yy, zz, xy, xz, yz, buf_R, rank_R, Right, zs, ze, d_i);
-      CUCHK(cudaGetLastError()) ;
-     //cudaPrintfDisplay(stdout, 1);
-     //cudaPrintfEnd();
+     CUCHK( cudaGetLastError() );
+     /*cudaPrintfDisplay(stdout, 1);
+     cudaPrintfEnd();*/
      return;
 }
 
 /* buffer exchanged for the swap area */
-__global__ void update_swapzone_buffer_x(_prec*  u1, _prec*  v1, _prec*  w1, _prec *xx, _prec *yy, _prec *zz, _prec *xy, _prec *xz, _prec *yz,
+__global__ void update_swapzone_buffer_x(float* u1, float* v1, float* w1, _prec *xx, _prec *yy, _prec *zz, _prec *xy, _prec *xz, _prec *yz,
    _prec *buf, int rank, int flag, int zs, int ze, int d_i)
 {
     register int i, j, k, pos, bpos;
@@ -3405,25 +3455,26 @@ __global__ void update_swapzone_buffer_x(_prec*  u1, _prec*  v1, _prec*  w1, _pr
     return;
 }
 
-void update_swapzone_data_x_H(_prec*  u1, _prec*  v1, _prec*  w1, _prec*  xx, _prec*  yy, _prec*  zz, _prec *xy, _prec *xz, _prec *yz, 
+extern "C"
+void update_swapzone_data_x_H(float* u1, float* v1, float* w1, float* xx, float* yy, float* zz, _prec *xy, _prec *xz, _prec *yz, 
    _prec *buf_L, _prec *buf_R, int nyt, cudaStream_t St1, cudaStream_t St2, int rank_L, int rank_R, int zs, int ze, int d_i) {
      if(rank_L==-1 && rank_R==-1) return;
 
      dim3 block (1, BLOCK_SIZE_Y, 1);
      dim3 grid (1, (nyt+4+ngsl2+BLOCK_SIZE_Y-1)/BLOCK_SIZE_Y,1);
-     //cudaPrintfInit();
+     /*cudaPrintfInit();*/
      CUCHK(cudaFuncSetCacheConfig(update_swapzone_buffer_x, cudaFuncCachePreferL1));
      update_swapzone_data_x<<<grid, block, 0, St1>>>(u1, v1, w1, xx, yy, zz, xy, xz, yz, buf_L, rank_L, Left, zs, ze, d_i);
-      CUCHK(cudaGetLastError()) ;
+     CUCHK( cudaGetLastError() );
      update_swapzone_data_x<<<grid, block, 0, St2>>>(u1, v1, w1, xx, yy, zz, xy, xz, yz, buf_R, rank_R, Right, zs, ze, d_i);
-      CUCHK(cudaGetLastError()) ;
-     //cudaPrintfDisplay(stdout, 1);
-     //cudaPrintfEnd();
+     CUCHK( cudaGetLastError() );
+     /*cudaPrintfDisplay(stdout, 1);
+     cudaPrintfEnd();*/
      return;
 }
 
 /* copy exchanged buffer data back to swap zone*/
-__global__ void update_swapzone_data_x(_prec*  u1, _prec*  v1, _prec*  w1, _prec *xx, _prec *yy, _prec *zz, _prec *xy, _prec *xz, _prec *yz,
+__global__ void update_swapzone_data_x(float* u1, float* v1, float* w1, _prec *xx, _prec *yy, _prec *zz, _prec *xy, _prec *xz, _prec *yz,
    _prec *buf, int rank, int flag, int zs, int ze, int d_i)
 {
     register int i, j, k, pos, bpos;
@@ -3496,25 +3547,26 @@ __global__ void update_swapzone_data_x(_prec*  u1, _prec*  v1, _prec*  w1, _prec
     return;
 }
 
-void update_swapzone_buffer_y_H(_prec*  u1, _prec*  v1, _prec*  w1, _prec*  xx, _prec*  yy, _prec*  zz, _prec *xy, _prec *xz, _prec *yz, 
+extern "C"
+void update_swapzone_buffer_y_H(float* u1, float* v1, float* w1, float* xx, float* yy, float* zz, _prec *xy, _prec *xz, _prec *yz, 
    _prec *buf_F, _prec *buf_B, int nxt, cudaStream_t St1, cudaStream_t St2, int rank_F, int rank_B, int zs, int ze, int d_i) {
      if(rank_F==-1 && rank_B==-1) return;
 
      dim3 block (BLOCK_SIZE_X, 1, 1);
      dim3 grid ((nxt+BLOCK_SIZE_X-1)/BLOCK_SIZE_X, 1,1);
-     //cudaPrintfInit();
+     /*cudaPrintfInit();*/
      CUCHK(cudaFuncSetCacheConfig(update_swapzone_buffer_y, cudaFuncCachePreferL1));
      update_swapzone_buffer_y<<<grid, block, 0, St1>>>(u1, v1, w1, xx, yy, zz, xy, xz, yz, buf_F, rank_F, Front, zs, ze, d_i);
-      CUCHK(cudaGetLastError()) ;
+     CUCHK( cudaGetLastError() );
      update_swapzone_buffer_y<<<grid, block, 0, St2>>>(u1, v1, w1, xx, yy, zz, xy, xz, yz, buf_B, rank_B, Back, zs, ze, d_i);
-      CUCHK(cudaGetLastError()) ;
-     //cudaPrintfDisplay(stdout, 1);
-     //cudaPrintfEnd();
+     CUCHK( cudaGetLastError() );
+     /*cudaPrintfDisplay(stdout, 1);
+     cudaPrintfEnd();*/
      return;
 }
 
 /* buffer exchanged for the swap area along Y*/
-__global__ void update_swapzone_buffer_y(_prec*  u1, _prec*  v1, _prec*  w1, _prec *xx, _prec *yy, _prec *zz, _prec *xy, _prec *xz, _prec *yz,
+__global__ void update_swapzone_buffer_y(float* u1, float* v1, float* w1, _prec *xx, _prec *yy, _prec *zz, _prec *xy, _prec *xz, _prec *yz,
    _prec *buf, int rank, int flag, int zs, int ze, int d_i)
 {
     register int i, j, k, pos, bpos;
@@ -3584,25 +3636,26 @@ __global__ void update_swapzone_buffer_y(_prec*  u1, _prec*  v1, _prec*  w1, _pr
     return;
 }
 
-void update_swapzone_data_y_H(_prec*  u1, _prec*  v1, _prec*  w1, _prec*  xx, _prec*  yy, _prec*  zz, _prec *xy, _prec *xz, _prec *yz, 
+extern "C"
+void update_swapzone_data_y_H(float* u1, float* v1, float* w1, float* xx, float* yy, float* zz, _prec *xy, _prec *xz, _prec *yz, 
    _prec *buf_F, _prec *buf_B, int nxt, cudaStream_t St1, cudaStream_t St2, int rank_F, int rank_B, int zs, int ze, int d_i) {
      if(rank_F==-1 && rank_B==-1) return;
 
      dim3 block (BLOCK_SIZE_X, 1, 1);
      dim3 grid ((nxt+BLOCK_SIZE_X-1)/BLOCK_SIZE_X, 1,1);
-     //cudaPrintfInit();
+     /*cudaPrintfInit();*/
      CUCHK(cudaFuncSetCacheConfig(update_swapzone_buffer_y, cudaFuncCachePreferL1));
      update_swapzone_data_y<<<grid, block, 0, St1>>>(u1, v1, w1, xx, yy, zz, xy, xz, yz, buf_F, rank_F, Front, zs, ze, d_i);
-      CUCHK(cudaGetLastError()) ;
+     CUCHK( cudaGetLastError() );
      update_swapzone_data_y<<<grid, block, 0, St2>>>(u1, v1, w1, xx, yy, zz, xy, xz, yz, buf_B, rank_B, Back, zs, ze, d_i);
-      CUCHK(cudaGetLastError()) ;
-     //cudaPrintfDisplay(stdout, 1);
-     //cudaPrintfEnd();
+     CUCHK( cudaGetLastError() );
+     /*cudaPrintfDisplay(stdout, 1);
+     cudaPrintfEnd();*/
      return;
 }
 
 /* copy exchanged buffer data back to swap zone*/
-__global__ void update_swapzone_data_y(_prec*  u1, _prec*  v1, _prec*  w1, _prec *xx, _prec *yy, _prec *zz, _prec *xy, _prec *xz, _prec *yz,
+__global__ void update_swapzone_data_y(float* u1, float* v1, float* w1, _prec *xx, _prec *yy, _prec *zz, _prec *xy, _prec *xz, _prec *yz,
    _prec *buf, int rank, int flag, int zs, int ze, int d_i)
 {
     register int i, j, k, pos, bpos;
@@ -3674,10 +3727,11 @@ __global__ void update_swapzone_data_y(_prec*  u1, _prec*  v1, _prec*  w1, _prec
     return;
 }
 
-void addkinsrc_H(int i,   int dim,    int* psrc,  int npsrc,  cudaStream_t St, _prec*  mu,
-              _prec*  axx, _prec*  ayy,    _prec*  azz, _prec*  axz, _prec*  ayz, _prec*  axy,
-              _prec*  xx,  _prec*  yy,     _prec*  zz,  _prec*  xy,  _prec*  yz,  _prec*  xz, 
-              _prec*  mom, double *srcfilt_d, int d_i)
+extern "C"
+void addkinsrc_H(int i,   int dim,    int* psrc,  int npsrc,  cudaStream_t St, float* mu,
+              float* axx, float* ayy,    float* azz, float* axz, float* ayz, float* axy,
+              float* xx,  float* yy,     float* zz,  float* xy,  float* yz,  float* xz, 
+              float* mom, double *srcfilt_d, int d_i)
 {
     dim3 grid, block;
     if(npsrc < 256)
@@ -3693,12 +3747,12 @@ void addkinsrc_H(int i,   int dim,    int* psrc,  int npsrc,  cudaStream_t St, _
     cudaError_t cerr;
     cerr=cudaGetLastError();
     if(cerr!=cudaSuccess) printf("CUDA ERROR: addkinsrc before kernel: %s\n",cudaGetErrorString(cerr));
-    /*cudaPrintfInit();*/
+    //cudaPrintfInit();
     addkinsrc_cu<<<grid, block, 0, St>>>(i, dim, psrc, npsrc, mu, axx, ayy, azz, axz, ayz, axy,
                                       xx, yy, zz,  xy,   yz,  xz, mom, srcfilt_d, d_i);
     cerr=cudaGetLastError();
     /*cudaPrintfDisplay(stdout, 1);
-    CUCHK(cudaPrintfEnd());*/
+    cudaPrintfEnd();*/
     if(cerr!=cudaSuccess) printf("CUDA ERROR: addkinsrc after kernel: %s\n",cudaGetErrorString(cerr));
     return;
 }
@@ -3757,6 +3811,28 @@ __device__ _prec liu(_prec tau, _prec time){
    return(stf);
 }
 
+/* GP 2010, revised based on Liu et al. (2006) source time function.  tau = risetime */
+__device__ _prec gp10(_prec tau, _prec time){
+   register _prec tau1, tau2, CN, stf;
+
+   tau1 = 0.13 * tau;
+   tau2 = tau-tau1;
+
+   CN=M_PI / (1.5 * M_PI*tau1 + 1.2*tau1 + 0.2 * M_PI * tau2);
+   if (time < 0.)
+      stf = 0.;
+   else if (time < tau1)
+      stf = CN*(0.7 - 0.7*cosf(M_PI*time/tau1) + 0.6*sinf(0.5*M_PI*time/tau1));
+   else if (time < 2*tau1)
+      stf = CN*(1.0 - 0.8*cosf(M_PI*time/tau1) + 0.2*cosf(M_PI*(time-tau1)/tau2));
+   else if (time < tau)
+      stf = CN*(0.2 + 0.2*cosf(M_PI*(time-tau1) / tau2));
+   else
+      stf = 0.;
+
+   return(stf);
+}
+
 /* 1-D FIR or IIR filter, modeled after scipy implementation of lfilter (Daniel) */
 __device__ double lfilter(int order, double *b, double *a, double x, double *d){
    register int n; 
@@ -3772,10 +3848,10 @@ __device__ double lfilter(int order, double *b, double *a, double x, double *d){
    return y;
 }
 
-__global__ void addkinsrc_cu(int i, int dim,    int* psrc,  int npsrc, _prec*  mu,
-                          _prec*  axx, _prec*  ayy,    _prec*  azz, _prec*  axz, _prec*  ayz, _prec*  axy,
-                          _prec*  xx,  _prec*  yy,     _prec*  zz,  _prec*  xy,  _prec*  yz,  _prec*  xz,  
-                          _prec*  mom, double *d_srcfilt_d, int d_i)
+__global__ void addkinsrc_cu(int i, int dim,    int* psrc,  int npsrc, float* mu,
+                          float* axx, float* ayy,    float* azz, float* axz, float* ayz, float* axy,
+                          float* xx,  float* yy,     float* zz,  float* xy,  float* yz,  float* xz,  
+                          float* mom, double *d_srcfilt_d, int d_i)
 {
 
         register _prec vtst;
@@ -3817,14 +3893,16 @@ __global__ void addkinsrc_cu(int i, int dim,    int* psrc,  int npsrc, _prec*  m
 	   }
 	   else if (stf_type == 2.0f)
 	      stf = liu(risetime, atime - ruptime);
-	   else 
+	   else if (stf_type == 3.0f)
+        stf = gp10(risetime, atime - ruptime);
+     else
 	      stf = 0.;               
 
            if (d_filtorder > 0)
-	      stf = (_prec) lfilter(d_filtorder, d_srcfilt_b, d_srcfilt_a, (double) stf, 
+	      stf = (float) lfilter(d_filtorder, d_srcfilt_b, d_srcfilt_a, (double) stf, 
 		   d_srcfilt_d+j*(d_filtorder+1));
 	   
-	   vtst = (_prec)d_DT/(d_DH[d_i]*d_DH[d_i]*d_DH[d_i]);
+	   vtst = (float)d_DT/(d_DH[d_i]*d_DH[d_i]*d_DH[d_i]);
 
 	   idx = psrc[j*dim]   + 1 + ngsl;
 	   idy = psrc[j*dim+1] + 1 + ngsl;
@@ -3837,14 +3915,14 @@ __global__ void addkinsrc_cu(int i, int dim,    int* psrc,  int npsrc, _prec*  m
            //cuPrintf("stf: %d %e %e %e %e\n", j, atime, stf, slip, area);
            //cuPrintf("mom: %d %e\n", j, mom[j]);
 
-           //if (j == 0)
-   	      /*cuPrintf("addkinsrc_cu: (%d,%d,%d) (%e, %e,%e,%e,%e,%e,%e)\n", idx, idy, idz, 
-	         stf, axxt, ayyt, azzt, axzt, ayzt, axyt);*/
-	      /*cuPrintf("addkinsrc_cu: (%d,%d,%d) (%e, %e)\n", idx, idy, idz, 
-	         stf, 1./mu[pos]);*/
-
            stf *= vtst;
 
+           /*if (j == 0)
+   	      cuPrintf("addkinsrc_cu: (%d,%d,%d) (%e, %e,%e,%e,%e,%e,%e)\n", idx, idy, idz, 
+	         stf, axxt, ayyt, azzt, axzt, ayzt, axyt);
+	      cuPrintf("addkinsrc_cu: (%d,%d,%d) (%e, %e, %e m^2, %f m)\n", idx, idy, idz, 
+	         stf, 1./mu[pos], area, slip);*/
+
 	   xx[pos] = xx[pos] - stf*axxt;
 	   yy[pos] = yy[pos] - stf*ayyt;
 	   zz[pos] = zz[pos] - stf*azzt;
@@ -3857,10 +3935,11 @@ __global__ void addkinsrc_cu(int i, int dim,    int* psrc,  int npsrc, _prec*  m
         return;
 }
 
+extern "C"
 void addplanesrc_H(int i,  int dim,   int NST,  cudaStream_t St,
               _prec *mu, _prec *lambda, int ND, int nxt, int nyt, 
-              _prec*  axx, _prec*  ayy,    _prec*  azz,
-              _prec*  xx,  _prec*  yy,     _prec*  zz,  _prec*  xy,  _prec*  yz,  _prec*  xz, int d_i){
+              float* axx, float* ayy,    float* azz,
+              float* xx,  float* yy,     float* zz,  float* xy,  float* yz,  float* xz, int d_i){
 
     dim3 grid, block;
     int nx, ny;
@@ -3888,15 +3967,15 @@ void addplanesrc_H(int i,  int dim,   int NST,  cudaStream_t St,
     addplanesrc_cu<<<grid, block, 0, St>>>(i, dim, NST, mu, lambda, ND, axx, ayy, azz, 
                                       xx, yy, zz, xy, yz, xz, d_i);
     /*CUCHK(cudaPrintfDisplay(stdout, 1));
-    CUCHK(cudaPrintfEnd());*/
+    cudaPrintfEnd();*/
     cerr=cudaGetLastError();
     if(cerr!=cudaSuccess) printf("CUDA ERROR: addplanesrc after kernel: %s\n",cudaGetErrorString(cerr));
 }
 
 
-__global__ void addplanesrc_cu(int n, int dim,  int NST, _prec*  mu, _prec*  lambda, int ND,
-                          _prec*  axx, _prec*  ayy,    _prec*  azz,
-                          _prec*  xx,  _prec*  yy,     _prec*  zz,  _prec*  xy,  _prec*  yz,  _prec*  xz,  
+__global__ void addplanesrc_cu(int n, int dim,  int NST, float* mu, float* lambda, int ND,
+                          float* axx, float* ayy,    float* azz,
+                          float* xx,  float* yy,     float* zz,  float* xy,  float* yz,  float* xz,  
                           int d_i)
 {
         register int j, i, k, pos;
@@ -3905,9 +3984,9 @@ __global__ void addplanesrc_cu(int n, int dim,  int NST, _prec*  mu, _prec*  lam
         i = blockIdx.x*blockDim.x+threadIdx.x + 4;
         j = blockIdx.y*blockDim.y+threadIdx.y + 4;
 
-        vtst = (_prec) d_DT/d_DH[d_i];
+        vtst = (float) d_DT/d_DH[d_i];
 
-        k = align + ND + 1;
+        k = align + ND + 9; //chosing value consistent with CPU code
 
         pos = i*d_slice_1[d_i] + j*d_yline_1[d_i] + k;
 
@@ -3924,8 +4003,9 @@ __global__ void addplanesrc_cu(int n, int dim,  int NST, _prec*  mu, _prec*  lam
         return;
 }
 
-void velbuffer_H(const float *u1, const float *v1, const float *w1, const float *neta,
-       float *Bufx, float *Bufy, float *Bufz, float *Bufeta, int NVE, 
+extern "C"
+void velbuffer_H(const _prec *u1, const _prec *v1, const _prec *w1, const _prec *neta,
+       _prec *Bufx, _prec *Bufy, _prec *Bufz, _prec *Bufeta, int NVE, 
        int nbgx, int nedx, int nskpx, int nbgy, int nedy, int nskpy, int nbgz, int nedz, int nskpz,
        int rec_nxt, int rec_nyt, int rec_nzt, cudaStream_t St, int FOLLOWBATHY, const int* bathy, int d_i){
 
@@ -3936,23 +4016,23 @@ void velbuffer_H(const float *u1, const float *v1, const float *w1, const float
 
     cudaFuncSetCacheConfig(velbuffer, cudaFuncCachePreferL1);
     CUCHK(cudaGetLastError());
-    /*CUCHK(cudaPrintfInit());*/
     velbuffer <<<grid, block, 0, St>>>(u1, v1, w1, neta, Bufx, Bufy, Bufz, Bufeta, NVE, 
          nbgx, nedx, nskpx, nbgy, nedy, nskpy, nbgz, nedz, nskpz, rec_nxt, rec_nyt, FOLLOWBATHY, bathy, d_i);
-    /*CUCHK(cudaPrintfDisplay(stdout, 1));
-    cudaPrintfEnd();*/
 
-    CUCHK(cudaGetLastError());
+    cudaError_t cerr;
+    CUCHK(cerr=cudaGetLastError());
+
+    if(cerr!=cudaSuccess) printf("CUDA ERROR: velbuffer_H after kernel: %s\n",cudaGetErrorString(cerr));
 }
 
-__global__ void velbuffer(const float *u1, const float *v1, const float *w1, const float *neta,
-       float *Bufx, float *Bufy, float *Bufz, float *Bufeta, int NVE, 
+__global__ void velbuffer(const _prec *u1, const _prec *v1, const _prec *w1, const _prec *neta,
+       _prec *Bufx, _prec *Bufy, _prec *Bufz, _prec *Bufeta, int NVE, 
        int nbgx, int nedx, int nskpx, int nbgy, int nedy, int nskpy, int nbgz, int nedz, int nskpz,
        int rec_nxt, int rec_nyt, int FOLLOWBATHY, const int *bathy, int d_i)
 {
-    register int i, j, k, ko, koz;
+    register int i, j, k, ko;
 
-    int tmpInd, pos, posz, bpos;
+    int tmpInd, pos, bpos;
 
     i = 2+ngsl+nbgx + (blockIdx.x*blockDim.x+threadIdx.x) * nskpx;
     j = 2+ngsl+nbgy + (blockIdx.y*blockDim.y+threadIdx.y) * nskpy;
@@ -3961,51 +4041,38 @@ __global__ void velbuffer(const float *u1, const float *v1, const float *w1, con
     if (i > 2+ngsl+nedx) return;
     if (j > 2+ngsl+nedy) return;
     if (k > nedz) return;
-   
-/*    if (FOLLOWBATHY && d_i == 0){
-       bpos=j*(d_nxt[0]+4+ngsl2)+i;
-       ko=bathy[bpos] - k;
+    //This implementation assumes topography is turned on.
+    //Vx and Vy at k=d_nzt[0]+align-1 can be used.
+    //Since Vz at k=d_nzt[0]+align-1 is always zero and shoud be avoided. The Vz right below will be output instead.
+    register int koxy, koz, posxy, posz, sfcidx;
+    if (FOLLOWBATHY && d_i == 0)
+    {
+    bpos=j*(d_nxt[0]+4+ngsl2)+i;
+    sfcidx=bathy[bpos];
     }
-    else ko=d_nzt[d_i]+align-1-k;
-                      
-    pos = i*d_slice_1[d_i]+j*d_yline_1[d_i]+ko;
-*/
-
-    if(d_i > 0)
-        {
-        ko=d_nzt[d_i]+align-1-k;
-        koz=ko;
-        }
-
     else
-        {
-        if(FOLLOWBATHY == 0)
-                {
-                ko=d_nzt[d_i]+align-1-k;
-                koz=ko-1;
-                }
-        else if(FOLLOWBATHY == 1)
-                {
-                bpos=j*(d_nxt[0]+4+ngsl2)+i;
-                ko=bathy[bpos] - k;
-                if(bathy[bpos] == d_nzt[d_i]+align-1 )koz=ko-1;
-                if(bathy[bpos] < d_nzt[d_i]+align-1 )koz=ko;
-                }
-        }
-
-     pos = i*d_slice_1[d_i]+j*d_yline_1[d_i]+ko;
-     posz = i*d_slice_1[d_i]+j*d_yline_1[d_i]+koz;
+    {
+    sfcidx=d_nzt[d_i]+align-1;
+    }
 
-    tmpInd =  (k - nbgz)/nskpz*rec_nxt*rec_nyt + 
-	       (j-2-ngsl-nbgy)/nskpy*rec_nxt + 
-	       (i-2-ngsl-nbgx)/nskpx;
+    koxy = sfcidx - k;
+    koz = sfcidx - k - 1;
 
-    /*if (i==48 && j==48 && k==1) 
-        cuPrintf("velbuffer: i=%d,j=%d,k=%d,pos=%d,tmpInd=%d,u1=%e\n", i,j,k,pos,tmpInd,u1[pos]);*/
+    posxy = i*d_slice_1[d_i]+j*d_yline_1[d_i]+koxy;
+    posz = i*d_slice_1[d_i]+j*d_yline_1[d_i]+koz;
 
-    Bufx[tmpInd] = u1[pos];
-    Bufy[tmpInd] = v1[pos];
+    tmpInd =  (k - nbgz)/nskpz*rec_nxt*rec_nyt +
+           (j-2-ngsl-nbgy)/nskpy*rec_nxt +
+           (i-2-ngsl-nbgx)/nskpx;
+    Bufx[tmpInd] = u1[posxy];
+    Bufy[tmpInd] = v1[posxy];
     Bufz[tmpInd] = w1[posz];
 
-    if (NVE == 3) Bufeta[tmpInd] = neta[pos];
+    if (NVE == 3) Bufeta[tmpInd] = neta[posz];
+   
 }
+
+
+
+
+
diff --git a/src/awp/mesh.c b/src/awp/mesh.c
index 61e58f0..1d3a593 100644
--- a/src/awp/mesh.c
+++ b/src/awp/mesh.c
@@ -24,6 +24,7 @@ void inimesh(int rank, int MEDIASTART, Grid3D d1, Grid3D mu, Grid3D lam, Grid3D
 	     Grid3D tau, Grid3D weights,Grid1D coeff, 
 	     int nvar, _prec FP,  _prec FAC, _prec Q0, _prec EX, int nxt, int nyt, int nzt, int PX, int PY, int NX, int NY, 
              int NZ, int *coords, MPI_Comm MCW, int IDYNA, int NVE, int SoCalQ, char *INVEL, 
+            _prec qsi, _prec qpqsr, _prec maxvpvsr, _prec vmin, _prec vmax, _prec dmin, 
              _prec *vse, _prec *vpe, _prec *dde)
 {
   int i,j,k,err;
@@ -276,40 +277,83 @@ void inimesh(int rank, int MEDIASTART, Grid3D d1, Grid3D mu, Grid3D lam, Grid3D
         for(j=0;j<nyt;j++)
           for(k=0;k<nzt;k++)
           {
-	    //             tmpvs[i][j][k] = tmpvs[i][j][k]*(1+ ( log(w2/w0) )/(pi*tmpsq[i][j][k]) );
-            // tmpvp[i][j][k] = tmpvp[i][j][k]*(1+ ( log(w2/w0) )/(pi*tmppq[i][j][k]) );
-        //    tmpsq[i][j][k] = 10000.;
-        //    tmppq[i][j][k] = 10000.;
-        //      tmpsq[i][j][k] = 200;
-        //      tmppq[i][j][k] = 200;
-        //   if(tmpvs[i][j][k]>0.0){
-        //   if(tmpvs[i][j][k]<2500.0){
-        //      vpvs=tmpvp[i][j][k]/tmpvs[i][j][k];
-        //      tmpvs[i][j][k]=2500.0;
-        //      tmpvp[i][j][k]=tmpvs[i][j][k]*vpvs;
-        //      }
-        //      }
-             /*if(tmpvs[i][j][k]>0.0){
-             if(tmpvs[i][j][k]<800.0){
-                vpvs=tmpvp[i][j][k]/tmpvs[i][j][k];
-                tmpvs[i][j][k]=800.0;
-                tmpvp[i][j][k]=tmpvs[i][j][k]*vpvs;
-                }
-                }*/
-	    /*if(tmpvs[i][j][k]<333.33)
-	      {
-		//		tmpvs[i][j][k]=200.0;
-                //tmpvp[i][j][k]=600.0;
-		//                 tmpsq[i][j][k] = 20;                                                                                                                             
-                // tmppq[i][j][k] = 20;                                                                                                                             
-                tmpvp[i][j][k] = 1500.;
-                tmpvs[i][j][k] = 0.;
-                tmpdd[i][j][k] = 1025.;
-                tmpsq[i][j][k] = 50.;
-                tmppq[i][j][k] = 100.;
-	      }*/
-	    /*tmpsq[i][j][k] = 0.05  * tmpvs[i][j][k];
-	    tmppq[i][j][k] = 2.0   * tmpsq[i][j][k];*/
+
+
+    //for solid materials
+    if(tmpvs[i][j][k] > 1.f && tmpvp[i][j][k] > 1.f)
+    {
+        if(nvar == 3)
+        {
+            //assigning Q value
+            if(qsi <= 1.)
+            {
+            tmpsq[i][j][k]=tmpvs[i][j][k]*qsi;
+            }
+            else if(qsi > 1.)
+            {
+            tmpsq[i][j][k]=qsi;
+            }
+            tmppq[i][j][k]=tmpsq[i][j][k]*qpqsr;
+        }
+
+        //capping max vp/vs ratio
+        vpvs=tmpvp[i][j][k]/tmpvs[i][j][k];
+        if(vpvs > maxvpvsr)
+        {
+        tmpvs[i][j][k]=tmpvp[i][j][k]/maxvpvsr;
+        }
+
+        //constraining min vp/vs ratio if lower than 1.5 to avoid negative lambda
+        //Here vp/vs ratio will be set to 1.5
+        vpvs=tmpvp[i][j][k]/tmpvs[i][j][k];
+        if(vpvs <= 1.5)
+        {
+        tmpvs[i][j][k]=tmpvp[i][j][k]/1.5;
+        }
+
+        //capping vmin, vmax
+        if(tmpvs[i][j][k] < vmin)
+        {
+        vpvs=tmpvp[i][j][k]/tmpvs[i][j][k];
+        tmpvs[i][j][k]=vmin;
+        tmpvp[i][j][k]=tmpvs[i][j][k]*vpvs;
+        }
+
+        if(tmpvp[i][j][k]>vmax)
+        {
+        vpvs=tmpvp[i][j][k]/tmpvs[i][j][k];
+        tmpvp[i][j][k]=vmax;
+        tmpvs[i][j][k]=tmpvp[i][j][k]/vpvs;
+        }
+
+
+        //constrain minimum density 
+        if(tmpdd[i][j][k]<dmin) tmpdd[i][j][k]=dmin;
+
+    }
+
+
+
+    //special treatment for water
+    if (tmpvs[i][j][k] < 1.f && tmpvp[i][j][k] > 1.f)
+    {
+    tmpvs[i][j][k] = 0.00001;
+    tmpvp[i][j][k] = 1492.;
+    tmpdd[i][j][k] = 1050.;
+    tmpsq[i][j][k] = 25.;
+    tmppq[i][j][k] = 10000.;
+    }
+
+    //capping minimum Q value
+    if (tmppq[i][j][k] <= 25.)
+    {
+    tmppq[i][j][k] = 25.;
+    }
+    if (tmpsq[i][j][k] <= 25.)
+    {
+    tmpsq[i][j][k] = 25.;
+    }
+
 
 	    if(tmppq[i][j][k]>200.0)
 	      {
@@ -402,46 +446,12 @@ void inimesh(int rank, int MEDIASTART, Grid3D d1, Grid3D mu, Grid3D lam, Grid3D
             #endif
 	    // QF - end  
 
-            if (tmpvs[i][j][k] == 0.f) {
-                //tmpvs[i][j][k] = 1.000;
-                tmpsq[i][j][k] = 50.;
-                tmppq[i][j][k] = 100.;
-            }
-
              if (SoCalQ==1)
              {
                 vpvs=tmpvp[i][j][k]/tmpvs[i][j][k];
                 if (vpvs<1.45)  tmpvs[i][j][k]=tmpvp[i][j][k]/1.45;
              }
-             /*if(tmpvp[i][j][k]>9000.0){
-                tmpvs[i][j][k]=5196.0;
-                tmpvp[i][j][k]=9000.0;
-                }*/
-
-             /*if(tmpvp[i][j][k]>7600.0){
-	       tmpvs[i][j][k]=4387.0;
-	       tmpvp[i][j][k]=7600.0;
-             }*/
-           /*if(tmpvs[i][j][k]<500.0){
-	      tmpvs[i][j][k]=500.0;
-	      if (tmpvp[i][j][k] < 725.0) tmpvp[i][j][k] = 725.0;
-           }*/
-               //if(tmpdd[i][j][k]<1700.0) tmpdd[i][j][k]=1700.0;
-
-
-             //if(tmpvs[i][j][k]<400.0)
-             /*if(tmpvs[i][j][k]<200.0)
-             {
-                //tmpvs[i][j][k]=400.0;
-                //tmpvp[i][j][k]=1200.0;
-                tmpvs[i][j][k]=200.0;
-                tmpvp[i][j][k]=600.0;
-             }*/
-             /*if(tmpvp[i][j][k]>6500.0){
-                tmpvs[i][j][k]=3752.0;
-                tmpvp[i][j][k]=6500.0;
-             }*/
-//             if(tmpdd[i][j][k]<1700.0) tmpdd[i][j][k]=1700.0;   
+
              mu[i+2+ngsl][j+2+ngsl][(nzt+align-1) - k]  = 1./(tmpdd[i][j][k]*tmpvs[i][j][k]*tmpvs[i][j][k]);
              lam[i+2+ngsl][j+2+ngsl][(nzt+align-1) - k] = 1./(tmpdd[i][j][k]*(tmpvp[i][j][k]*tmpvp[i][j][k]
                                                                               -2.*tmpvs[i][j][k]*tmpvs[i][j][k]));
@@ -1317,7 +1327,7 @@ void inidrpr_hoekbrown_light(int nxt, int nyt, int nzt, int nve, int *coords,
         }
 
         ypos = coords[1] * nyt + j - 2 - ngsl;
-        fltdist = fabsf((_prec) ypos + 0.5 - (_prec) fltpos) * dh;
+        fltdist = fabs((_prec) ypos + 0.5 - (_prec) fltpos) * dh;
         if (fltdist < 225.) {
           GSI_d = GSI_core;
         }
diff --git a/src/awp/pmcl3d.c b/src/awp/pmcl3d.c
index 69e5ae4..60ced1a 100644
--- a/src/awp/pmcl3d.c
+++ b/src/awp/pmcl3d.c
@@ -1,11 +1,11 @@
-/*  
+ /*  
 ********************************************************************************
 * pmcl3d.c                                                                     *
 * programming in C&CUDA language                                                    *
 * Author: Jun Zhou                                                             * 
 * First Version: Cerjan Mode and Homogenous                                    *
 ********************************************************************************
-*/  
+*/
 #include <sys/time.h>
 #include <time.h>
 #include <stdio.h>
@@ -24,501 +24,571 @@
 #include <awp/init.h>
 #include <topography/topography.h>
 #include <topography/velocity.cuh>
-#include <topography/stress_attenuation.cuh>
+#include <topography/stress.cuh>
 #include <topography/sources/sources.h>
 #include <topography/sources/forces.h>
 #include <topography/receivers/receivers.h>
 #include <topography/receivers/sgt.h>
 #include <topography/geometry/geometry.h>
+#include <topography/geometry.h>
+#include <topography/mms.cuh>
+#include <topography/energy.cuh>
 #include <buffers/buffer.h>
 
 #define VERBOSE 1
 
-int main(int argc,char **argv)
+// Uncomment this line to allow for gdb to attach to the mpi process with
+// rank = 0
+// #define GDB_ATTACH
+
+int main(int argc, char **argv)
 {
-//  variable definition begins
-    float TMAX, DH[MAXGRIDS], DT, ARBC, PHT;
-    int   NPC, ND, NSRC[MAXGRIDS], NST;
-    int   NVE, NVAR, MEDIASTART, IFAULT, READ_STEP, READ_STEP_GPU;
-    int   NX, NY, NZ[MAXGRIDS], PX, PY, IDYNA, SoCalQ, FOLLOWBATHY;
-    int   NBGX[MAXGRIDS], NEDX[MAXGRIDS], NSKPX[MAXGRIDS]; 
-    int   NBGY[MAXGRIDS], NEDY[MAXGRIDS], NSKPY[MAXGRIDS]; 
-    int   NBGZ[MAXGRIDS], NEDZ[MAXGRIDS], NSKPZ[MAXGRIDS];
-    int   nxt[MAXGRIDS], nyt[MAXGRIDS], nzt[MAXGRIDS];
-    MPI_Offset displacement[MAXGRIDS];
-    float FAC, Q0, EX, FP; 
-    char  INSRC[50], INVEL[50], OUT[50], INSRC_I2[50], CHKFILE[50];
-    char  insrcgrid[52], insrc_i2_grid[50];
-    double GFLOPS = 1.0;
-    double GFLOPS_SUM = 0.0;
-    Grid3D *u1=NULL, *v1=NULL, *w1=NULL;
-    Grid3D *d1=NULL, *mu=NULL, *lam=NULL;
-    Grid3D *xx=NULL, *yy=NULL, *zz=NULL, *xy=NULL, *yz=NULL, *xz=NULL;
-    Grid3D *r1=NULL, *r2=NULL, *r3=NULL, *r4=NULL, *r5=NULL, *r6=NULL;
-    Grid3D *qp=NULL, *qs=NULL;
-    PosInf *tpsrc=NULL;
-    Grid1D *taxx=NULL, *tayy=NULL, *tazz=NULL, *taxz=NULL, *tayz=NULL, *taxy=NULL; 
-    Grid1D *Bufx=NULL,coeff=NULL;
-    Grid1D *Bufy=NULL, *Bufz=NULL;
-    //Plasticity output buffers
-    Grid1D *Bufeta=NULL, *Bufeta2=NULL;
-    Grid3D *vx1=NULL,   *vx2=NULL,  *wwo=NULL,  *lam_mu=NULL;
-    Grid3Dww *ww=NULL;
-    Grid1D *dcrjx, *dcrjy, *dcrjz;
-    float **vse, **vpe, **dde;
-    FILE *fchk;
-    // plasticity variables
-    Grid3D *sigma2=NULL;
-    Grid3D *cohes=NULL, *phi=NULL;
-    Grid3D *yldfac=NULL, *neta=NULL;
-    /*Grid3D EPxx=NULL, EPyy=NULL, EPzz=NULL;
+   //  variable definition begins
+   float TMAX, DH[MAXGRIDS], DT, ARBC, PHT;
+   int NPC, ND, NSRC[MAXGRIDS], NST;
+   int NVE, NVAR, MEDIASTART, IFAULT, READ_STEP, READ_STEP_GPU;
+   int NX, NY, NZ[MAXGRIDS], PX, PY, IDYNA, SoCalQ, FOLLOWBATHY;
+   int NBGX[MAXGRIDS], NEDX[MAXGRIDS], NSKPX[MAXGRIDS];
+   int NBGY[MAXGRIDS], NEDY[MAXGRIDS], NSKPY[MAXGRIDS];
+   int NBGZ[MAXGRIDS], NEDZ[MAXGRIDS], NSKPZ[MAXGRIDS];
+   int nxt[MAXGRIDS], nyt[MAXGRIDS], nzt[MAXGRIDS];
+   MPI_Offset displacement[MAXGRIDS];
+   float FAC, Q0, EX, FP;
+   float QSI, QPQSR, MAXVPVSR, VMIN, VMAX, DMIN;
+   char INSRC[50], INVEL[50], OUT[50], INSRC_I2[50], CHKFILE[50];
+   char insrcgrid[52], insrc_i2_grid[50];
+   double GFLOPS = 1.0;
+   double GFLOPS_SUM = 0.0;
+   Grid3D *u1 = NULL, *v1 = NULL, *w1 = NULL;
+   Grid3D *d1 = NULL, *mu = NULL, *lam = NULL;
+   Grid3D *xx = NULL, *yy = NULL, *zz = NULL, *xy = NULL, *yz = NULL, *xz = NULL;
+   Grid3D *r1 = NULL, *r2 = NULL, *r3 = NULL, *r4 = NULL, *r5 = NULL, *r6 = NULL;
+   Grid3D *qp = NULL, *qs = NULL;
+   PosInf *tpsrc = NULL;
+   Grid1D *taxx = NULL, *tayy = NULL, *tazz = NULL, *taxz = NULL, *tayz = NULL, *taxy = NULL;
+   Grid1D *Bufx = NULL, coeff = NULL;
+   Grid1D *Bufy = NULL, *Bufz = NULL;
+   //Plasticity output buffers
+   Grid1D *Bufeta = NULL, *Bufeta2 = NULL;
+   Grid3D *vx1 = NULL, *vx2 = NULL, *wwo = NULL, *lam_mu = NULL;
+   Grid3Dww *ww = NULL;
+   Grid1D *dcrjx, *dcrjy, *dcrjz;
+   float **vse, **vpe, **dde;
+   FILE *fchk;
+   // plasticity variables
+   Grid3D *sigma2 = NULL;
+   Grid3D *cohes = NULL, *phi = NULL;
+   Grid3D *yldfac = NULL, *neta = NULL;
+   /*Grid3D EPxx=NULL, EPyy=NULL, EPzz=NULL;
     Grid3D EPxy=NULL, EPyz=NULL, EPxz=NULL;*/
 
-    // topography variables
-    int usetopo = 0;
-    char INTOPO[IN_FILE_LEN];
-
-    int usesourcefile = 0;
-    char SOURCEFILE[IN_FILE_LEN];
-
-    int userecvfile = 0;
-    char RECVFILE[IN_FILE_LEN];
-
-    int useforcefile = 0;
-    char FORCEFILE[IN_FILE_LEN];
-
-    int usesgtfile = 0;
-    char SGTFILE[IN_FILE_LEN];
-
-//  GPU variables
-    long int num_bytes;
-    float** d_d1;
-    float** d_u1;
-    float** d_v1;
-    float** d_w1;
-    float** d_f_u1;
-    float** d_f_v1;
-    float** d_f_w1;
-    float** d_b_u1;
-    float** d_b_v1;
-    float** d_b_w1;
-    float** d_dcrjx;
-    float** d_dcrjy;
-    float** d_dcrjz;
-    float** d_lam;
-    float** d_mu;
-    float** d_qp;
-    float* d_coeff;
-    float** d_qs;
-    float** d_vx1;
-    float** d_vx2;
-    int** d_ww;
-    float** d_wwo;
-    float** d_xx;
-    float** d_yy;
-    float** d_zz;
-    float** d_xy;
-    float** d_xz;
-    float** d_yz;
-    float** d_r1;
-    float** d_r2;
-    float** d_r3;
-    float** d_r4;
-    float** d_r5;
-    float** d_r6;
-    float** d_lam_mu;
-    int **d_tpsrc;
-    float** d_taxx;
-    float** d_tayy;
-    float** d_tazz;
-    float** d_taxz;
-    float** d_tayz;
-    float** d_taxy;
-
-    float **d_Bufx, **d_Bufy, **d_Bufz, **d_Bufeta;
-    // plasticity
-    float **d_sigma2;
-    float **d_yldfac,**d_cohes, **d_phi, **d_neta;
-//  end of GPU variables
-    int i,j,k,idx,idy,idz;
-    long int idtmp;
-    long int tmpInd;
-    const int maxdim = 3;
-    float taumax, taumin, tauu;
-    Grid3D tau=NULL, tau1=NULL, tau2=NULL;
-    Grid3D weights=NULL; 
-    int npsrc[MAXGRIDS];
-    long int nt, cur_step, source_step;
-    double time_un = 0.0;
-    double time_init = 0.0;
-    // time_src and time_mesh measures the time spent
-    // in source and mesh reading 
-    double time_src = 0.0, time_mesh = 0.0;
-    // time_gpuio measures the time spent in gpu memory copying for IO
-    double time_gpuio = 0.0;
-    double time_gpuio_tmp = 0.0; 
-//  MPI+CUDA variables
-    cudaError_t cerr = 0;
-    size_t cmemfree, cmemtotal;
-    cudaStream_t stream_1, /*stream_1b,*/ stream_2, /*stream_2b,*/ stream_i, stream_i2;;
-    cudaStream_t stream_o;
-    int   rank, size, err, srcproc[MAXGRIDS], rank_gpu;
-    int   dim[2], period[2], coord[2], reorder;
-    int   x_rank_L  = -1,  x_rank_R  = -1,  y_rank_F = -1,  y_rank_B = -1;
-    MPI_Comm MCW, MC1;
-    MPI_Request  request_x[MAXGRIDS][4], request_y[MAXGRIDS][4];
-    MPI_Status   status_x[MAXGRIDS][4],  status_y[MAXGRIDS][4], filestatus;
-    MPI_File fh;
-    int maxNX_NY_NZ_WS; 
-    #ifdef NOBGIO
-    /*int   fmtype[3], fptype[3], foffset[3];*/
-    int **ones;
-    MPI_Aint **dispArray;
-    MPI_Datatype filetype[MAXGRIDS];
-    #endif
-
-    int   msg_v_size_x[MAXGRIDS], msg_v_size_y[MAXGRIDS], count_x[MAXGRIDS], count_y[MAXGRIDS];
-    int   xls[MAXGRIDS], xre[MAXGRIDS], xvs[MAXGRIDS], xve[MAXGRIDS], xss1[MAXGRIDS]; 
-    int   xse1[MAXGRIDS], xss2[MAXGRIDS], xse2[MAXGRIDS], xss3[MAXGRIDS], xse3[MAXGRIDS];
-    int   yfs[MAXGRIDS], yfe[MAXGRIDS], ybs[MAXGRIDS], ybe[MAXGRIDS], yls[MAXGRIDS],  yre[MAXGRIDS];
-    /* Added by Daniel for plasticity computation boundaries */
-    int  xlsp[MAXGRIDS], xrep[MAXGRIDS], ylsp[MAXGRIDS], yrep[MAXGRIDS];
-    float** SL_vel;     // Velocity to be sent to   Left  in x direction (u1,v1,w1)
-    float** SR_vel;     // Velocity to be Sent to   Right in x direction (u1,v1,w1)
-    float** RL_vel;     // Velocity to be Recv from Left  in x direction (u1,v1,w1)
-    float** RR_vel;     // Velocity to be Recv from Right in x direction (u1,v1,w1)
-    float** SF_vel;     // Velocity to be sent to   Front in y direction (u1,v1,w1)
-    float** SB_vel;     // Velocity to be Sent to   Back  in y direction (u1,v1,w1)
-    float** RF_vel;     // Velocity to be Recv from Front in y direction (u1,v1,w1)
-    float** RB_vel;     // Velocity to be Recv from Back  in y direction (u1,v1,w1)
-
-//  variable definition ends    
-
-    int tmpSize;
-    int WRITE_STEP;
-    int NTISKP;
-    int rec_NX[MAXGRIDS];
-    int rec_NY[MAXGRIDS];
-    int rec_NZ[MAXGRIDS];
-    int rec_nxt[MAXGRIDS];
-    int rec_nyt[MAXGRIDS];
-    int rec_nzt[MAXGRIDS];
-    int rec_nbgx[MAXGRIDS];   // 0-based indexing, however NBG* is 1-based
-    int rec_nedx[MAXGRIDS];   // 0-based indexing, however NED* is 1-based
-    int rec_nbgy[MAXGRIDS];   // 0-based indexing
-    int rec_nedy[MAXGRIDS];   // 0-based indexing
-    int rec_nbgz[MAXGRIDS];   // 0-based indexing
-    int rec_nedz[MAXGRIDS];   // 0-based indexing
-    char filename[50];
-    #ifdef NOBGIO
-    char filenamebasex[50];
-    char filenamebasey[50];
-    char filenamebasez[50];
-    char filenamebaseeta[50];
-    char filenamebaseep[50];
-    #endif
-
-    // moving initial stress computation to GPU
-    float fmajor=0, fminor=0, strike[3], dip[3], Rz[9], RzT[9];
-
-    // variables for fault boundary condition (Daniel)
-    int fbc_ext[6], fbc_off[3], fbc_extl[6], fbc_dim[3], fbc_seismio, fbc_tskp=1;
-    char fbc_pmask[200];
-    long int nel[MAXGRIDS];
-
-    int ranktype=0, size_tot;
-    MPI_Comm MCT, MCS, MCI;
-
-    /*Daniel - Buffers for exchange of yield factors, same naming as with velocity */
-    float **SL_yldfac, **SR_yldfac, **RL_yldfac, **RR_yldfac; 
-    float **SF_yldfac, **SB_yldfac, **RF_yldfac, **RB_yldfac; 
-    float **d_SL_yldfac, **d_SR_yldfac, **d_RL_yldfac, **d_RR_yldfac; 
-    float **d_SF_yldfac, **d_SB_yldfac, **d_RF_yldfac, **d_RB_yldfac; 
-    int *yldfac_msg_size_x, *yldfac_msg_size_y;
-    long int num_bytes2;
-    MPI_Request  request_x_yldfac[MAXGRIDS][4], request_y_yldfac[MAXGRIDS][4];
-    MPI_Status   status_x_yldfac[MAXGRIDS][4], status_y_yldfac[MAXGRIDS][4];
-    int   count_x_yldfac[MAXGRIDS], count_y_yldfac[MAXGRIDS];
-    int yls2[MAXGRIDS], yre2[MAXGRIDS];
-
-    /* DM variables added by Daniel */
-    int p;
-    int ngrids;
-    int grdfct[MAXGRIDS]; /* Horizontal grid extent with respect to coarsest grid */
-
-    //int dm = 53;
-    //int dm = 41;
-
-    /*buffers for overlap zone variables */
-    float **SL_swap, **SR_swap, **RL_swap, **RR_swap; 
-    float **SF_swap, **SB_swap, **RF_swap, **RB_swap; 
-    float **d_SL_swap, **d_SR_swap, **d_RL_swap, **d_RR_swap;
-    float **d_SF_swap, **d_SB_swap, **d_RF_swap, **d_RB_swap;
-    int *swp_msg_size_x; //*swp_msg_size_x_l;
-    int *swp_msg_size_y; //*swp_msg_size_y_l;
-    MPI_Request  request_x_swp[MAXGRIDS][4], request_y_swp[MAXGRIDS][4];
-    MPI_Status   status_x_swp[MAXGRIDS][4], status_y_swp[MAXGRIDS][4];
-    int   count_x_swp[MAXGRIDS], count_y_swp[MAXGRIDS];
-    int intlev[MAXGRIDS], swaplevmin = align + 1, swaplevmax = align + 8, nswaplev; 
-    int grid_output[MAXGRIDS];
-    int islowest=0;
-
-    /*computation of moment and magnitude for kinematic source */
-    float **mom, **d_mom, tmom=0.0f, gmom, mag;
-    int n;
-
-    #ifdef SEISMIO
-    //int ghostx=ngsl+2, ghosty=ngsl+2, ghostz=align;
-    int ghostx=0, ghosty=0, ghostz=0;
-    char seism_method[]="mpiio";
-    int nx, ny, PZ=1;
-    int seism_regGridID[MAXGRIDS];
-    int seism_filex[MAXGRIDS], seism_filey[MAXGRIDS], seism_filez[MAXGRIDS];
-    int seism_fileeta[MAXGRIDS], seism_fileep[MAXGRIDS];
-    int one=1;
-    #endif
-
-    // Variables for filtering of source-time-function (Daniel)
-    int filtorder=-1;
-    /* filter parameters b and a, and state variable d */
-    double srcfilt_b[MAXFILT], srcfilt_a[MAXFILT], **d_srcfilt_d;  
-    FILE *fltfid;
-
-    int outsize, nout;
-    time_t time1, time2;
-
-    /* for FOLLOWBATHY option - save surface output on ocean floor */
-    int *bathy;
-    int *d_bathy;
-    float tmpvs;
-
-//  variable initialization begins 
-    //NZ=(int*) calloc(MAXGRIDS, sizeof(int));
-    command(argc, argv, &TMAX, DH, &DT, &ARBC, &PHT, &NPC, &ND, NSRC, &NST,
-            &NVAR, &NVE, &MEDIASTART, &IFAULT, &READ_STEP, &READ_STEP_GPU,
-            &NTISKP, &WRITE_STEP, &NX, &NY, NZ, &PX, &PY, NBGX, NEDX, NSKPX,
-            NBGY, NEDY, NSKPY, NBGZ, NEDZ, NSKPZ, &FAC, &Q0, &EX, &FP, &IDYNA,
-            &SoCalQ, INSRC, INVEL, OUT, INSRC_I2, CHKFILE, &ngrids,
-            &FOLLOWBATHY, INTOPO, &usetopo, SOURCEFILE,
-            &usesourcefile, RECVFILE, &userecvfile, FORCEFILE, &useforcefile,
-            SGTFILE, &usesgtfile);
-
-    #ifndef SEISMIO
-     #ifdef NOBGIO
-      sprintf(filenamebasex,"%s/SX",OUT);
-      sprintf(filenamebasey,"%s/SY",OUT);
-      sprintf(filenamebasez,"%s/SZ",OUT);
-      sprintf(filenamebaseeta,"%s/Eta",OUT);
-      sprintf(filenamebaseep,"%s/EP",OUT);
-     #endif
-    #endif
-
-    MPI_Init(&argc,&argv);
-    MPI_Comm_rank(MPI_COMM_WORLD,&rank);
-    MPI_Comm_size(MPI_COMM_WORLD,&size_tot);
-
-    time_init = gethrtime();
-    if (rank == 0) fprintf(stdout, "Initializing...\n");
+   // topography variables
+   int usetopo = 0;
+   char INTOPO[IN_FILE_LEN];
+
+   int usesourcefile = 0;
+   char SOURCEFILE[IN_FILE_LEN];
+
+   int userecvfile = 0;
+   char RECVFILE[IN_FILE_LEN];
+
+   int useforcefile = 0;
+   char FORCEFILE[IN_FILE_LEN];
+
+   int usesgtfile = 0;
+   char SGTFILE[IN_FILE_LEN];
+
+   int usemms = 0;
+   char MMSFILE[IN_FILE_LEN];
+
+   int useenergy = 0;
+   char ENERGYFILE[IN_FILE_LEN];
+
+   float DHB = -1.0;
+   float DHT = -1.0;
+
+   //  GPU variables
+   long int num_bytes;
+   float **d_d1;
+   float **d_u1;
+   float **d_v1;
+   float **d_w1;
+   float **d_f_u1;
+   float **d_f_v1;
+   float **d_f_w1;
+   float **d_b_u1;
+   float **d_b_v1;
+   float **d_b_w1;
+   float **d_dcrjx;
+   float **d_dcrjy;
+   float **d_dcrjz;
+   float **d_lam;
+   float **d_mu;
+   float **d_qp;
+   float *d_coeff;
+   float **d_qs;
+   float **d_vx1;
+   float **d_vx2;
+   int **d_ww;
+   float **d_wwo;
+   float **d_xx;
+   float **d_yy;
+   float **d_zz;
+   float **d_xy;
+   float **d_xz;
+   float **d_yz;
+   float **d_r1;
+   float **d_r2;
+   float **d_r3;
+   float **d_r4;
+   float **d_r5;
+   float **d_r6;
+   float **d_lam_mu;
+   int **d_tpsrc;
+   float **d_taxx;
+   float **d_tayy;
+   float **d_tazz;
+   float **d_taxz;
+   float **d_tayz;
+   float **d_taxy;
+
+   float **d_Bufx, **d_Bufy, **d_Bufz, **d_Bufeta;
+   // plasticity
+   float **d_sigma2;
+   float **d_yldfac, **d_cohes, **d_phi, **d_neta;
+   //  end of GPU variables
+   int i, j, k, idx, idy, idz;
+   long int idtmp;
+   long int tmpInd;
+   const int maxdim = 3;
+   float taumax, taumin, tauu;
+   Grid3D tau = NULL, tau1 = NULL, tau2 = NULL;
+   Grid3D weights = NULL;
+   int npsrc[MAXGRIDS];
+   long int nt, cur_step, source_step;
+   double time_un = 0.0;
+   double time_init = 0.0;
+   // time_src and time_mesh measures the time spent
+   // in source and mesh reading
+   double time_src = 0.0, time_mesh = 0.0;
+   // time_gpuio measures the time spent in gpu memory copying for IO
+   double time_gpuio = 0.0;
+   double time_gpuio_tmp = 0.0;
+   //  MPI+CUDA variables
+   cudaError_t cerr = 0;
+   size_t cmemfree, cmemtotal;
+   cudaStream_t stream_1, /*stream_1b,*/ stream_2, /*stream_2b,*/ stream_i, stream_i2;
+   ;
+   cudaStream_t stream_o;
+   int rank, size, err, srcproc[MAXGRIDS], rank_gpu;
+   int dim[2], period[2], coord[2], reorder;
+   int x_rank_L = -1, x_rank_R = -1, y_rank_F = -1, y_rank_B = -1;
+   MPI_Comm MCW, MC1;
+   MPI_Request request_x[MAXGRIDS][4], request_y[MAXGRIDS][4];
+   MPI_Status status_x[MAXGRIDS][4], status_y[MAXGRIDS][4], filestatus;
+   MPI_File fh;
+   int maxNX_NY_NZ_WS;
+#ifdef NOBGIO
+   /*int   fmtype[3], fptype[3], foffset[3];*/
+   int **ones;
+   MPI_Aint **dispArray;
+   MPI_Datatype filetype[MAXGRIDS];
+#endif
 
-#if VERBOSE
-    if (rank==0) fprintf(stdout, "AWP-ODC-DM: Number of grid resolutions = %d\n", ngrids);
+   int msg_v_size_x[MAXGRIDS], msg_v_size_y[MAXGRIDS], count_x[MAXGRIDS], count_y[MAXGRIDS];
+   int xls[MAXGRIDS], xre[MAXGRIDS], xvs[MAXGRIDS], xve[MAXGRIDS], xss1[MAXGRIDS];
+   int xse1[MAXGRIDS], xss2[MAXGRIDS], xse2[MAXGRIDS], xss3[MAXGRIDS], xse3[MAXGRIDS];
+   int yfs[MAXGRIDS], yfe[MAXGRIDS], ybs[MAXGRIDS], ybe[MAXGRIDS], yls[MAXGRIDS], yre[MAXGRIDS];
+   /* Added by Daniel for plasticity computation boundaries */
+   int xlsp[MAXGRIDS], xrep[MAXGRIDS], ylsp[MAXGRIDS], yrep[MAXGRIDS];
+   float **SL_vel; // Velocity to be sent to   Left  in x direction (u1,v1,w1)
+   float **SR_vel; // Velocity to be Sent to   Right in x direction (u1,v1,w1)
+   float **RL_vel; // Velocity to be Recv from Left  in x direction (u1,v1,w1)
+   float **RR_vel; // Velocity to be Recv from Right in x direction (u1,v1,w1)
+   float **SF_vel; // Velocity to be sent to   Front in y direction (u1,v1,w1)
+   float **SB_vel; // Velocity to be Sent to   Back  in y direction (u1,v1,w1)
+   float **RF_vel; // Velocity to be Recv from Front in y direction (u1,v1,w1)
+   float **RB_vel; // Velocity to be Recv from Back  in y direction (u1,v1,w1)
+
+   //  variable definition ends
+
+   int tmpSize;
+   int WRITE_STEP;
+   int NTISKP;
+   int rec_NX[MAXGRIDS];
+   int rec_NY[MAXGRIDS];
+   int rec_NZ[MAXGRIDS];
+   int rec_nxt[MAXGRIDS];
+   int rec_nyt[MAXGRIDS];
+   int rec_nzt[MAXGRIDS];
+   int rec_nbgx[MAXGRIDS]; // 0-based indexing, however NBG* is 1-based
+   int rec_nedx[MAXGRIDS]; // 0-based indexing, however NED* is 1-based
+   int rec_nbgy[MAXGRIDS]; // 0-based indexing
+   int rec_nedy[MAXGRIDS]; // 0-based indexing
+   int rec_nbgz[MAXGRIDS]; // 0-based indexing
+   int rec_nedz[MAXGRIDS]; // 0-based indexing
+   char filename[50];
+#ifdef NOBGIO
+   char filenamebasex[50];
+   char filenamebasey[50];
+   char filenamebasez[50];
+   char filenamebaseeta[50];
+   char filenamebaseep[50];
 #endif
-    fflush(stdout);
 
-    #ifndef NOBGIO
-    if ((size_tot % 3) != 0){
-       if (rank==0) fprintf(stderr, "Error. Number of CPUs %d must be divisible by 3.\n", size_tot);
-       MPI_Finalize();
-       return(0);
-    }
-    size = size_tot / 3;
+   // moving initial stress computation to GPU
+   float fmajor = 0, fminor = 0, strike[3], dip[3], Rz[9], RzT[9];
+
+   // variables for fault boundary condition (Daniel)
+   int fbc_ext[6], fbc_off[3], fbc_extl[6], fbc_dim[3], fbc_seismio, fbc_tskp = 1;
+   char fbc_pmask[200];
+   long int nel[MAXGRIDS];
+
+   int ranktype = 0, size_tot;
+   MPI_Comm MCT, MCS, MCI;
+
+   /*Daniel - Buffers for exchange of yield factors, same naming as with velocity */
+   float **SL_yldfac, **SR_yldfac, **RL_yldfac, **RR_yldfac;
+   float **SF_yldfac, **SB_yldfac, **RF_yldfac, **RB_yldfac;
+   float **d_SL_yldfac, **d_SR_yldfac, **d_RL_yldfac, **d_RR_yldfac;
+   float **d_SF_yldfac, **d_SB_yldfac, **d_RF_yldfac, **d_RB_yldfac;
+   int *yldfac_msg_size_x, *yldfac_msg_size_y;
+   long int num_bytes2;
+   MPI_Request request_x_yldfac[MAXGRIDS][4], request_y_yldfac[MAXGRIDS][4];
+   MPI_Status status_x_yldfac[MAXGRIDS][4], status_y_yldfac[MAXGRIDS][4];
+   int count_x_yldfac[MAXGRIDS], count_y_yldfac[MAXGRIDS];
+   int yls2[MAXGRIDS], yre2[MAXGRIDS];
+
+   /* DM variables added by Daniel */
+   int p;
+   int ngrids;
+   int grdfct[MAXGRIDS]; /* Horizontal grid extent with respect to coarsest grid */
+
+   //int dm = 53;
+   //int dm = 41;
+
+   /*buffers for overlap zone variables */
+   float **SL_swap, **SR_swap, **RL_swap, **RR_swap;
+   float **SF_swap, **SB_swap, **RF_swap, **RB_swap;
+   float **d_SL_swap, **d_SR_swap, **d_RL_swap, **d_RR_swap;
+   float **d_SF_swap, **d_SB_swap, **d_RF_swap, **d_RB_swap;
+   int *swp_msg_size_x; //*swp_msg_size_x_l;
+   int *swp_msg_size_y; //*swp_msg_size_y_l;
+   MPI_Request request_x_swp[MAXGRIDS][4], request_y_swp[MAXGRIDS][4];
+   MPI_Status status_x_swp[MAXGRIDS][4], status_y_swp[MAXGRIDS][4];
+   int count_x_swp[MAXGRIDS], count_y_swp[MAXGRIDS];
+   int intlev[MAXGRIDS], swaplevmin = align + 1, swaplevmax = align + 8, nswaplev;
+   int grid_output[MAXGRIDS];
+   int islowest = 0;
+
+   /*computation of moment and magnitude for kinematic source */
+   float **mom, **d_mom, tmom = 0.0f, gmom, mag;
+   int n;
+
+#ifdef SEISMIO
+   //int ghostx=ngsl+2, ghosty=ngsl+2, ghostz=align;
+   int ghostx = 0, ghosty = 0, ghostz = 0;
+   char seism_method[] = "mpiio";
+   int nx, ny, PZ = 1;
+   int seism_regGridID[MAXGRIDS];
+   int seism_filex[MAXGRIDS], seism_filey[MAXGRIDS], seism_filez[MAXGRIDS];
+   int seism_fileeta[MAXGRIDS], seism_fileep[MAXGRIDS];
+   int one = 1;
+#endif
 
-    if ((NX % PX) != 0) {
-        if (rank==0) fprintf(stderr, "NX on grid %d (%d) is not divisible by PX (%d)\n", 
-           ngrids-1, NX, PX);
-        MPI_Finalize();
-        return(0);
-    }
+   // Variables for filtering of source-time-function (Daniel)
+   int filtorder = -1;
+   /* filter parameters b and a, and state variable d */
+   double srcfilt_b[MAXFILT], srcfilt_a[MAXFILT], **d_srcfilt_d;
+   FILE *fltfid;
+
+   int outsize, nout;
+   time_t time1, time2;
+
+   /* for FOLLOWBATHY option - save surface output on ocean floor */
+   int *bathy;
+   int *d_bathy;
+   float tmpvs;
+
+   //  variable initialization begins
+   //NZ=(int*) calloc(MAXGRIDS, sizeof(int));
+   command(argc, argv, &TMAX, DH, &DT, &ARBC, &PHT, &NPC, &ND, NSRC, &NST,
+           &NVAR, &NVE, &MEDIASTART, &IFAULT, &READ_STEP, &READ_STEP_GPU,
+           &NTISKP, &WRITE_STEP, &NX, &NY, NZ, &PX, &PY, NBGX, NEDX, NSKPX,
+           NBGY, NEDY, NSKPY, NBGZ, NEDZ, NSKPZ, &FAC, &Q0, &EX, &FP, &IDYNA,
+           &SoCalQ, INSRC, INVEL, OUT, INSRC_I2, CHKFILE, &ngrids,
+           &FOLLOWBATHY, INTOPO, &usetopo, SOURCEFILE,
+           &usesourcefile, RECVFILE, &userecvfile, FORCEFILE, &useforcefile,
+           SGTFILE, &usesgtfile, MMSFILE, &usemms, &DHB, &DHT, ENERGYFILE, &useenergy, 
+           &QSI, &QPQSR, &MAXVPVSR, &VMIN, &VMAX, &DMIN);
+
+
+
+#ifndef SEISMIO
+#ifdef NOBGIO
+   sprintf(filenamebasex, "%s/SX", OUT);
+   sprintf(filenamebasey, "%s/SY", OUT);
+   sprintf(filenamebasez, "%s/SZ", OUT);
+   sprintf(filenamebaseeta, "%s/Eta", OUT);
+   sprintf(filenamebaseep, "%s/EP", OUT);
+#endif
+#endif
 
-    if ((NY % PY) != 0) {
-        if (rank==0) fprintf(stderr, "NY on grid %d (%d) is not divisible by PY (%d)\n", 
-           ngrids-1, NY, PY);
-        MPI_Finalize();
-    }
+   MPI_Init(&argc, &argv);
+   MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+   MPI_Comm_size(MPI_COMM_WORLD, &size_tot);
+
+#ifdef GDB_ATTACH
+   if (rank == 0)
+   {
+      volatile int i = 0;
+      printf("Process ID %d is ready for attach\n", getpid());
+      fflush(stdout);
+      while (0 == i)
+         sleep(5);
+   }
+#endif
 
-    MPI_Comm_dup(MPI_COMM_WORLD, &MCT );
-    /* The communicator MCW includes all ranks involved in GPU computations */
-    /* colors for MPI_Comm_split: 0=launches kernels; 1=source I/O for IFAULT=4; 2=time series output*/
-    if (rank < size) ranktype=0;
-    else if (rank < size*2) ranktype=1;
-    else ranktype=2;
-    MPI_Comm_split(MCT, ranktype, 0, &MCW);
-    MPI_Comm_split(MCT, ranktype, 1, &MCS);
-    MPI_Comm_split(MCT, ranktype, 2, &MCI);
-
-    for (p=0; p<ngrids; p++) count_y_yldfac[p] = count_x_yldfac[p] = 0;
-
-    MPI_Barrier(MCT);
-
-    /* Business as usual for these ranks */
-    if (ranktype==0) {
-    #else
-    size = size_tot ;
-    MPI_Comm_dup(MPI_COMM_WORLD, &MCW );
-    #endif
-
-    grdfct[ngrids-1]=1;
-    for (p=ngrids-2; p>-1; p--) grdfct[p] = grdfct[p+1] * 3;
-
-    DH[ngrids-1] = DH[0];
-    for (p=0; p<ngrids; p++){
-       DH[p] = DH[ngrids-1] / grdfct[p];
-       nxt[p] = NX/PX * grdfct[p];
-       nyt[p] = NY/PY * grdfct[p];
-    }
+   time_init = gethrtime();
+   if (rank == 0)
+      fprintf(stdout, "Initializing...\n");
 
-    for (p=0; p<ngrids; p++){
-       nzt[p] = NZ[p];
-       if ((nzt[p] % BLOCK_SIZE_Z) != 0){
-	  if (rank==0) {
-	      fprintf(stderr, "NZT for grid %d is not divisble by BLOCK_SIZE_Z.\n", p);
-	      fprintf(stderr, "NZT = %d, BLOCK_SIZE_Z=%d\n", nzt[p], BLOCK_SIZE_Z);
-	      fprintf(stderr, "Aborting.  Please change NZT or change BLOCK_SIZE_Z in pmcl3d_cons.h and recompile.\n");
-	  }
-	  MPI_Finalize();
-	  return(0);
-       }
-    }
+#if VERBOSE
+   if (rank == 0)
+      fprintf(stdout, "AWP-ODC-DM: Number of grid resolutions = %d\n", ngrids);
+#endif
+   fflush(stdout);
+
+#ifndef NOBGIO
+   if ((size_tot % 3) != 0)
+   {
+      if (rank == 0)
+         fprintf(stderr, "Error. Number of CPUs %d must be divisible by 3.\n", size_tot);
+      MPI_Finalize();
+      return (0);
+   }
+   size = size_tot / 3;
+
+   if ((NX % PX) != 0)
+   {
+      if (rank == 0)
+         fprintf(stderr, "NX on grid %d (%d) is not divisible by PX (%d)\n",
+                 ngrids - 1, NX, PX);
+      MPI_Finalize();
+      return (0);
+   }
+
+   if ((NY % PY) != 0)
+   {
+      if (rank == 0)
+         fprintf(stderr, "NY on grid %d (%d) is not divisible by PY (%d)\n",
+                 ngrids - 1, NY, PY);
+      MPI_Finalize();
+   }
+
+   MPI_Comm_dup(MPI_COMM_WORLD, &MCT);
+   /* The communicator MCW includes all ranks involved in GPU computations */
+   /* colors for MPI_Comm_split: 0=launches kernels; 1=source I/O for IFAULT=4; 2=time series output*/
+   if (rank < size)
+      ranktype = 0;
+   else if (rank < size * 2)
+      ranktype = 1;
+   else
+      ranktype = 2;
+   MPI_Comm_split(MCT, ranktype, 0, &MCW);
+   MPI_Comm_split(MCT, ranktype, 1, &MCS);
+   MPI_Comm_split(MCT, ranktype, 2, &MCI);
+
+   for (p = 0; p < ngrids; p++)
+      count_y_yldfac[p] = count_x_yldfac[p] = 0;
+
+   MPI_Barrier(MCT);
+
+   /* Business as usual for these ranks */
+   if (ranktype == 0)
+   {
+#else
+   size = size_tot;
+   MPI_Comm_dup(MPI_COMM_WORLD, &MCW);
+#endif
 
-    nt        = (int)(TMAX/DT) + 1;
-    dim[0]    = PX;
-    dim[1]    = PY;
-    if (NPC < 2) { 
-       period[0] = 0;
-       period[1] = 0;
-    }
-    else { /* Periodic PCs - Daniel */
-       period[0] = 1;
-       period[1] = 1;
-    }
-    reorder   = 1;
-    err       = MPI_Cart_create(MCW, 2, dim, period, reorder, &MC1);
-    err       = MPI_Cart_shift(MC1, 0,  1,  &x_rank_L, &x_rank_R );
-    err       = MPI_Cart_shift(MC1, 1,  1,  &y_rank_F, &y_rank_B ); 
+      grdfct[ngrids - 1] = 1;
+      for (p = ngrids - 2; p > -1; p--)
+         grdfct[p] = grdfct[p + 1] * 3;
+
+      DH[ngrids - 1] = DH[0];
+      for (p = 0; p < ngrids; p++)
+      {
+         DH[p] = DH[ngrids - 1] / grdfct[p];
+         nxt[p] = NX / PX * grdfct[p];
+         nyt[p] = NY / PY * grdfct[p];
+      }
+
+      DHB = DHB == -1.0 ? DH[0] : DHB;
+      DHT = DHT == -1.0 ? DH[0] : DHT;
+
+      for (p = 0; p < ngrids; p++)
+      {
+         nzt[p] = NZ[p];
+         if ((nzt[p] % BLOCK_SIZE_Z) != 0)
+         {
+            if (rank == 0)
+            {
+               fprintf(stderr, "NZT for grid %d is not divisble by BLOCK_SIZE_Z.\n", p);
+               fprintf(stderr, "NZT = %d, BLOCK_SIZE_Z=%d\n", nzt[p], BLOCK_SIZE_Z);
+               fprintf(stderr, "Aborting.  Please change NZT or change BLOCK_SIZE_Z in pmcl3d_cons.h and recompile.\n");
+            }
+            MPI_Finalize();
+            return (0);
+         }
+      }
+
+      nt = (int)(TMAX / DT) + 1;
+      dim[0] = PX;
+      dim[1] = PY;
+      if (NPC < 2)
+      {
+         period[0] = 0;
+         period[1] = 0;
+      }
+      else
+      { /* Periodic PCs - Daniel */
+         period[0] = 1;
+         period[1] = 1;
+      }
+      reorder = 1;
+      err = MPI_Cart_create(MCW, 2, dim, period, reorder, &MC1);
+      err = MPI_Cart_shift(MC1, 0, 1, &x_rank_L, &x_rank_R);
+      err = MPI_Cart_shift(MC1, 1, 1, &y_rank_F, &y_rank_B);
 
-    /*if (x_rank_L == MPI_UNDEFINED) x_rank_L = -1;
+      /*if (x_rank_L == MPI_UNDEFINED) x_rank_L = -1;
     if (x_rank_R == MPI_UNDEFINED) x_rank_R = -1;
     if (y_rank_F == MPI_UNDEFINED) y_rank_F = -1;
     if (y_rank_B == MPI_UNDEFINED) y_rank_B = -1;*/
-    if (x_rank_L < 0) x_rank_L = -1;
-    if (x_rank_R < 0 ) x_rank_R = -1;
-    if (y_rank_F < 0) y_rank_F = -1;
-    if (y_rank_B < 0) y_rank_B = -1;   
-
-    err       = MPI_Cart_coords(MC1, rank, 2, coord);
-    err       = MPI_Barrier(MCW);
-    // Below line is only for HPGPU4 machine!
-    //rank_gpu = rank%4;
-    // Below line is for 1 GPU/node systems or Summit using 1 GPU per resource set
-    rank_gpu = 0;
-    CUCHK(cudaSetDevice(rank_gpu));
+      if (x_rank_L < 0)
+         x_rank_L = -1;
+      if (x_rank_R < 0)
+         x_rank_R = -1;
+      if (y_rank_F < 0)
+         y_rank_F = -1;
+      if (y_rank_B < 0)
+         y_rank_B = -1;
+
+      err = MPI_Cart_coords(MC1, rank, 2, coord);
+      err = MPI_Barrier(MCW);
+      // Below line is only for HPGPU4 machine!
+      //rank_gpu = rank%4;
+      // Below line is for 1 GPU/node systems or Summit using 1 GPU per resource set
+      rank_gpu = 0;
+      CUCHK(cudaSetDevice(rank_gpu));
 
 #if VERBOSE
-printf("\n\nrank=%d) RS=%d, RSG=%d, NST=%d, IF=%d\n\n\n", 
-rank, READ_STEP, READ_STEP_GPU, NST, IFAULT);
+      printf("\n\nrank=%d) RS=%d, RSG=%d, NST=%d, IF=%d\n\n\n",
+             rank, READ_STEP, READ_STEP_GPU, NST, IFAULT);
 #endif
 
-    for (p=0; p<ngrids; p++){
-       if (p==0){
-	  if(NEDX[p]==-1) NEDX[p] = NX*grdfct[p];
-	  if(NEDY[p]==-1) NEDY[p] = NY*grdfct[p];
-	  if(NEDZ[p]==-1) NEDZ[p] = NZ[p];
-          grid_output[p] = 1;
-       }
-       // make NED's a record point
-       // for instance if NBGX:NSKPX:NEDX = 1:3:9
-       // then we have 1,4,7 but NEDX=7 is better
-       NEDX[p] = NEDX[p]-(NEDX[p]-NBGX[p])%NSKPX[p];
-       NEDY[p] = NEDY[p]-(NEDY[p]-NBGY[p])%NSKPY[p];
-       NEDZ[p] = NEDZ[p]-(NEDZ[p]-NBGZ[p])%NSKPZ[p];
-       if (NEDX[p] > -1 && NEDY[p] > -1 && NEDZ[p] > -1) grid_output[p] = 1;
+      for (p = 0; p < ngrids; p++)
+      {
+         if (p == 0)
+         {
+            if (NEDX[p] == -1)
+               NEDX[p] = NX * grdfct[p];
+            if (NEDY[p] == -1)
+               NEDY[p] = NY * grdfct[p];
+            if (NEDZ[p] == -1)
+               NEDZ[p] = NZ[p];
+            grid_output[p] = 1;
+         }
+         // make NED's a record point
+         // for instance if NBGX:NSKPX:NEDX = 1:3:9
+         // then we have 1,4,7 but NEDX=7 is better
+         NEDX[p] = NEDX[p] - (NEDX[p] - NBGX[p]) % NSKPX[p];
+         NEDY[p] = NEDY[p] - (NEDY[p] - NBGY[p]) % NSKPY[p];
+         NEDZ[p] = NEDZ[p] - (NEDZ[p] - NBGZ[p]) % NSKPZ[p];
+         if (NEDX[p] > -1 && NEDY[p] > -1 && NEDZ[p] > -1)
+            grid_output[p] = 1;
 #if VERBOSE
-       fprintf(stdout, "%d: X: %d:%d:%d.  Y: %d:%d:%d.  Z:%d:%d:%d\n",
-          p, NBGX[p], NSKPX[p], NEDX[p],  NBGY[p], NSKPY[p], NEDY[p], NBGZ[p], NSKPZ[p], NEDZ[p]);
-       fflush(stdout);
+         fprintf(stdout, "%d: X: %d:%d:%d.  Y: %d:%d:%d.  Z:%d:%d:%d\n",
+                 p, NBGX[p], NSKPX[p], NEDX[p], NBGY[p], NSKPY[p], NEDY[p], NBGZ[p], NSKPZ[p], NEDZ[p]);
+         fflush(stdout);
 #endif
-    }
-    #ifndef SEISMIO
-    // number of recording points in total
-    for (p=0; p<ngrids; p++){
-       rec_NX[p] = (NEDX[p]-NBGX[p])/NSKPX[p]+1;
-       rec_NY[p] = (NEDY[p]-NBGY[p])/NSKPY[p]+1;
-       rec_NZ[p] = (NEDZ[p]-NBGZ[p])/NSKPZ[p]+1;
-
-       // specific to each processor:
-       calcRecordingPoints(&rec_nbgx[p], &rec_nedx[p], &rec_nbgy[p], &rec_nedy[p], 
-	 &rec_nbgz[p], &rec_nedz[p], &rec_nxt[p], &rec_nyt[p], &rec_nzt[p], &displacement[p],
-	 (long int)nxt[p],(long int)nyt[p],(long int)nzt[p], rec_NX[p], rec_NY[p], rec_NZ[p], 
-	 NBGX[p],NEDX[p],NSKPX[p], NBGY[p],NEDY[p],NSKPY[p], NBGZ[p],NEDZ[p],NSKPZ[p], coord);
+      }
+#ifndef SEISMIO
+      // number of recording points in total
+      for (p = 0; p < ngrids; p++)
+      {
+         rec_NX[p] = (NEDX[p] - NBGX[p]) / NSKPX[p] + 1;
+         rec_NY[p] = (NEDY[p] - NBGY[p]) / NSKPY[p] + 1;
+         rec_NZ[p] = (NEDZ[p] - NBGZ[p]) / NSKPZ[p] + 1;
+
+         // specific to each processor:
+         calcRecordingPoints(&rec_nbgx[p], &rec_nedx[p], &rec_nbgy[p], &rec_nedy[p],
+                             &rec_nbgz[p], &rec_nedz[p], &rec_nxt[p], &rec_nyt[p], &rec_nzt[p], &displacement[p],
+                             (long int)nxt[p], (long int)nyt[p], (long int)nzt[p], rec_NX[p], rec_NY[p], rec_NZ[p],
+                             NBGX[p], NEDX[p], NSKPX[p], NBGY[p], NEDY[p], NSKPY[p], NBGZ[p], NEDZ[p], NSKPZ[p], coord);
 #if VERBOSE
-       printf("%d = (%d,%d)) NX,NY,NZ=%d,%d,%d\nnxt,nyt,nzt=%d,%d,%d\nrec_N=(%d,%d,%d)\nrec_nxt,=%d,%d,%d\nNBGX,SKP,END=(%d:%d:%d),(%d:%d:%d),(%d:%d:%d)\nrec_nbg,ed=(%d,%d),(%d,%d),(%d,%d)\ndisp=%ld\n",
-	   rank,coord[p],coord[1],NX,NY,NZ[p],nxt[p],nyt[p],nzt[p],
-	   rec_NX[p], rec_NY[p], rec_NZ[p], rec_nxt[p], rec_nyt[p], rec_nzt[p],
-	   NBGX[p],NSKPX[p],NEDX[p],NBGY[p],NSKPY[p],NEDY[p],NBGZ[p],NSKPZ[p],NEDZ[p],
-	   rec_nbgx[p],rec_nedx[p],rec_nbgy[p],rec_nedy[p],rec_nbgz[p],rec_nedz[p],(long int)displacement[p]);
+         printf("%d = (%d,%d)) NX,NY,NZ=%d,%d,%d\nnxt,nyt,nzt=%d,%d,%d\nrec_N=(%d,%d,%d)\nrec_nxt,=%d,%d,%d\nNBGX,SKP,END=(%d:%d:%d),(%d:%d:%d),(%d:%d:%d)\nrec_nbg,ed=(%d,%d),(%d,%d),(%d,%d)\ndisp=%ld\n",
+                rank, coord[p], coord[1], NX, NY, NZ[p], nxt[p], nyt[p], nzt[p],
+                rec_NX[p], rec_NY[p], rec_NZ[p], rec_nxt[p], rec_nyt[p], rec_nzt[p],
+                NBGX[p], NSKPX[p], NEDX[p], NBGY[p], NSKPY[p], NEDY[p], NBGZ[p], NSKPZ[p], NEDZ[p],
+                rec_nbgx[p], rec_nedx[p], rec_nbgy[p], rec_nedy[p], rec_nbgz[p], rec_nedz[p], (long int)displacement[p]);
 #endif
+      }
 
-    }
-
-    #ifndef NOBGIO
-    MPI_Send(rec_nxt, ngrids, MPI_INT, rank+2*size, MPIRANKIO, MPI_COMM_WORLD);
-    MPI_Send(rec_nyt, ngrids, MPI_INT, rank+2*size, MPIRANKIO+1, MPI_COMM_WORLD);
-    MPI_Send(rec_nzt, ngrids, MPI_INT, rank+2*size, MPIRANKIO+2, MPI_COMM_WORLD);
-    MPI_Send(rec_NX, ngrids, MPI_INT, rank+2*size, MPIRANKIO+3, MPI_COMM_WORLD);
-    MPI_Send(rec_NY, ngrids, MPI_INT, rank+2*size, MPIRANKIO+4, MPI_COMM_WORLD);
-    MPI_Send(rec_NZ, ngrids, MPI_INT, rank+2*size, MPIRANKIO+5, MPI_COMM_WORLD);
-    MPI_Send(grid_output, ngrids, MPI_INT, rank+2*size, MPIRANKIO+6, MPI_COMM_WORLD);
-    MPI_Send(displacement, ngrids, MPI_OFFSET, rank+2*size, MPIRANKIO+7, MPI_COMM_WORLD);
-    #else
-    dispArray=(MPI_Aint**) calloc(ngrids, sizeof(MPI_Aint*));
-    ones=(int**) calloc(ngrids, sizeof(int*));
-    for (p=0; p<ngrids; p++){
-       maxNX_NY_NZ_WS = (rec_NX[p]>rec_NY[p]?rec_NX[p]:rec_NY[p]);
-       maxNX_NY_NZ_WS = (maxNX_NY_NZ_WS>rec_NZ[p]?maxNX_NY_NZ_WS:rec_NZ[p]);
-       maxNX_NY_NZ_WS = (maxNX_NY_NZ_WS>WRITE_STEP?maxNX_NY_NZ_WS:WRITE_STEP);
-       ones[p]=(int*) calloc(maxNX_NY_NZ_WS, sizeof(int));
-       for(i=0;i<maxNX_NY_NZ_WS;++i) ones[p][i] = 1;
-       dispArray[p] = (MPI_Aint*) calloc(maxNX_NY_NZ_WS, sizeof(MPI_Aint));
-
-       err = MPI_Type_contiguous(rec_nxt[p], MPI_FLOAT, &filetype[p]);
-       err = MPI_Type_commit(&filetype[p]);
-       for(i=0;i<rec_nyt[p];i++){
-	 dispArray[p][i] = sizeof(float);
-	 dispArray[p][i] = dispArray[p][i]*rec_NX[p]*i;
-       }
-       err = MPI_Type_create_hindexed(rec_nyt[p], ones[p], dispArray[p], filetype[p], &filetype[p]);
-       err = MPI_Type_commit(&filetype[p]);
-       for(i=0;i<rec_nzt[p];i++){
-	 dispArray[p][i] = sizeof(float);
-	 dispArray[p][i] = dispArray[p][i]*rec_NY[p]*rec_NX[p]*i;
-       }
-       err = MPI_Type_create_hindexed(rec_nzt[p], ones[p], dispArray[p], filetype[p], &filetype[p]);
-       err = MPI_Type_commit(&filetype[p]);
-       for(i=0;i<WRITE_STEP;i++){
-	 dispArray[p][i] = sizeof(float);
-	 dispArray[p][i] = dispArray[p][i]*rec_NZ[p]*rec_NY[p]*rec_NX[p]*i;
-       }
-       err = MPI_Type_create_hindexed(WRITE_STEP, ones[p], dispArray[p], filetype[p], &filetype[p]);
-       err = MPI_Type_commit(&filetype[p]);
-       MPI_Type_size(filetype[p], &tmpSize);
+#ifndef NOBGIO
+      MPI_Send(rec_nxt, ngrids, MPI_INT, rank + 2 * size, MPIRANKIO, MPI_COMM_WORLD);
+      MPI_Send(rec_nyt, ngrids, MPI_INT, rank + 2 * size, MPIRANKIO + 1, MPI_COMM_WORLD);
+      MPI_Send(rec_nzt, ngrids, MPI_INT, rank + 2 * size, MPIRANKIO + 2, MPI_COMM_WORLD);
+      MPI_Send(rec_NX, ngrids, MPI_INT, rank + 2 * size, MPIRANKIO + 3, MPI_COMM_WORLD);
+      MPI_Send(rec_NY, ngrids, MPI_INT, rank + 2 * size, MPIRANKIO + 4, MPI_COMM_WORLD);
+      MPI_Send(rec_NZ, ngrids, MPI_INT, rank + 2 * size, MPIRANKIO + 5, MPI_COMM_WORLD);
+      MPI_Send(grid_output, ngrids, MPI_INT, rank + 2 * size, MPIRANKIO + 6, MPI_COMM_WORLD);
+      MPI_Send(displacement, ngrids, MPI_OFFSET, rank + 2 * size, MPIRANKIO + 7, MPI_COMM_WORLD);
+#else
+      dispArray = (MPI_Aint **)calloc(ngrids, sizeof(MPI_Aint *));
+      ones = (int **)calloc(ngrids, sizeof(int *));
+      for (p = 0; p < ngrids; p++)
+      {
+         maxNX_NY_NZ_WS = (rec_NX[p] > rec_NY[p] ? rec_NX[p] : rec_NY[p]);
+         maxNX_NY_NZ_WS = (maxNX_NY_NZ_WS > rec_NZ[p] ? maxNX_NY_NZ_WS : rec_NZ[p]);
+         maxNX_NY_NZ_WS = (maxNX_NY_NZ_WS > WRITE_STEP ? maxNX_NY_NZ_WS : WRITE_STEP);
+         ones[p] = (int *)calloc(maxNX_NY_NZ_WS, sizeof(int));
+         for (i = 0; i < maxNX_NY_NZ_WS; ++i)
+            ones[p][i] = 1;
+         dispArray[p] = (MPI_Aint *)calloc(maxNX_NY_NZ_WS, sizeof(MPI_Aint));
+
+         err = MPI_Type_contiguous(rec_nxt[p], MPI_FLOAT, &filetype[p]);
+         err = MPI_Type_commit(&filetype[p]);
+         for (i = 0; i < rec_nyt[p]; i++)
+         {
+            dispArray[p][i] = sizeof(float);
+            dispArray[p][i] = dispArray[p][i] * rec_NX[p] * i;
+         }
+         err = MPI_Type_create_hindexed(rec_nyt[p], ones[p], dispArray[p], filetype[p], &filetype[p]);
+         err = MPI_Type_commit(&filetype[p]);
+         for (i = 0; i < rec_nzt[p]; i++)
+         {
+            dispArray[p][i] = sizeof(float);
+            dispArray[p][i] = dispArray[p][i] * rec_NY[p] * rec_NX[p] * i;
+         }
+         err = MPI_Type_create_hindexed(rec_nzt[p], ones[p], dispArray[p], filetype[p], &filetype[p]);
+         err = MPI_Type_commit(&filetype[p]);
+         for (i = 0; i < WRITE_STEP; i++)
+         {
+            dispArray[p][i] = sizeof(float);
+            dispArray[p][i] = dispArray[p][i] * rec_NZ[p] * rec_NY[p] * rec_NX[p] * i;
+         }
+         err = MPI_Type_create_hindexed(WRITE_STEP, ones[p], dispArray[p], filetype[p], &filetype[p]);
+         err = MPI_Type_commit(&filetype[p]);
+         MPI_Type_size(filetype[p], &tmpSize);
 #if VERBOSE
-       if(rank==0) printf("filetype size grid %d (supposedly=rec_nxt*nyt*nzt*WS*4=%ld) =%d\n", 
-          p, rec_nxt[p]*rec_nyt[p]*rec_nzt[p]*WRITE_STEP*sizeof(float),tmpSize);
+         if (rank == 0)
+            printf("filetype size grid %d (supposedly=rec_nxt*nyt*nzt*WS*4=%ld) =%d\n",
+                   p, rec_nxt[p] * rec_nyt[p] * rec_nzt[p] * WRITE_STEP * sizeof(float), tmpSize);
 #endif
+      }
 
-    }
-
-    /*
+      /*
     fmtype[0]  = WRITE_STEP;
     fmtype[1]  = NY;
     fmtype[2]  = NX;
@@ -531,752 +601,871 @@ rank, READ_STEP, READ_STEP_GPU, NST, IFAULT);
     err = MPI_Type_create_subarray(3, fmtype, fptype, foffset, MPI_ORDER_C, MPI_FLOAT, &filetype);
     err = MPI_Type_commit(&filetype);*/
 
-    #endif
-
-    #else
-    err = 0;
-    // 2 <= maxdim <= 3
-    for (p=0; p<ngrids; p++){
-       if (grid_output[p]){
-	  nx=NX*grdfct[p];
-	  ny=NY*grdfct[p];
-	  if (rank == 0) fprintf(stdout, "|    initializing SEISM-IO for grid %d\n", p);
-          fflush(stdout);
-	  seism_init(&MC1,&rank,coord,(int*)&maxdim,&nx,&ny,&nzt[p],&nxt[p],&nyt[p],&nzt[p],
-		     &ghostx,&ghosty,&ghostz,&PX,&PY,&PZ,seism_method,&err);
-	  if (err != 0) {
-	      fprintf(stderr, "|    SEISM ERROR! Init failed for grid %d!\n", p);
-	      MPI_Abort(MCW, 1);
-	      MPI_Finalize();
-	  }
-	  if (rank == 0) fprintf(stdout, "|    done initializing SEISM-IO for grid %d\n", p);
-
-	  seism_createRegularGrid(NBGX+p, NEDX+p, NSKPX+p, NBGY+p, NEDY+p, NSKPY+p,
-				  NBGZ+p, NEDZ+p, NSKPZ+p, seism_regGridID+p, &err);
-
-	  sprintf(filenamebasex,"%s/SX_%d", OUT, p);
-	  sprintf(filenamebasey,"%s/SY_%d", OUT, p);
-	  sprintf(filenamebasez,"%s/SZ_%d", OUT, p);
-	  sprintf(filenamebaseeta,"%s/Eta_%d",OUT, p);
-	  //sprintf(filenamebaseep,"%s/EP",OUT);
-
-          seism_file_open(filenamebasex, "w", &WRITE_STEP, "float", seism_regGridID+p, seism_filex+p, &err);
-          seism_file_open(filenamebasey, "w", &WRITE_STEP, "float", seism_regGridID+p, seism_filey+p, &err);
-          seism_file_open(filenamebasez, "w", &WRITE_STEP, "float", seism_regGridID+p, seism_filez+p, &err);
-          if (NVE == 3)
-             seism_file_open(filenamebaseeta, "w", &WRITE_STEP, "float", seism_regGridID+p, seism_fileeta+p, &err);
-       }
-    }
-    #endif 
-
-    for (p=0; p<ngrids; p++){
-
-       if(x_rank_L<0) {
-	  xls[p] = 2+ngsl;
-	  xlsp[p] = xls[p];
-       }
-       else {
-	  xls[p] = 4;
-	  xlsp[p] = xls[p] -1;
-       }
-
-       if(x_rank_R<0) {
-	  xre[p] = nxt[p]+ngsl+1;
-	  xrep[p] = xre[p];
-       }
-       else {
-	  xre[p] = nxt[p] + ngsl2 - 1;
-	  xrep[p] = xre[p] + 1;
-       }
-
-       xvs[p]   = 2+ngsl;
-       xve[p]   = nxt[p]+ngsl+1;
-
-       xss1[p]  = xls[p];
-       xse1[p]  = ngsl+3;
-       xss2[p]  = ngsl+4;
-       xse2[p]  = nxt[p]+ngsl-1;
-       xss3[p]  = nxt[p]+ngsl;
-       xse3[p]  = xre[p];
-
-       if(y_rank_F<0) {
-	  yls[p] = 2+ngsl;
-	  ylsp[p] = yls[p];
-       }
-       else {
-	  yls[p] = 4;
-	  ylsp[p] = yls[p] -1;
-       }
-
-       if(y_rank_B<0) {
-	  yre[p] = nyt[p]+ngsl+1;
-	  yrep[p] = yre[p];
-       }
-       else {
-	  yre[p] = nyt[p] + ngsl2 - 1;
-	  yrep[p] = yre[p] + 1;
-       }
-
-       /*margins for division of inner stress region*/
-       yls2[p]=yls[p] + (int) (yre[p]-yls[p])*0.25;
-       if (yls2[p] % 2 !=0) yls2[p]=yls2[p]+1;  /* yls2 must be even */
-       yre2[p]=yls[p] + (int) (yre[p]-yls[p])*0.75;
-       if (yre2[p] % 2 ==0) yre2[p]=yre2[p]-1; /* yre2 must be uneven */
-
-       yls2[p]=max(yls2[p], ylsp[p]+ngsl+2);
-       yre2[p]=min(yre2[p], yrep[p]-ngsl-2);
+#endif
+
+#else
+   err = 0;
+   // 2 <= maxdim <= 3
+   for (p = 0; p < ngrids; p++)
+   {
+      if (grid_output[p])
+      {
+         nx = NX * grdfct[p];
+         ny = NY * grdfct[p];
+         if (rank == 0)
+            fprintf(stdout, "|    initializing SEISM-IO for grid %d\n", p);
+         fflush(stdout);
+         seism_init(&MC1, &rank, coord, (int *)&maxdim, &nx, &ny, &nzt[p], &nxt[p], &nyt[p], &nzt[p],
+                    &ghostx, &ghosty, &ghostz, &PX, &PY, &PZ, seism_method, &err);
+         if (err != 0)
+         {
+            fprintf(stderr, "|    SEISM ERROR! Init failed for grid %d!\n", p);
+            MPI_Abort(MCW, 1);
+            MPI_Finalize();
+         }
+         if (rank == 0)
+            fprintf(stdout, "|    done initializing SEISM-IO for grid %d\n", p);
+
+         seism_createRegularGrid(NBGX + p, NEDX + p, NSKPX + p, NBGY + p, NEDY + p, NSKPY + p,
+                                 NBGZ + p, NEDZ + p, NSKPZ + p, seism_regGridID + p, &err);
+
+         sprintf(filenamebasex, "%s/SX_%d", OUT, p);
+         sprintf(filenamebasey, "%s/SY_%d", OUT, p);
+         sprintf(filenamebasez, "%s/SZ_%d", OUT, p);
+         sprintf(filenamebaseeta, "%s/Eta_%d", OUT, p);
+         //sprintf(filenamebaseep,"%s/EP",OUT);
+
+         seism_file_open(filenamebasex, "w", &WRITE_STEP, "float", seism_regGridID + p, seism_filex + p, &err);
+         seism_file_open(filenamebasey, "w", &WRITE_STEP, "float", seism_regGridID + p, seism_filey + p, &err);
+         seism_file_open(filenamebasez, "w", &WRITE_STEP, "float", seism_regGridID + p, seism_filez + p, &err);
+         if (NVE == 3)
+            seism_file_open(filenamebaseeta, "w", &WRITE_STEP, "float", seism_regGridID + p, seism_fileeta + p, &err);
+      }
+   }
+#endif
+
+      for (p = 0; p < ngrids; p++)
+      {
+
+         if (x_rank_L < 0)
+         {
+            xls[p] = 2 + ngsl;
+            xlsp[p] = xls[p];
+         }
+         else
+         {
+            xls[p] = 4;
+            xlsp[p] = xls[p] - 1;
+         }
+
+         if (x_rank_R < 0)
+         {
+            xre[p] = nxt[p] + ngsl + 1;
+            xrep[p] = xre[p];
+         }
+         else
+         {
+            xre[p] = nxt[p] + ngsl2 - 1;
+            xrep[p] = xre[p] + 1;
+         }
+
+         xvs[p] = 2 + ngsl;
+         xve[p] = nxt[p] + ngsl + 1;
+
+         xss1[p] = xls[p];
+         xse1[p] = ngsl + 3;
+         xss2[p] = ngsl + 4;
+         xse2[p] = nxt[p] + ngsl - 1;
+         xss3[p] = nxt[p] + ngsl;
+         xse3[p] = xre[p];
+
+         if (y_rank_F < 0)
+         {
+            yls[p] = 2 + ngsl;
+            ylsp[p] = yls[p];
+         }
+         else
+         {
+            yls[p] = 4;
+            ylsp[p] = yls[p] - 1;
+         }
+
+         if (y_rank_B < 0)
+         {
+            yre[p] = nyt[p] + ngsl + 1;
+            yrep[p] = yre[p];
+         }
+         else
+         {
+            yre[p] = nyt[p] + ngsl2 - 1;
+            yrep[p] = yre[p] + 1;
+         }
+
+         /*margins for division of inner stress region*/
+         yls2[p] = yls[p] + (int)(yre[p] - yls[p]) * 0.25;
+         if (yls2[p] % 2 != 0)
+            yls2[p] = yls2[p] + 1; /* yls2 must be even */
+         yre2[p] = yls[p] + (int)(yre[p] - yls[p]) * 0.75;
+         if (yre2[p] % 2 == 0)
+            yre2[p] = yre2[p] - 1; /* yre2 must be uneven */
+
+         yls2[p] = max(yls2[p], ylsp[p] + ngsl + 2);
+         yre2[p] = min(yre2[p], yrep[p] - ngsl - 2);
 
 #if VERBOSE
-       if (rank == 0)
-         fprintf(stdout, "%d: yls[%d]=%d, yls2[%d]=%d, yre2[%d]=%d, yre[%d]=%d\n", rank, 
-            p, yls[p], p, yls2[p], p, yre2[p], p, yre[p]);
-          fflush(stdout);
+         if (rank == 0)
+            fprintf(stdout, "%d: yls[%d]=%d, yls2[%d]=%d, yre2[%d]=%d, yre[%d]=%d\n", rank,
+                    p, yls[p], p, yls2[p], p, yre2[p], p, yre[p]);
+         fflush(stdout);
 #endif
 
-       yfs[p]  = 2+ngsl;
-       yfe[p]  = 2+ngsl2-1;   
-       ybs[p]  = nyt[p]+2;
-       ybe[p]  = nyt[p]+ngsl+1;   
-    
-    }
+         yfs[p] = 2 + ngsl;
+         yfe[p] = 2 + ngsl2 - 1;
+         ybs[p] = nyt[p] + 2;
+         ybe[p] = nyt[p] + ngsl + 1;
+      }
 
-    time_src -= gethrtime();
+      time_src -= gethrtime();
 
 #if VEROBSE
-    if(rank==0) printf("Before inisource\n");
-    fflush(stdout);
+      if (rank == 0)
+         printf("Before inisource\n");
+      fflush(stdout);
 #endif
 
-    if (rank==0) {
-       if (access("sourcefilter.dat", F_OK) != -1){
-          fltfid=fopen("sourcefilter.dat", "r");
-          fscanf(fltfid, "%d\n", &filtorder);
-          fprintf(stdout, "Order of source filter: %d.  Parameters:\n", filtorder);
-          for (k=0; k<filtorder+1; k++){
-             fscanf(fltfid, "%le %le\n", srcfilt_b+k, srcfilt_a+k);
-             fprintf(stdout, "b[%d]=%le, a[%d]=%le\n", k, srcfilt_b[k], k, srcfilt_a[k]);
-          }
-          fclose(fltfid); 
-       }
-       else {
+      if (rank == 0)
+      {
+         if (access("sourcefilter.dat", F_OK) != -1)
+         {
+            fltfid = fopen("sourcefilter.dat", "r");
+            fscanf(fltfid, "%d\n", &filtorder);
+            fprintf(stdout, "Order of source filter: %d.  Parameters:\n", filtorder);
+            for (k = 0; k < filtorder + 1; k++)
+            {
+               fscanf(fltfid, "%le %le\n", srcfilt_b + k, srcfilt_a + k);
+               fprintf(stdout, "b[%d]=%le, a[%d]=%le\n", k, srcfilt_b[k], k, srcfilt_a[k]);
+            }
+            fclose(fltfid);
+         }
+         else
+         {
 #if VERBOSE
-          fprintf(stdout, "File sourcefilter.dat not found, no STF filtering applied.\n");
+            fprintf(stdout, "File sourcefilter.dat not found, no STF filtering applied.\n");
 #endif
-       }
-    }
-    MPI_Bcast(&filtorder, 1, MPI_INT, 0, MCW);
-    fflush(stdout);
-    
-    if (filtorder > 0){
-       MPI_Bcast(srcfilt_b, filtorder+1, MPI_DOUBLE, 0, MCW);
-       MPI_Bcast(srcfilt_a, filtorder+1, MPI_DOUBLE, 0, MCW);
-    }
-
-    SetDeviceFilterParameters(filtorder, srcfilt_b, srcfilt_a);
+         }
+      }
+      MPI_Bcast(&filtorder, 1, MPI_INT, 0, MCW);
+      fflush(stdout);
 
-    tpsrc = (PosInf*) calloc(ngrids, sizeof(PosInf));
-    taxx = (Grid1D*) calloc(ngrids, sizeof(Grid1D));
-    tayy = (Grid1D*) calloc(ngrids, sizeof(Grid1D));
-    tazz = (Grid1D*) calloc(ngrids, sizeof(Grid1D));
-    taxy = (Grid1D*) calloc(ngrids, sizeof(Grid1D));
-    taxz = (Grid1D*) calloc(ngrids, sizeof(Grid1D));
-    tayz = (Grid1D*) calloc(ngrids, sizeof(Grid1D));
+      if (filtorder > 0)
+      {
+         MPI_Bcast(srcfilt_b, filtorder + 1, MPI_DOUBLE, 0, MCW);
+         MPI_Bcast(srcfilt_a, filtorder + 1, MPI_DOUBLE, 0, MCW);
+      }
 
-    for (p=0; p<ngrids; p++) {
-       npsrc[p] = 0;
-       tpsrc[p] = NULL;
-       taxx[p] = tayy[p] = tazz[p] = taxy[p] = taxz[p] = tayz[p] = NULL;
-    }
+      SetDeviceFilterParameters(filtorder, srcfilt_b, srcfilt_a);
+
+      tpsrc = (PosInf *)calloc(ngrids, sizeof(PosInf));
+      taxx = (Grid1D *)calloc(ngrids, sizeof(Grid1D));
+      tayy = (Grid1D *)calloc(ngrids, sizeof(Grid1D));
+      tazz = (Grid1D *)calloc(ngrids, sizeof(Grid1D));
+      taxy = (Grid1D *)calloc(ngrids, sizeof(Grid1D));
+      taxz = (Grid1D *)calloc(ngrids, sizeof(Grid1D));
+      tayz = (Grid1D *)calloc(ngrids, sizeof(Grid1D));
+
+      for (p = 0; p < ngrids; p++)
+      {
+         npsrc[p] = 0;
+         tpsrc[p] = NULL;
+         taxx[p] = tayy[p] = tazz[p] = taxy[p] = taxz[p] = tayz[p] = NULL;
+      }
 
-    if (IFAULT == 5){
-       if (rank==0) fprintf(stdout, "Using IFAULT=5: kinematic source.\n");
-       if ((NST != 2) || (READ_STEP != 2)) {
-             if (rank==0) fprintf(stderr, "IFAULT=5 requires NST = READ_STEP =2.\nQuitting.");
-          MPI_Finalize();
-          return(-1);
-          } 
-    }
+      if (IFAULT == 5)
+      {
+         if (rank == 0)
+            fprintf(stdout, "Using IFAULT=5: kinematic source.\n");
+         if ((NST != 2) || (READ_STEP != 2))
+         {
+            if (rank == 0)
+               fprintf(stderr, "IFAULT=5 requires NST = READ_STEP =2.\nQuitting.");
+            MPI_Finalize();
+            return (-1);
+         }
+      }
 
-    if (IFAULT < 3 || IFAULT == 5) {
-        for (p=0; p<ngrids; p++){
-           if (NSRC[p] > 0) {
-	      sprintf(insrcgrid, "%s_%d", INSRC, p);
-	      sprintf(insrc_i2_grid, "%s_%d", INSRC_I2, p);
+      if (IFAULT < 3 || IFAULT == 5)
+      {
+         for (p = 0; p < ngrids; p++)
+         {
+            if (NSRC[p] > 0)
+            {
+               sprintf(insrcgrid, "%s_%d", INSRC, p);
+               sprintf(insrc_i2_grid, "%s_%d", INSRC_I2, p);
 #if VERBOSE
-	      fprintf(stdout, "opening %s\n", insrcgrid);
+               fprintf(stdout, "opening %s\n", insrcgrid);
 #endif
-	      err = inisource(rank,   IFAULT, NSRC[p],  READ_STEP, NST,   srcproc+p, NZ[p], MCW, nxt[p], nyt[p], nzt[p], 
-		 coord, maxdim, npsrc+p, tpsrc+p, taxx+p, tayy+p, tazz+p, taxz+p, tayz+p, taxy+p, insrcgrid, insrc_i2_grid);
-           }
-           else srcproc[p] = -1;
-       }
-    }
-    else if(IFAULT == 4){
-	err = read_src_ifault_4(rank, READ_STEP,
-	INSRC, maxdim, coord, NZ[0],
-	nxt[0], nyt[0], nzt[0],
-	&npsrc[0], &srcproc[0],
-	&tpsrc[0], &taxx[0], &tayy[0], &tazz[0], 1, 
-	fbc_ext, fbc_off, fbc_pmask, fbc_extl, fbc_dim, 
-	&fbc_seismio, &fbc_tskp, NST, size);
-    }
+               err = inisource(rank, IFAULT, NSRC[p], READ_STEP, NST, srcproc + p, NZ[p], MCW, nxt[p], nyt[p], nzt[p],
+                               coord, maxdim, npsrc + p, tpsrc + p, taxx + p, tayy + p, tazz + p, taxz + p, tayz + p, taxy + p, insrcgrid, insrc_i2_grid);
+            }
+            else
+               srcproc[p] = -1;
+         }
+      }
+      else if (IFAULT == 4)
+      {
+         err = read_src_ifault_4(rank, READ_STEP,
+                                 INSRC, maxdim, coord, NZ[0],
+                                 nxt[0], nyt[0], nzt[0],
+                                 &npsrc[0], &srcproc[0],
+                                 &tpsrc[0], &taxx[0], &tayy[0], &tazz[0], 1,
+                                 fbc_ext, fbc_off, fbc_pmask, fbc_extl, fbc_dim,
+                                 &fbc_seismio, &fbc_tskp, NST, size);
+      }
 
-    if (IFAULT == 5){
-        mom=(float**) calloc(ngrids, sizeof(float*));
-        d_mom=(float**) calloc(ngrids, sizeof(float*));
-        for (p=0; p<ngrids; p++) {
-           if (rank==srcproc[p]) {
-              num_bytes = npsrc[p] * sizeof(float);
-              mom[p] = (float*) calloc(npsrc[p], sizeof(float));
-              CUCHK(cudaMalloc((void**) &d_mom[p], num_bytes));
-	      CUCHK(cudaMemcpy(d_mom[p], mom[p], num_bytes, cudaMemcpyHostToDevice));
-           }
-        }
-        /*if (srcproc[0] == rank){
+      if (IFAULT == 5)
+      {
+         mom = (float **)calloc(ngrids, sizeof(float *));
+         d_mom = (float **)calloc(ngrids, sizeof(float *));
+         for (p = 0; p < ngrids; p++)
+         {
+            if (rank == srcproc[p])
+            {
+               num_bytes = npsrc[p] * sizeof(float);
+               mom[p] = (float *)calloc(npsrc[p], sizeof(float));
+               CUCHK(cudaMalloc((void **)&d_mom[p], num_bytes));
+               CUCHK(cudaMemcpy(d_mom[p], mom[p], num_bytes, cudaMemcpyHostToDevice));
+            }
+         }
+         /*if (srcproc[0] == rank){
            for (n=0; n<npsrc[0]; n++) fprintf(stdout, "src at rank %d: %d,%d,%d\n", 
               rank, tpsrc[0][n*3], tpsrc[0][n*3+1], tpsrc[0][n*3+2]);
         }*/
 
-        /* allocate state variables required for filtering */
-        d_srcfilt_d = (double**) calloc(ngrids, sizeof(double*));
-        if (filtorder > 0){
-           for (p=0; p<ngrids; p++){
-              num_bytes = npsrc[p] * (filtorder+1) * sizeof(double);
-              CUCHK(cudaMalloc((void**) &d_srcfilt_d[p], num_bytes));
-              CUCHK(cudaMemset(d_srcfilt_d[p], 0., num_bytes));
-           }
-        }
-    }
+         /* allocate state variables required for filtering */
+         d_srcfilt_d = (double **)calloc(ngrids, sizeof(double *));
+         if (filtorder > 0)
+         {
+            for (p = 0; p < ngrids; p++)
+            {
+               num_bytes = npsrc[p] * (filtorder + 1) * sizeof(double);
+               CUCHK(cudaMalloc((void **)&d_srcfilt_d[p], num_bytes));
+               CUCHK(cudaMemset(d_srcfilt_d[p], 0., num_bytes));
+            }
+         }
+      }
 
-    if (IFAULT == 6){
-       if (rank==0) fprintf(stdout, "Using plane wave input at grid position %d in grid %d\n", NZ[ngrids-1]-ND-1, ngrids-1);
-       if (READ_STEP != NST) {
-          if (rank==0) fprintf(stderr, "Error.  READ_STEP should be equal NST for IFAULT=6\n");
-          MPI_Finalize();
-       }
-       if (NST < 1) {
-          if (rank==0) fprintf(stderr, "Error.  NST=%d, but should be > 0 for IFAULT=6.\n", NST);
-          MPI_Finalize();
-       }
-       for (p=0; p<ngrids-1; p++) srcproc[p] = -1;
-       srcproc[ngrids-1]=rank;
-       err=ini_plane_wave(rank, MCW, INSRC, NST, taxx+ngrids-1, tayy+ngrids-1, tazz+ngrids-1);
-       if (rank==0) fprintf(stdout, "taxx[%d]=%e\n", NST-1, taxx[ngrids-1][NST-1]);
-    }
+      if (IFAULT == 6)
+      {
+         if (rank == 0)
+            fprintf(stdout, "Using plane wave input at grid position %d in grid %d\n", NZ[ngrids - 1] - ND - 1, ngrids - 1);
+         if (READ_STEP != NST)
+         {
+            if (rank == 0)
+               fprintf(stderr, "Error.  READ_STEP should be equal NST for IFAULT=6\n");
+            MPI_Finalize();
+         }
+         if (NST < 1)
+         {
+            if (rank == 0)
+               fprintf(stderr, "Error.  NST=%d, but should be > 0 for IFAULT=6.\n", NST);
+            MPI_Finalize();
+         }
+         for (p = 0; p < ngrids - 1; p++)
+            srcproc[p] = -1;
+         srcproc[ngrids - 1] = rank;
+         err = ini_plane_wave(rank, MCW, INSRC, NST, taxx + ngrids - 1, tayy + ngrids - 1, tazz + ngrids - 1);
+         if (rank == 0)
+            fprintf(stdout, "taxx[%d]=%e\n", NST - 1, taxx[ngrids - 1][NST - 1]);
+      }
 
-    if(err)
-    {
-       printf("source initialization failed\n");
-       return -1;
-    }
-    time_src += gethrtime(); 
+      if (err)
+      {
+         printf("source initialization failed\n");
+         return -1;
+      }
+      time_src += gethrtime();
 #if VERBOSE
-    if(rank==0) printf("After inisource. Time elapsed (seconds): %lf\n", time_src); 
-    fflush(stdout);
+      if (rank == 0)
+         printf("After inisource. Time elapsed (seconds): %lf\n", time_src);
+      fflush(stdout);
 #endif
 
-    d_tpsrc = (int**) calloc(ngrids, sizeof(int*));
-    d_taxx = (float**) calloc(ngrids, sizeof(float*));
-    d_tayy = (float**) calloc(ngrids, sizeof(float*));
-    d_tazz = (float**) calloc(ngrids, sizeof(float*));
-    d_taxy = (float**) calloc(ngrids, sizeof(float*));
-    d_taxz = (float**) calloc(ngrids, sizeof(float*));
-    d_tayz = (float**) calloc(ngrids, sizeof(float*));
-
-    for (p=0; p<ngrids; p++){
-       if(rank==srcproc[p]) {
+      d_tpsrc = (int **)calloc(ngrids, sizeof(int *));
+      d_taxx = (float **)calloc(ngrids, sizeof(float *));
+      d_tayy = (float **)calloc(ngrids, sizeof(float *));
+      d_tazz = (float **)calloc(ngrids, sizeof(float *));
+      d_taxy = (float **)calloc(ngrids, sizeof(float *));
+      d_taxz = (float **)calloc(ngrids, sizeof(float *));
+      d_tayz = (float **)calloc(ngrids, sizeof(float *));
+
+      for (p = 0; p < ngrids; p++)
+      {
+         if (rank == srcproc[p])
+         {
 #if VERBOSE
-	  printf("rank=%d, grid=%d, source rank, npsrc=%d, srcproc=%d\n", rank, p, npsrc[p], srcproc[p]);
-#endif 
-	  /* here, we allocate data for keeping prevoius timestep */
-          if (IFAULT == 4) num_bytes = sizeof(float)*npsrc[p]*(READ_STEP_GPU+1);
-          else if (IFAULT == 6) num_bytes = sizeof(float)*NST;
-          else num_bytes = sizeof(float)*npsrc[p]*READ_STEP_GPU;
-	  CUCHK(cudaMalloc((void**)&d_taxx[p], num_bytes));
-	  CUCHK(cudaMalloc((void**)&d_tayy[p], num_bytes));
-	  CUCHK(cudaMalloc((void**)&d_tazz[p], num_bytes));
-	  /*Added by Daniel for fault B.C. and plane wave*/
-	  if (IFAULT != 4 && IFAULT != 6){
-	     CUCHK(cudaMalloc((void**)&d_taxz[p], num_bytes));
-	     CUCHK(cudaMalloc((void**)&d_tayz[p], num_bytes));
-	     CUCHK(cudaMalloc((void**)&d_taxy[p], num_bytes));
-	  }
-	  CUCHK(cudaMemcpy(d_taxx[p],taxx[p],num_bytes,cudaMemcpyHostToDevice));
-	  CUCHK(cudaMemcpy(d_tayy[p],tayy[p],num_bytes,cudaMemcpyHostToDevice));
-	  CUCHK(cudaMemcpy(d_tazz[p],tazz[p],num_bytes,cudaMemcpyHostToDevice));
-	  /*Added by Daniel for fault B.C.*/
-	  if (IFAULT != 4 && IFAULT != 6) {
-	     CUCHK(cudaMemcpy(d_taxz[p],taxz[p],num_bytes,cudaMemcpyHostToDevice));
-	     CUCHK(cudaMemcpy(d_tayz[p],tayz[p],num_bytes,cudaMemcpyHostToDevice));
-	     CUCHK(cudaMemcpy(d_taxy[p],taxy[p],num_bytes,cudaMemcpyHostToDevice));
-	  }
-          if (IFAULT !=6) {
-	     num_bytes = sizeof(int)*npsrc[p]*maxdim;
-	     CUCHK(cudaMalloc((void**)&d_tpsrc[p], num_bytes));
-	     CUCHK(cudaMemcpy(d_tpsrc[p],tpsrc[p],num_bytes,cudaMemcpyHostToDevice));
-          }
-       }
-    }
-    fflush(stdout);
-
-    d1 = (Grid3D*) calloc(ngrids, sizeof(Grid3D));
-    mu = (Grid3D*) calloc(ngrids, sizeof(Grid3D));
-    lam = (Grid3D*) calloc(ngrids, sizeof(Grid3D));
-    lam_mu = (Grid3D*) calloc(ngrids, sizeof(Grid3D));
-    for (p=0; p<ngrids; p++){
-       d1[p]     = Alloc3D(nxt[p]+4+ngsl2, nyt[p]+4+ngsl2, nzt[p]+2*align); 
-       mu[p]     = Alloc3D(nxt[p]+4+ngsl2, nyt[p]+4+ngsl2, nzt[p]+2*align);
-       lam[p]    = Alloc3D(nxt[p]+4+ngsl2, nyt[p]+4+ngsl2, nzt[p]+2*align);
-       lam_mu[p] = Alloc3D(nxt[p]+4+ngsl2, nyt[p]+4+ngsl2, 1); 
-    }
+            printf("rank=%d, grid=%d, source rank, npsrc=%d, srcproc=%d\n", rank, p, npsrc[p], srcproc[p]);
+#endif
+            /* here, we allocate data for keeping prevoius timestep */
+            if (IFAULT == 4)
+               num_bytes = sizeof(float) * npsrc[p] * (READ_STEP_GPU + 1);
+            else if (IFAULT == 6)
+               num_bytes = sizeof(float) * NST;
+            else
+               num_bytes = sizeof(float) * npsrc[p] * READ_STEP_GPU;
+            CUCHK(cudaMalloc((void **)&d_taxx[p], num_bytes));
+            CUCHK(cudaMalloc((void **)&d_tayy[p], num_bytes));
+            CUCHK(cudaMalloc((void **)&d_tazz[p], num_bytes));
+            /*Added by Daniel for fault B.C. and plane wave*/
+            if (IFAULT != 4 && IFAULT != 6)
+            {
+               CUCHK(cudaMalloc((void **)&d_taxz[p], num_bytes));
+               CUCHK(cudaMalloc((void **)&d_tayz[p], num_bytes));
+               CUCHK(cudaMalloc((void **)&d_taxy[p], num_bytes));
+            }
+            CUCHK(cudaMemcpy(d_taxx[p], taxx[p], num_bytes, cudaMemcpyHostToDevice));
+            CUCHK(cudaMemcpy(d_tayy[p], tayy[p], num_bytes, cudaMemcpyHostToDevice));
+            CUCHK(cudaMemcpy(d_tazz[p], tazz[p], num_bytes, cudaMemcpyHostToDevice));
+            /*Added by Daniel for fault B.C.*/
+            if (IFAULT != 4 && IFAULT != 6)
+            {
+               CUCHK(cudaMemcpy(d_taxz[p], taxz[p], num_bytes, cudaMemcpyHostToDevice));
+               CUCHK(cudaMemcpy(d_tayz[p], tayz[p], num_bytes, cudaMemcpyHostToDevice));
+               CUCHK(cudaMemcpy(d_taxy[p], taxy[p], num_bytes, cudaMemcpyHostToDevice));
+            }
+            if (IFAULT != 6)
+            {
+               num_bytes = sizeof(int) * npsrc[p] * maxdim;
+               CUCHK(cudaMalloc((void **)&d_tpsrc[p], num_bytes));
+               CUCHK(cudaMemcpy(d_tpsrc[p], tpsrc[p], num_bytes, cudaMemcpyHostToDevice));
+            }
+         }
+      }
+      fflush(stdout);
+
+      d1 = (Grid3D *)calloc(ngrids, sizeof(Grid3D));
+      mu = (Grid3D *)calloc(ngrids, sizeof(Grid3D));
+      lam = (Grid3D *)calloc(ngrids, sizeof(Grid3D));
+      lam_mu = (Grid3D *)calloc(ngrids, sizeof(Grid3D));
+      for (p = 0; p < ngrids; p++)
+      {
+         d1[p] = Alloc3D(nxt[p] + 4 + ngsl2, nyt[p] + 4 + ngsl2, nzt[p] + 2 * align);
+         mu[p] = Alloc3D(nxt[p] + 4 + ngsl2, nyt[p] + 4 + ngsl2, nzt[p] + 2 * align);
+         lam[p] = Alloc3D(nxt[p] + 4 + ngsl2, nyt[p] + 4 + ngsl2, nzt[p] + 2 * align);
+         lam_mu[p] = Alloc3D(nxt[p] + 4 + ngsl2, nyt[p] + 4 + ngsl2, 1);
+      }
 
-    qp = (Grid3D*) calloc(ngrids, sizeof(Grid3D));
-    qs = (Grid3D*) calloc(ngrids, sizeof(Grid3D));
-    if(NVE==1 || NVE==3)
-       for (p=0; p<ngrids; p++){
-       { 
-	  qp[p]   = Alloc3D(nxt[p]+4+ngsl2, nyt[p]+4+ngsl2, nzt[p]+2*align);
-	  qs[p]   = Alloc3D(nxt[p]+4+ngsl2, nyt[p]+4+ngsl2, nzt[p]+2*align);
-       }
-       tau  = Alloc3D(2, 2, 2);
-       tau1 = Alloc3D(2, 2, 2); 
-       tau2 = Alloc3D(2, 2, 2); 
-       weights = Alloc3D(2, 2, 2); 
-       coeff = Alloc1D(16); 
-       weights_sub(weights,coeff, EX, FAC);  
-    }
-    time_mesh -= gethrtime(); 
-
-    if(NVE==3){
-       sigma2 = (Grid3D*) calloc(ngrids, sizeof(Grid3D));
-       cohes = (Grid3D*) calloc(ngrids, sizeof(Grid3D));
-       phi = (Grid3D*) calloc(ngrids, sizeof(Grid3D));
-       yldfac = (Grid3D*) calloc(ngrids, sizeof(Grid3D));
-       neta = (Grid3D*) calloc(ngrids, sizeof(Grid3D));
-       for (p=0; p<ngrids; p++){
-	  sigma2[p]  = Alloc3D(nxt[p]+4+ngsl2, nyt[p]+4+ngsl2, nzt[p]+2*align);
-	  cohes[p]  = Alloc3D(nxt[p]+4+ngsl2, nyt[p]+4+ngsl2, nzt[p]+2*align);
-	  phi[p]    = Alloc3D(nxt[p]+4+ngsl2, nyt[p]+4+ngsl2, nzt[p]+2*align);
-
-	  yldfac[p] = Alloc3D(nxt[p]+4+ngsl2, nyt[p]+4+ngsl2, nzt[p]+2*align);
-	  neta[p]   = Alloc3D(nxt[p]+4+ngsl2, nyt[p]+4+ngsl2, nzt[p]+2*align);
-
-	 // initialize
-	 for(i=0;i<nxt[p]+4+ngsl2;i++) 
-	   for(j=0;j<nyt[p]+4+ngsl2;j++)
-	     for(k=0;k<nzt[p]+2*align;k++){
-	       neta[p][i][j][k] = 0.;
-	       yldfac[p][i][j][k] = 1.;
-	     }
-       }
-    }
+      qp = (Grid3D *)calloc(ngrids, sizeof(Grid3D));
+      qs = (Grid3D *)calloc(ngrids, sizeof(Grid3D));
+      if (NVE == 1 || NVE == 3)
+         for (p = 0; p < ngrids; p++)
+         {
+            {
+               qp[p] = Alloc3D(nxt[p] + 4 + ngsl2, nyt[p] + 4 + ngsl2, nzt[p] + 2 * align);
+               qs[p] = Alloc3D(nxt[p] + 4 + ngsl2, nyt[p] + 4 + ngsl2, nzt[p] + 2 * align);
+            }
+            tau = Alloc3D(2, 2, 2);
+            tau1 = Alloc3D(2, 2, 2);
+            tau2 = Alloc3D(2, 2, 2);
+            weights = Alloc3D(2, 2, 2);
+            coeff = Alloc1D(16);
+            weights_sub(weights, coeff, EX, FAC);
+         }
+      time_mesh -= gethrtime();
+
+      if (NVE == 3)
+      {
+         sigma2 = (Grid3D *)calloc(ngrids, sizeof(Grid3D));
+         cohes = (Grid3D *)calloc(ngrids, sizeof(Grid3D));
+         phi = (Grid3D *)calloc(ngrids, sizeof(Grid3D));
+         yldfac = (Grid3D *)calloc(ngrids, sizeof(Grid3D));
+         neta = (Grid3D *)calloc(ngrids, sizeof(Grid3D));
+         for (p = 0; p < ngrids; p++)
+         {
+            sigma2[p] = Alloc3D(nxt[p] + 4 + ngsl2, nyt[p] + 4 + ngsl2, nzt[p] + 2 * align);
+            cohes[p] = Alloc3D(nxt[p] + 4 + ngsl2, nyt[p] + 4 + ngsl2, nzt[p] + 2 * align);
+            phi[p] = Alloc3D(nxt[p] + 4 + ngsl2, nyt[p] + 4 + ngsl2, nzt[p] + 2 * align);
+
+            yldfac[p] = Alloc3D(nxt[p] + 4 + ngsl2, nyt[p] + 4 + ngsl2, nzt[p] + 2 * align);
+            neta[p] = Alloc3D(nxt[p] + 4 + ngsl2, nyt[p] + 4 + ngsl2, nzt[p] + 2 * align);
+
+            // initialize
+            for (i = 0; i < nxt[p] + 4 + ngsl2; i++)
+               for (j = 0; j < nyt[p] + 4 + ngsl2; j++)
+                  for (k = 0; k < nzt[p] + 2 * align; k++)
+                  {
+                     neta[p][i][j][k] = 0.;
+                     yldfac[p][i][j][k] = 1.;
+                  }
+         }
+      }
 
+if (!usemms) {
 #if VERBOSE
-    if(rank==0) printf("Before inimesh\n");
+      if (rank == 0)
+         printf("Before inimesh\n");
 #endif
-    fflush(stdout);
-    vpe = (float**) calloc(ngrids, sizeof(float*));
-    vse = (float**) calloc(ngrids, sizeof(float*));
-    dde = (float**) calloc(ngrids, sizeof(float*));
-    for (p=0; p<ngrids; p++){
-       char INVEL2[52];
-       int corrected;
-       sprintf(INVEL2, "%s_%d", INVEL, p);
-       //if (rank==0) fprintf(stdout, "opening %s\n", INVEL2);
-       vpe[p] = (float*) calloc(2, sizeof(float));
-       vse[p] = (float*) calloc(2, sizeof(float));
-       dde[p] = (float*) calloc(2, sizeof(float));
-       inimesh(rank, MEDIASTART, d1[p], mu[p], lam[p], qp[p], qs[p], &taumax, &taumin, tau, 
-               weights,coeff, NVAR, FP, FAC, Q0, EX, 
-	       nxt[p], nyt[p], nzt[p], PX, PY, NX*grdfct[p], NY*grdfct[p], nzt[p], coord, MCW, IDYNA, NVE, 
-               SoCalQ, INVEL2, vse[p], vpe[p], dde[p]);
-       if (p > 0) {
-          corrected=checkmesh(nxt[p], nyt[p], nzt[p], nxt[p-1], nyt[p-1], nzt[p-1], d1[p], d1[p-1], p, p-1, "d1");
-          corrected+=checkmesh(nxt[p], nyt[p], nzt[p], nxt[p-1], nyt[p-1], nzt[p-1], mu[p], mu[p-1], p, p-1, "mu");
-          corrected+=checkmesh(nxt[p], nyt[p], nzt[p], nxt[p-1], nyt[p-1], nzt[p-1], lam[p], lam[p-1], p, p-1, "lam");
-          corrected+=checkmesh(nxt[p], nyt[p], nzt[p], nxt[p-1], nyt[p-1], nzt[p-1], qp[p], qp[p-1], p, p-1, "qp");
-          corrected+=checkmesh(nxt[p], nyt[p], nzt[p], nxt[p-1], nyt[p-1], nzt[p-1], qs[p], qs[p-1], p, p-1, "qs");
-          if (corrected > 0) fprintf(stdout, "Warning: Inconsistent material constants between mesh %d and %d corrected.\n", p-1, p);
-       }
-
+      fflush(stdout);
+
+    if(rank == 0){
+        printf("QSI=%f\n", QSI);
+        printf("QPQSR=%f\n", QPQSR);
+        printf("MAXVPVSR=%f\n", MAXVPVSR);
+        printf("VMIN=%f\n", VMIN);
+        printf("VMAX=%f\n", VMAX);
+        printf("DMIN=%f\n", DMIN);
+        fflush(stdout);
     }
-    fflush(stdout);
-
-    if (FOLLOWBATHY == 1) {
-       bathy = (int*) calloc((nxt[0]+4+ngsl2)*(nyt[0]+4+ngsl2), sizeof(int));
-       //for (i=0; i<nxt[0]+4+ngsl2; i++) bathy[i] = (int*) calloc(nyt[0]+4+ngsl2, sizeof(int));
-
-       FILE *bathyfid;
-       char bathyofname[200];
-       sprintf(bathyofname, "debug/bathy.%04d", rank);
-       bathyfid=fopen(bathyofname, "w");
-       for (i=0; i<nxt[0]+4+ngsl2; i++){
-          for (j=0; j<nyt[0]+4+ngsl2; j++){
-             for (k=nzt[0] + align - 1; k > align; k--){
-                //if (mu[0][i][j][k] < 1.e7) {
-                tmpvs=sqrt(1./(mu[0][i][j][k] * d1[0][i][j][k]));
-                if (tmpvs > 0.001f){
-                   //bathy[i][j] = k;
-                   int pos=j*(nxt[0]+4+ngsl2)+i;
-                   bathy[pos] = k;
-                   fprintf(bathyfid, "%d %d %d %e\n", i, j, k, tmpvs);
-                   break;
-       }}}} 
-       fclose(bathyfid);
-
-       num_bytes = sizeof(int)*(nxt[0]+4+ngsl2)*(nyt[0]+4+ngsl2);
-       CUCHK(cudaMalloc((void**) &d_bathy, num_bytes));
-       CUCHK(cudaMemcpy(d_bathy, bathy, num_bytes, cudaMemcpyHostToDevice));
-    } 
-
-    time_mesh += gethrtime();  
+
+      vpe = (float **)calloc(ngrids, sizeof(float *));
+      vse = (float **)calloc(ngrids, sizeof(float *));
+      dde = (float **)calloc(ngrids, sizeof(float *));
+      for (p = 0; p < ngrids; p++)
+      {
+         char INVEL2[52];
+         int corrected;
+         sprintf(INVEL2, "%s_%d", INVEL, p);
+         //if (rank==0) fprintf(stdout, "opening %s\n", INVEL2);
+         vpe[p] = (float *)calloc(2, sizeof(float));
+         vse[p] = (float *)calloc(2, sizeof(float));
+         dde[p] = (float *)calloc(2, sizeof(float));
+         inimesh(rank, MEDIASTART, d1[p], mu[p], lam[p], qp[p], qs[p], &taumax, &taumin, tau,
+                 weights, coeff, NVAR, FP, FAC, Q0, EX,
+                 nxt[p], nyt[p], nzt[p], PX, PY, NX * grdfct[p], NY * grdfct[p], nzt[p], coord, MCW, IDYNA, NVE,
+                 SoCalQ, INVEL2, QSI, QPQSR, MAXVPVSR, VMIN, VMAX, DMIN,  vse[p], vpe[p], dde[p]);
+         if (p > 0)
+         {
+            corrected = checkmesh(nxt[p], nyt[p], nzt[p], nxt[p - 1], nyt[p - 1], nzt[p - 1], d1[p], d1[p - 1], p, p - 1, "d1");
+            corrected += checkmesh(nxt[p], nyt[p], nzt[p], nxt[p - 1], nyt[p - 1], nzt[p - 1], mu[p], mu[p - 1], p, p - 1, "mu");
+            corrected += checkmesh(nxt[p], nyt[p], nzt[p], nxt[p - 1], nyt[p - 1], nzt[p - 1], lam[p], lam[p - 1], p, p - 1, "lam");
+            corrected += checkmesh(nxt[p], nyt[p], nzt[p], nxt[p - 1], nyt[p - 1], nzt[p - 1], qp[p], qp[p - 1], p, p - 1, "qp");
+            corrected += checkmesh(nxt[p], nyt[p], nzt[p], nxt[p - 1], nyt[p - 1], nzt[p - 1], qs[p], qs[p - 1], p, p - 1, "qs");
+            if (corrected > 0)
+               fprintf(stdout, "Warning: Inconsistent material constants between mesh %d and %d corrected.\n", p - 1, p);
+         }
+      }
+      fflush(stdout);
+
+      if (FOLLOWBATHY == 1)
+      {
+         bathy = (int *)calloc((nxt[0] + 4 + ngsl2) * (nyt[0] + 4 + ngsl2), sizeof(int));
+         //for (i=0; i<nxt[0]+4+ngsl2; i++) bathy[i] = (int*) calloc(nyt[0]+4+ngsl2, sizeof(int));
+
+         FILE *bathyfid;
+         char bathyofname[200];
+         sprintf(bathyofname, "debug/bathy.%04d", rank);
+         bathyfid = fopen(bathyofname, "w");
+         for (i = 0; i < nxt[0] + 4 + ngsl2; i++)
+         {
+            for (j = 0; j < nyt[0] + 4 + ngsl2; j++)
+            {
+               for (k = nzt[0] + align - 1; k > align; k--)
+               {
+                  //if (mu[0][i][j][k] < 1.e7) {
+                  tmpvs = sqrt(1. / (mu[0][i][j][k] * d1[0][i][j][k]));
+                  if (tmpvs > 0.001f)
+                  {
+                     //bathy[i][j] = k;
+                     int pos = j * (nxt[0] + 4 + ngsl2) + i;
+                     bathy[pos] = k;
+                     fprintf(bathyfid, "%d %d %d %e\n", i, j, k, tmpvs);
+                     break;
+                  }
+               }
+            }
+         }
+         fclose(bathyfid);
+
+         num_bytes = sizeof(int) * (nxt[0] + 4 + ngsl2) * (nyt[0] + 4 + ngsl2);
+         CUCHK(cudaMalloc((void **)&d_bathy, num_bytes));
+         CUCHK(cudaMemcpy(d_bathy, bathy, num_bytes, cudaMemcpyHostToDevice));
+      }
+
+      time_mesh += gethrtime();
 #if VERBOSE
-    if(rank==0) printf("After inimesh. Time elapsed (seconds): %lf\n", time_mesh);  
+      if (rank == 0)
+         printf("After inimesh. Time elapsed (seconds): %lf\n", time_mesh);
 #endif
-    fflush(stdout);
-    if(rank==0)
-      writeCHK(CHKFILE, NTISKP, DT, DH, nxt, nyt, nzt,
-	       nt, ARBC, NPC, NVE, FAC, Q0, EX, FP, vse, vpe, dde, ngrids);
-
-    for (p=0; p<ngrids; p++){
-       mediaswap(d1[p], mu[p], lam[p], qp[p], qs[p], rank, x_rank_L, x_rank_R, y_rank_F, y_rank_B, 
-          nxt[p], nyt[p], nzt[p], MCW, p);
-
-       for(i=xls[p];i<xre[p]+1;i++)
-	 for(j=yls[p];j<yre[p]+1;j++)
-	 {
-	    float t_xl, t_xl2m;
-	    t_xl             = 1.0/lam[p][i][j][nzt[p]+align-1];
-	    t_xl2m           = 2.0/mu[p][i][j][nzt[p]+align-1] + t_xl; 
-	    lam_mu[p][i][j][0]  = t_xl/t_xl2m;
-	 }
-
-       if(NVE==3){
-	 printf("%d) Computing initial stress\n",rank);
-	 inidrpr_hoekbrown_light(nxt[p], nyt[p], nzt[p], NVE, coord, DH[p], rank, mu[p], lam[p], d1[p],
-	     sigma2[p], cohes[p], phi[p], &fmajor, &fminor, strike, dip, MCW, p);
-	 rotation_matrix(strike, dip, Rz, RzT);
-       }
-    }
-    fflush(stdout);
-
-    /*set a zone without plastic yielding around source nodes*/
-    MPI_Barrier(MCW);
-    if ((NVE > 1) && (IFAULT < 4 || IFAULT == 5)){
-    fprintf(stdout, "removing plasticity from source nodes\n");
-    for (p=0; p<ngrids; p++){
-       for (j=0; j<npsrc[p]; j++){
-	  idx = tpsrc[p][j*maxdim]   + 1 + ngsl;
-	  idy = tpsrc[p][j*maxdim+1] + 1 + ngsl;
-	  idz = tpsrc[p][j*maxdim+2] + align - 1;
-	  int xi, yi, zi;
-	  int dox, doy, doz;
-	  for (xi=idx-1; xi<idx+2;xi++){
-	    for (yi=idy-2; yi<idy+2;yi++){ // because we are adding slip on two sides of the fault 
-	       for (zi=idz-1; zi<idz+2;zi++){
-		  dox=doy=doz=0;
-		  if ((xi>=0) && (xi < (nxt[0] + ngsl2 +1))) dox = 1;
-		  if ((yi>=0) && (yi < (nyt[0] + ngsl2 +1))) doy = 1;
-		  if ((zi>=0) && (yi < (nzt[0] + ngsl2 +1))) doz = 1;
-		  if ((dox && doy) && doz ) cohes[p][xi][yi][zi]=1.e18;
-	       }
-	     }
-	  } 
-       } 
-    }
-    fprintf(stdout, "done\n");
-    }
-    MPI_Barrier(MCW);
+      fflush(stdout);
+      if (rank == 0)
+         writeCHK(CHKFILE, NTISKP, DT, DH, nxt, nyt, nzt,
+                  nt, ARBC, NPC, NVE, FAC, Q0, EX, FP, vse, vpe, dde, ngrids);
+
+      for (p = 0; p < ngrids; p++)
+      {
+         mediaswap(d1[p], mu[p], lam[p], qp[p], qs[p], rank, x_rank_L, x_rank_R, y_rank_F, y_rank_B,
+                   nxt[p], nyt[p], nzt[p], MCW, p);
+
+         for (i = xls[p]; i < xre[p] + 1; i++)
+            for (j = yls[p]; j < yre[p] + 1; j++)
+            {
+               float t_xl, t_xl2m;
+               t_xl = 1.0 / lam[p][i][j][nzt[p] + align - 1];
+               t_xl2m = 2.0 / mu[p][i][j][nzt[p] + align - 1] + t_xl;
+               lam_mu[p][i][j][0] = t_xl / t_xl2m;
+            }
 
+         if (NVE == 3)
+         {
+            printf("%d) Computing initial stress\n", rank);
+            inidrpr_hoekbrown_light(nxt[p], nyt[p], nzt[p], NVE, coord, DH[p], rank, mu[p], lam[p], d1[p],
+                                    sigma2[p], cohes[p], phi[p], &fmajor, &fminor, strike, dip, MCW, p);
+            rotation_matrix(strike, dip, Rz, RzT);
+         }
+      }
+      fflush(stdout);
+
+      /*set a zone without plastic yielding around source nodes*/
+      MPI_Barrier(MCW);
+      if ((NVE > 1) && (IFAULT < 4 || IFAULT == 5))
+      {
+         fprintf(stdout, "removing plasticity from source nodes\n");
+         for (p = 0; p < ngrids; p++)
+         {
+            for (j = 0; j < npsrc[p]; j++)
+            {
+               idx = tpsrc[p][j * maxdim] + 1 + ngsl;
+               idy = tpsrc[p][j * maxdim + 1] + 1 + ngsl;
+               idz = tpsrc[p][j * maxdim + 2] + align - 1;
+               int xi, yi, zi;
+               int dox, doy, doz;
+               for (xi = idx - 1; xi < idx + 2; xi++)
+               {
+                  for (yi = idy - 2; yi < idy + 2; yi++)
+                  { // because we are adding slip on two sides of the fault
+                     for (zi = idz - 1; zi < idz + 2; zi++)
+                     {
+                        dox = doy = doz = 0;
+                        if ((xi >= 0) && (xi < (nxt[0] + ngsl2 + 1)))
+                           dox = 1;
+                        if ((yi >= 0) && (yi < (nyt[0] + ngsl2 + 1)))
+                           doy = 1;
+                        if ((zi >= 0) && (yi < (nzt[0] + ngsl2 + 1)))
+                           doz = 1;
+                        if ((dox && doy) && doz)
+                           cohes[p][xi][yi][zi] = 1.e18;
+                     }
+                  }
+               }
+            }
+         }
+         fprintf(stdout, "done\n");
+      }
 
-    vx1 = (Grid3D*) calloc(ngrids, sizeof(Grid3D));
-    vx2 = (Grid3D*) calloc(ngrids, sizeof(Grid3D));
-    ww = (Grid3Dww*) calloc(ngrids, sizeof(Grid3Dww));
-    wwo = (Grid3D*) calloc(ngrids, sizeof(Grid3D));
+// MMS ends
+}
+      MPI_Barrier(MCW);
 
-    d_lam_mu = (float**) calloc(ngrids, sizeof(float*));
+      vx1 = (Grid3D *)calloc(ngrids, sizeof(Grid3D));
+      vx2 = (Grid3D *)calloc(ngrids, sizeof(Grid3D));
+      ww = (Grid3Dww *)calloc(ngrids, sizeof(Grid3Dww));
+      wwo = (Grid3D *)calloc(ngrids, sizeof(Grid3D));
 
-    for (p=0; p<ngrids; p++){
-       num_bytes = sizeof(float)*(nxt[p]+4+ngsl2)*(nyt[p]+4+ngsl2);
-       CUCHK(cudaMalloc((void**)&d_lam_mu[p], num_bytes));
-       CUCHK(cudaMemcpy(d_lam_mu[p],&lam_mu[p][0][0][0],num_bytes,cudaMemcpyHostToDevice));
+      d_lam_mu = (float **)calloc(ngrids, sizeof(float *));
 
-       vx1[p]  = Alloc3D(nxt[p]+4+ngsl2, nyt[p]+4+ngsl2, nzt[p]+2*align);
-       vx2[p]  = Alloc3D(nxt[p]+4+ngsl2, nyt[p]+4+ngsl2, nzt[p]+2*align);
-       ww[p]   = Alloc3Dww(nxt[p]+4+ngsl2, nyt[p]+4+ngsl2, nzt[p]+2*align); 
-       wwo[p]   = Alloc3D(nxt[p]+4+ngsl2, nyt[p]+4+ngsl2, nzt[p]+2*align); 
-    }
+      for (p = 0; p < ngrids; p++)
+      {
+         num_bytes = sizeof(float) * (nxt[p] + 4 + ngsl2) * (nyt[p] + 4 + ngsl2);
+         CUCHK(cudaMalloc((void **)&d_lam_mu[p], num_bytes));
+         CUCHK(cudaMemcpy(d_lam_mu[p], &lam_mu[p][0][0][0], num_bytes, cudaMemcpyHostToDevice));
 
-    //fprintf(stdout, "sizeof Grid1D: %ld, sizeof Grid1D*: %ld\n", sizeof(Grid1D), sizeof(Grid1D*));
-    dcrjx = (Grid1D*) calloc(ngrids, sizeof(Grid1D));
-    dcrjy = (Grid1D*) calloc(ngrids, sizeof(Grid1D));
-    dcrjz = (Grid1D*) calloc(ngrids, sizeof(Grid1D));
-    if((NPC==0) || (NPC == 2)){
-       for (p=0; p<ngrids; p++){
-	   dcrjx[p] = Alloc1D(nxt[p]+4+ngsl2);
-	   dcrjy[p] = Alloc1D(nyt[p]+4+ngsl2);
-	   dcrjz[p] = Alloc1D(nzt[p]+2*align);
-
-	   for(i=0;i<nxt[p]+4+ngsl2;i++)
-	      dcrjx[p][i]  = 1.0;
-	   for(j=0;j<nyt[p]+4+ngsl2;j++)
-	      dcrjy[p][j]  = 1.0;
-	   for(k=0;k<nzt[p]+2*align;k++)
-	      dcrjz[p][k]  = 1.0;
-
-           if (p == ngrids-1) islowest = 1;
-           else islowest = 0;
-	   inicrj(ARBC, coord, nxt[p], nyt[p], nzt[p], NX*grdfct[p], NY*grdfct[p], ND*grdfct[p], dcrjx[p], dcrjy[p], dcrjz[p], islowest, NPC);
-
-           /*DM: disable ABCs at bottom unless it's the lowest grid*/
-           //if (p < ngrids-1) for(k=0;k<nzt[p]+2*align;k++) dcrjz[p][k]  = 1.0;
-       }
-    }
+         vx1[p] = Alloc3D(nxt[p] + 4 + ngsl2, nyt[p] + 4 + ngsl2, nzt[p] + 2 * align);
+         vx2[p] = Alloc3D(nxt[p] + 4 + ngsl2, nyt[p] + 4 + ngsl2, nzt[p] + 2 * align);
+         ww[p] = Alloc3Dww(nxt[p] + 4 + ngsl2, nyt[p] + 4 + ngsl2, nzt[p] + 2 * align);
+         wwo[p] = Alloc3D(nxt[p] + 4 + ngsl2, nyt[p] + 4 + ngsl2, nzt[p] + 2 * align);
+      }
+
+      //fprintf(stdout, "sizeof Grid1D: %ld, sizeof Grid1D*: %ld\n", sizeof(Grid1D), sizeof(Grid1D*));
+      dcrjx = (Grid1D *)calloc(ngrids, sizeof(Grid1D));
+      dcrjy = (Grid1D *)calloc(ngrids, sizeof(Grid1D));
+      dcrjz = (Grid1D *)calloc(ngrids, sizeof(Grid1D));
+      if ((NPC == 0) || (NPC == 2))
+      {
+         for (p = 0; p < ngrids; p++)
+         {
+            dcrjx[p] = Alloc1D(nxt[p] + 4 + ngsl2);
+            dcrjy[p] = Alloc1D(nyt[p] + 4 + ngsl2);
+            dcrjz[p] = Alloc1D(nzt[p] + 2 * align);
+
+            for (i = 0; i < nxt[p] + 4 + ngsl2; i++)
+               dcrjx[p][i] = 1.0;
+            for (j = 0; j < nyt[p] + 4 + ngsl2; j++)
+               dcrjy[p][j] = 1.0;
+            for (k = 0; k < nzt[p] + 2 * align; k++)
+               dcrjz[p][k] = 1.0;
+
+            if (p == ngrids - 1)
+               islowest = 1;
+            else
+               islowest = 0;
+            inicrj(ARBC, coord, nxt[p], nyt[p], nzt[p], NX * grdfct[p], NY * grdfct[p], ND * grdfct[p], dcrjx[p], dcrjy[p], dcrjz[p], islowest, NPC);
+
+            /*DM: disable ABCs at bottom unless it's the lowest grid*/
+            //if (p < ngrids-1) for(k=0;k<nzt[p]+2*align;k++) dcrjz[p][k]  = 1.0;
+         }
+      }
 
-    if(NVE==1 || NVE==3)
-    {
-        //float dt1 = 1.0/DT;
-        for(i=0;i<2;i++)
-          for(j=0;j<2;j++)
-            for(k=0;k<2;k++)
+      if (NVE == 1 || NVE == 3)
+      {
+         //float dt1 = 1.0/DT;
+         for (i = 0; i < 2; i++)
+            for (j = 0; j < 2; j++)
+               for (k = 0; k < 2; k++)
+               {
+                  tauu = tau[i][j][k];
+                  tau2[i][j][k] = exp(-DT / tauu);
+                  tau1[i][j][k] = 0.5 * (1. - tau2[i][j][k]);
+               }
+
+         for (p = 0; p < ngrids; p++)
+         {
+            init_texture(nxt[p], nyt[p], nzt[p], tau1, tau2,
+                         vx1[p], vx2[p], weights, ww[p], wwo[p], xls[p], xre[p], yls[p], yre[p]);
+            if (p > 0)
             {
-               tauu          = tau[i][j][k];
-	       tau2[i][j][k] = exp(-DT/tauu);
-	       tau1[i][j][k] = 0.5*(1.-tau2[i][j][k]); 
-            }
-
-        for (p=0; p<ngrids; p++){
-	   init_texture(nxt[p], nyt[p], nzt[p], tau1, tau2, 
-		vx1[p], vx2[p], weights, ww[p],wwo[p], xls[p], xre[p], yls[p], yre[p]);  
- 	   if (p > 0) {
-             int corrected;
-	     corrected=checkmesh(nxt[p], nyt[p], nzt[p], nxt[p-1], nyt[p-1], nzt[p-1], vx1[p], vx1[p-1], p, p-1, "vx1");
-	     corrected+=checkmesh(nxt[p], nyt[p], nzt[p], nxt[p-1], nyt[p-1], nzt[p-1], vx2[p], vx2[p-1], p, p-1, "vx2");
-	     corrected+=checkmesh(nxt[p], nyt[p], nzt[p], nxt[p-1], nyt[p-1], nzt[p-1], wwo[p], wwo[p-1], p, p-1, "wwo");
-	     corrected+=checkmesh_ww(nxt[p], nyt[p], nzt[p], nxt[p-1], nyt[p-1], nzt[p-1], ww[p], ww[p-1], p, p-1, "ww");
-             if (corrected > 0) fprintf(stdout, "Warning: Inconsistent texture variables between mesh %d and %d corrected.\n", 
-                p-1, p);
-           }
-        }
-
-
-        Delloc3D(tau);
-        Delloc3D(tau1);
-        Delloc3D(tau2);
-    }
+               int corrected;
+               corrected = checkmesh(nxt[p], nyt[p], nzt[p], nxt[p - 1], nyt[p - 1], nzt[p - 1], vx1[p], vx1[p - 1], p, p - 1, "vx1");
+               corrected += checkmesh(nxt[p], nyt[p], nzt[p], nxt[p - 1], nyt[p - 1], nzt[p - 1], vx2[p], vx2[p - 1], p, p - 1, "vx2");
+               corrected += checkmesh(nxt[p], nyt[p], nzt[p], nxt[p - 1], nyt[p - 1], nzt[p - 1], wwo[p], wwo[p - 1], p, p - 1, "wwo");
+               corrected += checkmesh_ww(nxt[p], nyt[p], nzt[p], nxt[p - 1], nyt[p - 1], nzt[p - 1], ww[p], ww[p - 1], p, p - 1, "ww");
+               if (corrected > 0)
+                  fprintf(stdout, "Warning: Inconsistent texture variables between mesh %d and %d corrected.\n",
+                          p - 1, p);
+            }
+         }
+
+         Delloc3D(tau);
+         Delloc3D(tau1);
+         Delloc3D(tau2);
+      }
 
 #if VERBOSE
-    if(rank==0) printf("Allocate device media pointers and copy.\n");
+      if (rank == 0)
+         printf("Allocate device media pointers and copy.\n");
 #endif
-    fflush(stdout);
-    d_d1 = (float**) calloc(ngrids, sizeof(float*));
-    d_lam = (float**) calloc(ngrids, sizeof(float*));
-    d_mu = (float**) calloc(ngrids, sizeof(float*));
-    d_qp = (float**) calloc(ngrids, sizeof(float*));
-    d_qs = (float**) calloc(ngrids, sizeof(float*));
-
-    d_vx1 = (float**) calloc(ngrids, sizeof(float*));
-    d_vx2 = (float**) calloc(ngrids, sizeof(float*));
-    d_ww = (int**) calloc(ngrids, sizeof(int*));
-    d_wwo = (float**) calloc(ngrids, sizeof(float*));
-    for (p=0; p<ngrids; p++){
-       num_bytes = sizeof(float)*(nxt[p]+4+ngsl2)*(nyt[p]+4+ngsl2)*(nzt[p]+2*align);
-       CUCHK(cudaMalloc((void**)&d_d1[p], num_bytes));
-       CUCHK(cudaMemcpy(d_d1[p],&d1[p][0][0][0],num_bytes,cudaMemcpyHostToDevice));
-       CUCHK(cudaMalloc((void**)&d_lam[p], num_bytes));
-       CUCHK(cudaMemcpy(d_lam[p],&lam[p][0][0][0],num_bytes,cudaMemcpyHostToDevice));
-       CUCHK(cudaMalloc((void**)&d_mu[p], num_bytes));
-       CUCHK(cudaMemcpy(d_mu[p],&mu[p][0][0][0],num_bytes,cudaMemcpyHostToDevice));
-       CUCHK(cudaMalloc((void**)&d_qp[p], num_bytes));
-       CUCHK(cudaMemcpy(d_qp[p],&qp[p][0][0][0],num_bytes,cudaMemcpyHostToDevice));
-
-       num_bytes = sizeof(float)*(nxt[p]+4+ngsl2)*(nyt[p]+4+ngsl2)*(nzt[p]+2*align); 
-       CUCHK(cudaMalloc((void**)&d_qs[p], num_bytes));
-       CUCHK(cudaMemcpy(d_qs[p],&qs[p][0][0][0],num_bytes,cudaMemcpyHostToDevice));
-       CUCHK(cudaMalloc((void**)&d_vx1[p], num_bytes));
-       CUCHK(cudaMemcpy(d_vx1[p],&vx1[p][0][0][0],num_bytes,cudaMemcpyHostToDevice));
-       CUCHK(cudaMalloc((void**)&d_vx2[p], num_bytes));
-       CUCHK(cudaMemcpy(d_vx2[p],&vx2[p][0][0][0],num_bytes,cudaMemcpyHostToDevice));
-       num_bytes = sizeof(int)*(nxt[p]+4+ngsl2)*(nyt[p]+4+ngsl2)*(nzt[p]+2*align); 
+      fflush(stdout);
+      d_d1 = (float **)calloc(ngrids, sizeof(float *));
+      d_lam = (float **)calloc(ngrids, sizeof(float *));
+      d_mu = (float **)calloc(ngrids, sizeof(float *));
+      d_qp = (float **)calloc(ngrids, sizeof(float *));
+      d_qs = (float **)calloc(ngrids, sizeof(float *));
+
+      d_vx1 = (float **)calloc(ngrids, sizeof(float *));
+      d_vx2 = (float **)calloc(ngrids, sizeof(float *));
+      d_ww = (int **)calloc(ngrids, sizeof(int *));
+      d_wwo = (float **)calloc(ngrids, sizeof(float *));
+      for (p = 0; p < ngrids; p++)
+      {
+         num_bytes = sizeof(float) * (nxt[p] + 4 + ngsl2) * (nyt[p] + 4 + ngsl2) * (nzt[p] + 2 * align);
+         CUCHK(cudaMalloc((void **)&d_d1[p], num_bytes));
+         CUCHK(cudaMemcpy(d_d1[p], &d1[p][0][0][0], num_bytes, cudaMemcpyHostToDevice));
+         CUCHK(cudaMalloc((void **)&d_lam[p], num_bytes));
+         CUCHK(cudaMemcpy(d_lam[p], &lam[p][0][0][0], num_bytes, cudaMemcpyHostToDevice));
+         CUCHK(cudaMalloc((void **)&d_mu[p], num_bytes));
+         CUCHK(cudaMemcpy(d_mu[p], &mu[p][0][0][0], num_bytes, cudaMemcpyHostToDevice));
+         CUCHK(cudaMalloc((void **)&d_qp[p], num_bytes));
+         CUCHK(cudaMemcpy(d_qp[p], &qp[p][0][0][0], num_bytes, cudaMemcpyHostToDevice));
+
+         num_bytes = sizeof(float) * (nxt[p] + 4 + ngsl2) * (nyt[p] + 4 + ngsl2) * (nzt[p] + 2 * align);
+         CUCHK(cudaMalloc((void **)&d_qs[p], num_bytes));
+         CUCHK(cudaMemcpy(d_qs[p], &qs[p][0][0][0], num_bytes, cudaMemcpyHostToDevice));
+         CUCHK(cudaMalloc((void **)&d_vx1[p], num_bytes));
+         CUCHK(cudaMemcpy(d_vx1[p], &vx1[p][0][0][0], num_bytes, cudaMemcpyHostToDevice));
+         CUCHK(cudaMalloc((void **)&d_vx2[p], num_bytes));
+         CUCHK(cudaMemcpy(d_vx2[p], &vx2[p][0][0][0], num_bytes, cudaMemcpyHostToDevice));
+         num_bytes = sizeof(int) * (nxt[p] + 4 + ngsl2) * (nyt[p] + 4 + ngsl2) * (nzt[p] + 2 * align);
 #if VERBOSE
-       if (rank==0) fprintf(stdout, "Allocating d_ww and d_wwo, num_bytes=%ld\n", num_bytes);
+         if (rank == 0)
+            fprintf(stdout, "Allocating d_ww and d_wwo, num_bytes=%ld\n", num_bytes);
 #endif
-       CUCHK(cudaMalloc((void**)&d_ww[p], num_bytes)); 
-       CUCHK(cudaMemcpy(d_ww[p],&ww[p][0][0][0],num_bytes,cudaMemcpyHostToDevice));
-       num_bytes = sizeof(float)*(nxt[p]+4+ngsl2)*(nyt[p]+4+ngsl2)*(nzt[p]+2*align); 
-       CUCHK(cudaMalloc((void**)&d_wwo[p], num_bytes));
-       CUCHK(cudaMemcpy(d_wwo[p],&wwo[p][0][0][0],num_bytes,cudaMemcpyHostToDevice));  
-
-    }
-    num_bytes = sizeof(float)*(16);  
-    CUCHK(cudaMalloc((void**)&d_coeff, num_bytes)); 
-    CUCHK(cudaMemcpy(d_coeff,&coeff[0],num_bytes,cudaMemcpyHostToDevice));
-
-
-    if((NPC==0) || (NPC == 2)) {
-       d_dcrjx = (float**) calloc(ngrids, sizeof(float*));
-       d_dcrjy = (float**) calloc(ngrids, sizeof(float*));
-       d_dcrjz = (float**) calloc(ngrids, sizeof(float*));
-       for (p=0; p<ngrids; p++){
-	  num_bytes = sizeof(float)*(nxt[p]+4+ngsl2);
-	  CUCHK(cudaMalloc((void**)&d_dcrjx[p], num_bytes));
-	  CUCHK(cudaMemcpy(d_dcrjx[p],dcrjx[p],num_bytes,cudaMemcpyHostToDevice));
-	  num_bytes = sizeof(float)*(nyt[p]+4+ngsl2);
-	  CUCHK(cudaMalloc((void**)&d_dcrjy[p], num_bytes));
-	  CUCHK(cudaMemcpy(d_dcrjy[p],dcrjy[p],num_bytes,cudaMemcpyHostToDevice));
-	  num_bytes = sizeof(float)*(nzt[p]+2*align);
-	  CUCHK(cudaMalloc((void**)&d_dcrjz[p], num_bytes));
-	  CUCHK(cudaMemcpy(d_dcrjz[p],dcrjz[p],num_bytes,cudaMemcpyHostToDevice));
-       }
-    }
+         CUCHK(cudaMalloc((void **)&d_ww[p], num_bytes));
+         CUCHK(cudaMemcpy(d_ww[p], &ww[p][0][0][0], num_bytes, cudaMemcpyHostToDevice));
+         num_bytes = sizeof(float) * (nxt[p] + 4 + ngsl2) * (nyt[p] + 4 + ngsl2) * (nzt[p] + 2 * align);
+         CUCHK(cudaMalloc((void **)&d_wwo[p], num_bytes));
+         CUCHK(cudaMemcpy(d_wwo[p], &wwo[p][0][0][0], num_bytes, cudaMemcpyHostToDevice));
+      }
+      num_bytes = sizeof(float) * (16);
+      CUCHK(cudaMalloc((void **)&d_coeff, num_bytes));
+      CUCHK(cudaMemcpy(d_coeff, &coeff[0], num_bytes, cudaMemcpyHostToDevice));
+
+      if ((NPC == 0) || (NPC == 2))
+      {
+         d_dcrjx = (float **)calloc(ngrids, sizeof(float *));
+         d_dcrjy = (float **)calloc(ngrids, sizeof(float *));
+         d_dcrjz = (float **)calloc(ngrids, sizeof(float *));
+         for (p = 0; p < ngrids; p++)
+         {
+            num_bytes = sizeof(float) * (nxt[p] + 4 + ngsl2);
+            CUCHK(cudaMalloc((void **)&d_dcrjx[p], num_bytes));
+            CUCHK(cudaMemcpy(d_dcrjx[p], dcrjx[p], num_bytes, cudaMemcpyHostToDevice));
+            num_bytes = sizeof(float) * (nyt[p] + 4 + ngsl2);
+            CUCHK(cudaMalloc((void **)&d_dcrjy[p], num_bytes));
+            CUCHK(cudaMemcpy(d_dcrjy[p], dcrjy[p], num_bytes, cudaMemcpyHostToDevice));
+            num_bytes = sizeof(float) * (nzt[p] + 2 * align);
+            CUCHK(cudaMalloc((void **)&d_dcrjz[p], num_bytes));
+            CUCHK(cudaMemcpy(d_dcrjz[p], dcrjz[p], num_bytes, cudaMemcpyHostToDevice));
+         }
+      }
 
 #if VERBOSE
-    if(rank==0) printf("Allocate host velocity and stress pointers.\n");
+      if (rank == 0)
+         printf("Allocate host velocity and stress pointers.\n");
 #endif
-    fflush(stdout);
-    u1=(Grid3D*) calloc(ngrids, sizeof(Grid3D));
-    v1=(Grid3D*) calloc(ngrids, sizeof(Grid3D));
-    w1=(Grid3D*) calloc(ngrids, sizeof(Grid3D));
-    xx=(Grid3D*) calloc(ngrids, sizeof(Grid3D));
-    yy=(Grid3D*) calloc(ngrids, sizeof(Grid3D));
-    zz=(Grid3D*) calloc(ngrids, sizeof(Grid3D));
-    xy=(Grid3D*) calloc(ngrids, sizeof(Grid3D));
-    yz=(Grid3D*) calloc(ngrids, sizeof(Grid3D));
-    xz=(Grid3D*) calloc(ngrids, sizeof(Grid3D));
-
-    r1=(Grid3D*) calloc(ngrids, sizeof(Grid3D));
-    r2=(Grid3D*) calloc(ngrids, sizeof(Grid3D));
-    r3=(Grid3D*) calloc(ngrids, sizeof(Grid3D));
-    r4=(Grid3D*) calloc(ngrids, sizeof(Grid3D));
-    r5=(Grid3D*) calloc(ngrids, sizeof(Grid3D));
-    r6=(Grid3D*) calloc(ngrids, sizeof(Grid3D));
-
-    for (p=0; p<ngrids; p++){
-       u1[p]  = Alloc3D(nxt[p]+4+ngsl2, nyt[p]+4+ngsl2, nzt[p]+2*align);
-       v1[p]  = Alloc3D(nxt[p]+4+ngsl2, nyt[p]+4+ngsl2, nzt[p]+2*align);
-       w1[p]  = Alloc3D(nxt[p]+4+ngsl2, nyt[p]+4+ngsl2, nzt[p]+2*align);
-       xx[p]  = Alloc3D(nxt[p]+4+ngsl2, nyt[p]+4+ngsl2, nzt[p]+2*align);
-       yy[p]  = Alloc3D(nxt[p]+4+ngsl2, nyt[p]+4+ngsl2, nzt[p]+2*align);
-       zz[p]  = Alloc3D(nxt[p]+4+ngsl2, nyt[p]+4+ngsl2, nzt[p]+2*align);
-       xy[p]  = Alloc3D(nxt[p]+4+ngsl2, nyt[p]+4+ngsl2, nzt[p]+2*align);
-       yz[p]  = Alloc3D(nxt[p]+4+ngsl2, nyt[p]+4+ngsl2, nzt[p]+2*align);
-       xz[p]  = Alloc3D(nxt[p]+4+ngsl2, nyt[p]+4+ngsl2, nzt[p]+2*align);
-       if(NVE==1 || NVE==3)
-       {
-	   r1[p]  = Alloc3D(nxt[p]+4+ngsl2, nyt[p]+4+ngsl2, nzt[p]+2*align);
-	   r2[p]  = Alloc3D(nxt[p]+4+ngsl2, nyt[p]+4+ngsl2, nzt[p]+2*align);
-	   r3[p]  = Alloc3D(nxt[p]+4+ngsl2, nyt[p]+4+ngsl2, nzt[p]+2*align);
-	   r4[p]  = Alloc3D(nxt[p]+4+ngsl2, nyt[p]+4+ngsl2, nzt[p]+2*align);
-	   r5[p]  = Alloc3D(nxt[p]+4+ngsl2, nyt[p]+4+ngsl2, nzt[p]+2*align);
-	   r6[p]  = Alloc3D(nxt[p]+4+ngsl2, nyt[p]+4+ngsl2, nzt[p]+2*align);
-       }
-    }
+      fflush(stdout);
+      u1 = (Grid3D *)calloc(ngrids, sizeof(Grid3D));
+      v1 = (Grid3D *)calloc(ngrids, sizeof(Grid3D));
+      w1 = (Grid3D *)calloc(ngrids, sizeof(Grid3D));
+      xx = (Grid3D *)calloc(ngrids, sizeof(Grid3D));
+      yy = (Grid3D *)calloc(ngrids, sizeof(Grid3D));
+      zz = (Grid3D *)calloc(ngrids, sizeof(Grid3D));
+      xy = (Grid3D *)calloc(ngrids, sizeof(Grid3D));
+      yz = (Grid3D *)calloc(ngrids, sizeof(Grid3D));
+      xz = (Grid3D *)calloc(ngrids, sizeof(Grid3D));
+
+      r1 = (Grid3D *)calloc(ngrids, sizeof(Grid3D));
+      r2 = (Grid3D *)calloc(ngrids, sizeof(Grid3D));
+      r3 = (Grid3D *)calloc(ngrids, sizeof(Grid3D));
+      r4 = (Grid3D *)calloc(ngrids, sizeof(Grid3D));
+      r5 = (Grid3D *)calloc(ngrids, sizeof(Grid3D));
+      r6 = (Grid3D *)calloc(ngrids, sizeof(Grid3D));
+
+      for (p = 0; p < ngrids; p++)
+      {
+         u1[p] = Alloc3D(nxt[p] + 4 + ngsl2, nyt[p] + 4 + ngsl2, nzt[p] + 2 * align);
+         v1[p] = Alloc3D(nxt[p] + 4 + ngsl2, nyt[p] + 4 + ngsl2, nzt[p] + 2 * align);
+         w1[p] = Alloc3D(nxt[p] + 4 + ngsl2, nyt[p] + 4 + ngsl2, nzt[p] + 2 * align);
+         xx[p] = Alloc3D(nxt[p] + 4 + ngsl2, nyt[p] + 4 + ngsl2, nzt[p] + 2 * align);
+         yy[p] = Alloc3D(nxt[p] + 4 + ngsl2, nyt[p] + 4 + ngsl2, nzt[p] + 2 * align);
+         zz[p] = Alloc3D(nxt[p] + 4 + ngsl2, nyt[p] + 4 + ngsl2, nzt[p] + 2 * align);
+         xy[p] = Alloc3D(nxt[p] + 4 + ngsl2, nyt[p] + 4 + ngsl2, nzt[p] + 2 * align);
+         yz[p] = Alloc3D(nxt[p] + 4 + ngsl2, nyt[p] + 4 + ngsl2, nzt[p] + 2 * align);
+         xz[p] = Alloc3D(nxt[p] + 4 + ngsl2, nyt[p] + 4 + ngsl2, nzt[p] + 2 * align);
+         if (NVE == 1 || NVE == 3)
+         {
+            r1[p] = Alloc3D(nxt[p] + 4 + ngsl2, nyt[p] + 4 + ngsl2, nzt[p] + 2 * align);
+            r2[p] = Alloc3D(nxt[p] + 4 + ngsl2, nyt[p] + 4 + ngsl2, nzt[p] + 2 * align);
+            r3[p] = Alloc3D(nxt[p] + 4 + ngsl2, nyt[p] + 4 + ngsl2, nzt[p] + 2 * align);
+            r4[p] = Alloc3D(nxt[p] + 4 + ngsl2, nyt[p] + 4 + ngsl2, nzt[p] + 2 * align);
+            r5[p] = Alloc3D(nxt[p] + 4 + ngsl2, nyt[p] + 4 + ngsl2, nzt[p] + 2 * align);
+            r6[p] = Alloc3D(nxt[p] + 4 + ngsl2, nyt[p] + 4 + ngsl2, nzt[p] + 2 * align);
+         }
+      }
 
-    source_step = 1;
-    if (IFAULT < 4) {
-       for (p=0; p<ngrids; p++){
-	  if(rank==srcproc[p]) {
+      source_step = 1;
+      if (IFAULT < 4)
+      {
+         for (p = 0; p < ngrids; p++)
+         {
+            if (rank == srcproc[p])
+            {
 #if VERBOSE
-	     printf("%d) add initial src\n", rank);
-             fflush(stdout);
+               printf("%d) add initial src\n", rank);
+               fflush(stdout);
 #endif
-		addsrc(source_step, DH[p], DT, NST, npsrc[p], READ_STEP, maxdim, tpsrc[p], taxx[p], tayy[p], tazz[p], taxz[p], \
-		   tayz[p], taxy[p], xx[p], yy[p], zz[p], xy[p], yz[p], xz[p]);
-	  }
-       }
-    }
-    else if (IFAULT == 4) {
-       if(rank==srcproc[0]) {
- 	 frcvel(source_step, DH[0], DT, NST, npsrc[p], READ_STEP, fbc_tskp, maxdim, tpsrc[0], taxx[0], tayy[0], tazz[0], taxz[0], \
-     	    tayz[0], taxy[0], u1[0], v1[0], w1[0], rank);
-       }
-    }
+               addsrc(source_step, DH[p], DT, NST, npsrc[p], READ_STEP, maxdim, tpsrc[p], taxx[p], tayy[p], tazz[p], taxz[p],
+                      tayz[p], taxy[p], xx[p], yy[p], zz[p], xy[p], yz[p], xz[p]);
+            }
+         }
+      }
+      else if (IFAULT == 4)
+      {
+         if (rank == srcproc[0])
+         {
+            frcvel(source_step, DH[0], DT, NST, npsrc[p], READ_STEP, fbc_tskp, maxdim, tpsrc[0], taxx[0], tayy[0], tazz[0], taxz[0],
+                   tayz[0], taxy[0], u1[0], v1[0], w1[0], rank);
+         }
+      }
 
 #if VERBOSE
-    if(rank==0) printf("Allocate device velocity and stress pointers and copy.\n");
+      if (rank == 0)
+         printf("Allocate device velocity and stress pointers and copy.\n");
 #endif
-    
-    d_u1 = (float**) calloc(ngrids, sizeof(float*));
-    d_v1 = (float**) calloc(ngrids, sizeof(float*));
-    d_w1 = (float**) calloc(ngrids, sizeof(float*));
-    d_xx = (float**) calloc(ngrids, sizeof(float*));
-    d_yy = (float**) calloc(ngrids, sizeof(float*));
-    d_zz = (float**) calloc(ngrids, sizeof(float*));
-    d_xy = (float**) calloc(ngrids, sizeof(float*));
-    d_xz = (float**) calloc(ngrids, sizeof(float*));
-    d_yz = (float**) calloc(ngrids, sizeof(float*));
-
-    d_r1 = (float**) calloc(ngrids, sizeof(float*));
-    d_r2 = (float**) calloc(ngrids, sizeof(float*));
-    d_r3 = (float**) calloc(ngrids, sizeof(float*));
-    d_r4 = (float**) calloc(ngrids, sizeof(float*));
-    d_r5 = (float**) calloc(ngrids, sizeof(float*));
-    d_r6 = (float**) calloc(ngrids, sizeof(float*));
-
-    if (NVE==3){
-      d_sigma2 = (float**) calloc(ngrids, sizeof(float*));
-      d_yldfac = (float**) calloc(ngrids, sizeof(float*));
-      d_cohes = (float**) calloc(ngrids, sizeof(float*));
-      d_phi = (float**) calloc(ngrids, sizeof(float*));
-    }
-    d_neta = (float**) calloc(ngrids, sizeof(float*));  /*always needed for velbuffer kernel*/
-
-    for (p=0; p<ngrids; p++){
-       num_bytes = sizeof(float)*(nxt[p]+4+ngsl2)*(nyt[p]+4+ngsl2)*(nzt[p]+2*align);
-       CUCHK(cudaMalloc((void**)&d_u1[p], num_bytes));
-       CUCHK(cudaMemcpy(d_u1[p],&u1[p][0][0][0],num_bytes,cudaMemcpyHostToDevice));
-       CUCHK(cudaMalloc((void**)&d_v1[p], num_bytes));
-       CUCHK(cudaMemcpy(d_v1[p],&v1[p][0][0][0],num_bytes,cudaMemcpyHostToDevice));
-       CUCHK(cudaMalloc((void**)&d_w1[p], num_bytes));
-       CUCHK(cudaMemcpy(d_w1[p],&w1[p][0][0][0],num_bytes,cudaMemcpyHostToDevice));
-       CUCHK(cudaMalloc((void**)&d_xx[p], num_bytes));
-       CUCHK(cudaMemcpy(d_xx[p],&xx[p][0][0][0],num_bytes,cudaMemcpyHostToDevice));
-       CUCHK(cudaMalloc((void**)&d_yy[p], num_bytes));
-       CUCHK(cudaMemcpy(d_yy[p],&yy[p][0][0][0],num_bytes,cudaMemcpyHostToDevice));
-       CUCHK(cudaMalloc((void**)&d_zz[p], num_bytes));
-       CUCHK(cudaMemcpy(d_zz[p],&zz[p][0][0][0],num_bytes,cudaMemcpyHostToDevice));
-       CUCHK(cudaMalloc((void**)&d_xy[p], num_bytes));
-       CUCHK(cudaMemcpy(d_xy[p],&xy[p][0][0][0],num_bytes,cudaMemcpyHostToDevice));
-       CUCHK(cudaMalloc((void**)&d_xz[p], num_bytes));
-       CUCHK(cudaMemcpy(d_xz[p],&xz[p][0][0][0],num_bytes,cudaMemcpyHostToDevice));
-       CUCHK(cudaMalloc((void**)&d_yz[p], num_bytes));
-       CUCHK(cudaMemcpy(d_yz[p],&yz[p][0][0][0],num_bytes,cudaMemcpyHostToDevice));
-       if(NVE==1 || NVE==3)
-       {
+
+      d_u1 = (float **)calloc(ngrids, sizeof(float *));
+      d_v1 = (float **)calloc(ngrids, sizeof(float *));
+      d_w1 = (float **)calloc(ngrids, sizeof(float *));
+      d_xx = (float **)calloc(ngrids, sizeof(float *));
+      d_yy = (float **)calloc(ngrids, sizeof(float *));
+      d_zz = (float **)calloc(ngrids, sizeof(float *));
+      d_xy = (float **)calloc(ngrids, sizeof(float *));
+      d_xz = (float **)calloc(ngrids, sizeof(float *));
+      d_yz = (float **)calloc(ngrids, sizeof(float *));
+
+      d_r1 = (float **)calloc(ngrids, sizeof(float *));
+      d_r2 = (float **)calloc(ngrids, sizeof(float *));
+      d_r3 = (float **)calloc(ngrids, sizeof(float *));
+      d_r4 = (float **)calloc(ngrids, sizeof(float *));
+      d_r5 = (float **)calloc(ngrids, sizeof(float *));
+      d_r6 = (float **)calloc(ngrids, sizeof(float *));
+
+      if (NVE == 3)
+      {
+         d_sigma2 = (float **)calloc(ngrids, sizeof(float *));
+         d_yldfac = (float **)calloc(ngrids, sizeof(float *));
+         d_cohes = (float **)calloc(ngrids, sizeof(float *));
+         d_phi = (float **)calloc(ngrids, sizeof(float *));
+      }
+      d_neta = (float **)calloc(ngrids, sizeof(float *)); /*always needed for velbuffer kernel*/
+
+      for (p = 0; p < ngrids; p++)
+      {
+         num_bytes = sizeof(float) * (nxt[p] + 4 + ngsl2) * (nyt[p] + 4 + ngsl2) * (nzt[p] + 2 * align);
+         CUCHK(cudaMalloc((void **)&d_u1[p], num_bytes));
+         CUCHK(cudaMemcpy(d_u1[p], &u1[p][0][0][0], num_bytes, cudaMemcpyHostToDevice));
+         CUCHK(cudaMalloc((void **)&d_v1[p], num_bytes));
+         CUCHK(cudaMemcpy(d_v1[p], &v1[p][0][0][0], num_bytes, cudaMemcpyHostToDevice));
+         CUCHK(cudaMalloc((void **)&d_w1[p], num_bytes));
+         CUCHK(cudaMemcpy(d_w1[p], &w1[p][0][0][0], num_bytes, cudaMemcpyHostToDevice));
+         CUCHK(cudaMalloc((void **)&d_xx[p], num_bytes));
+         CUCHK(cudaMemcpy(d_xx[p], &xx[p][0][0][0], num_bytes, cudaMemcpyHostToDevice));
+         CUCHK(cudaMalloc((void **)&d_yy[p], num_bytes));
+         CUCHK(cudaMemcpy(d_yy[p], &yy[p][0][0][0], num_bytes, cudaMemcpyHostToDevice));
+         CUCHK(cudaMalloc((void **)&d_zz[p], num_bytes));
+         CUCHK(cudaMemcpy(d_zz[p], &zz[p][0][0][0], num_bytes, cudaMemcpyHostToDevice));
+         CUCHK(cudaMalloc((void **)&d_xy[p], num_bytes));
+         CUCHK(cudaMemcpy(d_xy[p], &xy[p][0][0][0], num_bytes, cudaMemcpyHostToDevice));
+         CUCHK(cudaMalloc((void **)&d_xz[p], num_bytes));
+         CUCHK(cudaMemcpy(d_xz[p], &xz[p][0][0][0], num_bytes, cudaMemcpyHostToDevice));
+         CUCHK(cudaMalloc((void **)&d_yz[p], num_bytes));
+         CUCHK(cudaMemcpy(d_yz[p], &yz[p][0][0][0], num_bytes, cudaMemcpyHostToDevice));
+         if (NVE == 1 || NVE == 3)
+         {
 #if VERBOSE
-	 if(rank==0) printf("Allocate additional device pointers (r) and copy.\n");
+            if (rank == 0)
+               printf("Allocate additional device pointers (r) and copy.\n");
 #endif
-	   CUCHK(cudaMalloc((void**)&d_r1[p], num_bytes));
-	   CUCHK(cudaMemcpy(d_r1[p],&r1[p][0][0][0],num_bytes,cudaMemcpyHostToDevice));
-	   CUCHK(cudaMalloc((void**)&d_r2[p], num_bytes));
-	   CUCHK(cudaMemcpy(d_r2[p],&r2[p][0][0][0],num_bytes,cudaMemcpyHostToDevice));
-	   CUCHK(cudaMalloc((void**)&d_r3[p], num_bytes));
-	   CUCHK(cudaMemcpy(d_r3[p],&r3[p][0][0][0],num_bytes,cudaMemcpyHostToDevice));
-	   CUCHK(cudaMalloc((void**)&d_r4[p], num_bytes));
-	   CUCHK(cudaMemcpy(d_r4[p],&r4[p][0][0][0],num_bytes,cudaMemcpyHostToDevice));
-	   CUCHK(cudaMalloc((void**)&d_r5[p], num_bytes));
-	   CUCHK(cudaMemcpy(d_r5[p],&r5[p][0][0][0],num_bytes,cudaMemcpyHostToDevice));
-	   CUCHK(cudaMalloc((void**)&d_r6[p], num_bytes));
-	   CUCHK(cudaMemcpy(d_r6[p],&r6[p][0][0][0],num_bytes,cudaMemcpyHostToDevice));
-       }
-       if(NVE==3){
+            CUCHK(cudaMalloc((void **)&d_r1[p], num_bytes));
+            CUCHK(cudaMemcpy(d_r1[p], &r1[p][0][0][0], num_bytes, cudaMemcpyHostToDevice));
+            CUCHK(cudaMalloc((void **)&d_r2[p], num_bytes));
+            CUCHK(cudaMemcpy(d_r2[p], &r2[p][0][0][0], num_bytes, cudaMemcpyHostToDevice));
+            CUCHK(cudaMalloc((void **)&d_r3[p], num_bytes));
+            CUCHK(cudaMemcpy(d_r3[p], &r3[p][0][0][0], num_bytes, cudaMemcpyHostToDevice));
+            CUCHK(cudaMalloc((void **)&d_r4[p], num_bytes));
+            CUCHK(cudaMemcpy(d_r4[p], &r4[p][0][0][0], num_bytes, cudaMemcpyHostToDevice));
+            CUCHK(cudaMalloc((void **)&d_r5[p], num_bytes));
+            CUCHK(cudaMemcpy(d_r5[p], &r5[p][0][0][0], num_bytes, cudaMemcpyHostToDevice));
+            CUCHK(cudaMalloc((void **)&d_r6[p], num_bytes));
+            CUCHK(cudaMemcpy(d_r6[p], &r6[p][0][0][0], num_bytes, cudaMemcpyHostToDevice));
+         }
+         if (NVE == 3)
+         {
 #if VERBOSE
-	 if(rank==0) printf("Allocate plasticity variables since NVE=3\n");
+            if (rank == 0)
+               printf("Allocate plasticity variables since NVE=3\n");
 #endif
-         fflush(stdout);
-	 num_bytes = sizeof(float)*(nxt[p]+4+ngsl2)*(nyt[p]+4+ngsl2)*(nzt[p]+2*align);
-	 CUCHK(cudaMalloc((void**)&d_sigma2[p], num_bytes));
-	 CUCHK(cudaMemcpy(d_sigma2[p],&sigma2[p][0][0][0],num_bytes,cudaMemcpyHostToDevice));
-	 CUCHK(cudaMalloc((void**)&d_yldfac[p], num_bytes));
-	 CUCHK(cudaMemcpy(d_yldfac[p],&yldfac[p][0][0][0],num_bytes,cudaMemcpyHostToDevice));
-	 CUCHK(cudaMalloc((void**)&d_cohes[p], num_bytes));
-	 CUCHK(cudaMemcpy(d_cohes[p],&cohes[p][0][0][0],num_bytes,cudaMemcpyHostToDevice));
-	 CUCHK(cudaMalloc((void**)&d_phi[p], num_bytes));
-	 CUCHK(cudaMemcpy(d_phi[p],&phi[p][0][0][0],num_bytes,cudaMemcpyHostToDevice));
-	 CUCHK(cudaMalloc((void**)&d_neta[p], num_bytes));
-	 CUCHK(cudaMemcpy(d_neta[p],&neta[p][0][0][0],num_bytes,cudaMemcpyHostToDevice));
-
-	 /*cudaMalloc((void**)&d_yldfac_L, msg_yldfac_size_x*sizeof(float));
+            fflush(stdout);
+            num_bytes = sizeof(float) * (nxt[p] + 4 + ngsl2) * (nyt[p] + 4 + ngsl2) * (nzt[p] + 2 * align);
+            CUCHK(cudaMalloc((void **)&d_sigma2[p], num_bytes));
+            CUCHK(cudaMemcpy(d_sigma2[p], &sigma2[p][0][0][0], num_bytes, cudaMemcpyHostToDevice));
+            CUCHK(cudaMalloc((void **)&d_yldfac[p], num_bytes));
+            CUCHK(cudaMemcpy(d_yldfac[p], &yldfac[p][0][0][0], num_bytes, cudaMemcpyHostToDevice));
+            CUCHK(cudaMalloc((void **)&d_cohes[p], num_bytes));
+            CUCHK(cudaMemcpy(d_cohes[p], &cohes[p][0][0][0], num_bytes, cudaMemcpyHostToDevice));
+            CUCHK(cudaMalloc((void **)&d_phi[p], num_bytes));
+            CUCHK(cudaMemcpy(d_phi[p], &phi[p][0][0][0], num_bytes, cudaMemcpyHostToDevice));
+            CUCHK(cudaMalloc((void **)&d_neta[p], num_bytes));
+            CUCHK(cudaMemcpy(d_neta[p], &neta[p][0][0][0], num_bytes, cudaMemcpyHostToDevice));
+
+            /*cudaMalloc((void**)&d_yldfac_L, msg_yldfac_size_x*sizeof(float));
 	 cudaMalloc((void**)&d_yldfac_R, msg_yldfac_size_x*sizeof(float));
 	 cudaMalloc((void**)&d_yldfac_F, msg_yldfac_size_y*sizeof(float));
 	 cudaMalloc((void**)&d_yldfac_B, msg_yldfac_size_y*sizeof(float));
@@ -1284,932 +1473,1069 @@ rank, READ_STEP, READ_STEP_GPU, NST, IFAULT);
 	 cudaMalloc((void**)&d_yldfac_FR, sizeof(float));
 	 cudaMalloc((void**)&d_yldfac_BL, sizeof(float));
 	 cudaMalloc((void**)&d_yldfac_BR, sizeof(float));*/
-       }
-       nel[p]=(nxt[p]+4+ngsl2)*(nyt[p]+4+ngsl2)*(nzt[p]+2*align);
-    }
+         }
+         nel[p] = (nxt[p] + 4 + ngsl2) * (nyt[p] + 4 + ngsl2) * (nzt[p] + 2 * align);
+      }
 
-    /*dump_variable(d_d1[0], nel[0], "d1", 0, 0, 0, rank, size);
+/*dump_variable(d_d1[0], nel[0], "d1", 0, 0, 0, rank, size);
     dump_variable(d_qs[0], nel[0], "qs", 0, 0, 0, rank, size);*/
 
-    //  variable initialization ends
-    #ifndef SEISMIO
-    Bufx = (Grid1D*) calloc(ngrids, sizeof(Grid1D));
-    Bufy = (Grid1D*) calloc(ngrids, sizeof(Grid1D));
-    Bufz = (Grid1D*) calloc(ngrids, sizeof(Grid1D));
-    if (NVE == 3) Bufeta = (Grid1D*) calloc(ngrids, sizeof(Grid1D));
-
-    d_Bufx = (float**) calloc(ngrids, sizeof(float*));
-    d_Bufy = (float**) calloc(ngrids, sizeof(float*));
-    d_Bufz = (float**) calloc(ngrids, sizeof(float*));
-    d_Bufeta = (float**) calloc(ngrids, sizeof(float*));
-
-    for (p=0; p<ngrids; p++){
-       if (grid_output[p]){
+//  variable initialization ends
+#ifndef SEISMIO
+      Bufx = (Grid1D *)calloc(ngrids, sizeof(Grid1D));
+      Bufy = (Grid1D *)calloc(ngrids, sizeof(Grid1D));
+      Bufz = (Grid1D *)calloc(ngrids, sizeof(Grid1D));
+      if (NVE == 3)
+         Bufeta = (Grid1D *)calloc(ngrids, sizeof(Grid1D));
+
+      d_Bufx = (float **)calloc(ngrids, sizeof(float *));
+      d_Bufy = (float **)calloc(ngrids, sizeof(float *));
+      d_Bufz = (float **)calloc(ngrids, sizeof(float *));
+      d_Bufeta = (float **)calloc(ngrids, sizeof(float *));
+
+      for (p = 0; p < ngrids; p++)
+      {
+         if (grid_output[p])
+         {
 #if VERBOSE
-	  if(rank==0) printf("Allocate buffers of #elements: %d\n",rec_nxt[p]*rec_nyt[p]*rec_nzt[p]*WRITE_STEP);
+            if (rank == 0)
+               printf("Allocate buffers of #elements: %ld\n", rec_nxt[p] * rec_nyt[p] * rec_nzt[p] * WRITE_STEP);
+#endif
+            Bufx[p] = Alloc1D((long)rec_nxt[p] * rec_nyt[p] * rec_nzt[p] * WRITE_STEP);
+            Bufy[p] = Alloc1D((long)rec_nxt[p] * rec_nyt[p] * rec_nzt[p] * WRITE_STEP);
+            Bufz[p] = Alloc1D((long)rec_nxt[p] * rec_nyt[p] * rec_nzt[p] * WRITE_STEP);
+            //  Allocate buffers for plasticity output
+            if (NVE == 3)
+               Bufeta[p] = Alloc1D(rec_nxt[p] * rec_nyt[p] * rec_nzt[p] * WRITE_STEP);
+
+            num_bytes = sizeof(float) * rec_nxt[p] * rec_nyt[p] * rec_nzt[p];
+            CUCHK(cudaMallocHost((void **)&d_Bufx[p], num_bytes));
+            CUCHK(cudaMallocHost((void **)&d_Bufy[p], num_bytes));
+            CUCHK(cudaMallocHost((void **)&d_Bufz[p], num_bytes));
+            if (NVE == 3)
+               CUCHK(cudaMallocHost((void **)&d_Bufz[p], num_bytes));
+         }
+      }
 #endif
-	  Bufx[p]  = Alloc1D(rec_nxt[p]*rec_nyt[p]*rec_nzt[p]*WRITE_STEP);
-	  Bufy[p]  = Alloc1D(rec_nxt[p]*rec_nyt[p]*rec_nzt[p]*WRITE_STEP);
-	  Bufz[p]  = Alloc1D(rec_nxt[p]*rec_nyt[p]*rec_nzt[p]*WRITE_STEP);
-	  //  Allocate buffers for plasticity output
-	  if (NVE == 3) Bufeta[p] = Alloc1D(rec_nxt[p]*rec_nyt[p]*rec_nzt[p]*WRITE_STEP);
-
-          num_bytes = sizeof(float)*rec_nxt[p]*rec_nyt[p]*rec_nzt[p];
-          CUCHK(cudaMallocHost((void**)&d_Bufx[p], num_bytes));
-          CUCHK(cudaMallocHost((void**)&d_Bufy[p], num_bytes));
-          CUCHK(cudaMallocHost((void**)&d_Bufz[p], num_bytes));
-          if (NVE == 3) CUCHK(cudaMallocHost((void**)&d_Bufz[p], num_bytes));
-
-       }
-    }
-    #endif
-
-    SL_vel = (float**) calloc(ngrids, sizeof(float*));
-    SR_vel = (float**) calloc(ngrids, sizeof(float*));
-    RL_vel = (float**) calloc(ngrids, sizeof(float*));
-    RR_vel = (float**) calloc(ngrids, sizeof(float*));
-    SF_vel = (float**) calloc(ngrids, sizeof(float*));
-    SB_vel = (float**) calloc(ngrids, sizeof(float*));
-    RF_vel = (float**) calloc(ngrids, sizeof(float*));
-    RB_vel = (float**) calloc(ngrids, sizeof(float*));
-
-    d_f_u1 = (float**) calloc(ngrids, sizeof(float*));
-    d_f_v1 = (float**) calloc(ngrids, sizeof(float*));
-    d_f_w1 = (float**) calloc(ngrids, sizeof(float*));
-    d_b_u1 = (float**) calloc(ngrids, sizeof(float*));
-    d_b_v1 = (float**) calloc(ngrids, sizeof(float*));
-    d_b_w1 = (float**) calloc(ngrids, sizeof(float*));
-    for (p=0; p<ngrids; p++){
-       num_bytes = sizeof(float)*3*(ngsl)*(nyt[p]+4+ngsl2)*(nzt[p]+2*align);
-       CUCHK(cudaMallocHost((void**)&SL_vel[p], num_bytes));
-       CUCHK(cudaMallocHost((void**)&SR_vel[p], num_bytes));
-       CUCHK(cudaMallocHost((void**)&RL_vel[p], num_bytes));
-       CUCHK(cudaMallocHost((void**)&RR_vel[p], num_bytes));
-       num_bytes = sizeof(float)*3*(ngsl)*(nxt[p]+4+ngsl2)*(nzt[p]+2*align);
-       CUCHK(cudaMallocHost((void**)&SF_vel[p], num_bytes));
-       CUCHK(cudaMallocHost((void**)&SB_vel[p], num_bytes));
-       CUCHK(cudaMallocHost((void**)&RF_vel[p], num_bytes));
-       CUCHK(cudaMallocHost((void**)&RB_vel[p], num_bytes));
-       num_bytes = sizeof(float)*(ngsl)*(nxt[p]+4+ngsl2)*(nzt[p]+2*align);
-       CUCHK(cudaMalloc((void**)&d_f_u1[p], num_bytes));
-       CUCHK(cudaMalloc((void**)&d_f_v1[p], num_bytes));
-       CUCHK(cudaMalloc((void**)&d_f_w1[p], num_bytes));
-       CUCHK(cudaMalloc((void**)&d_b_u1[p], num_bytes));
-       CUCHK(cudaMalloc((void**)&d_b_v1[p], num_bytes));
-       CUCHK(cudaMalloc((void**)&d_b_w1[p], num_bytes));
- 
-       CUCHK(cudaMemset(d_f_u1[p], 0., num_bytes));
-       CUCHK(cudaMemset(d_f_v1[p], 0., num_bytes));
-       CUCHK(cudaMemset(d_f_w1[p], 0., num_bytes));
-       CUCHK(cudaMemset(d_b_u1[p], 0., num_bytes));
-       CUCHK(cudaMemset(d_b_v1[p], 0., num_bytes));
-       CUCHK(cudaMemset(d_b_w1[p], 0., num_bytes));
- 
-       msg_v_size_x[p] = 3*(ngsl)*(nyt[p]+4+ngsl2)*(nzt[p]+2*align);
-       msg_v_size_y[p] = 3*(ngsl)*(nxt[p]+4+ngsl2)*(nzt[p]+2*align);
-    }
 
-    SetDeviceConstValue(DH, DT, nxt, nyt, nzt, ngrids, fmajor, fminor, Rz, RzT);
-    print_const_H(ngrids);
-
-    CUCHK(cudaStreamCreate(&stream_1));
-    CUCHK(cudaStreamCreate(&stream_2));
-    //cudaStreamCreate(&stream_2b);
-    CUCHK(cudaStreamCreate(&stream_i));
-    CUCHK(cudaStreamCreate(&stream_i2));
-    CUCHK(cudaStreamCreate(&stream_o));
-//    Delloc3D(tau); 
-
-    /*Daniel - yield factor exchange*/
-    if (NVE == 3){
-       SL_yldfac=(float**) calloc(ngrids, sizeof(float*));
-       SR_yldfac=(float**) calloc(ngrids, sizeof(float*));
-       RL_yldfac=(float**) calloc(ngrids, sizeof(float*));
-       RR_yldfac=(float**) calloc(ngrids, sizeof(float*));
-       SF_yldfac=(float**) calloc(ngrids, sizeof(float*));
-       SB_yldfac=(float**) calloc(ngrids, sizeof(float*));
-       RF_yldfac=(float**) calloc(ngrids, sizeof(float*));
-       RB_yldfac=(float**) calloc(ngrids, sizeof(float*));
-
-       d_SL_yldfac=(float**) calloc(ngrids, sizeof(float*));
-       d_SR_yldfac=(float**) calloc(ngrids, sizeof(float*));
-       d_RL_yldfac=(float**) calloc(ngrids, sizeof(float*));
-       d_RR_yldfac=(float**) calloc(ngrids, sizeof(float*));
-       d_SF_yldfac=(float**) calloc(ngrids, sizeof(float*));
-       d_SB_yldfac=(float**) calloc(ngrids, sizeof(float*));
-       d_RF_yldfac=(float**) calloc(ngrids, sizeof(float*));
-       d_RB_yldfac=(float**) calloc(ngrids, sizeof(float*));
-
-       yldfac_msg_size_x = (int*) calloc(ngrids, sizeof(int));
-       yldfac_msg_size_y = (int*) calloc(ngrids, sizeof(int));
-       for (p=0; p<ngrids; p++){
-	  yldfac_msg_size_x[p] = ngsl*(nyt[p]+ngsl2)*nzt[p];
-	  num_bytes2 = yldfac_msg_size_x[p]*sizeof(float);
-	  /*fprintf(stdout, "swp_msg_size_x=%d, num_bytes2=%d\n", swp_msg_size_x, num_bytes2);*/
-	  CUCHK(cudaMallocHost((void**)&SL_yldfac[p], num_bytes2));
-	  CUCHK(cudaMallocHost((void**)&SR_yldfac[p], num_bytes2));
-	  CUCHK(cudaMallocHost((void**)&RL_yldfac[p], num_bytes2));
-	  CUCHK(cudaMallocHost((void**)&RR_yldfac[p], num_bytes2));
-
-	  CUCHK(cudaMalloc((void**) &d_SL_yldfac[p], num_bytes2));
-	  CUCHK(cudaMalloc((void**) &d_SR_yldfac[p], num_bytes2));
-	  CUCHK(cudaMalloc((void**) &d_RL_yldfac[p], num_bytes2));
-	  CUCHK(cudaMalloc((void**) &d_RR_yldfac[p], num_bytes2));
-
-	  yldfac_msg_size_y[p] = nxt[p]*ngsl*nzt[p];
-	  num_bytes2 = yldfac_msg_size_y[p]*sizeof(float);
-	  CUCHK(cudaMallocHost((void**)&SF_yldfac[p], num_bytes2));
-	  CUCHK(cudaMallocHost((void**)&SB_yldfac[p], num_bytes2));
-	  CUCHK(cudaMallocHost((void**)&RF_yldfac[p], num_bytes2));
-	  CUCHK(cudaMallocHost((void**)&RB_yldfac[p], num_bytes2));
-
-	  CUCHK(cudaMalloc((void**) &d_SF_yldfac[p], num_bytes2));
-	  CUCHK(cudaMalloc((void**) &d_SB_yldfac[p], num_bytes2));
-	  CUCHK(cudaMalloc((void**) &d_RF_yldfac[p], num_bytes2));
-	  CUCHK(cudaMalloc((void**) &d_RB_yldfac[p], num_bytes2));
-
-       }
-    }
+      SL_vel = (float **)calloc(ngrids, sizeof(float *));
+      SR_vel = (float **)calloc(ngrids, sizeof(float *));
+      RL_vel = (float **)calloc(ngrids, sizeof(float *));
+      RR_vel = (float **)calloc(ngrids, sizeof(float *));
+      SF_vel = (float **)calloc(ngrids, sizeof(float *));
+      SB_vel = (float **)calloc(ngrids, sizeof(float *));
+      RF_vel = (float **)calloc(ngrids, sizeof(float *));
+      RB_vel = (float **)calloc(ngrids, sizeof(float *));
+
+      d_f_u1 = (float **)calloc(ngrids, sizeof(float *));
+      d_f_v1 = (float **)calloc(ngrids, sizeof(float *));
+      d_f_w1 = (float **)calloc(ngrids, sizeof(float *));
+      d_b_u1 = (float **)calloc(ngrids, sizeof(float *));
+      d_b_v1 = (float **)calloc(ngrids, sizeof(float *));
+      d_b_w1 = (float **)calloc(ngrids, sizeof(float *));
+      for (p = 0; p < ngrids; p++)
+      {
+         num_bytes = sizeof(float) * 3 * (ngsl) * (nyt[p] + 4 + ngsl2) * (nzt[p] + 2 * align);
+         CUCHK(cudaMallocHost((void **)&SL_vel[p], num_bytes));
+         CUCHK(cudaMallocHost((void **)&SR_vel[p], num_bytes));
+         CUCHK(cudaMallocHost((void **)&RL_vel[p], num_bytes));
+         CUCHK(cudaMallocHost((void **)&RR_vel[p], num_bytes));
+         num_bytes = sizeof(float) * 3 * (ngsl) * (nxt[p] + 4 + ngsl2) * (nzt[p] + 2 * align);
+         CUCHK(cudaMallocHost((void **)&SF_vel[p], num_bytes));
+         CUCHK(cudaMallocHost((void **)&SB_vel[p], num_bytes));
+         CUCHK(cudaMallocHost((void **)&RF_vel[p], num_bytes));
+         CUCHK(cudaMallocHost((void **)&RB_vel[p], num_bytes));
+         num_bytes = sizeof(float) * (ngsl) * (nxt[p] + 4 + ngsl2) * (nzt[p] + 2 * align);
+         CUCHK(cudaMalloc((void **)&d_f_u1[p], num_bytes));
+         CUCHK(cudaMalloc((void **)&d_f_v1[p], num_bytes));
+         CUCHK(cudaMalloc((void **)&d_f_w1[p], num_bytes));
+         CUCHK(cudaMalloc((void **)&d_b_u1[p], num_bytes));
+         CUCHK(cudaMalloc((void **)&d_b_v1[p], num_bytes));
+         CUCHK(cudaMalloc((void **)&d_b_w1[p], num_bytes));
+
+         CUCHK(cudaMemset(d_f_u1[p], 0., num_bytes));
+         CUCHK(cudaMemset(d_f_v1[p], 0., num_bytes));
+         CUCHK(cudaMemset(d_f_w1[p], 0., num_bytes));
+         CUCHK(cudaMemset(d_b_u1[p], 0., num_bytes));
+         CUCHK(cudaMemset(d_b_v1[p], 0., num_bytes));
+         CUCHK(cudaMemset(d_b_w1[p], 0., num_bytes));
+
+         msg_v_size_x[p] = 3 * (ngsl) * (nyt[p] + 4 + ngsl2) * (nzt[p] + 2 * align);
+         msg_v_size_y[p] = 3 * (ngsl) * (nxt[p] + 4 + ngsl2) * (nzt[p] + 2 * align);
+      }
+
+      SetDeviceConstValue(DH, DT, nxt, nyt, nzt, ngrids, fmajor, fminor, Rz, RzT);
+      print_const_H(ngrids);
+
+      CUCHK(cudaStreamCreate(&stream_1));
+      CUCHK(cudaStreamCreate(&stream_2));
+      //cudaStreamCreate(&stream_2b);
+      CUCHK(cudaStreamCreate(&stream_i));
+      CUCHK(cudaStreamCreate(&stream_i2));
+      CUCHK(cudaStreamCreate(&stream_o));
+      //    Delloc3D(tau);
+
+      /*Daniel - yield factor exchange*/
+      if (NVE == 3)
+      {
+         SL_yldfac = (float **)calloc(ngrids, sizeof(float *));
+         SR_yldfac = (float **)calloc(ngrids, sizeof(float *));
+         RL_yldfac = (float **)calloc(ngrids, sizeof(float *));
+         RR_yldfac = (float **)calloc(ngrids, sizeof(float *));
+         SF_yldfac = (float **)calloc(ngrids, sizeof(float *));
+         SB_yldfac = (float **)calloc(ngrids, sizeof(float *));
+         RF_yldfac = (float **)calloc(ngrids, sizeof(float *));
+         RB_yldfac = (float **)calloc(ngrids, sizeof(float *));
+
+         d_SL_yldfac = (float **)calloc(ngrids, sizeof(float *));
+         d_SR_yldfac = (float **)calloc(ngrids, sizeof(float *));
+         d_RL_yldfac = (float **)calloc(ngrids, sizeof(float *));
+         d_RR_yldfac = (float **)calloc(ngrids, sizeof(float *));
+         d_SF_yldfac = (float **)calloc(ngrids, sizeof(float *));
+         d_SB_yldfac = (float **)calloc(ngrids, sizeof(float *));
+         d_RF_yldfac = (float **)calloc(ngrids, sizeof(float *));
+         d_RB_yldfac = (float **)calloc(ngrids, sizeof(float *));
+
+         yldfac_msg_size_x = (int *)calloc(ngrids, sizeof(int));
+         yldfac_msg_size_y = (int *)calloc(ngrids, sizeof(int));
+         for (p = 0; p < ngrids; p++)
+         {
+            yldfac_msg_size_x[p] = ngsl * (nyt[p] + ngsl2) * nzt[p];
+            num_bytes2 = yldfac_msg_size_x[p] * sizeof(float);
+            /*fprintf(stdout, "swp_msg_size_x=%d, num_bytes2=%d\n", swp_msg_size_x, num_bytes2);*/
+            CUCHK(cudaMallocHost((void **)&SL_yldfac[p], num_bytes2));
+            CUCHK(cudaMallocHost((void **)&SR_yldfac[p], num_bytes2));
+            CUCHK(cudaMallocHost((void **)&RL_yldfac[p], num_bytes2));
+            CUCHK(cudaMallocHost((void **)&RR_yldfac[p], num_bytes2));
+
+            CUCHK(cudaMalloc((void **)&d_SL_yldfac[p], num_bytes2));
+            CUCHK(cudaMalloc((void **)&d_SR_yldfac[p], num_bytes2));
+            CUCHK(cudaMalloc((void **)&d_RL_yldfac[p], num_bytes2));
+            CUCHK(cudaMalloc((void **)&d_RR_yldfac[p], num_bytes2));
+
+            yldfac_msg_size_y[p] = nxt[p] * ngsl * nzt[p];
+            num_bytes2 = yldfac_msg_size_y[p] * sizeof(float);
+            CUCHK(cudaMallocHost((void **)&SF_yldfac[p], num_bytes2));
+            CUCHK(cudaMallocHost((void **)&SB_yldfac[p], num_bytes2));
+            CUCHK(cudaMallocHost((void **)&RF_yldfac[p], num_bytes2));
+            CUCHK(cudaMallocHost((void **)&RB_yldfac[p], num_bytes2));
+
+            CUCHK(cudaMalloc((void **)&d_SF_yldfac[p], num_bytes2));
+            CUCHK(cudaMalloc((void **)&d_SB_yldfac[p], num_bytes2));
+            CUCHK(cudaMalloc((void **)&d_RF_yldfac[p], num_bytes2));
+            CUCHK(cudaMalloc((void **)&d_RB_yldfac[p], num_bytes2));
+         }
+      }
 
-    /* Daniel: overlap zone variable exchange for DM */
-    SL_swap=(float**) calloc(ngrids, sizeof(float*));
-    SR_swap=(float**) calloc(ngrids, sizeof(float*));
-    RL_swap=(float**) calloc(ngrids, sizeof(float*));
-    RR_swap=(float**) calloc(ngrids, sizeof(float*));
-    SF_swap=(float**) calloc(ngrids, sizeof(float*));
-    SB_swap=(float**) calloc(ngrids, sizeof(float*));
-    RF_swap=(float**) calloc(ngrids, sizeof(float*));
-    RB_swap=(float**) calloc(ngrids, sizeof(float*));
-
-    d_SL_swap=(float**) calloc(ngrids, sizeof(float*));
-    d_SR_swap=(float**) calloc(ngrids, sizeof(float*));
-    d_RL_swap=(float**) calloc(ngrids, sizeof(float*));
-    d_RR_swap=(float**) calloc(ngrids, sizeof(float*));
-    d_SF_swap=(float**) calloc(ngrids, sizeof(float*));
-    d_SB_swap=(float**) calloc(ngrids, sizeof(float*));
-    d_RF_swap=(float**) calloc(ngrids, sizeof(float*));
-    d_RB_swap=(float**) calloc(ngrids, sizeof(float*));
-
-    swp_msg_size_x = (int*) calloc(ngrids, sizeof(int));
-    swp_msg_size_y = (int*) calloc(ngrids, sizeof(int));
-
-    nswaplev = swaplevmax-swaplevmin+1;
-
-    for (p=0; p<ngrids; p++){
-       swp_msg_size_x[p] = 9*(2+ngsl+WWL)*(nyt[p]+4+ngsl2+2*WWL)*nswaplev;
-       num_bytes2 = swp_msg_size_x[p]*sizeof(float);
-       
-       CUCHK(cudaMallocHost((void**)&SL_swap[p], num_bytes2));
-       CUCHK(cudaMallocHost((void**)&SR_swap[p], num_bytes2));
-       CUCHK(cudaMallocHost((void**)&RL_swap[p], num_bytes2));
-       CUCHK(cudaMallocHost((void**)&RR_swap[p], num_bytes2));
-
-       CUCHK(cudaMalloc((void**) &d_SL_swap[p], num_bytes2));
-       CUCHK(cudaMalloc((void**) &d_SR_swap[p], num_bytes2));
-       CUCHK(cudaMalloc((void**) &d_RL_swap[p], num_bytes2));
-       CUCHK(cudaMalloc((void**) &d_RR_swap[p], num_bytes2));
-
-       for (k=0; k<(int)(num_bytes2/sizeof(float)); k++) SL_swap[p][k] = SR_swap[p][k] = RL_swap[p][k] = RR_swap[p][k] = 0.f;
-
-       //copy zero-allocated arrays to device if GPU arrays remain uninitialized otherwise
-       /*if (x_rank_L < 0) CUCHK(cudaMemcpy(d_RL_swap[p], RL_swap[p], num_bytes2, cudaMemcpyHostToDevice));
+      /* Daniel: overlap zone variable exchange for DM */
+      SL_swap = (float **)calloc(ngrids, sizeof(float *));
+      SR_swap = (float **)calloc(ngrids, sizeof(float *));
+      RL_swap = (float **)calloc(ngrids, sizeof(float *));
+      RR_swap = (float **)calloc(ngrids, sizeof(float *));
+      SF_swap = (float **)calloc(ngrids, sizeof(float *));
+      SB_swap = (float **)calloc(ngrids, sizeof(float *));
+      RF_swap = (float **)calloc(ngrids, sizeof(float *));
+      RB_swap = (float **)calloc(ngrids, sizeof(float *));
+
+      d_SL_swap = (float **)calloc(ngrids, sizeof(float *));
+      d_SR_swap = (float **)calloc(ngrids, sizeof(float *));
+      d_RL_swap = (float **)calloc(ngrids, sizeof(float *));
+      d_RR_swap = (float **)calloc(ngrids, sizeof(float *));
+      d_SF_swap = (float **)calloc(ngrids, sizeof(float *));
+      d_SB_swap = (float **)calloc(ngrids, sizeof(float *));
+      d_RF_swap = (float **)calloc(ngrids, sizeof(float *));
+      d_RB_swap = (float **)calloc(ngrids, sizeof(float *));
+
+      swp_msg_size_x = (int *)calloc(ngrids, sizeof(int));
+      swp_msg_size_y = (int *)calloc(ngrids, sizeof(int));
+
+      nswaplev = swaplevmax - swaplevmin + 1;
+
+      for (p = 0; p < ngrids; p++)
+      {
+         swp_msg_size_x[p] = 9 * (2 + ngsl + WWL) * (nyt[p] + 4 + ngsl2 + 2 * WWL) * nswaplev;
+         num_bytes2 = swp_msg_size_x[p] * sizeof(float);
+
+         CUCHK(cudaMallocHost((void **)&SL_swap[p], num_bytes2));
+         CUCHK(cudaMallocHost((void **)&SR_swap[p], num_bytes2));
+         CUCHK(cudaMallocHost((void **)&RL_swap[p], num_bytes2));
+         CUCHK(cudaMallocHost((void **)&RR_swap[p], num_bytes2));
+
+         CUCHK(cudaMalloc((void **)&d_SL_swap[p], num_bytes2));
+         CUCHK(cudaMalloc((void **)&d_SR_swap[p], num_bytes2));
+         CUCHK(cudaMalloc((void **)&d_RL_swap[p], num_bytes2));
+         CUCHK(cudaMalloc((void **)&d_RR_swap[p], num_bytes2));
+
+         for (k = 0; k < (int)(num_bytes2 / sizeof(float)); k++)
+            SL_swap[p][k] = SR_swap[p][k] = RL_swap[p][k] = RR_swap[p][k] = 0.f;
+
+         //copy zero-allocated arrays to device if GPU arrays remain uninitialized otherwise
+         /*if (x_rank_L < 0) CUCHK(cudaMemcpy(d_RL_swap[p], RL_swap[p], num_bytes2, cudaMemcpyHostToDevice));
        if (x_rank_R < 0) CUCHK(cudaMemcpy(d_RR_swap[p], RR_swap[p], num_bytes2, cudaMemcpyHostToDevice));*/
-       CUCHK(cudaMemset(d_SL_swap[p], 0., num_bytes2));
-       CUCHK(cudaMemset(d_SR_swap[p], 0., num_bytes2));
-       CUCHK(cudaMemset(d_RL_swap[p], 0., num_bytes2));
-       CUCHK(cudaMemset(d_RR_swap[p], 0., num_bytes2));
-
-       swp_msg_size_y[p] = 9*(nxt[p]+4+ngsl2)*(2+ngsl+WWL)*nswaplev;
-       num_bytes2 = swp_msg_size_y[p]*sizeof(float);
-       CUCHK(cudaMallocHost((void**)&SF_swap[p], num_bytes2));
-       CUCHK(cudaMallocHost((void**)&SB_swap[p], num_bytes2));
-       CUCHK(cudaMallocHost((void**)&RF_swap[p], num_bytes2));
-       CUCHK(cudaMallocHost((void**)&RB_swap[p], num_bytes2));
-
-       CUCHK(cudaMalloc((void**) &d_SF_swap[p], num_bytes2));
-       CUCHK(cudaMalloc((void**) &d_SB_swap[p], num_bytes2));
-       CUCHK(cudaMalloc((void**) &d_RF_swap[p], num_bytes2));
-       CUCHK(cudaMalloc((void**) &d_RB_swap[p], num_bytes2));
-
-       for (k=0; k<(int)(num_bytes2/sizeof(float)); k++) SF_swap[p][k] = SB_swap[p][k] = RF_swap[p][k] = RB_swap[p][k] = 0.f;
-
-       //copy zero-allocated arrays to device if GPU arrays remain uninitialized otherwise
-       /*if (y_rank_F < 0) cudaMemcpy(d_RF_swap[p], RF_swap[p], num_bytes2, cudaMemcpyHostToDevice);
+         CUCHK(cudaMemset(d_SL_swap[p], 0., num_bytes2));
+         CUCHK(cudaMemset(d_SR_swap[p], 0., num_bytes2));
+         CUCHK(cudaMemset(d_RL_swap[p], 0., num_bytes2));
+         CUCHK(cudaMemset(d_RR_swap[p], 0., num_bytes2));
+
+         swp_msg_size_y[p] = 9 * (nxt[p] + 4 + ngsl2) * (2 + ngsl + WWL) * nswaplev;
+         num_bytes2 = swp_msg_size_y[p] * sizeof(float);
+         CUCHK(cudaMallocHost((void **)&SF_swap[p], num_bytes2));
+         CUCHK(cudaMallocHost((void **)&SB_swap[p], num_bytes2));
+         CUCHK(cudaMallocHost((void **)&RF_swap[p], num_bytes2));
+         CUCHK(cudaMallocHost((void **)&RB_swap[p], num_bytes2));
+
+         CUCHK(cudaMalloc((void **)&d_SF_swap[p], num_bytes2));
+         CUCHK(cudaMalloc((void **)&d_SB_swap[p], num_bytes2));
+         CUCHK(cudaMalloc((void **)&d_RF_swap[p], num_bytes2));
+         CUCHK(cudaMalloc((void **)&d_RB_swap[p], num_bytes2));
+
+         for (k = 0; k < (int)(num_bytes2 / sizeof(float)); k++)
+            SF_swap[p][k] = SB_swap[p][k] = RF_swap[p][k] = RB_swap[p][k] = 0.f;
+
+         //copy zero-allocated arrays to device if GPU arrays remain uninitialized otherwise
+         /*if (y_rank_F < 0) cudaMemcpy(d_RF_swap[p], RF_swap[p], num_bytes2, cudaMemcpyHostToDevice);
        if (y_rank_B < 0) cudaMemcpy(d_RB_swap[p], RB_swap[p], num_bytes2, cudaMemcpyHostToDevice);*/
-       CUCHK(cudaMemset(d_SF_swap[p], 0., num_bytes2));
-       CUCHK(cudaMemset(d_SB_swap[p], 0., num_bytes2));
-       CUCHK(cudaMemset(d_RF_swap[p], 0., num_bytes2));
-       CUCHK(cudaMemset(d_RB_swap[p], 0., num_bytes2));
+         CUCHK(cudaMemset(d_SF_swap[p], 0., num_bytes2));
+         CUCHK(cudaMemset(d_SB_swap[p], 0., num_bytes2));
+         CUCHK(cudaMemset(d_RF_swap[p], 0., num_bytes2));
+         CUCHK(cudaMemset(d_RB_swap[p], 0., num_bytes2));
 
-       /*swp_msg_size_x_l = 9*(2+ngsl+WWL)*(nytl+4+ngsl2+2*WWL); 
+         /*swp_msg_size_x_l = 9*(2+ngsl+WWL)*(nytl+4+ngsl2+2*WWL); 
        swp_msg_size_y_l = 9*(nxtl+4+8*loop)*(2+4*loop+WWL)*1;*/
 
-       intlev[p] = nzt[p] + align - 3;
-    }
+         intlev[p] = nzt[p] + align - 3;
+      }
 
-            // Initialize grids
-            grids_t grids[MAXGRIDS];
-            int istopo = usetopo;
-            for (p = 0; p < ngrids; p++) {
-                    // Disable topography in grids below the top grid
-                    if (p > 0) istopo = 0;
-                    grids[p] = grids_init(nxt[p], nyt[p], nzt[p], coord[0],
-                                          coord[1], 0, istopo, DH[p]);
-            }
+      // Initialize grids
+      grids_t grids[MAXGRIDS];
+      int istopo = usetopo;
+      for (p = 0; p < ngrids; p++)
+      {
+         // Disable topography in grids below the top grid
+         if (p > 0)
+            istopo = 0;
+         grids[p] = grids_init(nxt[p], nyt[p], nzt[p], coord[0],
+                               coord[1], 0, istopo, DH[p]);
+      }
 
-            f_grid_t *metrics_f = NULL;
+      f_grid_t *metrics_f = NULL;
+      g_grid_t *metrics_g = NULL;
+      struct mapping *map = NULL;
+
+if (usemms) {
+        if (rank == 0) printf("METHOD OF MANUFACTURED SOLUTIONS ENABLED \n");
+        mms_init(MMSFILE, nxt, nyt, nzt, ngrids, 
+        d_d1, d_lam, d_mu, d_qp, d_qs, 
+        d_u1, d_v1, d_w1, d_xx, d_yy, d_zz, d_xy, d_xz, d_yz, coord[0], coord[1], rank, MCW, DH, DT);
+}
 
 #if TOPO
 
-            topo_t T = topo_init(usetopo, INTOPO, 
-                                 rank, 
-                                 x_rank_L, x_rank_R,
-                                 y_rank_F, y_rank_B, coord,
-                                 dim[0], dim[1],
-                                 nxt[0], nyt[0], nzt[0], 
-                                 DT, *DH,
-                                 stream_1, stream_2, stream_i
-                                 );
-            topo_bind(&T, d_d1[0], d_lam[0], d_mu[0], 
-                      d_qp[0], d_coeff, d_qs[0], d_vx1[0], d_vx2[0], d_ww[0],
-                      d_wwo[0], d_xx[0], d_yy[0], d_zz[0],
-                      d_xy[0], d_xz[0], d_yz[0], d_r1[0], d_r2[0], d_r3[0],
-                      d_r4[0], d_r5[0], d_r6[0], d_u1[0], d_v1[0], d_w1[0],
-                      d_f_u1[0], d_f_v1[0], d_f_w1[0], d_b_u1[0], d_b_v1[0],
-                      d_b_w1[0], d_dcrjx[0], d_dcrjy[0], d_dcrjz[0]);
-            topo_init_metrics(&T);
-
-            if (T.use) {
-                topo_init_grid(&T);
-                topo_init_geometry(&T);
-                topo_build(&T);
-                topo_set_constants(&T);
-            }
+      energy_t energy = energy_init(useenergy, rank, MCW, nt, DT, nxt[0], nyt[0], nzt[0], NTISKP);
+      topo_t T = topo_init(usetopo, INTOPO,
+                           rank,
+                           x_rank_L, x_rank_R,
+                           y_rank_F, y_rank_B, coord,
+                           dim[0], dim[1],
+                           nxt[0], nyt[0], nzt[0],
+                           DT, *DH, DHB, DHT,
+                           stream_1, stream_2, stream_i);
+      topo_bind(&T, d_d1[0], d_lam[0], d_mu[0],
+                d_qp[0], d_coeff, d_qs[0], d_vx1[0], d_vx2[0], d_ww[0],
+                d_wwo[0], d_xx[0], d_yy[0], d_zz[0],
+                d_xy[0], d_xz[0], d_yz[0], d_r1[0], d_r2[0], d_r3[0],
+                d_r4[0], d_r5[0], d_r6[0], d_u1[0], d_v1[0], d_w1[0],
+                d_f_u1[0], d_f_v1[0], d_f_w1[0], d_b_u1[0], d_b_v1[0],
+                d_b_w1[0], d_dcrjx[0], d_dcrjy[0], d_dcrjz[0]);
+      topo_init_metrics(&T);
+      map = &T.map;
+
+      if (T.use)
+      {
+         topo_init_grid(&T);
+         topo_init_geometry(&T);
+         topo_build(&T);
+         topo_set_constants(&T);
+
+        topo_write_geometry_vtk(&T, 1);
+      }
+
+#endif
 
+#if VERBOSE
+      if (rank == 0)
+         printf("Initialize source and receivers\n");
+      fflush(stdout);
+#endif
 
+      if (T.use)
+      {
+         metrics_f = &T.metrics_f;
+         metrics_g = &T.metrics_g;
+      }
 
+      if (usesourcefile)
+         sources_init(SOURCEFILE, grids, map, ngrids, metrics_f, metrics_g, MCW, rank,
+                      size_tot);
+      if (userecvfile)
+         receivers_init(RECVFILE, grids, map, ngrids, metrics_f, MCW, rank,
+                        size_tot);
+      if (useforcefile)
+         forces_init(FORCEFILE, grids, map, ngrids, metrics_f, metrics_g, MCW, rank,
+                     size_tot, (float*)d_d1[0], usetopo);
+      if (usesgtfile)
+      {
+         sgt_init(SGTFILE, grids, map, ngrids, metrics_f, MCW, rank,
+                  size_tot);
+         for (p = 0; p < ngrids; p++)
+         {
+            sgt_write_material_properties(d_d1[p], d_lam[p],
+                                          d_mu[p], p, rank);
+         }
+      }
+#if VERBOSE
+      if (rank == 0)
+         printf("done.\n");
+      fflush(stdout);
 #endif
 
+      size_t cmemfreeMin;
+      cudaMemGetInfo(&cmemfree, &cmemtotal);
+      if (sizeof(size_t) == 8)
+         MPI_Reduce(&cmemfree, &cmemfreeMin, 1, MPI_UINT64_T, MPI_MIN, 0, MCW);
+      else
+         MPI_Reduce(&cmemfree, &cmemfreeMin, 1, MPI_UINT32_T, MPI_MIN, 0, MCW);
+      if (rank == 0)
+         printf("CUDA MEMORY: free = %f GB \ttotal = %f GB \n",
+                cmemfreeMin / 1e9, cmemtotal / 1e9);
+
+      if (rank == 0)
+      {
+         cudaMemGetInfo(&cmemfree, &cmemtotal);
 #if VERBOSE
-            if(rank == 0)printf("Initialize source and receivers\n");
-            fflush(stdout);
+         printf("CUDA MEMORY: Total=%ld\tAvailable=%ld\n", cmemtotal, cmemfree);
 #endif
+      }
+
+      for (p = 0; p < ngrids; p++)
+      {
+         receivers_write(d_u1[p], d_v1[p], d_w1[p], 0, nt, p);
+         sgt_write(d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p], 0, nt, p);
+      }
+      sources_read(0);
+
+      if (rank == 0)
+         fchk = fopen(CHKFILE, "a+");
+
+      time_init = gethrtime() - time_init;
+      if (rank == 0)
+         printf("Initialization completed in %f s.\n", time_init);
+      //  Main Loop Starts
+      if (rank == 0)
+      {
+         printf(
+             "Time step \t Time \t\t Elapsed time \t sec per time step \t Time "
+             "steps per sec \t Percentage completed\n");
+         printf(
+             "-----------------------------------------------------------------"
+             "----------------------------------------------------\n");
+      }
+      if (((NPC == 0) || (NPC == 2)) && (NVE == 1 || NVE == 3))
+      {
+         time_un -= gethrtime();
+         //This loop has no loverlapping because there is source input
+         for (cur_step = 1; cur_step <= nt; cur_step++)
+         {
+
+            forces_read(cur_step - 1);
+            if (T.use)
+               forces_add(d_u1[0], d_v1[0], d_w1[0], d_d1[0], cur_step - 1, DH[0], DT,
+                          &T.metrics_f, &T.metrics_g, 0);
+            else
+                forces_add_cartesian_velocity(d_u1[0], d_v1[0], d_w1[0], cur_step - 1, nxt[0], nyt[0], nzt[0], DH[0], DT, 0);
 
+             energy_zero(&energy, d_u1[0], d_v1[0], d_w1[0], d_xx[0], d_yy[0], d_zz[0], d_xy[0], d_xz[0], d_yz[0], 0);
+             energy_update_previous_solutions(&energy, d_u1[0], d_v1[0], d_w1[0], d_xx[0], d_yy[0], d_zz[0], d_xy[0], d_xz[0], d_yz[0]);
 
-            if (T.use) {
-                metrics_f = &T.metrics_f;
+            CUCHK(cudaStreamSynchronize(stream_i));
+            CUCHK(cudaStreamSynchronize(stream_1));
+            CUCHK(cudaStreamSynchronize(stream_2));
+            if (cur_step % 10 == 0 && rank == 0)
+            {
+               double elapsed = gethrtime() + time_un;
+               double step_s = elapsed / cur_step;
+               double pcomplete = 100 * (double)cur_step / (double)(nt - 1);
+               printf(
+                   "%ld  \t\t %4.3f s \t %4.3f s \t %3.5f s/step \t %5.3f step/s \t %2.2f %%  \n",
+                   cur_step, cur_step * DT, elapsed, step_s, 1.0 / step_s, pcomplete);
+
+               fflush(stdout);
+               fflush(stderr);
             }
+            CUCHK(cudaGetLastError());
+            //cerr=cudaGetLastError();
+            //
 
-            if (usesourcefile)
-                sources_init(SOURCEFILE, grids, ngrids, metrics_f, MCW, rank,
-                             size_tot);
-            if (userecvfile)
-                receivers_init(RECVFILE, grids, ngrids, metrics_f, MCW, rank,
-                               size_tot);
-            if (useforcefile)
-                forces_init(FORCEFILE, grids, ngrids, metrics_f, MCW, rank,
-                             size_tot);
-            if (usesgtfile) {
-                sgt_init(SGTFILE, grids, ngrids, metrics_f, MCW, rank,
-                             size_tot);
-                for (p = 0; p < ngrids; p++) {
-                        sgt_write_material_properties(d_d1[p], d_lam[p],
-                                                      d_mu[p], p);
-                }
+            for (p = 0; p < ngrids; p++)
+            {
+               dump_nonzeros(d_u1[p], nxt[p] + 4 + 8 * loop, nyt[p] + 4 + 8 * loop, nzt[p] + 2 * align, "u1", p, cur_step, 6, rank, size);
+               dump_nonzeros(d_v1[p], nxt[p] + 4 + 8 * loop, nyt[p] + 4 + 8 * loop, nzt[p] + 2 * align, "v1", p, cur_step, 6, rank, size);
+               dump_nonzeros(d_w1[p], nxt[p] + 4 + 8 * loop, nyt[p] + 4 + 8 * loop, nzt[p] + 2 * align, "w1", p, cur_step, 6, rank, size);
             }
-#if VERBOSE
-            if(rank == 0)printf("done.\n");
-            fflush(stdout);
+
+            if (cerr != cudaSuccess)
+               printf("CUDA ERROR! rank=%d before timestep: %s\n", rank, cudaGetErrorString(cerr));
+            //pre-post MPI Message
+
+            for (p = 0; p < ngrids; p++)
+            {
+               PostRecvMsg_Y(RF_vel[p], RB_vel[p], MCW, request_y[p], &count_y[p], msg_v_size_y[p], y_rank_F, y_rank_B, p);
+               //PostRecvMsg_X(RL_vel[p], RR_vel[p], MCW, request_x[p], &count_x[p], msg_v_size_x[p], x_rank_L, x_rank_R, p);
+               //velocity computation in y boundary, two ghost cell regions
+               if (!usetopo || p > 0)
+               {
+                  dvelcy_H(d_u1[p], d_v1[p], d_w1[p], d_xx[p], d_yy[p], d_zz[p], d_xy[p],
+                           d_xz[p], d_yz[p], d_dcrjx[p], d_dcrjy[p], d_dcrjz[p],
+                           d_d1[p], nxt[p], nzt[p], d_f_u1[p], d_f_v1[p], d_f_w1[p],
+                           stream_1, yfs[p], yfe[p], y_rank_F, p);
+               }
+               else
+               {
+#if TOPO
+                  topo_velocity_front_H(&T);
+#endif
+               }
+               Cpy2Host_VY(d_f_u1[p], d_f_v1[p], d_f_w1[p], SF_vel[p], nxt[p], nzt[p], stream_1, y_rank_F);
+
+               if (!usetopo || p > 0)
+               {
+                  dvelcy_H(d_u1[p], d_v1[p], d_w1[p], d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p],
+                           d_dcrjx[p], d_dcrjy[p], d_dcrjz[p], d_d1[p], nxt[p], nzt[p],
+                           d_b_u1[p], d_b_v1[p], d_b_w1[p], stream_2, ybs[p], ybe[p], y_rank_B, p);
+               }
+               else
+               {
+#if TOPO
+                  topo_velocity_back_H(&T);
 #endif
+               }
 
-    size_t  cmemfreeMin;
-    cudaMemGetInfo(&cmemfree, &cmemtotal);
-    if(sizeof(size_t)==8) 
-      MPI_Reduce(&cmemfree, &cmemfreeMin, 1, MPI_UINT64_T, MPI_MIN, 0, MCW);
-    else 
-      MPI_Reduce(&cmemfree, &cmemfreeMin, 1, MPI_UINT32_T, MPI_MIN, 0, MCW);
-    if (rank == 0)
-            printf("CUDA MEMORY: free = %f GB \ttotal = %f GB \n",
-                   cmemfreeMin / 1e9, cmemtotal / 1e9);
+               Cpy2Host_VY(d_b_u1[p], d_b_v1[p], d_b_w1[p], SB_vel[p], nxt[p], nzt[p], stream_2, y_rank_B);
 
+               CUCHK(cudaStreamSynchronize(stream_1)); /*these fix sync issues, but not sure why*/
+               CUCHK(cudaStreamSynchronize(stream_2));
 
-    if(rank==0){
-      cudaMemGetInfo(&cmemfree, &cmemtotal);
-#if VERBOSE
-      printf("CUDA MEMORY: Total=%ld\tAvailable=%ld\n",cmemtotal,cmemfree);
+               //usleep(1);
+
+               if (!usetopo || p > 0)
+               {
+                  dvelcx_H_opt(d_u1[p], d_v1[p], d_w1[p], d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p],
+                               d_dcrjx[p], d_dcrjy[p], d_dcrjz[p], d_d1[p], nyt[p], nzt[p], stream_i, xvs[p], xve[p], p, ngrids);
+               }
+               else
+               {
+#if TOPO
+                  topo_velocity_interior_H(&T);
 #endif
-    }
+               }
+            }
 
-    for (p=0; p<ngrids; p++){
-        receivers_write(d_u1[p], d_v1[p], d_w1[p], 0, nt, p);
-        sgt_write(d_xx[p], d_yy[p], d_zz[p], d_xy[0], d_xz[p], d_yz[p], 0, nt, p);
-    }
-    sources_read(0);
-    forces_read(0);
-
-    if(rank==0)
-      fchk = fopen(CHKFILE,"a+");
-
-    time_init = gethrtime() - time_init;
-    if (rank == 0) printf("Initialization completed in %f s.\n", time_init);
-    //  Main Loop Starts
-    if (rank == 0) {
-    printf(
-        "Time step \t Time \t\t Elapsed time \t sec per time step \t Time "
-        "steps per sec \t Percentage completed\n");
-    printf(
-        "-----------------------------------------------------------------"
-        "----------------------------------------------------\n");
-    }
-    if( ((NPC==0) || (NPC==2)) && (NVE==1 || NVE==3))
-    {
-       time_un  -= gethrtime();
-       //This loop has no loverlapping because there is source input
-       for(cur_step=1;cur_step<=nt;cur_step++)
-       {
-         //CUCHK(cudaDeviceSynchronize());
-         CUCHK(cudaStreamSynchronize(stream_i));
-         CUCHK(cudaStreamSynchronize(stream_1));
-         CUCHK(cudaStreamSynchronize(stream_2));
-         if (cur_step % 10 == 0 && rank == 0) {
-                 double elapsed = gethrtime() + time_un;
-                double step_s = elapsed / cur_step;
-                double pcomplete = 100 * (double) cur_step / (double) (nt - 1);
-                printf(
-                    "%ld  \t\t %4.3f s \t %4.3f s \t %3.5f s/step \t %5.3f step/s \t %2.2f %%  \n",
-                    cur_step, cur_step * DT, elapsed, step_s, 1.0/step_s, pcomplete);
-
-                fflush(stdout);
-                fflush(stderr);
-         }
-         CUCHK(cudaGetLastError());
-         //cerr=cudaGetLastError();
-         
-         for (p=0; p<ngrids; p++){
-	    dump_nonzeros(d_u1[p], nxt[p]+4+8*loop, nyt[p]+4+8*loop, nzt[p]+2*align, "u1", p, cur_step, 6, rank, size);
-	    dump_nonzeros(d_v1[p], nxt[p]+4+8*loop, nyt[p]+4+8*loop, nzt[p]+2*align, "v1", p, cur_step, 6, rank, size);
-	    dump_nonzeros(d_w1[p], nxt[p]+4+8*loop, nyt[p]+4+8*loop, nzt[p]+2*align, "w1", p, cur_step, 6, rank, size);
-         }
-         
-         if(cerr!=cudaSuccess) printf("CUDA ERROR! rank=%d before timestep: %s\n",rank,cudaGetErrorString(cerr));
-	 //pre-post MPI Message
-
-	 for (p=0; p<ngrids; p++){
-	    PostRecvMsg_Y(RF_vel[p], RB_vel[p], MCW, request_y[p], &count_y[p], msg_v_size_y[p], y_rank_F, y_rank_B, p);
-	    //PostRecvMsg_X(RL_vel[p], RR_vel[p], MCW, request_x[p], &count_x[p], msg_v_size_x[p], x_rank_L, x_rank_R, p);
-	    //velocity computation in y boundary, two ghost cell regions
-            if (!usetopo || p > 0) {
-	    dvelcy_H(d_u1[p], d_v1[p], d_w1[p], d_xx[p],   d_yy[p],   d_zz[p],   d_xy[p],       
-                     d_xz[p], d_yz[p], d_dcrjx[p], d_dcrjy[p], d_dcrjz[p],
-		     d_d1[p], nxt[p],  nzt[p],  d_f_u1[p], d_f_v1[p], d_f_w1[p], 
-                     stream_1,   yfs[p],  yfe[p], y_rank_F, p);
-            } else {
-            #if TOPO
-                topo_velocity_front_H(&T);
-            #endif
-            }
-	    Cpy2Host_VY(d_f_u1[p], d_f_v1[p], d_f_w1[p],  SF_vel[p], nxt[p], nzt[p], stream_1, y_rank_F);
-
-            if (!usetopo || p > 0) {
-	    dvelcy_H(d_u1[p], d_v1[p], d_w1[p], d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p], 
-                     d_dcrjx[p], d_dcrjy[p], d_dcrjz[p], d_d1[p], nxt[p],  nzt[p],  
-                     d_b_u1[p], d_b_v1[p], d_b_w1[p], stream_2, ybs[p], ybe[p], y_rank_B, p);
-            } else {
-            #if TOPO
-                topo_velocity_back_H(&T);
-            #endif
-            }
-
-	    Cpy2Host_VY(d_b_u1[p], d_b_v1[p], d_b_w1[p], SB_vel[p], nxt[p], nzt[p], stream_2, y_rank_B);
-
-            CUCHK(cudaStreamSynchronize(stream_1)); /*these fix sync issues, but not sure why*/
-            CUCHK(cudaStreamSynchronize(stream_2)); 
-            //usleep(1);
-
-            if (!usetopo || p > 0) {
-	    dvelcx_H_opt(d_u1[p], d_v1[p], d_w1[p], d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p], 
-                     d_dcrjx[p], d_dcrjy[p], d_dcrjz[p], d_d1[p], nyt[p], nzt[p], stream_i, xvs[p],  xve[p], p, ngrids);
-            } else {
-            #if TOPO
-                topo_velocity_interior_H(&T);
-            #endif
-                }
-         }
+            for (p = 0; p < ngrids; p++)
+            {
+               dump_nonzeros(d_u1[p], nxt[p] + 4 + 8 * loop, nyt[p] + 4 + 8 * loop, nzt[p] + 2 * align, "u1", p, cur_step, 7, rank, size);
+               dump_nonzeros(d_v1[p], nxt[p] + 4 + 8 * loop, nyt[p] + 4 + 8 * loop, nzt[p] + 2 * align, "v1", p, cur_step, 7, rank, size);
+               dump_nonzeros(d_w1[p], nxt[p] + 4 + 8 * loop, nyt[p] + 4 + 8 * loop, nzt[p] + 2 * align, "w1", p, cur_step, 7, rank, size);
+            }
 
-         for (p=0; p<ngrids; p++){
-	    dump_nonzeros(d_u1[p], nxt[p]+4+8*loop, nyt[p]+4+8*loop, nzt[p]+2*align, "u1", p, cur_step, 7, rank, size);
-	    dump_nonzeros(d_v1[p], nxt[p]+4+8*loop, nyt[p]+4+8*loop, nzt[p]+2*align, "v1", p, cur_step, 7, rank, size);
-	    dump_nonzeros(d_w1[p], nxt[p]+4+8*loop, nyt[p]+4+8*loop, nzt[p]+2*align, "w1", p, cur_step, 7, rank, size);
-         }
+             energy_zero(&energy, d_u1[0], d_v1[0], d_w1[0], d_xx[0], d_yy[0], d_zz[0], d_xy[0], d_xz[0], d_yz[0], 0);
+            //MPI overlapping velocity computation
 
-         //MPI overlapping velocity computation
+            //velocity communication in y direction
+            CUCHK(cudaStreamSynchronize(stream_1));
+            for (p = 0; p < ngrids; p++)
+            {
+               PostSendMsg_Y(SF_vel[p], SB_vel[p], MCW, request_y[p], &count_y[p],
+                             msg_v_size_y[p], y_rank_F, y_rank_B, rank, Front, p);
+            }
+            CUCHK(cudaStreamSynchronize(stream_2));
+            for (p = 0; p < ngrids; p++)
+            {
+               PostSendMsg_Y(SF_vel[p], SB_vel[p], MCW, request_y[p], &count_y[p],
+                             msg_v_size_y[p], y_rank_F, y_rank_B, rank, Back, p);
+               MPI_Waitall(count_y[p], request_y[p], status_y[p]);
+               Cpy2Device_VY(d_u1[p], d_v1[p], d_w1[p], d_f_u1[p], d_f_v1[p], d_f_w1[p], d_b_u1[p], d_b_v1[p], d_b_w1[p],
+                             RF_vel[p], RB_vel[p], nxt[p], nyt[p],
+                             nzt[p], stream_1, stream_2, y_rank_F, y_rank_B, p);
+            }
 
-         //velocity communication in y direction
-         CUCHK(cudaStreamSynchronize(stream_1));
-	 for (p=0; p<ngrids; p++){
-	    PostSendMsg_Y(SF_vel[p], SB_vel[p], MCW, request_y[p], &count_y[p], 
-               msg_v_size_y[p], y_rank_F, y_rank_B, rank, Front, p);
-         }
-	 CUCHK(cudaStreamSynchronize(stream_2));
-	 for (p=0; p<ngrids; p++){
-	    PostSendMsg_Y(SF_vel[p], SB_vel[p], MCW, request_y[p], &count_y[p], 
-               msg_v_size_y[p], y_rank_F, y_rank_B, rank, Back, p);
-	    MPI_Waitall(count_y[p], request_y[p], status_y[p]);
-	    Cpy2Device_VY(d_u1[p], d_v1[p], d_w1[p], d_f_u1[p], d_f_v1[p], d_f_w1[p], d_b_u1[p], d_b_v1[p], d_b_w1[p],  
-                          RF_vel[p], RB_vel[p], nxt[p], nyt[p], 
-			  nzt[p], stream_1, stream_2, y_rank_F, y_rank_B, p);
-         }
+            CUCHK(cudaStreamSynchronize(stream_i));
+            CUCHK(cudaStreamSynchronize(stream_1));
+            CUCHK(cudaStreamSynchronize(stream_2));
+            /* DM: 2nd order velocity update */
+            for (p = 0; p < ngrids - 1; p++)
+            {
+               dvelc2_H(d_u1[p], d_v1[p], d_w1[p], d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p],
+                        d_dcrjx[p], d_dcrjy[p], d_dcrjz[p], d_d1[p], nxt[p], nyt[p], stream_i, p);
+            }
+            for (p = 0; p < ngrids; p++)
+            {
+               dump_nonzeros(d_u1[p], nxt[p] + 4 + 8 * loop, nyt[p] + 4 + 8 * loop, nzt[p] + 2 * align, "u1", p, cur_step, 0, rank, size);
+               dump_nonzeros(d_v1[p], nxt[p] + 4 + 8 * loop, nyt[p] + 4 + 8 * loop, nzt[p] + 2 * align, "v1", p, cur_step, 0, rank, size);
+               dump_nonzeros(d_w1[p], nxt[p] + 4 + 8 * loop, nyt[p] + 4 + 8 * loop, nzt[p] + 2 * align, "w1", p, cur_step, 0, rank, size);
+            }
 
-         CUCHK(cudaStreamSynchronize(stream_i));
-         CUCHK(cudaStreamSynchronize(stream_1));
-         CUCHK(cudaStreamSynchronize(stream_2));
-         /* DM: 2nd order velocity update */
-	 for (p=0; p<ngrids-1; p++){
-            dvelc2_H(d_u1[p], d_v1[p], d_w1[p], d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p], 
-                d_dcrjx[p], d_dcrjy[p], d_dcrjz[p], d_d1[p], nxt[p], nyt[p], stream_i, p);
-         }
-         for (p=0; p<ngrids; p++){
-	    dump_nonzeros(d_u1[p], nxt[p]+4+8*loop, nyt[p]+4+8*loop, nzt[p]+2*align, "u1", p, cur_step, 0, rank, size);
-	    dump_nonzeros(d_v1[p], nxt[p]+4+8*loop, nyt[p]+4+8*loop, nzt[p]+2*align, "v1", p, cur_step, 0, rank, size);
-	    dump_nonzeros(d_w1[p], nxt[p]+4+8*loop, nyt[p]+4+8*loop, nzt[p]+2*align, "w1", p, cur_step, 0, rank, size);
-         }
+            CUCHK(cudaStreamSynchronize(stream_i));
+            /*swap transition zone data on coarse grid(s)*/
+            for (p = 1; p < ngrids; p++)
+            {
+               Cpy2Host_swaparea_Y(d_u1[p], d_v1[p], d_w1[p], d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p],
+                                   SF_swap[p], SB_swap[p], d_SF_swap[p], d_SB_swap[p], nxt[p], stream_i, stream_i, y_rank_F, y_rank_B,
+                                   intlev[p], intlev[p], p);
+               CUCHK(cudaStreamSynchronize(stream_i));
+               PostRecvMsg_Y(RF_swap[p], RB_swap[p], MCW, request_y_swp[p], count_y_swp + p, swp_msg_size_y[p], y_rank_F, y_rank_B, p);
+               PostSendMsg_Y(SF_swap[p], SB_swap[p], MCW, request_y_swp[p], count_y_swp + p, swp_msg_size_y[p], y_rank_F, y_rank_B,
+                             rank, Both, p);
+               MPI_Waitall(count_y_swp[p], request_y_swp[p], status_y_swp[p]);
+               Cpy2Device_swaparea_Y(d_u1[p], d_v1[p], d_w1[p], d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p],
+                                     RF_swap[p], RB_swap[p], d_RF_swap[p], d_RB_swap[p], nxt[p], stream_i, stream_i, y_rank_F, y_rank_B,
+                                     intlev[p], intlev[p], p);
+               Cpy2Host_swaparea_X(d_u1[p], d_v1[p], d_w1[p], d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p],
+                                   SL_swap[p], SR_swap[p], d_SL_swap[p], d_SR_swap[p], nyt[p], stream_i, stream_i, x_rank_L, x_rank_R,
+                                   intlev[p], intlev[p], p);
+               CUCHK(cudaStreamSynchronize(stream_i));
+               PostRecvMsg_X(RL_swap[p], RR_swap[p], MCW, request_x_swp[p], count_x_swp + p, swp_msg_size_x[p], x_rank_L, x_rank_R, p);
+               PostSendMsg_X(SL_swap[p], SR_swap[p], MCW, request_x_swp[p], count_x_swp + p, swp_msg_size_x[p], x_rank_L, x_rank_R,
+                             rank, Both, p);
+               MPI_Waitall(count_x_swp[p], request_x_swp[p], status_x_swp[p]);
+               Cpy2Device_swaparea_X(d_u1[p], d_v1[p], d_w1[p], d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p],
+                                     RL_swap[p], RR_swap[p], d_RL_swap[p], d_RR_swap[p], nyt[p], stream_i, stream_i, x_rank_L, x_rank_R,
+                                     intlev[p], intlev[p], p);
+            }
 
-         CUCHK(cudaStreamSynchronize(stream_i));
-         /*swap transition zone data on coarse grid(s)*/
-	 for (p=1; p<ngrids; p++){
-            Cpy2Host_swaparea_Y(d_u1[p], d_v1[p], d_w1[p], d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p], 
-                     SF_swap[p], SB_swap[p], d_SF_swap[p], d_SB_swap[p], nxt[p], stream_i, stream_i, y_rank_F, y_rank_B, 
-                     intlev[p], intlev[p], p);
             CUCHK(cudaStreamSynchronize(stream_i));
-            PostRecvMsg_Y(RF_swap[p], RB_swap[p], MCW, request_y_swp[p], count_y_swp+p, swp_msg_size_y[p], y_rank_F, y_rank_B, p);
-            PostSendMsg_Y(SF_swap[p], SB_swap[p], MCW, request_y_swp[p], count_y_swp+p, swp_msg_size_y[p], y_rank_F, y_rank_B, 
-                     rank, Both, p);
-            MPI_Waitall(count_y_swp[p], request_y_swp[p], status_y_swp[p]);
-            Cpy2Device_swaparea_Y(d_u1[p], d_v1[p], d_w1[p], d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p], 
-                     RF_swap[p], RB_swap[p], d_RF_swap[p], d_RB_swap[p], nxt[p], stream_i, stream_i, y_rank_F, y_rank_B, 
-                     intlev[p], intlev[p], p);
-            Cpy2Host_swaparea_X(d_u1[p], d_v1[p], d_w1[p], d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p], 
-                     SL_swap[p], SR_swap[p], d_SL_swap[p], d_SR_swap[p], nyt[p], stream_i, stream_i, x_rank_L, x_rank_R, 
-                     intlev[p], intlev[p], p);
+            for (p = 1; p < ngrids; p++)
+            {
+               intp3d_H(d_u1[p], d_v1[p], d_w1[p], d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p],
+                        d_u1[p - 1], d_v1[p - 1], d_w1[p - 1], d_xx[p - 1], d_yy[p - 1], d_zz[p - 1], d_xy[p - 1], d_xz[p - 1], d_yz[p - 1],
+                        nxt[p], nyt[p], rank, stream_i, p);
+            }
             CUCHK(cudaStreamSynchronize(stream_i));
-            PostRecvMsg_X(RL_swap[p], RR_swap[p], MCW, request_x_swp[p], count_x_swp+p, swp_msg_size_x[p], x_rank_L, x_rank_R, p);
-            PostSendMsg_X(SL_swap[p], SR_swap[p], MCW, request_x_swp[p], count_x_swp+p, swp_msg_size_x[p], x_rank_L, x_rank_R, 
-                     rank, Both, p);
-            MPI_Waitall(count_x_swp[p], request_x_swp[p], status_x_swp[p]);
-            Cpy2Device_swaparea_X(d_u1[p], d_v1[p], d_w1[p], d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p], 
-                     RL_swap[p], RR_swap[p], d_RL_swap[p], d_RR_swap[p], nyt[p], stream_i, stream_i, x_rank_L, x_rank_R, 
-                     intlev[p], intlev[p], p);
-         }
 
-         CUCHK(cudaStreamSynchronize(stream_i));
-	 for (p=1; p<ngrids; p++){
-            intp3d_H(d_u1[p], d_v1[p], d_w1[p], d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p],
-                   d_u1[p-1], d_v1[p-1], d_w1[p-1], d_xx[p-1], d_yy[p-1], d_zz[p-1], d_xy[p-1], d_xz[p-1], d_yz[p-1],
-                   nxt[p], nyt[p], rank, stream_i, p);
-         }
-         CUCHK(cudaStreamSynchronize(stream_i));
-
-         for (p=0; p<ngrids; p++)
-            dump_nonzeros(d_w1[p], nxt[p]+4+8*loop, nyt[p]+4+8*loop, nzt[p]+2*align, "w1", p, cur_step, 1, rank, size);
-
-         for (p=0; p<ngrids-1; p++){
-	    Cpy2Host_swaparea_Y(d_u1[p], d_v1[p], d_w1[p], d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p], 
-		     SF_swap[p], SB_swap[p], d_SF_swap[p], d_SB_swap[p], nxt[p], stream_i, stream_i, y_rank_F, y_rank_B, 
-                     swaplevmin, swaplevmax, p);
-	    CUCHK(cudaStreamSynchronize(stream_i));
-	    PostRecvMsg_Y(RF_swap[p], RB_swap[p], MCW, request_y_swp[p], count_y_swp+p, swp_msg_size_y[p], y_rank_F, y_rank_B, p);
-	    PostSendMsg_Y(SF_swap[p], SB_swap[p], MCW, request_y_swp[p], count_y_swp+p, swp_msg_size_y[p], y_rank_F, y_rank_B, 
-                     rank, Both, p);
-	    MPI_Waitall(count_y_swp[p], request_y_swp[p], status_y_swp[p]);
-	    Cpy2Device_swaparea_Y(d_u1[p], d_v1[p], d_w1[p], d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p], 
-		     RF_swap[p], RB_swap[p], d_RF_swap[p], d_RB_swap[p], nxt[p], stream_i, stream_i, y_rank_F, y_rank_B, 
-                     swaplevmin, swaplevmax, p);
-
-	    Cpy2Host_swaparea_X(d_u1[p], d_v1[p], d_w1[p], d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p], 
-		     SL_swap[p], SR_swap[p], d_SL_swap[p], d_SR_swap[p], nyt[p], stream_i, stream_i, x_rank_L, x_rank_R, 
-                     swaplevmin, swaplevmax, p);
-	    CUCHK(cudaStreamSynchronize(stream_i));
-	    swaparea_update_corners(SL_swap[p], SR_swap[p], RF_swap[p], RB_swap[p], nswaplev, WWL, nxt[p], nyt[p]);
-	    PostRecvMsg_X(RL_swap[p], RR_swap[p], MCW, request_x_swp[p], count_x_swp+p, swp_msg_size_x[p], x_rank_L, x_rank_R, p);
-	    PostSendMsg_X(SL_swap[p], SR_swap[p], MCW, request_x_swp[p], count_x_swp+p, swp_msg_size_x[p], x_rank_L, 
-                     x_rank_R, rank, Both, p);
-	    MPI_Waitall(count_x_swp[p], request_x_swp[p], status_x_swp[p]);
-	    Cpy2Device_swaparea_X(d_u1[p], d_v1[p], d_w1[p], d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p], 
-		     RL_swap[p], RR_swap[p], d_RL_swap[p], d_RR_swap[p], nyt[p], stream_i, stream_i, x_rank_L, x_rank_R, 
-                     swaplevmin, swaplevmax, p);
-         }
+            for (p = 0; p < ngrids; p++)
+               dump_nonzeros(d_w1[p], nxt[p] + 4 + 8 * loop, nyt[p] + 4 + 8 * loop, nzt[p] + 2 * align, "w1", p, cur_step, 1, rank, size);
 
-	 for (p=0; p<ngrids-1; p++){
-            swap_H(d_xx[p+1], d_yy[p+1], d_zz[p+1], d_xy[p+1], d_xz[p+1], d_yz[p+1], d_u1[p+1], d_v1[p+1], d_w1[p+1],
-                   d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p], d_u1[p], d_v1[p], d_w1[p],
-                   nxt[p+1],  nyt[p+1], d_RL_swap[p], d_RR_swap[p], d_RF_swap[p], d_RB_swap[p], rank, stream_i, p);
-         }
+            for (p = 0; p < ngrids - 1; p++)
+            {
+               Cpy2Host_swaparea_Y(d_u1[p], d_v1[p], d_w1[p], d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p],
+                                   SF_swap[p], SB_swap[p], d_SF_swap[p], d_SB_swap[p], nxt[p], stream_i, stream_i, y_rank_F, y_rank_B,
+                                   swaplevmin, swaplevmax, p);
+               CUCHK(cudaStreamSynchronize(stream_i));
+               PostRecvMsg_Y(RF_swap[p], RB_swap[p], MCW, request_y_swp[p], count_y_swp + p, swp_msg_size_y[p], y_rank_F, y_rank_B, p);
+               PostSendMsg_Y(SF_swap[p], SB_swap[p], MCW, request_y_swp[p], count_y_swp + p, swp_msg_size_y[p], y_rank_F, y_rank_B,
+                             rank, Both, p);
+               MPI_Waitall(count_y_swp[p], request_y_swp[p], status_y_swp[p]);
+               Cpy2Device_swaparea_Y(d_u1[p], d_v1[p], d_w1[p], d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p],
+                                     RF_swap[p], RB_swap[p], d_RF_swap[p], d_RB_swap[p], nxt[p], stream_i, stream_i, y_rank_F, y_rank_B,
+                                     swaplevmin, swaplevmax, p);
+
+               Cpy2Host_swaparea_X(d_u1[p], d_v1[p], d_w1[p], d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p],
+                                   SL_swap[p], SR_swap[p], d_SL_swap[p], d_SR_swap[p], nyt[p], stream_i, stream_i, x_rank_L, x_rank_R,
+                                   swaplevmin, swaplevmax, p);
+               CUCHK(cudaStreamSynchronize(stream_i));
+               swaparea_update_corners(SL_swap[p], SR_swap[p], RF_swap[p], RB_swap[p], nswaplev, WWL, nxt[p], nyt[p]);
+               PostRecvMsg_X(RL_swap[p], RR_swap[p], MCW, request_x_swp[p], count_x_swp + p, swp_msg_size_x[p], x_rank_L, x_rank_R, p);
+               PostSendMsg_X(SL_swap[p], SR_swap[p], MCW, request_x_swp[p], count_x_swp + p, swp_msg_size_x[p], x_rank_L,
+                             x_rank_R, rank, Both, p);
+               MPI_Waitall(count_x_swp[p], request_x_swp[p], status_x_swp[p]);
+               Cpy2Device_swaparea_X(d_u1[p], d_v1[p], d_w1[p], d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p],
+                                     RL_swap[p], RR_swap[p], d_RL_swap[p], d_RR_swap[p], nyt[p], stream_i, stream_i, x_rank_L, x_rank_R,
+                                     swaplevmin, swaplevmax, p);
+            }
 
-         for (p=0; p<ngrids; p++){
-	    dump_nonzeros(d_u1[p], nxt[p]+4+8*loop, nyt[p]+4+8*loop, nzt[p]+2*align, "u1", p, cur_step, 2, rank, size);
-	    dump_nonzeros(d_v1[p], nxt[p]+4+8*loop, nyt[p]+4+8*loop, nzt[p]+2*align, "v1", p, cur_step, 2, rank, size);
-	    dump_nonzeros(d_w1[p], nxt[p]+4+8*loop, nyt[p]+4+8*loop, nzt[p]+2*align, "w1", p, cur_step, 2, rank, size);
-         }
+            for (p = 0; p < ngrids - 1; p++)
+            {
+               swap_H(d_xx[p + 1], d_yy[p + 1], d_zz[p + 1], d_xy[p + 1], d_xz[p + 1], d_yz[p + 1], d_u1[p + 1], d_v1[p + 1], d_w1[p + 1],
+                      d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p], d_u1[p], d_v1[p], d_w1[p],
+                      nxt[p + 1], nyt[p + 1], d_RL_swap[p], d_RR_swap[p], d_RF_swap[p], d_RB_swap[p], rank, stream_i, p);
+            }
 
-         CUCHK(cudaStreamSynchronize(stream_i));
+            for (p = 0; p < ngrids; p++)
+            {
+               dump_nonzeros(d_u1[p], nxt[p] + 4 + 8 * loop, nyt[p] + 4 + 8 * loop, nzt[p] + 2 * align, "u1", p, cur_step, 2, rank, size);
+               dump_nonzeros(d_v1[p], nxt[p] + 4 + 8 * loop, nyt[p] + 4 + 8 * loop, nzt[p] + 2 * align, "v1", p, cur_step, 2, rank, size);
+               dump_nonzeros(d_w1[p], nxt[p] + 4 + 8 * loop, nyt[p] + 4 + 8 * loop, nzt[p] + 2 * align, "w1", p, cur_step, 2, rank, size);
+            }
 
-	 for (p=0; p<ngrids; p++){
-            dump_all_data(d_u1[p], d_v1[p], d_w1[p], d_xx[p], d_yy[p], d_zz[p], d_xz[p], d_yz[p], d_xy[p], 
-                     nel[p], cur_step, 0, p, rank, size);
-         }
+             energy_zero(&energy, d_u1[0], d_v1[0], d_w1[0], d_xx[0], d_yy[0], d_zz[0], d_xy[0], d_xz[0], d_yz[0], 0);
 
-         if((rank==srcproc[0]) && (IFAULT == 4))
-         {
-            fprintf(stdout, "calling frcvel_H\n");
-            ++source_step;
-            frcvel_H(source_step, READ_STEP_GPU, maxdim, d_tpsrc[0], npsrc[0], fbc_tskp, stream_i, d_taxx[0], d_tayy[0], 
-                 d_tazz[0], d_taxz[0], d_tayz[0], d_taxy[0], d_u1[0], d_v1[0], d_w1[0], -1, -1, 0);
-         }
-         CUCHK(cudaStreamSynchronize(stream_i));
-         for (p=0; p<ngrids; p++){
-	    dump_nonzeros(d_xx[p], nxt[p]+4+8*loop, nyt[p]+4+8*loop, nzt[p]+2*align, "xx", p, cur_step, 2, rank, size);
-	    dump_nonzeros(d_yy[p], nxt[p]+4+8*loop, nyt[p]+4+8*loop, nzt[p]+2*align, "yy", p, cur_step, 2, rank, size);
-	    dump_nonzeros(d_zz[p], nxt[p]+4+8*loop, nyt[p]+4+8*loop, nzt[p]+2*align, "zz", p, cur_step, 2, rank, size);
-	    dump_nonzeros(d_xy[p], nxt[p]+4+8*loop, nyt[p]+4+8*loop, nzt[p]+2*align, "xy", p, cur_step, 2, rank, size);
-	    dump_nonzeros(d_xz[p], nxt[p]+4+8*loop, nyt[p]+4+8*loop, nzt[p]+2*align, "xz", p, cur_step, 2, rank, size);
-	    dump_nonzeros(d_yz[p], nxt[p]+4+8*loop, nyt[p]+4+8*loop, nzt[p]+2*align, "yz", p, cur_step, 2, rank, size);
-         }
+            CUCHK(cudaStreamSynchronize(stream_i));
 
-         for (p=0; p<ngrids; p++){ 
-   	    PostRecvMsg_X(RL_vel[p], RR_vel[p], MCW, request_x[p], &count_x[p], msg_v_size_x[p], x_rank_L, x_rank_R, p);
-         }
-         if (NVE < 3){
-   	    //stress computation in full inside region
-   	    for (p=usetopo; p<ngrids; p++){
-               CUCHK(cudaStreamSynchronize(stream_i));
-	       dstrqc_H(d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p],
-			d_r1[p], d_r2[p], d_r3[p], d_r4[p], d_r5[p], d_r6[p],
-			d_u1[p], d_v1[p], d_w1[p], d_lam[p],
-			d_mu[p], d_qp[p], d_coeff, d_qs[p], d_dcrjx[p], d_dcrjy[p], d_dcrjz[p],
-			nyt[p],  nzt[p],  stream_i, d_lam_mu[p],
-			d_vx1[p], d_vx2[p], d_ww[p], d_wwo[p],
-			NX*grdfct[p], NPC,  coord[0], coord[1],   xss2[p],  xse2[p],
-			yls[p],  yre[p], p);
+            for (p = 0; p < ngrids; p++)
+            {
+               dump_all_data(d_u1[p], d_v1[p], d_w1[p], d_xx[p], d_yy[p], d_zz[p], d_xz[p], d_yz[p], d_xy[p],
+                             nel[p], cur_step, 0, p, rank, size);
+            }
+
+            if ((rank == srcproc[0]) && (IFAULT == 4))
+            {
+               fprintf(stdout, "calling frcvel_H\n");
+               ++source_step;
+               frcvel_H(source_step, READ_STEP_GPU, maxdim, d_tpsrc[0], npsrc[0], fbc_tskp, stream_i, d_taxx[0], d_tayy[0],
+                        d_tazz[0], d_taxz[0], d_tayz[0], d_taxy[0], d_u1[0], d_v1[0], d_w1[0], -1, -1, 0);
+            }
+            CUCHK(cudaStreamSynchronize(stream_i));
+            for (p = 0; p < ngrids; p++)
+            {
+               dump_nonzeros(d_xx[p], nxt[p] + 4 + 8 * loop, nyt[p] + 4 + 8 * loop, nzt[p] + 2 * align, "xx", p, cur_step, 2, rank, size);
+               dump_nonzeros(d_yy[p], nxt[p] + 4 + 8 * loop, nyt[p] + 4 + 8 * loop, nzt[p] + 2 * align, "yy", p, cur_step, 2, rank, size);
+               dump_nonzeros(d_zz[p], nxt[p] + 4 + 8 * loop, nyt[p] + 4 + 8 * loop, nzt[p] + 2 * align, "zz", p, cur_step, 2, rank, size);
+               dump_nonzeros(d_xy[p], nxt[p] + 4 + 8 * loop, nyt[p] + 4 + 8 * loop, nzt[p] + 2 * align, "xy", p, cur_step, 2, rank, size);
+               dump_nonzeros(d_xz[p], nxt[p] + 4 + 8 * loop, nyt[p] + 4 + 8 * loop, nzt[p] + 2 * align, "xz", p, cur_step, 2, rank, size);
+               dump_nonzeros(d_yz[p], nxt[p] + 4 + 8 * loop, nyt[p] + 4 + 8 * loop, nzt[p] + 2 * align, "yz", p, cur_step, 2, rank, size);
+            }
+
+
+               if (usemms) {
+                       float t =  DT * (cur_step);
+                        for (p = 0; p < ngrids; p++) {
+                       mms_force_velocity(d_u1[p], d_v1[p], d_w1[p], nxt[p],
+                                          nyt[p], nzt[p], DH[p], coord[0],
+                                          coord[1], p, t + 0.5 * DT , DT);
+                        }
+
+                        p = ngrids - 1;
+                        //// Exact solution at bottom boundary
+                        //mms_exact_velocity(d_u1[p], d_v1[p], d_w1[p], 
+                        //nxt[p], nyt[p], nzt[p], coord[0], coord[1], p, 0, 0, 0,
+                        //4 + 2 * ngsl + nxt[p], 4 + 2 * ngsl + nyt[p], 8, DH[p], t);
+
+                        // Exact solution at top boundary
+                        //mms_exact_velocity(d_u1[p], d_v1[p], d_w1[p], 
+                        //    nxt[p], nyt[p], nzt[p], coord[0], coord[1],
+                        //    p, 50 , 50, 16, 4 + 2 * ngsl + nxt[p] - 50,
+                        //    4 + 2 * ngsl + nyt[p] - 50, nzt[p] - 16, DH[p], t, 0);
             }
 
+             
+            energy_zero(&energy, d_u1[0], d_v1[0], d_w1[0], d_xx[0], d_yy[0], d_zz[0], d_xy[0], d_xz[0], d_yz[0], 0);
+
+            for (p = 0; p < ngrids; p++)
+            {
+               PostRecvMsg_X(RL_vel[p], RR_vel[p], MCW, request_x[p], &count_x[p], msg_v_size_x[p], x_rank_L, x_rank_R, p);
+            }
+            if (NVE < 3)
+            {
+               //stress computation in full inside region
+               for (p = usetopo; p < ngrids; p++)
+               {
+                  CUCHK(cudaStreamSynchronize(stream_i));
+                  dstrqc_H(d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p],
+                           d_r1[p], d_r2[p], d_r3[p], d_r4[p], d_r5[p], d_r6[p],
+                           d_u1[p], d_v1[p], d_w1[p], d_lam[p],
+                           d_mu[p], d_qp[p], d_coeff, d_qs[p], d_dcrjx[p], d_dcrjy[p], d_dcrjz[p],
+                           nyt[p], nzt[p], stream_i, d_lam_mu[p],
+                           d_vx1[p], d_vx2[p], d_ww[p], d_wwo[p],
+                           NX * grdfct[p], NPC, coord[0], coord[1], xss2[p], xse2[p],
+                           yls[p], yre[p], p);
+               }
+
 #if TOPO
-            topo_stress_interior_H(&T);
+               topo_stress_interior_H(&T);
 #endif
-         }
-         else {
-   	    //stress computation in part of the inside region
-   	    for (p=usetopo; p<ngrids; p++){
-	       dstrqc_H(d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p],
-			d_r1[p], d_r2[p], d_r3[p], d_r4[p], d_r5[p], d_r6[p],
-			d_u1[p], d_v1[p], d_w1[p], d_lam[p],
-			d_mu[p], d_qp[p],d_coeff, d_qs[p], d_dcrjx[p], d_dcrjy[p], d_dcrjz[p],
-			nyt[p],  nzt[p],  stream_i, d_lam_mu[p],
-			d_vx1[p], d_vx2[p], d_ww[p], d_wwo[p],
-			NX*grdfct[p], NPC,  coord[0], coord[1],   xss2[p],  xse2[p],
-			yls[p],  yls2[p]-1, p);
-	       dstrqc_H(d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p],
-			d_r1[p], d_r2[p], d_r3[p], d_r4[p], d_r5[p], d_r6[p],
-			d_u1[p], d_v1[p], d_w1[p], d_lam[p],
-			d_mu[p], d_qp[p],d_coeff, d_qs[p], d_dcrjx[p], d_dcrjy[p], d_dcrjz[p],
-			nyt[p],  nzt[p],  stream_i2, d_lam_mu[p],
-			d_vx1[p], d_vx2[p], d_ww[p], d_wwo[p],
-			NX*grdfct[p], NPC,  coord[0], coord[1],   xss2[p],  xse2[p],
-			yre2[p]+1, yre[p], p);
-           }
-         }
 
-         //dump_all_stresses(d_xx, d_yy, d_zz, d_xz, d_yz, d_xy, nel, 'u', cur_step, 0, rank, size);
+            }
+            else
+            {
+               //stress computation in part of the inside region
+               for (p = usetopo; p < ngrids; p++)
+               {
+                  dstrqc_H(d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p],
+                           d_r1[p], d_r2[p], d_r3[p], d_r4[p], d_r5[p], d_r6[p],
+                           d_u1[p], d_v1[p], d_w1[p], d_lam[p],
+                           d_mu[p], d_qp[p], d_coeff, d_qs[p], d_dcrjx[p], d_dcrjy[p], d_dcrjz[p],
+                           nyt[p], nzt[p], stream_i, d_lam_mu[p],
+                           d_vx1[p], d_vx2[p], d_ww[p], d_wwo[p],
+                           NX * grdfct[p], NPC, coord[0], coord[1], xss2[p], xse2[p],
+                           yls[p], yls2[p] - 1, p);
+                  dstrqc_H(d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p],
+                           d_r1[p], d_r2[p], d_r3[p], d_r4[p], d_r5[p], d_r6[p],
+                           d_u1[p], d_v1[p], d_w1[p], d_lam[p],
+                           d_mu[p], d_qp[p], d_coeff, d_qs[p], d_dcrjx[p], d_dcrjy[p], d_dcrjz[p],
+                           nyt[p], nzt[p], stream_i2, d_lam_mu[p],
+                           d_vx1[p], d_vx2[p], d_ww[p], d_wwo[p],
+                           NX * grdfct[p], NPC, coord[0], coord[1], xss2[p], xse2[p],
+                           yre2[p] + 1, yre[p], p);
+               }
+            }
 
-   	 for (p=0; p<ngrids; p++){
-	    Cpy2Host_VX(d_u1[p], d_v1[p], d_w1[p], SL_vel[p], nxt[p], nyt[p], nzt[p], stream_1, x_rank_L, Left);
-	    Cpy2Host_VX(d_u1[p], d_v1[p], d_w1[p], SR_vel[p], nxt[p], nyt[p], nzt[p], stream_2, x_rank_R, Right);
-         }
+            //dump_all_stresses(d_xx, d_yy, d_zz, d_xz, d_yz, d_xy, nel, 'u', cur_step, 0, rank, size);
 
-         //velocity communication in x direction
-         CUCHK(cudaStreamSynchronize(stream_1));
-   	 for (p=0; p<ngrids; p++){
-	    PostSendMsg_X(SL_vel[p], SR_vel[p], MCW, request_x[p], &count_x[p], 
-	       msg_v_size_x[p], x_rank_L, x_rank_R, rank, Left, p);
-         }
-         CUCHK(cudaStreamSynchronize(stream_2));
-   	 for (p=0; p<ngrids; p++){
-	    PostSendMsg_X(SL_vel[p], SR_vel[p], MCW, request_x[p], &count_x[p], 
-               msg_v_size_x[p], x_rank_L, x_rank_R, rank, Right, p);
-	    MPI_Waitall(count_x[p], request_x[p], status_x[p]);
-
-	    Cpy2Device_VX(d_u1[p], d_v1[p], d_w1[p], RL_vel[p], RR_vel[p], nxt[p], nyt[p], nzt[p], 
-               stream_1, stream_2, x_rank_L, x_rank_R);
-
-            if (!usetopo || p > 0) { 
-	    dstrqc_H(d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p],
-		     d_r1[p], d_r2[p], d_r3[p], d_r4[p], d_r5[p], d_r6[p],
-		     d_u1[p], d_v1[p], d_w1[p], d_lam[p],
-		     d_mu[p], d_qp[p],d_coeff, d_qs[p], d_dcrjx[p], d_dcrjy[p], d_dcrjz[p],
-		     nyt[p],  nzt[p],  stream_1, d_lam_mu[p],
-		     d_vx1[p], d_vx2[p], d_ww[p], d_wwo[p],
-		     NX*grdfct[p], NPC,  coord[0], coord[1],   xss1[p],  xse1[p],
-		     yls[p],  yre[p], p);
-	    dstrqc_H(d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p],
-		     d_r1[p], d_r2[p], d_r3[p], d_r4[p], d_r5[p], d_r6[p],
-		     d_u1[p], d_v1[p], d_w1[p], d_lam[p],
-		     d_mu[p], d_qp[p],d_coeff, d_qs[p], d_dcrjx[p], d_dcrjy[p], d_dcrjz[p],
-		     nyt[p],  nzt[p],  stream_2, d_lam_mu[p],
-		     d_vx1[p], d_vx2[p], d_ww[p], d_wwo[p],
-		     NX*grdfct[p], NPC,  coord[0], coord[1],   xss3[p],  xse3[p],
-		     yls[p],  yre[p], p);
-            } else {
-                    topo_stress_left_H(&T);
-                    topo_stress_right_H(&T);
+            for (p = 0; p < ngrids; p++)
+            {
+               Cpy2Host_VX(d_u1[p], d_v1[p], d_w1[p], SL_vel[p], nxt[p], nyt[p], nzt[p], stream_1, x_rank_L, Left);
+               Cpy2Host_VX(d_u1[p], d_v1[p], d_w1[p], SR_vel[p], nxt[p], nyt[p], nzt[p], stream_2, x_rank_R, Right);
             }
-         }
-         CUCHK(cudaDeviceSynchronize());
-         
-         for (p=0; p<ngrids; p++){
-	    dump_nonzeros(d_xx[p], nxt[p]+4+8*loop, nyt[p]+4+8*loop, nzt[p]+2*align, "xx", p, cur_step, 8, rank, size);
-	    dump_nonzeros(d_yy[p], nxt[p]+4+8*loop, nyt[p]+4+8*loop, nzt[p]+2*align, "yy", p, cur_step, 8, rank, size);
-	    dump_nonzeros(d_zz[p], nxt[p]+4+8*loop, nyt[p]+4+8*loop, nzt[p]+2*align, "zz", p, cur_step, 8, rank, size);
-	    dump_nonzeros(d_xy[p], nxt[p]+4+8*loop, nyt[p]+4+8*loop, nzt[p]+2*align, "xy", p, cur_step, 8, rank, size);
-	    dump_nonzeros(d_xz[p], nxt[p]+4+8*loop, nyt[p]+4+8*loop, nzt[p]+2*align, "xz", p, cur_step, 8, rank, size);
-	    dump_nonzeros(d_yz[p], nxt[p]+4+8*loop, nyt[p]+4+8*loop, nzt[p]+2*align, "yz", p, cur_step, 8, rank, size);
-         }
 
-         for (p=0; p<ngrids-1; p++){
-            dstrqc2_H(d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p],    
-                      d_r1[p], d_r2[p], d_r3[p], d_r4[p], d_r5[p], d_r6[p], d_u1[p], d_v1[p], d_w1[p],
-                      d_lam[p], d_mu[p], d_qp[p], d_qs[p], d_dcrjx[p], d_dcrjy[p], d_dcrjz[p], 
-                      nxt[p], nyt[p], stream_i, d_coeff, d_vx1[p], d_vx2[p], d_ww[p], d_wwo[p], 
-                      xss1[p],  xse3[p], yls[p],  yre[p], p);
-	    /*print_nan_H(d_xx[p], (nxt[p]+4+ngsl2), (nyt[p]+4+ngsl2), (nzt[p]+2*align), "xx");
+            //velocity communication in x direction
+            CUCHK(cudaStreamSynchronize(stream_1));
+            for (p = 0; p < ngrids; p++)
+            {
+               PostSendMsg_X(SL_vel[p], SR_vel[p], MCW, request_x[p], &count_x[p],
+                             msg_v_size_x[p], x_rank_L, x_rank_R, rank, Left, p);
+            }
+            CUCHK(cudaStreamSynchronize(stream_2));
+            for (p = 0; p < ngrids; p++)
+            {
+               PostSendMsg_X(SL_vel[p], SR_vel[p], MCW, request_x[p], &count_x[p],
+                             msg_v_size_x[p], x_rank_L, x_rank_R, rank, Right, p);
+               MPI_Waitall(count_x[p], request_x[p], status_x[p]);
+
+               Cpy2Device_VX(d_u1[p], d_v1[p], d_w1[p], RL_vel[p], RR_vel[p], nxt[p], nyt[p], nzt[p],
+                             stream_1, stream_2, x_rank_L, x_rank_R);
+
+               if (!usetopo || p > 0)
+               {
+                  dstrqc_H(d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p],
+                           d_r1[p], d_r2[p], d_r3[p], d_r4[p], d_r5[p], d_r6[p],
+                           d_u1[p], d_v1[p], d_w1[p], d_lam[p],
+                           d_mu[p], d_qp[p], d_coeff, d_qs[p], d_dcrjx[p], d_dcrjy[p], d_dcrjz[p],
+                           nyt[p], nzt[p], stream_1, d_lam_mu[p],
+                           d_vx1[p], d_vx2[p], d_ww[p], d_wwo[p],
+                           NX * grdfct[p], NPC, coord[0], coord[1], xss1[p], xse1[p],
+                           yls[p], yre[p], p);
+                  dstrqc_H(d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p],
+                           d_r1[p], d_r2[p], d_r3[p], d_r4[p], d_r5[p], d_r6[p],
+                           d_u1[p], d_v1[p], d_w1[p], d_lam[p],
+                           d_mu[p], d_qp[p], d_coeff, d_qs[p], d_dcrjx[p], d_dcrjy[p], d_dcrjz[p],
+                           nyt[p], nzt[p], stream_2, d_lam_mu[p],
+                           d_vx1[p], d_vx2[p], d_ww[p], d_wwo[p],
+                           NX * grdfct[p], NPC, coord[0], coord[1], xss3[p], xse3[p],
+                           yls[p], yre[p], p);
+               }
+               else
+               {
+                  topo_stress_left_H(&T);
+                  topo_stress_right_H(&T);
+               }
+            }
+            CUCHK(cudaDeviceSynchronize());
+
+
+            for (p = 0; p < ngrids; p++)
+            {
+               dump_nonzeros(d_xx[p], nxt[p] + 4 + 8 * loop, nyt[p] + 4 + 8 * loop, nzt[p] + 2 * align, "xx", p, cur_step, 8, rank, size);
+               dump_nonzeros(d_yy[p], nxt[p] + 4 + 8 * loop, nyt[p] + 4 + 8 * loop, nzt[p] + 2 * align, "yy", p, cur_step, 8, rank, size);
+               dump_nonzeros(d_zz[p], nxt[p] + 4 + 8 * loop, nyt[p] + 4 + 8 * loop, nzt[p] + 2 * align, "zz", p, cur_step, 8, rank, size);
+               dump_nonzeros(d_xy[p], nxt[p] + 4 + 8 * loop, nyt[p] + 4 + 8 * loop, nzt[p] + 2 * align, "xy", p, cur_step, 8, rank, size);
+               dump_nonzeros(d_xz[p], nxt[p] + 4 + 8 * loop, nyt[p] + 4 + 8 * loop, nzt[p] + 2 * align, "xz", p, cur_step, 8, rank, size);
+               dump_nonzeros(d_yz[p], nxt[p] + 4 + 8 * loop, nyt[p] + 4 + 8 * loop, nzt[p] + 2 * align, "yz", p, cur_step, 8, rank, size);
+            }
+
+            for (p = 0; p < ngrids - 1; p++)
+            {
+               dstrqc2_H(d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p],
+                         d_r1[p], d_r2[p], d_r3[p], d_r4[p], d_r5[p], d_r6[p], d_u1[p], d_v1[p], d_w1[p],
+                         d_lam[p], d_mu[p], d_qp[p], d_qs[p], d_dcrjx[p], d_dcrjy[p], d_dcrjz[p],
+                         nxt[p], nyt[p], stream_i, d_coeff, d_vx1[p], d_vx2[p], d_ww[p], d_wwo[p],
+                         xss1[p], xse3[p], yls[p], yre[p], p);
+               /*print_nan_H(d_xx[p], (nxt[p]+4+ngsl2), (nyt[p]+4+ngsl2), (nzt[p]+2*align), "xx");
 	    print_nan_H(d_yy[p], (nxt[p]+4+ngsl2), (nyt[p]+4+ngsl2), (nzt[p]+2*align), "yy");
 	    print_nan_H(d_zz[p], (nxt[p]+4+ngsl2), (nyt[p]+4+ngsl2), (nzt[p]+2*align), "zz");
 	    print_nan_H(d_xy[p], (nxt[p]+4+ngsl2), (nyt[p]+4+ngsl2), (nzt[p]+2*align), "xy");
 	    print_nan_H(d_xz[p], (nxt[p]+4+ngsl2), (nyt[p]+4+ngsl2), (nzt[p]+2*align), "xz");
 	    print_nan_H(d_yz[p], (nxt[p]+4+ngsl2), (nyt[p]+4+ngsl2), (nzt[p]+2*align), "yz");*/
-         }
+            }
 
-         for (p=0; p<ngrids; p++){
-	    dump_nonzeros(d_xx[p], nxt[p]+4+8*loop, nyt[p]+4+8*loop, nzt[p]+2*align, "xx", p, cur_step, 3, rank, size);
-	    dump_nonzeros(d_yy[p], nxt[p]+4+8*loop, nyt[p]+4+8*loop, nzt[p]+2*align, "yy", p, cur_step, 3, rank, size);
-	    dump_nonzeros(d_zz[p], nxt[p]+4+8*loop, nyt[p]+4+8*loop, nzt[p]+2*align, "zz", p, cur_step, 3, rank, size);
-	    dump_nonzeros(d_xy[p], nxt[p]+4+8*loop, nyt[p]+4+8*loop, nzt[p]+2*align, "xy", p, cur_step, 3, rank, size);
-	    dump_nonzeros(d_xz[p], nxt[p]+4+8*loop, nyt[p]+4+8*loop, nzt[p]+2*align, "xz", p, cur_step, 3, rank, size);
-	    dump_nonzeros(d_yz[p], nxt[p]+4+8*loop, nyt[p]+4+8*loop, nzt[p]+2*align, "yz", p, cur_step, 3, rank, size);
-         }
+            for (p = 0; p < ngrids; p++)
+            {
+               dump_nonzeros(d_xx[p], nxt[p] + 4 + 8 * loop, nyt[p] + 4 + 8 * loop, nzt[p] + 2 * align, "xx", p, cur_step, 3, rank, size);
+               dump_nonzeros(d_yy[p], nxt[p] + 4 + 8 * loop, nyt[p] + 4 + 8 * loop, nzt[p] + 2 * align, "yy", p, cur_step, 3, rank, size);
+               dump_nonzeros(d_zz[p], nxt[p] + 4 + 8 * loop, nyt[p] + 4 + 8 * loop, nzt[p] + 2 * align, "zz", p, cur_step, 3, rank, size);
+               dump_nonzeros(d_xy[p], nxt[p] + 4 + 8 * loop, nyt[p] + 4 + 8 * loop, nzt[p] + 2 * align, "xy", p, cur_step, 3, rank, size);
+               dump_nonzeros(d_xz[p], nxt[p] + 4 + 8 * loop, nyt[p] + 4 + 8 * loop, nzt[p] + 2 * align, "xz", p, cur_step, 3, rank, size);
+               dump_nonzeros(d_yz[p], nxt[p] + 4 + 8 * loop, nyt[p] + 4 + 8 * loop, nzt[p] + 2 * align, "yz", p, cur_step, 3, rank, size);
+            }
 
-         CUCHK(cudaDeviceSynchronize());
+            CUCHK(cudaDeviceSynchronize());
 
-         /*swap transition zone data on coarse grid(s)*/
-	 for (p=1; p<ngrids; p++){
-            Cpy2Host_swaparea_Y(d_u1[p], d_v1[p], d_w1[p], d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p], 
-                     SF_swap[p], SB_swap[p], d_SF_swap[p], d_SB_swap[p], nxt[p], stream_i, stream_i, y_rank_F, y_rank_B, 
-                     intlev[p], intlev[p], p);
-            cudaDeviceSynchronize();
-            PostRecvMsg_Y(RF_swap[p], RB_swap[p], MCW, request_y_swp[p], count_y_swp+p, swp_msg_size_y[p], y_rank_F, y_rank_B, p);
-            PostSendMsg_Y(SF_swap[p], SB_swap[p], MCW, request_y_swp[p], count_y_swp+p, swp_msg_size_y[p], y_rank_F, y_rank_B, 
-                     rank, Both, p);
-            MPI_Waitall(count_y_swp[p], request_y_swp[p], status_y_swp[p]);
-            Cpy2Device_swaparea_Y(d_u1[p], d_v1[p], d_w1[p], d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p], 
-                     RF_swap[p], RB_swap[p], d_RF_swap[p], d_RB_swap[p], nxt[p], stream_i, stream_i, y_rank_F, y_rank_B, 
-                     intlev[p], intlev[p], p);
-            Cpy2Host_swaparea_X(d_u1[p], d_v1[p], d_w1[p], d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p], 
-                     SL_swap[p], SR_swap[p], d_SL_swap[p], d_SR_swap[p], nyt[p], stream_i, stream_i, x_rank_L, x_rank_R, 
-                     intlev[p], intlev[p], p);
-            cudaDeviceSynchronize();
-            PostRecvMsg_X(RL_swap[p], RR_swap[p], MCW, request_x_swp[p], count_x_swp+p, swp_msg_size_x[p], x_rank_L, x_rank_R, p);
-            PostSendMsg_X(SL_swap[p], SR_swap[p], MCW, request_x_swp[p], count_x_swp+p, swp_msg_size_x[p], x_rank_L, x_rank_R, 
-                     rank, Both, p);
-            MPI_Waitall(count_x_swp[p], request_x_swp[p], status_x_swp[p]);
-            Cpy2Device_swaparea_X(d_u1[p], d_v1[p], d_w1[p], d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p], 
-                     RL_swap[p], RR_swap[p], d_RL_swap[p], d_RR_swap[p], nyt[p], stream_i, stream_i, x_rank_L, x_rank_R, 
-                     intlev[p], intlev[p], p);
-         }
+            /*swap transition zone data on coarse grid(s)*/
+            for (p = 1; p < ngrids; p++)
+            {
+               Cpy2Host_swaparea_Y(d_u1[p], d_v1[p], d_w1[p], d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p],
+                                   SF_swap[p], SB_swap[p], d_SF_swap[p], d_SB_swap[p], nxt[p], stream_i, stream_i, y_rank_F, y_rank_B,
+                                   intlev[p], intlev[p], p);
+               cudaDeviceSynchronize();
+               PostRecvMsg_Y(RF_swap[p], RB_swap[p], MCW, request_y_swp[p], count_y_swp + p, swp_msg_size_y[p], y_rank_F, y_rank_B, p);
+               PostSendMsg_Y(SF_swap[p], SB_swap[p], MCW, request_y_swp[p], count_y_swp + p, swp_msg_size_y[p], y_rank_F, y_rank_B,
+                             rank, Both, p);
+               MPI_Waitall(count_y_swp[p], request_y_swp[p], status_y_swp[p]);
+               Cpy2Device_swaparea_Y(d_u1[p], d_v1[p], d_w1[p], d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p],
+                                     RF_swap[p], RB_swap[p], d_RF_swap[p], d_RB_swap[p], nxt[p], stream_i, stream_i, y_rank_F, y_rank_B,
+                                     intlev[p], intlev[p], p);
+               Cpy2Host_swaparea_X(d_u1[p], d_v1[p], d_w1[p], d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p],
+                                   SL_swap[p], SR_swap[p], d_SL_swap[p], d_SR_swap[p], nyt[p], stream_i, stream_i, x_rank_L, x_rank_R,
+                                   intlev[p], intlev[p], p);
+               cudaDeviceSynchronize();
+               PostRecvMsg_X(RL_swap[p], RR_swap[p], MCW, request_x_swp[p], count_x_swp + p, swp_msg_size_x[p], x_rank_L, x_rank_R, p);
+               PostSendMsg_X(SL_swap[p], SR_swap[p], MCW, request_x_swp[p], count_x_swp + p, swp_msg_size_x[p], x_rank_L, x_rank_R,
+                             rank, Both, p);
+               MPI_Waitall(count_x_swp[p], request_x_swp[p], status_x_swp[p]);
+               Cpy2Device_swaparea_X(d_u1[p], d_v1[p], d_w1[p], d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p],
+                                     RL_swap[p], RR_swap[p], d_RL_swap[p], d_RR_swap[p], nyt[p], stream_i, stream_i, x_rank_L, x_rank_R,
+                                     intlev[p], intlev[p], p);
+            }
 
-         sources_read(cur_step);
-         if (T.use) {
-                 forces_read(cur_step);
-                 sources_add_curvilinear(d_xx[0], d_yy[0], d_zz[0], d_xy[0],
-                                         d_xz[0], d_yz[0], cur_step, DH[0], DT,
-                                         &T.metrics_f, &T.metrics_g, 0);
-                 forces_add(d_u1[0], d_v1[0], d_w1[0], d_d1[0], cur_step, DH[0], DT,
-                            &T.metrics_f, &T.metrics_g, 0);
-                 for (p = 1; p < ngrids; p++) {
-                         sources_add_cartesian(d_xx[p], d_yy[p], d_zz[p],
-                                               d_xy[p], d_xz[p], d_yz[p],
-                                               cur_step, DH[p], DT, p);
-                }
-         } else {
-                for (p=0; p<ngrids; p++){
-                        sources_add_cartesian(d_xx[p], d_yy[p], d_zz[p],
-                                              d_xy[p], d_xz[p], d_yz[p],
-                                              cur_step, DH[p], DT, p);
-                }
-         }
-         //update source input
-         if ((IFAULT < 4) && (cur_step<NST)) {
-            CUCHK(cudaDeviceSynchronize());
-            ++source_step;
-            for (p=0; p<ngrids; p++){ 
-               if (rank==srcproc[p])
-	       addsrc_H(source_step, READ_STEP_GPU, maxdim, d_tpsrc[p], npsrc[p], stream_i, 
-			d_taxx[p], d_tayy[p], d_tazz[p], d_taxz[p], d_tayz[p], d_taxy[p],
-			d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_yz[p], d_xz[p], p);
+            sources_read(cur_step);
+            //forces_read(cur_step);
+            if (T.use)
+            {
+               sources_add_curvilinear(d_xx[0], d_yy[0], d_zz[0], d_xy[0],
+                                       d_xz[0], d_yz[0], cur_step, DH[0], DT,
+                                       &T.metrics_f, &T.metrics_g, 0);
+               //forces_add(d_u1[0], d_v1[0], d_w1[0], d_d1[0], cur_step, DH[0], DT,
+               //           &T.metrics_f, &T.metrics_g, 0);
+               for (p = 1; p < ngrids; p++)
+               {
+                  sources_add_cartesian(d_xx[p], d_yy[p], d_zz[p],
+                                        d_xy[p], d_xz[p], d_yz[p],
+                                        cur_step, DH[p], DT, p);
+               }
             }
-	    for (p=0; p<ngrids; p++){
-	       dump_nonzeros(d_xx[p], nxt[p]+4+8*loop, nyt[p]+4+8*loop, nzt[p]+2*align, "xx", p, cur_step, 1, rank, size);
-	       dump_nonzeros(d_yy[p], nxt[p]+4+8*loop, nyt[p]+4+8*loop, nzt[p]+2*align, "yy", p, cur_step, 1, rank, size);
-	       dump_nonzeros(d_zz[p], nxt[p]+4+8*loop, nyt[p]+4+8*loop, nzt[p]+2*align, "zz", p, cur_step, 1, rank, size);
-	       dump_nonzeros(d_xy[p], nxt[p]+4+8*loop, nyt[p]+4+8*loop, nzt[p]+2*align, "xy", p, cur_step, 1, rank, size);
-	       dump_nonzeros(d_xz[p], nxt[p]+4+8*loop, nyt[p]+4+8*loop, nzt[p]+2*align, "xz", p, cur_step, 1, rank, size);
-	       dump_nonzeros(d_yz[p], nxt[p]+4+8*loop, nyt[p]+4+8*loop, nzt[p]+2*align, "yz", p, cur_step, 1, rank, size);
-	    }
-         }
-         else if (IFAULT == 5) {
-            CUCHK(cudaDeviceSynchronize());
-            for (p=0; p<ngrids; p++){ 
-               if (rank==srcproc[p])
-	       addkinsrc_H(cur_step, maxdim, d_tpsrc[p], npsrc[p], stream_i, d_mu[p],
-			d_taxx[p], d_tayy[p], d_tazz[p], d_taxz[p], d_tayz[p], d_taxy[p],
-			d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_yz[p], d_xz[p], d_mom[p], d_srcfilt_d[p], p);
+            else
+            {
+               for (p = 0; p < ngrids; p++)
+               {
+                  sources_add_cartesian(d_xx[p], d_yy[p], d_zz[p],
+                                        d_xy[p], d_xz[p], d_yz[p],
+                                        cur_step, DH[p], DT, p);
+               }
             }
-         }
-         else if ((IFAULT == 6) && (cur_step<NST)) {
+            //update source input
+            if ((IFAULT < 4) && (cur_step < NST))
+            {
+               CUCHK(cudaDeviceSynchronize());
+               ++source_step;
+               for (p = 0; p < ngrids; p++)
+               {
+                  if (rank == srcproc[p])
+                     addsrc_H(source_step, READ_STEP_GPU, maxdim, d_tpsrc[p], npsrc[p], stream_i,
+                              d_taxx[p], d_tayy[p], d_tazz[p], d_taxz[p], d_tayz[p], d_taxy[p],
+                              d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_yz[p], d_xz[p], p);
+               }
+               for (p = 0; p < ngrids; p++)
+               {
+                  dump_nonzeros(d_xx[p], nxt[p] + 4 + 8 * loop, nyt[p] + 4 + 8 * loop, nzt[p] + 2 * align, "xx", p, cur_step, 1, rank, size);
+                  dump_nonzeros(d_yy[p], nxt[p] + 4 + 8 * loop, nyt[p] + 4 + 8 * loop, nzt[p] + 2 * align, "yy", p, cur_step, 1, rank, size);
+                  dump_nonzeros(d_zz[p], nxt[p] + 4 + 8 * loop, nyt[p] + 4 + 8 * loop, nzt[p] + 2 * align, "zz", p, cur_step, 1, rank, size);
+                  dump_nonzeros(d_xy[p], nxt[p] + 4 + 8 * loop, nyt[p] + 4 + 8 * loop, nzt[p] + 2 * align, "xy", p, cur_step, 1, rank, size);
+                  dump_nonzeros(d_xz[p], nxt[p] + 4 + 8 * loop, nyt[p] + 4 + 8 * loop, nzt[p] + 2 * align, "xz", p, cur_step, 1, rank, size);
+                  dump_nonzeros(d_yz[p], nxt[p] + 4 + 8 * loop, nyt[p] + 4 + 8 * loop, nzt[p] + 2 * align, "yz", p, cur_step, 1, rank, size);
+               }
+            }
+            else if (IFAULT == 5)
+            {
+               CUCHK(cudaDeviceSynchronize());
+               for (p = 0; p < ngrids; p++)
+               {
+                  if (rank == srcproc[p])
+                     addkinsrc_H(cur_step, maxdim, d_tpsrc[p], npsrc[p], stream_i, d_mu[p],
+                                 d_taxx[p], d_tayy[p], d_tazz[p], d_taxz[p], d_tayz[p], d_taxy[p],
+                                 d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_yz[p], d_xz[p], d_mom[p], d_srcfilt_d[p], p);
+               }
+            }
+            else if ((IFAULT == 6) && (cur_step < NST))
+            {
+               CUCHK(cudaDeviceSynchronize());
+               p = ngrids - 1;
+               addplanesrc_H(cur_step, maxdim, NST, stream_i, d_mu[p], d_lam[p], ND * grdfct[p], nxt[p], nyt[p],
+                             d_taxx[p], d_tayy[p], d_tazz[p],
+                             d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_yz[p], d_xz[p], p);
+            }
+
             CUCHK(cudaDeviceSynchronize());
-	    p=ngrids-1;
-	    addplanesrc_H(cur_step, maxdim, NST, stream_i, d_mu[p],d_lam[p], ND*grdfct[p], nxt[p], nyt[p],
-		     d_taxx[p], d_tayy[p], d_tazz[p],
-		     d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_yz[p], d_xz[p], p);
-         }
 
-         CUCHK(cudaDeviceSynchronize());
+            for (p = 1; p < ngrids; p++)
+            {
+               intp3d_H(d_u1[p], d_v1[p], d_w1[p], d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p],
+                        d_u1[p - 1], d_v1[p - 1], d_w1[p - 1], d_xx[p - 1], d_yy[p - 1], d_zz[p - 1], d_xy[p - 1], d_xz[p - 1], d_yz[p - 1],
+                        nxt[p], nyt[p], rank, stream_i, p);
+            }
+            CUCHK(cudaDeviceSynchronize());
 
-	 for (p=1; p<ngrids; p++){
-            intp3d_H(d_u1[p], d_v1[p], d_w1[p], d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p],
-                   d_u1[p-1], d_v1[p-1], d_w1[p-1], d_xx[p-1], d_yy[p-1], d_zz[p-1], d_xy[p-1], d_xz[p-1], d_yz[p-1],
-                   nxt[p], nyt[p], rank, stream_i, p);
-         }
-         CUCHK(cudaDeviceSynchronize());
- 
-         for (p=0; p<ngrids; p++){
-	    dump_nonzeros(d_xz[p], nxt[p]+4+8*loop, nyt[p]+4+8*loop, nzt[p]+2*align, "xz", p, cur_step, 4, rank, size);
-	    dump_nonzeros(d_yz[p], nxt[p]+4+8*loop, nyt[p]+4+8*loop, nzt[p]+2*align, "yz", p, cur_step, 4, rank, size);
-         }
+            for (p = 0; p < ngrids; p++)
+            {
+               dump_nonzeros(d_xz[p], nxt[p] + 4 + 8 * loop, nyt[p] + 4 + 8 * loop, nzt[p] + 2 * align, "xz", p, cur_step, 4, rank, size);
+               dump_nonzeros(d_yz[p], nxt[p] + 4 + 8 * loop, nyt[p] + 4 + 8 * loop, nzt[p] + 2 * align, "yz", p, cur_step, 4, rank, size);
+            }
 
-         for (p=0; p<ngrids-1; p++){
-	    Cpy2Host_swaparea_Y(d_u1[p], d_v1[p], d_w1[p], d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p], 
-		     SF_swap[p], SB_swap[p], d_SF_swap[p], d_SB_swap[p], nxt[p], stream_i, stream_i, y_rank_F, y_rank_B, 
-                     swaplevmin, swaplevmax, p);
-	    cudaDeviceSynchronize();
-	    PostRecvMsg_Y(RF_swap[p], RB_swap[p], MCW, request_y_swp[p], count_y_swp+p, swp_msg_size_y[p], y_rank_F, y_rank_B, p);
-	    PostSendMsg_Y(SF_swap[p], SB_swap[p], MCW, request_y_swp[p], count_y_swp+p, swp_msg_size_y[p], y_rank_F, y_rank_B, 
-                     rank, Both, p);
-	    MPI_Waitall(count_y_swp[p], request_y_swp[p], status_y_swp[p]);
-	    Cpy2Device_swaparea_Y(d_u1[p], d_v1[p], d_w1[p], d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p], 
-		     RF_swap[p], RB_swap[p], d_RF_swap[p], d_RB_swap[p], nxt[p], stream_i, stream_i, y_rank_F, y_rank_B, 
-                     swaplevmin, swaplevmax, p);
-
-	    Cpy2Host_swaparea_X(d_u1[p], d_v1[p], d_w1[p], d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p], 
-		     SL_swap[p], SR_swap[p], d_SL_swap[p], d_SR_swap[p], nyt[p], stream_i, stream_i, x_rank_L, x_rank_R, 
-                     swaplevmin, swaplevmax, p);
-	    cudaDeviceSynchronize();
-	    swaparea_update_corners(SL_swap[p], SR_swap[p], RF_swap[p], RB_swap[p], nswaplev, WWL, nxt[p], nyt[p]);
-	    PostRecvMsg_X(RL_swap[p], RR_swap[p], MCW, request_x_swp[p], count_x_swp+p, swp_msg_size_x[p], x_rank_L, x_rank_R, p);
-	    PostSendMsg_X(SL_swap[p], SR_swap[p], MCW, request_x_swp[p], count_x_swp+p, swp_msg_size_x[p], x_rank_L, 
-                     x_rank_R, rank, Both, p);
-	    MPI_Waitall(count_x_swp[p], request_x_swp[p], status_x_swp[p]);
-	    Cpy2Device_swaparea_X(d_u1[p], d_v1[p], d_w1[p], d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p], 
-		     RL_swap[p], RR_swap[p], d_RL_swap[p], d_RR_swap[p], nyt[p], stream_i, stream_i, x_rank_L, x_rank_R, 
-                     swaplevmin, swaplevmax, p);
-         }
+            for (p = 0; p < ngrids - 1; p++)
+            {
+               Cpy2Host_swaparea_Y(d_u1[p], d_v1[p], d_w1[p], d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p],
+                                   SF_swap[p], SB_swap[p], d_SF_swap[p], d_SB_swap[p], nxt[p], stream_i, stream_i, y_rank_F, y_rank_B,
+                                   swaplevmin, swaplevmax, p);
+               cudaDeviceSynchronize();
+               PostRecvMsg_Y(RF_swap[p], RB_swap[p], MCW, request_y_swp[p], count_y_swp + p, swp_msg_size_y[p], y_rank_F, y_rank_B, p);
+               PostSendMsg_Y(SF_swap[p], SB_swap[p], MCW, request_y_swp[p], count_y_swp + p, swp_msg_size_y[p], y_rank_F, y_rank_B,
+                             rank, Both, p);
+               MPI_Waitall(count_y_swp[p], request_y_swp[p], status_y_swp[p]);
+               Cpy2Device_swaparea_Y(d_u1[p], d_v1[p], d_w1[p], d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p],
+                                     RF_swap[p], RB_swap[p], d_RF_swap[p], d_RB_swap[p], nxt[p], stream_i, stream_i, y_rank_F, y_rank_B,
+                                     swaplevmin, swaplevmax, p);
+
+               Cpy2Host_swaparea_X(d_u1[p], d_v1[p], d_w1[p], d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p],
+                                   SL_swap[p], SR_swap[p], d_SL_swap[p], d_SR_swap[p], nyt[p], stream_i, stream_i, x_rank_L, x_rank_R,
+                                   swaplevmin, swaplevmax, p);
+               cudaDeviceSynchronize();
+               swaparea_update_corners(SL_swap[p], SR_swap[p], RF_swap[p], RB_swap[p], nswaplev, WWL, nxt[p], nyt[p]);
+               PostRecvMsg_X(RL_swap[p], RR_swap[p], MCW, request_x_swp[p], count_x_swp + p, swp_msg_size_x[p], x_rank_L, x_rank_R, p);
+               PostSendMsg_X(SL_swap[p], SR_swap[p], MCW, request_x_swp[p], count_x_swp + p, swp_msg_size_x[p], x_rank_L,
+                             x_rank_R, rank, Both, p);
+               MPI_Waitall(count_x_swp[p], request_x_swp[p], status_x_swp[p]);
+               Cpy2Device_swaparea_X(d_u1[p], d_v1[p], d_w1[p], d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p],
+                                     RL_swap[p], RR_swap[p], d_RL_swap[p], d_RR_swap[p], nyt[p], stream_i, stream_i, x_rank_L, x_rank_R,
+                                     swaplevmin, swaplevmax, p);
+            }
 
-         /*dump_local_variable(SF_swap[0], swp_msg_size_y[0], "SF_swap", 'h', cur_step, 4, rank, size);
+            /*dump_local_variable(SF_swap[0], swp_msg_size_y[0], "SF_swap", 'h', cur_step, 4, rank, size);
          dump_local_variable(RB_swap[0], swp_msg_size_y[0], "RB_swap", 'h', cur_step, 4, rank, size);*/
 
-	 for (p=0; p<ngrids-1; p++){
-            swap_H(d_xx[p+1], d_yy[p+1], d_zz[p+1], d_xy[p+1], d_xz[p+1], d_yz[p+1], d_u1[p+1], d_v1[p+1], d_w1[p+1],
-                   d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p], d_u1[p], d_v1[p], d_w1[p],
-                   nxt[p+1],  nyt[p+1], d_RL_swap[p], d_RR_swap[p], d_RF_swap[p], d_RB_swap[p], rank, stream_i, p);
-         }
+            for (p = 0; p < ngrids - 1; p++)
+            {
+               swap_H(d_xx[p + 1], d_yy[p + 1], d_zz[p + 1], d_xy[p + 1], d_xz[p + 1], d_yz[p + 1], d_u1[p + 1], d_v1[p + 1], d_w1[p + 1],
+                      d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p], d_u1[p], d_v1[p], d_w1[p],
+                      nxt[p + 1], nyt[p + 1], d_RL_swap[p], d_RR_swap[p], d_RF_swap[p], d_RB_swap[p], rank, stream_i, p);
+            }
 
-         for (p=0; p<ngrids; p++){
-	    dump_nonzeros(d_xx[p], nxt[p]+4+8*loop, nyt[p]+4+8*loop, nzt[p]+2*align, "xx", p, cur_step, 5, rank, size);
-	    dump_nonzeros(d_yy[p], nxt[p]+4+8*loop, nyt[p]+4+8*loop, nzt[p]+2*align, "yy", p, cur_step, 5, rank, size);
-	    dump_nonzeros(d_zz[p], nxt[p]+4+8*loop, nyt[p]+4+8*loop, nzt[p]+2*align, "zz", p, cur_step, 5, rank, size);
-	    dump_nonzeros(d_xy[p], nxt[p]+4+8*loop, nyt[p]+4+8*loop, nzt[p]+2*align, "xy", p, cur_step, 5, rank, size);
-	    dump_nonzeros(d_xz[p], nxt[p]+4+8*loop, nyt[p]+4+8*loop, nzt[p]+2*align, "xz", p, cur_step, 5, rank, size);
-	    dump_nonzeros(d_yz[p], nxt[p]+4+8*loop, nyt[p]+4+8*loop, nzt[p]+2*align, "yz", p, cur_step, 5, rank, size);
-         }
+            for (p = 0; p < ngrids; p++)
+            {
+               dump_nonzeros(d_xx[p], nxt[p] + 4 + 8 * loop, nyt[p] + 4 + 8 * loop, nzt[p] + 2 * align, "xx", p, cur_step, 5, rank, size);
+               dump_nonzeros(d_yy[p], nxt[p] + 4 + 8 * loop, nyt[p] + 4 + 8 * loop, nzt[p] + 2 * align, "yy", p, cur_step, 5, rank, size);
+               dump_nonzeros(d_zz[p], nxt[p] + 4 + 8 * loop, nyt[p] + 4 + 8 * loop, nzt[p] + 2 * align, "zz", p, cur_step, 5, rank, size);
+               dump_nonzeros(d_xy[p], nxt[p] + 4 + 8 * loop, nyt[p] + 4 + 8 * loop, nzt[p] + 2 * align, "xy", p, cur_step, 5, rank, size);
+               dump_nonzeros(d_xz[p], nxt[p] + 4 + 8 * loop, nyt[p] + 4 + 8 * loop, nzt[p] + 2 * align, "xz", p, cur_step, 5, rank, size);
+               dump_nonzeros(d_yz[p], nxt[p] + 4 + 8 * loop, nyt[p] + 4 + 8 * loop, nzt[p] + 2 * align, "yz", p, cur_step, 5, rank, size);
+            }
 
-         CUCHK(cudaStreamSynchronize(stream_i));
+            CUCHK(cudaStreamSynchronize(stream_i));
 
-	 for (p=0; p<ngrids-1; p++){
-            dump_all_data(d_u1[p], d_v1[p], d_w1[p], d_xx[p], d_yy[p], d_zz[p], d_xz[p], d_yz[p], d_xy[p], 
-                     nel[p], cur_step, 1, p, rank, size);
-         }
+            for (p = 0; p < ngrids - 1; p++)
+            {
+               dump_all_data(d_u1[p], d_v1[p], d_w1[p], d_xx[p], d_yy[p], d_zz[p], d_xz[p], d_yz[p], d_xy[p],
+                             nel[p], cur_step, 1, p, rank, size);
+            }
 
-         // plasticity related calls:
-         if(NVE==3){
-           CUCHK(cudaDeviceSynchronize());
-
-           //cudaStreamSynchronize(stream_i);
-           for (p=0; p<ngrids; p++){
-	      PostRecvMsg_Y(RF_yldfac[p], RB_yldfac[p], MCW, request_y_yldfac[p], &count_y_yldfac[p], 
-		 yldfac_msg_size_y[p], y_rank_F, y_rank_B, p);
-	      PostRecvMsg_X(RL_yldfac[p], RR_yldfac[p], MCW, request_x_yldfac[p], &count_x_yldfac[p], 
-		 yldfac_msg_size_x[p], x_rank_L, x_rank_R, p);
-
-	      //yield factor computation, front and back
-	      drprecpc_calc_H_opt(d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p], d_mu[p], d_d1[p],
-		 d_sigma2[p], d_yldfac[p],d_cohes[p], d_phi[p], d_neta[p], 
-		 nzt[p], xlsp[p], xrep[p], ylsp[p], ylsp[p]+ngsl, stream_1, p);
-	      drprecpc_calc_H_opt(d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p], d_mu[p], d_d1[p],
-		 d_sigma2[p], d_yldfac[p],d_cohes[p], d_phi[p], d_neta[p], 
-		 nzt[p], xlsp[p], xrep[p], yrep[p]-ngsl, yrep[p], stream_2, p);
-	      update_yldfac_buffer_y_H(d_yldfac[p], d_SF_yldfac[p], d_SB_yldfac[p], nxt[p], nzt[p], 
-		 stream_1, stream_2, y_rank_F, y_rank_B, p);
-           }
-           CUCHK(cudaStreamSynchronize(stream_1));
-           CUCHK(cudaStreamSynchronize(stream_2));
-
-           for (p=0; p<ngrids; p++){
-	      Cpy2Host_yldfac_Y(d_yldfac[p],  SF_yldfac[p], SB_yldfac[p], d_SF_yldfac[p], d_SB_yldfac[p], 
-		   nxt[p], nzt[p], stream_1, stream_2, y_rank_F, y_rank_B, p);
-           }
-
-	   /*compute Stress in remaining part of inner region*/
-           for (p=0; p<ngrids; p++){
-	      dstrqc_H(d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p],
-			d_r1[p], d_r2[p], d_r3[p], d_r4[p], d_r5[p], d_r6[p],
-			d_u1[p], d_v1[p], d_w1[p], d_lam[p],
-			d_mu[p], d_qp[p],d_coeff, d_qs[p], d_dcrjx[p], d_dcrjy[p], d_dcrjz[p],
-			nyt[p],  nzt[p],  stream_i, d_lam_mu[p],
-			d_vx1[p], d_vx2[p], d_ww[p], d_wwo[p],
-			NX*grdfct[p], NPC,  coord[0], coord[1],   xss2[p],  xse2[p],
-			yls2[p], yre2[p], p);
-           }
- 
-           CUCHK(cudaStreamSynchronize(stream_1));
-           CUCHK(cudaStreamSynchronize(stream_2));
-           for (p=0; p<ngrids; p++){
-	      PostSendMsg_Y(SF_yldfac[p], SB_yldfac[p], MCW, request_y_yldfac[p], &count_y_yldfac[p], 
-		 yldfac_msg_size_y[p], y_rank_F, y_rank_B, rank, Both, p);
-	      MPI_Waitall(count_y_yldfac[p], request_y_yldfac[p], status_y_yldfac[p]);
-
-	      //cudaStreamSynchronize(stream_i);
-	      //left and right
-	      drprecpc_calc_H_opt(d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p], d_mu[p], d_d1[p],
-		 d_sigma2[p], d_yldfac[p],d_cohes[p], d_phi[p], d_neta[p], 
-		 nzt[p], xlsp[p], xlsp[p]+ngsl, ylsp[p], yrep[p], stream_1, p);
-	      drprecpc_calc_H_opt(d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p], d_mu[p], d_d1[p],
-		 d_sigma2[p], d_yldfac[p],d_cohes[p], d_phi[p], d_neta[p], 
-		 nzt[p], xrep[p]-ngsl, xrep[p], ylsp[p], yrep[p], stream_2, p);
-
-	      Cpy2Device_yldfac_Y(d_yldfac[p], RF_yldfac[p], RB_yldfac[p], d_RF_yldfac[p], d_RB_yldfac[p], 
-                         nxt[p], nzt[p], stream_1, stream_2, y_rank_F, y_rank_B, p);
-	      update_yldfac_buffer_x_H(d_yldfac[p], d_SL_yldfac[p], d_SR_yldfac[p], nyt[p], nzt[p], 
-                 stream_1, stream_2, x_rank_L, x_rank_R, p);
-	   }
-
-           CUCHK(cudaDeviceSynchronize());
-           
-           for (p=0; p<ngrids; p++){
-	      Cpy2Host_yldfac_X(d_yldfac[p],  SL_yldfac[p], SR_yldfac[p], d_SL_yldfac[p], d_SR_yldfac[p], 
-		   nyt[p], nzt[p], stream_1, stream_2, x_rank_L, x_rank_R, p);
-
-	      //compute yield factor in inside of subdomain
-	      drprecpc_calc_H_opt(d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p], d_mu[p], d_d1[p],
-		 d_sigma2[p], d_yldfac[p],d_cohes[p], d_phi[p], d_neta[p], 
-		 nzt[p], xlsp[p]+ngsl, xrep[p]-ngsl, ylsp[p]+ngsl, yrep[p]-ngsl, stream_i, p); /*xrep-ngsl*/
-           }
-
-           //dump_variable(d_yldfac, nel, "yldfac", 'u', cur_step, 0, rank, size);
-
-           //dump_variable(d_yldfac, nel, "yldfac", 'u', cur_step, 1, rank, size);
-
-           CUCHK(cudaStreamSynchronize(stream_1));
-           CUCHK(cudaStreamSynchronize(stream_2));
-           //cudaStreamSynchronize(stream_2b);
-           //cudaDeviceSynchronize();
-           for (p=0; p<ngrids; p++){
-	      PostSendMsg_X(SL_yldfac[p], SR_yldfac[p], MCW, request_x_yldfac[p], &count_x_yldfac[p], 
-		 yldfac_msg_size_x[p], x_rank_L, x_rank_R, rank, Both, p);
-	      MPI_Waitall(count_x_yldfac[p], request_x_yldfac[p], status_x_yldfac[p]);
-	      Cpy2Device_yldfac_X(d_yldfac[p], RL_yldfac[p], RR_yldfac[p], d_RL_yldfac[p], d_RR_yldfac[p], 
-		 nyt[p], nzt[p], stream_1, stream_2, x_rank_L, x_rank_R, p);
-           }
-
-           //wait until all streams have completed, including stream_i working on the inside part
-           CUCHK(cudaDeviceSynchronize());
-           //dump_variable(d_yldfac, nel, "yldfac", 'u', cur_step, 2, rank, size);
-
-           for (p=0; p<ngrids; p++){
-	      drprecpc_app_H(d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p], d_mu[p],
-		 d_sigma2[p], d_yldfac[p], 
-		 nzt[p], xlsp[p], xrep[p], ylsp[p], yrep[p], stream_i, p);
-           }
-         }
+            // plasticity related calls:
+            if (NVE == 3)
+            {
+               CUCHK(cudaDeviceSynchronize());
+
+               //cudaStreamSynchronize(stream_i);
+               for (p = 0; p < ngrids; p++)
+               {
+                  PostRecvMsg_Y(RF_yldfac[p], RB_yldfac[p], MCW, request_y_yldfac[p], &count_y_yldfac[p],
+                                yldfac_msg_size_y[p], y_rank_F, y_rank_B, p);
+                  PostRecvMsg_X(RL_yldfac[p], RR_yldfac[p], MCW, request_x_yldfac[p], &count_x_yldfac[p],
+                                yldfac_msg_size_x[p], x_rank_L, x_rank_R, p);
+
+                  //yield factor computation, front and back
+                  drprecpc_calc_H_opt(d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p], d_mu[p], d_d1[p],
+                                      d_sigma2[p], d_yldfac[p], d_cohes[p], d_phi[p], d_neta[p],
+                                      nzt[p], xlsp[p], xrep[p], ylsp[p], ylsp[p] + ngsl, stream_1, p);
+                  drprecpc_calc_H_opt(d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p], d_mu[p], d_d1[p],
+                                      d_sigma2[p], d_yldfac[p], d_cohes[p], d_phi[p], d_neta[p],
+                                      nzt[p], xlsp[p], xrep[p], yrep[p] - ngsl, yrep[p], stream_2, p);
+                  update_yldfac_buffer_y_H(d_yldfac[p], d_SF_yldfac[p], d_SB_yldfac[p], nxt[p], nzt[p],
+                                           stream_1, stream_2, y_rank_F, y_rank_B, p);
+               }
+               CUCHK(cudaStreamSynchronize(stream_1));
+               CUCHK(cudaStreamSynchronize(stream_2));
+
+               for (p = 0; p < ngrids; p++)
+               {
+                  Cpy2Host_yldfac_Y(d_yldfac[p], SF_yldfac[p], SB_yldfac[p], d_SF_yldfac[p], d_SB_yldfac[p],
+                                    nxt[p], nzt[p], stream_1, stream_2, y_rank_F, y_rank_B, p);
+               }
+
+               /*compute Stress in remaining part of inner region*/
+               for (p = 0; p < ngrids; p++)
+               {
+                  dstrqc_H(d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p],
+                           d_r1[p], d_r2[p], d_r3[p], d_r4[p], d_r5[p], d_r6[p],
+                           d_u1[p], d_v1[p], d_w1[p], d_lam[p],
+                           d_mu[p], d_qp[p], d_coeff, d_qs[p], d_dcrjx[p], d_dcrjy[p], d_dcrjz[p],
+                           nyt[p], nzt[p], stream_i, d_lam_mu[p],
+                           d_vx1[p], d_vx2[p], d_ww[p], d_wwo[p],
+                           NX * grdfct[p], NPC, coord[0], coord[1], xss2[p], xse2[p],
+                           yls2[p], yre2[p], p);
+               }
+
+               CUCHK(cudaStreamSynchronize(stream_1));
+               CUCHK(cudaStreamSynchronize(stream_2));
+               for (p = 0; p < ngrids; p++)
+               {
+                  PostSendMsg_Y(SF_yldfac[p], SB_yldfac[p], MCW, request_y_yldfac[p], &count_y_yldfac[p],
+                                yldfac_msg_size_y[p], y_rank_F, y_rank_B, rank, Both, p);
+                  MPI_Waitall(count_y_yldfac[p], request_y_yldfac[p], status_y_yldfac[p]);
+
+                  //cudaStreamSynchronize(stream_i);
+                  //left and right
+                  drprecpc_calc_H_opt(d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p], d_mu[p], d_d1[p],
+                                      d_sigma2[p], d_yldfac[p], d_cohes[p], d_phi[p], d_neta[p],
+                                      nzt[p], xlsp[p], xlsp[p] + ngsl, ylsp[p], yrep[p], stream_1, p);
+                  drprecpc_calc_H_opt(d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p], d_mu[p], d_d1[p],
+                                      d_sigma2[p], d_yldfac[p], d_cohes[p], d_phi[p], d_neta[p],
+                                      nzt[p], xrep[p] - ngsl, xrep[p], ylsp[p], yrep[p], stream_2, p);
+
+                  Cpy2Device_yldfac_Y(d_yldfac[p], RF_yldfac[p], RB_yldfac[p], d_RF_yldfac[p], d_RB_yldfac[p],
+                                      nxt[p], nzt[p], stream_1, stream_2, y_rank_F, y_rank_B, p);
+                  update_yldfac_buffer_x_H(d_yldfac[p], d_SL_yldfac[p], d_SR_yldfac[p], nyt[p], nzt[p],
+                                           stream_1, stream_2, x_rank_L, x_rank_R, p);
+               }
+
+               CUCHK(cudaDeviceSynchronize());
+
+               for (p = 0; p < ngrids; p++)
+               {
+                  Cpy2Host_yldfac_X(d_yldfac[p], SL_yldfac[p], SR_yldfac[p], d_SL_yldfac[p], d_SR_yldfac[p],
+                                    nyt[p], nzt[p], stream_1, stream_2, x_rank_L, x_rank_R, p);
+
+                  //compute yield factor in inside of subdomain
+                  drprecpc_calc_H_opt(d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p], d_mu[p], d_d1[p],
+                                      d_sigma2[p], d_yldfac[p], d_cohes[p], d_phi[p], d_neta[p],
+                                      nzt[p], xlsp[p] + ngsl, xrep[p] - ngsl, ylsp[p] + ngsl, yrep[p] - ngsl, stream_i, p); /*xrep-ngsl*/
+               }
+
+               //dump_variable(d_yldfac, nel, "yldfac", 'u', cur_step, 0, rank, size);
+
+               //dump_variable(d_yldfac, nel, "yldfac", 'u', cur_step, 1, rank, size);
+
+               CUCHK(cudaStreamSynchronize(stream_1));
+               CUCHK(cudaStreamSynchronize(stream_2));
+               //cudaStreamSynchronize(stream_2b);
+               //cudaDeviceSynchronize();
+               for (p = 0; p < ngrids; p++)
+               {
+                  PostSendMsg_X(SL_yldfac[p], SR_yldfac[p], MCW, request_x_yldfac[p], &count_x_yldfac[p],
+                                yldfac_msg_size_x[p], x_rank_L, x_rank_R, rank, Both, p);
+                  MPI_Waitall(count_x_yldfac[p], request_x_yldfac[p], status_x_yldfac[p]);
+                  Cpy2Device_yldfac_X(d_yldfac[p], RL_yldfac[p], RR_yldfac[p], d_RL_yldfac[p], d_RR_yldfac[p],
+                                      nyt[p], nzt[p], stream_1, stream_2, x_rank_L, x_rank_R, p);
+               }
+
+               //wait until all streams have completed, including stream_i working on the inside part
+               CUCHK(cudaDeviceSynchronize());
+               //dump_variable(d_yldfac, nel, "yldfac", 'u', cur_step, 2, rank, size);
+
+               for (p = 0; p < ngrids; p++)
+               {
+                  drprecpc_app_H(d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p], d_mu[p],
+                                 d_sigma2[p], d_yldfac[p],
+                                 nzt[p], xlsp[p], xrep[p], ylsp[p], yrep[p], stream_i, p);
+               }
+            }
 
-         //update source input
-         /*if ((IFAULT < 4) && (cur_step<NST)) {
+            //update source input
+            /*if ((IFAULT < 4) && (cur_step<NST)) {
             CUCHK(cudaDeviceSynchronize());
             ++source_step;
             for (p=0; p<ngrids; p++){ 
@@ -2229,66 +2555,117 @@ rank, READ_STEP, READ_STEP_GPU, NST, IFAULT);
             }
          }*/
 
-         CUCHK(cudaDeviceSynchronize());
-         if (cur_step < -1){
-            for (p=0; p<ngrids; p++)
-   	       //print_nonzero_H(d_xx[p], (nxt[p]+4+ngsl2), (nyt[p]+4+ngsl2), (nzt[p]+2*align), p);
-   	       print_nonzero_mat_H(d_xx[p], (nxt[p]+4+ngsl2), (nyt[p]+4+ngsl2), (nzt[p]+2*align), p, 
-                 d_d1[p], d_mu[p], d_lam[p], d_qp[p], d_qs[p], rank);
-         }
- 
-         //apply free surface boundary conditions (Daniel)
-         CUCHK(cudaDeviceSynchronize());
+            CUCHK(cudaDeviceSynchronize());
+            if (cur_step < -1)
+            {
+               for (p = 0; p < ngrids; p++)
+                  //print_nonzero_H(d_xx[p], (nxt[p]+4+ngsl2), (nyt[p]+4+ngsl2), (nzt[p]+2*align), p);
+                  print_nonzero_mat_H(d_xx[p], (nxt[p] + 4 + ngsl2), (nyt[p] + 4 + ngsl2), (nzt[p] + 2 * align), p,
+                                      d_d1[p], d_mu[p], d_lam[p], d_qp[p], d_qs[p], rank);
+            }
 
-         fstr_H(d_zz[0], d_xz[0], d_yz[0], stream_i, xls[0], xre[0], yls[0], yre[0]);
-         CUCHK(cudaDeviceSynchronize());
+            //apply free surface boundary conditions (Daniel)
+            CUCHK(cudaDeviceSynchronize());
 
-         for (p=0; p<ngrids; p++) {
-                receivers_write(d_u1[p], d_v1[p], d_w1[p], cur_step, nt, p);
-                sgt_write(d_xx[p], d_yy[p], d_zz[p], d_xy[0], d_xz[p], d_yz[p],
-                          cur_step, nt, p);
-         }
+            fstr_H(d_zz[0], d_xz[0], d_yz[0], stream_i, xls[0], xre[0], yls[0], yre[0]);
+            
+
+               if (usemms) {
+                       float t = DT * (cur_step - 1) + 0.5 * DT;
+                       for (p = 0; p < ngrids; p++) {
+                               mms_force_stress(d_xx[p], d_yy[p], d_zz[p],
+                                                d_xy[p], d_xz[p], d_yz[p],
+                                                nxt[p], nyt[p], nzt[p], DH[p],
+                                                coord[0], coord[1], p,
+                                                t + 0.5 * DT, DT);
+                       }
+
+                        p = ngrids - 1;
+
+                        // Exact solution at bottom boundary
+                        //mms_exact_stress(
+                        //    d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p],
+                        //    d_yz[p], nxt[p], nyt[p], nzt[p], coord[0], coord[1],
+                        //    p, 2 + ngsl, 2 + ngsl, 8, 2 + ngsl + nxt[p],
+                        //    2 + ngsl + nyt[p], nzt[p] - 8, DH[p], t, 0);
+
+                        //// Exact solution at top boundary
+                        //mms_exact_stress(
+                        //    d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p],
+                        //    d_yz[p], nxt[p], nyt[p], nzt[p], coord[0], coord[1],
+                        //    p, 0 , 0, nzt[p] - 8, 4 + 2 * ngsl + nxt[p],
+                        //    4 + 2 * ngsl + nyt[p], nzt[p], DH[p], t);
+
+                        //mms_exact_stress(
+                        //    d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p],
+                        //    d_yz[p], nxt[p], nyt[p], nzt[p], coord[0], coord[1],
+                        //    p, 50 , 50, 16, 4 + 2 * ngsl + nxt[p] - 50,
+                        //    4 + 2 * ngsl + nyt[p] - 20, nzt[p] - 16, DH[p], t, 0);
+
+                        CUCHK(cudaDeviceSynchronize());
+               }
+            CUCHK(cudaDeviceSynchronize());
+
+            for (p = 0; p < ngrids; p++)
+            {
+               receivers_write(d_u1[p], d_v1[p], d_w1[p], cur_step, nt, p);
+               sgt_write(d_xx[p], d_yy[p], d_zz[p], d_xy[p], d_xz[p], d_yz[p],
+                         cur_step, nt, p);
+            }
 
-         if(cur_step%NTISKP == 0){
-          #ifndef SEISMIO
+#if TOPO
+   energy_rate(&energy, cur_step, d_u1[0], d_v1[0], d_w1[0], d_xx[0], d_yy[0], d_zz[0], d_xy[0], d_xz[0], d_yz[0], d_d1[0], d_mu[0], d_lam[0], &T.metrics_f, &T.metrics_g, nxt[0], nyt[0], nzt[0], rank, MCW);
+#endif
 
-          for (p=0; p<ngrids; p++){
-             if (grid_output[p]){
-		if(!rank) time_gpuio_tmp = -gethrtime();
-                velbuffer_H(d_u1[p], d_v1[p], d_w1[p], d_neta[p], 
-                    d_Bufx[p], d_Bufy[p], d_Bufz[p], d_Bufeta[p], NVE,
-                    rec_nbgx[p], rec_nedx[p], NSKPX[p], rec_nbgy[p], rec_nedy[p], NSKPY[p], 
-                    rec_nbgz[p], rec_nedz[p], NSKPZ[p], rec_nxt[p], rec_nyt[p], rec_nzt[p], 
-                    stream_i, FOLLOWBATHY, d_bathy, p);
-		cudaStreamSynchronize(stream_i);
+//#define TOPO_USE_VTK 1
+//            if (cur_step % 10 == 0)
+//            topo_write_vtk(&T, cur_step, 1);
 
-		/*num_bytes = sizeof(float)*(nxt[p]+4+ngsl2)*(nyt[p]+4+ngsl2)*(nzt[p]+2*align);
+            if (cur_step % NTISKP == 0)
+            {
+#ifndef SEISMIO
+
+               for (p = 0; p < ngrids; p++)
+               {
+                  if (grid_output[p])
+                  {
+                     if (!rank)
+                        time_gpuio_tmp = -gethrtime();
+                     velbuffer_H(d_u1[p], d_v1[p], d_w1[p], d_neta[p],
+                                 d_Bufx[p], d_Bufy[p], d_Bufz[p], d_Bufeta[p], NVE,
+                                 rec_nbgx[p], rec_nedx[p], NSKPX[p], rec_nbgy[p], rec_nedy[p], NSKPY[p],
+                                 rec_nbgz[p], rec_nedz[p], NSKPZ[p], rec_nxt[p], rec_nyt[p], rec_nzt[p],
+                                 stream_i, FOLLOWBATHY, d_bathy, p);
+                     cudaStreamSynchronize(stream_i);
+
+                     /*num_bytes = sizeof(float)*(nxt[p]+4+ngsl2)*(nyt[p]+4+ngsl2)*(nzt[p]+2*align);
 		CUCHK(cudaMemcpy(&u1[p][0][0][0],d_u1[p],num_bytes,cudaMemcpyDeviceToHost));
 		CUCHK(cudaMemcpy(&v1[p][0][0][0],d_v1[p],num_bytes,cudaMemcpyDeviceToHost));
 		CUCHK(cudaMemcpy(&w1[p][0][0][0],d_w1[p],num_bytes,cudaMemcpyDeviceToHost));
 		//added for plasticity
 		if (NVE == 3) 
 		   CUCHK(cudaMemcpy(&neta[p][0][0][0],d_neta[p],num_bytes,cudaMemcpyDeviceToHost));*/
-		if(!rank){
-		  //cudaStreamSynchronize(stream_i);
-		  time_gpuio_tmp += gethrtime();
-		  time_gpuio += time_gpuio_tmp;
+                     if (!rank)
+                     {
+                        //cudaStreamSynchronize(stream_i);
+                        time_gpuio_tmp += gethrtime();
+                        time_gpuio += time_gpuio_tmp;
 #if VERBOSE
-		  printf("Output data buffered on GPU in (sec): %lf\n",time_gpuio_tmp);
+                        printf("Output data buffered on GPU in (sec): %lf\n", time_gpuio_tmp);
 #endif
-		  //printf("Output data copied to host in (sec): %lf\n",time_gpuio_tmp);
-		  time_gpuio_tmp = -gethrtime();
-		}
-		idtmp = ((cur_step/NTISKP+WRITE_STEP-1)%WRITE_STEP);
-		idtmp = idtmp*rec_nxt[p]*rec_nyt[p]*rec_nzt[p];
-                num_bytes = sizeof(float)*rec_nxt[p]*rec_nyt[p]*rec_nzt[p];
-                CUCHK(cudaStreamSynchronize(stream_o));
-                CUCHK(cudaMemcpyAsync(Bufx[p]+idtmp,d_Bufx[p],num_bytes,cudaMemcpyDeviceToHost, stream_o));
-		CUCHK(cudaMemcpyAsync(Bufy[p]+idtmp,d_Bufy[p],num_bytes,cudaMemcpyDeviceToHost, stream_o));
-		CUCHK(cudaMemcpyAsync(Bufz[p]+idtmp,d_Bufz[p],num_bytes,cudaMemcpyDeviceToHost, stream_o));
-                if (NVE==3) 
-                   CUCHK(cudaMemcpyAsync(Bufeta[p]+idtmp,d_Bufeta[p],num_bytes,cudaMemcpyDeviceToHost, stream_o))
-		/*tmpInd = idtmp;
+                        //printf("Output data copied to host in (sec): %lf\n",time_gpuio_tmp);
+                        time_gpuio_tmp = -gethrtime();
+                     }
+                     idtmp = ((cur_step / NTISKP + WRITE_STEP - 1) % WRITE_STEP);
+                     idtmp = idtmp * rec_nxt[p] * rec_nyt[p] * rec_nzt[p];
+                     num_bytes = sizeof(float) * rec_nxt[p] * rec_nyt[p] * rec_nzt[p];
+                     CUCHK(cudaStreamSynchronize(stream_o));
+                     CUCHK(cudaMemcpyAsync(Bufx[p] + idtmp, d_Bufx[p], num_bytes, cudaMemcpyDeviceToHost, stream_o));
+                     CUCHK(cudaMemcpyAsync(Bufy[p] + idtmp, d_Bufy[p], num_bytes, cudaMemcpyDeviceToHost, stream_o));
+                     CUCHK(cudaMemcpyAsync(Bufz[p] + idtmp, d_Bufz[p], num_bytes, cudaMemcpyDeviceToHost, stream_o));
+                     if (NVE == 3)
+                        CUCHK(cudaMemcpyAsync(Bufeta[p] + idtmp, d_Bufeta[p], num_bytes, cudaMemcpyDeviceToHost, stream_o))
+                     /*tmpInd = idtmp;
 		for(i=2+ngsl + rec_nbgx[p]; i<=2+ngsl + rec_nedx[p]; i+=NSKPX[p])
 		  for(j=2+ngsl + rec_nbgy[p]; j<=2+ngsl + rec_nedy[p]; j+=NSKPY[p])
 		    for(k=rec_nbgz[p]; k<=rec_nedz[p]; k=k+NSKPZ[p])
@@ -2319,490 +2696,537 @@ rank, READ_STEP, READ_STEP_GPU, NST, IFAULT);
 
 		      tmpInd++;
 		    }*/
-		if(!rank){
-		   time_gpuio_tmp += gethrtime();
-		   time_gpuio += time_gpuio_tmp;
+                     if (!rank)
+                     {
+                        time_gpuio_tmp += gethrtime();
+                        time_gpuio += time_gpuio_tmp;
 #if VERBOSE
-		   printf("Output data copied to host in (sec): %lf\n",time_gpuio_tmp);
+                        printf("Output data copied to host in (sec): %lf\n", time_gpuio_tmp);
 #endif
-		   //printf("Output data buffered in (sec): %lf\n",time_gpuio_tmp);
-		}
-
-		if((cur_step/NTISKP)%WRITE_STEP == 0){
-		  CUCHK(cudaDeviceSynchronize());
-                  #ifndef NOBGIO
-                  outsize=rec_nxt[p]*rec_nyt[p]*rec_nzt[p]*WRITE_STEP;
-                  time(&time1);
-                  MPI_Send(Bufx[p], outsize, MPI_FLOAT, rank+2*size, MPIRANKIO+30, MPI_COMM_WORLD);
-                  time(&time2);
-                  if (rank==0 && p==0)
-                     fprintf(stdout, "Wait time for sending output (): %5.f seconds.\n", difftime(time2, time1));
-                  MPI_Send(Bufy[p], outsize, MPI_FLOAT, rank+2*size, MPIRANKIO+31, MPI_COMM_WORLD);
-                  MPI_Send(Bufz[p], outsize, MPI_FLOAT, rank+2*size, MPIRANKIO+32, MPI_COMM_WORLD);
-                  if (NVE ==3) 
-                    MPI_Send(Bufeta[p], outsize, MPI_FLOAT, rank+2*size, MPIRANKIO+33, MPI_COMM_WORLD);
-                  #else
-		  sprintf(filename, "%s_%1d_%07ld", filenamebasex, p, cur_step);
-		  err = MPI_File_open(MCW,filename,MPI_MODE_CREATE|MPI_MODE_WRONLY,MPI_INFO_NULL,&fh);
-                  //error_check(err, "MPI_File_open X");
-		  err = MPI_File_set_view(fh, displacement[p], MPI_FLOAT, filetype[p], "native", MPI_INFO_NULL);
-                  //error_check(err, "MPI_File_set_view X");
-		  err = MPI_File_write_all(fh, Bufx[p], rec_nxt[p]*rec_nyt[p]*rec_nzt[p]*WRITE_STEP, MPI_FLOAT, &filestatus);
-                  //error_check(err, "MPI_File_write X");
-
-		  err = MPI_File_close(&fh);
-                  //error_check(err, "MPI_File_close X");
-
-		  sprintf(filename, "%s_%1d_%07ld", filenamebasey, p, cur_step);
-		  err = MPI_File_open(MCW,filename,MPI_MODE_CREATE|MPI_MODE_WRONLY,MPI_INFO_NULL,&fh);
-		  err = MPI_File_set_view(fh, displacement[p], MPI_FLOAT, filetype[p], "native", MPI_INFO_NULL);
-		  err = MPI_File_write_all(fh, Bufy[p], rec_nxt[p]*rec_nyt[p]*rec_nzt[p]*WRITE_STEP, MPI_FLOAT, &filestatus);
-		  err = MPI_File_close(&fh);
-		  sprintf(filename, "%s_%1d_%07ld", filenamebasez, p, cur_step);
-		  err = MPI_File_open(MCW,filename,MPI_MODE_CREATE|MPI_MODE_WRONLY,MPI_INFO_NULL,&fh);
-		  err = MPI_File_set_view(fh, displacement[p], MPI_FLOAT, filetype[p], "native", MPI_INFO_NULL);
-		  err = MPI_File_write_all(fh, Bufz[p], rec_nxt[p]*rec_nyt[p]*rec_nzt[p]*WRITE_STEP, MPI_FLOAT, &filestatus);
-		  err = MPI_File_close(&fh);
-		  //saves the plastic shear work
-		  if (NVE == 3) {
-		     sprintf(filename, "%s_%1d_%07ld", filenamebaseeta, p, cur_step);
-		     err = MPI_File_open(MCW,filename,MPI_MODE_CREATE|MPI_MODE_WRONLY,MPI_INFO_NULL,&fh);
-		     err = MPI_File_set_view(fh, displacement[p], MPI_FLOAT, filetype[p], "native", MPI_INFO_NULL);
-		     err = MPI_File_write_all(fh, Bufeta[p], rec_nxt[p]*rec_nyt[p]*rec_nzt[p]*WRITE_STEP, MPI_FLOAT, &filestatus);
-		     err = MPI_File_close(&fh);
-		  }
-                  #endif
-		}
-          }
-          //else 
-            //cudaDeviceSynchronize();
-          #else
-          for (p=0; p<ngrids; p++){
-	     num_bytes = sizeof(float)*(nxt[p]+4+ngsl2)*(nyt[p]+4+ngsl2)*(nzt[p]+2*align);
-	     if(!rank && p==0) time_gpuio_tmp = -gethrtime();
-	     CUCHK(cudaMemcpy(&u1[p][0][0][0],d_u1[p],num_bytes,cudaMemcpyDeviceToHost));
-	     CUCHK(cudaMemcpy(&v1[p][0][0][0],d_v1[p],num_bytes,cudaMemcpyDeviceToHost));
-	     CUCHK(cudaMemcpy(&w1[p][0][0][0],d_w1[p],num_bytes,cudaMemcpyDeviceToHost));
-	     //added for plasticity
-	     if (NVE == 3) 
-		CUCHK(cudaMemcpy(&neta[p][0][0][0],d_neta[p],num_bytes,cudaMemcpyDeviceToHost));
-
-	     num_bytes = sizeof(float)*(nxt[p])*(nyt[p])*(nzt[p]);
-             Bufx[0]=(float*) malloc(num_bytes);
-             Bufy[0]=(float*) malloc(num_bytes);
-             Bufz[0]=(float*) malloc(num_bytes);
-             Bufeta[0]=(float*) malloc(num_bytes);
-
-	     tmpInd = 0;
-	     for(k=nzt[p]+align-1; k>=align; k--) {
-		for(j=2+ngsl; j<2+ngsl+nyt[p]; j++) {
-		   for(i=2+ngsl; i<2+ngsl+nxt[p]; i++) {
-		      Bufx[0][tmpInd] = u1[p][i][j][k];
-		      Bufy[0][tmpInd] = v1[p][i][j][k];
-		      Bufz[0][tmpInd] = w1[p][i][j][k];
-		      if (NVE == 3) Bufeta[0][tmpInd] = neta[p][i][j][k];
-		      tmpInd++;
-		   }
-		}
-	     }
- 
-             /*seism_write(&seism_filex[p], &u1[p][0][0][0], &err);
+                        //printf("Output data buffered in (sec): %lf\n",time_gpuio_tmp);
+                     }
+
+                     if ((cur_step / NTISKP) % WRITE_STEP == 0)
+                     {
+                        CUCHK(cudaDeviceSynchronize());
+#ifndef NOBGIO
+                        outsize = rec_nxt[p] * rec_nyt[p] * rec_nzt[p] * WRITE_STEP;
+                        time(&time1);
+                        MPI_Send(Bufx[p], outsize, MPI_FLOAT, rank + 2 * size, MPIRANKIO + 30, MPI_COMM_WORLD);
+                        time(&time2);
+                        if (rank == 0 && p == 0)
+                           fprintf(stdout, "Wait time for sending output (): %5.f seconds.\n", difftime(time2, time1));
+                        MPI_Send(Bufy[p], outsize, MPI_FLOAT, rank + 2 * size, MPIRANKIO + 31, MPI_COMM_WORLD);
+                        MPI_Send(Bufz[p], outsize, MPI_FLOAT, rank + 2 * size, MPIRANKIO + 32, MPI_COMM_WORLD);
+                        if (NVE == 3)
+                           MPI_Send(Bufeta[p], outsize, MPI_FLOAT, rank + 2 * size, MPIRANKIO + 33, MPI_COMM_WORLD);
+#else
+                        sprintf(filename, "%s_%1d_%07ld", filenamebasex, p, cur_step);
+                        err = MPI_File_open(MCW, filename, MPI_MODE_CREATE | MPI_MODE_WRONLY, MPI_INFO_NULL, &fh);
+                        //error_check(err, "MPI_File_open X");
+                        err = MPI_File_set_view(fh, displacement[p], MPI_FLOAT, filetype[p], "native", MPI_INFO_NULL);
+                        //error_check(err, "MPI_File_set_view X");
+                        err = MPI_File_write_all(fh, Bufx[p], rec_nxt[p] * rec_nyt[p] * rec_nzt[p] * WRITE_STEP, MPI_FLOAT, &filestatus);
+                        //error_check(err, "MPI_File_write X");
+
+                        err = MPI_File_close(&fh);
+                        //error_check(err, "MPI_File_close X");
+
+                        sprintf(filename, "%s_%1d_%07ld", filenamebasey, p, cur_step);
+                        err = MPI_File_open(MCW, filename, MPI_MODE_CREATE | MPI_MODE_WRONLY, MPI_INFO_NULL, &fh);
+                        err = MPI_File_set_view(fh, displacement[p], MPI_FLOAT, filetype[p], "native", MPI_INFO_NULL);
+                        err = MPI_File_write_all(fh, Bufy[p], rec_nxt[p] * rec_nyt[p] * rec_nzt[p] * WRITE_STEP, MPI_FLOAT, &filestatus);
+                        err = MPI_File_close(&fh);
+                        sprintf(filename, "%s_%1d_%07ld", filenamebasez, p, cur_step);
+                        err = MPI_File_open(MCW, filename, MPI_MODE_CREATE | MPI_MODE_WRONLY, MPI_INFO_NULL, &fh);
+                        err = MPI_File_set_view(fh, displacement[p], MPI_FLOAT, filetype[p], "native", MPI_INFO_NULL);
+                        err = MPI_File_write_all(fh, Bufz[p], rec_nxt[p] * rec_nyt[p] * rec_nzt[p] * WRITE_STEP, MPI_FLOAT, &filestatus);
+                        err = MPI_File_close(&fh);
+                        //saves the plastic shear work
+                        if (NVE == 3)
+                        {
+                           sprintf(filename, "%s_%1d_%07ld", filenamebaseeta, p, cur_step);
+                           err = MPI_File_open(MCW, filename, MPI_MODE_CREATE | MPI_MODE_WRONLY, MPI_INFO_NULL, &fh);
+                           err = MPI_File_set_view(fh, displacement[p], MPI_FLOAT, filetype[p], "native", MPI_INFO_NULL);
+                           err = MPI_File_write_all(fh, Bufeta[p], rec_nxt[p] * rec_nyt[p] * rec_nzt[p] * WRITE_STEP, MPI_FLOAT, &filestatus);
+                           err = MPI_File_close(&fh);
+                        }
+#endif
+                     }
+                  }
+//else
+//cudaDeviceSynchronize();
+#else
+            for (p = 0; p < ngrids; p++)
+            {
+               num_bytes = sizeof(float) * (nxt[p] + 4 + ngsl2) * (nyt[p] + 4 + ngsl2) * (nzt[p] + 2 * align);
+               if (!rank && p == 0)
+                  time_gpuio_tmp = -gethrtime();
+               CUCHK(cudaMemcpy(&u1[p][0][0][0], d_u1[p], num_bytes, cudaMemcpyDeviceToHost));
+               CUCHK(cudaMemcpy(&v1[p][0][0][0], d_v1[p], num_bytes, cudaMemcpyDeviceToHost));
+               CUCHK(cudaMemcpy(&w1[p][0][0][0], d_w1[p], num_bytes, cudaMemcpyDeviceToHost));
+               //added for plasticity
+               if (NVE == 3)
+                  CUCHK(cudaMemcpy(&neta[p][0][0][0], d_neta[p], num_bytes, cudaMemcpyDeviceToHost));
+
+               num_bytes = sizeof(float) * (nxt[p]) * (nyt[p]) * (nzt[p]);
+               Bufx[0] = (float *)malloc(num_bytes);
+               Bufy[0] = (float *)malloc(num_bytes);
+               Bufz[0] = (float *)malloc(num_bytes);
+               Bufeta[0] = (float *)malloc(num_bytes);
+
+               tmpInd = 0;
+               for (k = nzt[p] + align - 1; k >= align; k--)
+               {
+                  for (j = 2 + ngsl; j < 2 + ngsl + nyt[p]; j++)
+                  {
+                     for (i = 2 + ngsl; i < 2 + ngsl + nxt[p]; i++)
+                     {
+                        Bufx[0][tmpInd] = u1[p][i][j][k];
+                        Bufy[0][tmpInd] = v1[p][i][j][k];
+                        Bufz[0][tmpInd] = w1[p][i][j][k];
+                        if (NVE == 3)
+                           Bufeta[0][tmpInd] = neta[p][i][j][k];
+                        tmpInd++;
+                     }
+                  }
+               }
+
+               /*seism_write(&seism_filex[p], &u1[p][0][0][0], &err);
              seism_write(&seism_filey[p], &v1[p][0][0][0], &err);
              seism_write(&seism_filez[p], &w1[p][0][0][0], &err);
              if (NVE == 3) seism_write(&seism_fileeta[p], &neta[p][0][0][0], &err);*/
 
-             seism_write(&seism_filex[p], Bufx[0], &err);
-             seism_write(&seism_filey[p], Bufy[0], &err);
-             seism_write(&seism_filez[p], Bufz[0], &err);
-             if (NVE == 3) seism_write(&seism_fileeta[p], &neta[p][0][0][0], &err);
-             
-             free(Bufx[0]);
-             free(Bufy[0]);
-             free(Bufz[0]);
-             if (NVE == 3) free(Bufeta[0]);
-          }
-          #endif
-
-          // write-statistics to chk file:
-          if(rank==0){
-            if (NPC < 2){ /* for periodic BCs, ND may be larger than nxt - Daniel */
-	       i = ND+2+ngsl;
-	       j = i;
-            }
-            else i = j = 2+ngsl;
-            
-            k = nzt[0]+align-1-ND;
-            fprintf(fchk,"%ld :\t%e\t%e\t%e\n",cur_step,u1[0][i][j][k],v1[0][i][j][k],w1[0][i][j][k]);
-            fflush(fchk);
-          }
+               seism_write(&seism_filex[p], Bufx[0], &err);
+               seism_write(&seism_filey[p], Bufy[0], &err);
+               seism_write(&seism_filez[p], Bufz[0], &err);
+               if (NVE == 3)
+                  seism_write(&seism_fileeta[p], &neta[p][0][0][0], &err);
+
+               free(Bufx[0]);
+               free(Bufy[0]);
+               free(Bufz[0]);
+               if (NVE == 3)
+                  free(Bufeta[0]);
+            }
+#endif
+
+                  // write-statistics to chk file:
+                  if (rank == 0)
+                  {
+                     if (NPC < 2)
+                     { /* for periodic BCs, ND may be larger than nxt - Daniel */
+                        i = ND + 2 + ngsl;
+                        j = i;
+                     }
+                     else
+                        i = j = 2 + ngsl;
+
+                     k = nzt[0] + align - 1 - ND;
+                     fprintf(fchk, "%ld :\t%e\t%e\t%e\n", cur_step, u1[0][i][j][k], v1[0][i][j][k], w1[0][i][j][k]);
+                     fflush(fchk);
+                  }
+               }
+            }
+            //else
+            //cudaDeviceSynchronize();
+
+            if ((cur_step < (NST * fbc_tskp) - 1) && (IFAULT >= 2) &&
+                ((cur_step + 1) % (READ_STEP_GPU * fbc_tskp) == 0))
+            {
+               printf("%d) Read new source from CPU.\n", rank);
+               if ((cur_step + 1) % (READ_STEP * fbc_tskp) == 0)
+               {
+                  printf("%d) Read new source from file.\n", rank);
+                  if (IFAULT == 2)
+                     for (p = 0; p < ngrids; p++)
+                     {
+                        if (rank == srcproc[p])
+                        {
+                           sprintf(insrcgrid, "%s_%d", INSRC, p);
+                           sprintf(insrc_i2_grid, "%s_%d", INSRC_I2, p);
+                           read_src_ifault_2(rank, READ_STEP,
+                                             insrcgrid, insrc_i2_grid,
+                                             maxdim, coord, NZ[p],
+                                             nxt[p], nyt[p], nzt[p],
+                                             npsrc + p, srcproc + p,
+                                             tpsrc + p, taxx + p, tayy + p, tazz + p,
+                                             taxz + p, tayz + p, taxy + p, (cur_step + 1) / READ_STEP + 1);
+                        }
+                     }
+                  else if ((IFAULT == 4) && (rank == srcproc[0]))
+                  {
+                     read_src_ifault_4(rank, READ_STEP,
+                                       INSRC, maxdim, coord, NZ[0],
+                                       nxt[0], nyt[0], nzt[0],
+                                       npsrc, srcproc,
+                                       tpsrc, taxx, tayy, tazz, cur_step + 2,
+                                       fbc_ext, fbc_off, fbc_pmask, fbc_extl, fbc_dim,
+                                       &fbc_seismio, &fbc_tskp, NST, size);
+                  }
+               }
+               if (rank == srcproc[0])
+                  printf("%d) SOURCE: taxx,yy,zz:%e,%e,%e\n", rank,
+                         taxx[0][cur_step % READ_STEP], tayy[0][cur_step % READ_STEP], tazz[0][cur_step % READ_STEP]);
+               // Synchronous copy!
+
+               for (p = 0; p < ngrids; p++)
+               {
+                  if (rank == srcproc[p])
+                     Cpy2Device_source(npsrc[p], READ_STEP_GPU,
+                                       (cur_step + 1) % (READ_STEP * fbc_tskp) / fbc_tskp,
+                                       taxx[p], tayy[p], tazz[p],
+                                       taxz[p], tayz[p], taxy[p],
+                                       d_taxx[p], d_tayy[p], d_tazz[p],
+                                       d_taxz[p], d_tayz[p], d_taxy[p], IFAULT);
+               }
+               source_step = 0;
+            }
          }
+         time_un += gethrtime();
+         CUCHK(cudaDeviceSynchronize());
+      }
+
+      if (IFAULT == 5)
+      {
+         for (p = 0; p < ngrids; p++)
+         {
+            if (rank == srcproc[p])
+            {
+               num_bytes = npsrc[p] * sizeof(float);
+               fprintf(stdout, "num_bytes=%ld\n", num_bytes);
+               CUCHK(cudaMemcpy(mom[p], d_mom[p], num_bytes, cudaMemcpyDeviceToHost));
+               for (n = 0; n < npsrc[p]; n++)
+               {
+                  /*fprintf(stdout, "mom[%d]=%e\n", n, mom[p][n]);*/
+                  tmom += mom[p][n];
+               }
+            }
          }
-         //else
-          //cudaDeviceSynchronize();
-
-          if((cur_step<(NST*fbc_tskp)-1) && (IFAULT >= 2) && 
-                   ((cur_step+1)%(READ_STEP_GPU*fbc_tskp)== 0)){
-            printf("%d) Read new source from CPU.\n",rank);
-            if((cur_step+1)%(READ_STEP*fbc_tskp) == 0){
-	       printf("%d) Read new source from file.\n",rank);
-	       if (IFAULT == 2) 
-		  for (p=0; p<ngrids; p++){
-		     if (rank==srcproc[p]) {
-			sprintf(insrcgrid, "%s_%d", INSRC, p);
-			sprintf(insrc_i2_grid, "%s_%d", INSRC_I2, p);
-			read_src_ifault_2(rank, READ_STEP,
-			  insrcgrid, insrc_i2_grid,
-			  maxdim, coord, NZ[p],
-			  nxt[p], nyt[p], nzt[p],
-			  npsrc+p, srcproc+p,
-			  tpsrc+p, taxx+p, tayy+p, tazz+p,
-			  taxz+p, tayz+p, taxy+p, (cur_step+1)/READ_STEP+1);
-		     }
-		 }
-	       else if ((IFAULT == 4) && (rank==srcproc[0])) {
-		  read_src_ifault_4(rank, READ_STEP,
-		    INSRC, maxdim, coord, NZ[0],
-		    nxt[0], nyt[0], nzt[0],
-		    npsrc, srcproc,
-		    tpsrc, taxx, tayy, tazz, cur_step+2,
-		    fbc_ext, fbc_off, fbc_pmask, fbc_extl, fbc_dim, 
-		    &fbc_seismio, &fbc_tskp, NST, size);
-	       }
-            }
-            if (rank==srcproc[0]) printf("%d) SOURCE: taxx,yy,zz:%e,%e,%e\n",rank,
-                taxx[0][cur_step%READ_STEP],tayy[0][cur_step%READ_STEP],tazz[0][cur_step%READ_STEP]);
-            // Synchronous copy!
-            
-            for (p=0; p<ngrids; p++){
-               if (rank == srcproc[p])
-		  Cpy2Device_source(npsrc[p], READ_STEP_GPU,
-		    (cur_step+1) % (READ_STEP*fbc_tskp) / fbc_tskp,
-		    taxx[p], tayy[p], tazz[p],
-		    taxz[p], tayz[p], taxy[p],
-		    d_taxx[p], d_tayy[p], d_tazz[p],
-		    d_taxz[p], d_tayz[p], d_taxy[p], IFAULT);
-            }
-            source_step = 0;
-          }       
-       }
-       time_un += gethrtime();
-       CUCHK(cudaDeviceSynchronize());
-    } 
-
-    if (IFAULT == 5){
-       for (p=0; p<ngrids; p++){
-	  if (rank==srcproc[p]) {
-	     num_bytes = npsrc[p]*sizeof(float);
-	     fprintf(stdout, "num_bytes=%ld\n", num_bytes);
-	     CUCHK(cudaMemcpy(mom[p], d_mom[p], num_bytes, cudaMemcpyDeviceToHost));
-	     for (n=0; n<npsrc[p]; n++) {
-		   /*fprintf(stdout, "mom[%d]=%e\n", n, mom[p][n]);*/
-		   tmom += mom[p][n];
-		}
-	     }
-       }
-       fprintf(stdout, "rank %d: moment=%e\n", rank, tmom);
-       MPI_Allreduce(&tmom, &gmom, 1, MPI_FLOAT, MPI_SUM, MCW);
-       mag= 2./3. * (log10f(gmom) - 9.1);
-       if (rank==0) fprintf(stdout, "Total M0=%e, Mw=%4.1f\n", gmom, mag);
-       //if (rank==0) fprintf(stdout, "moment of source node 19132: %e\n", mom[0][19132]);
-    }
+         fprintf(stdout, "rank %d: moment=%e\n", rank, tmom);
+         MPI_Allreduce(&tmom, &gmom, 1, MPI_FLOAT, MPI_SUM, MCW);
+         mag = 2. / 3. * (log10f(gmom) - 9.1);
+         if (rank == 0)
+            fprintf(stdout, "Total M0=%e, Mw=%4.1f\n", gmom, mag);
+         //if (rank==0) fprintf(stdout, "moment of source node 19132: %e\n", mom[0][19132]);
+      }
 
-    if(rank==0){
-      fprintf(fchk,"END\n");
-      fclose(fchk);
-    }
+      if (rank == 0)
+      {
+         fprintf(fchk, "END\n");
+         fclose(fchk);
+      }
 
-    #ifdef SEISMIO
-    for (p=0; p<ngrids; p++){
-       seism_file_close(seism_filex+p, &err);
-       seism_file_close(seism_filey+p, &err);
-       seism_file_close(seism_filez+p, &err);
-       seism_file_close(seism_fileeta+p, &err);
-    }
-    #endif
-
-    //This should save the final plastic strain tensor at the end of the simulation
-    if (NVE == 3){
-      #ifndef SEISMIO
-      Bufeta = (Grid1D*) calloc(ngrids, sizeof(Grid1D));
-      for (p=0; p<ngrids; p++){
-	 fprintf(stdout, "copying plastic strain back to CPU\n");
-	 num_bytes = sizeof(float)*(nxt[p]+4+ngsl2)*(nyt[p]+4+ngsl2)*(nzt[p]+2*align);
-	 CUCHK(cudaMemcpy(&neta[p][0][0][0],d_neta[p],num_bytes,cudaMemcpyDeviceToHost));
-	 tmpInd = 0;
-
-	 rec_NZ[p] = (NEDZ_EP-NBGZ[p])/NSKPZ[p]+1;
-	 calcRecordingPoints(&rec_nbgx[p], &rec_nedx[p], &rec_nbgy[p], &rec_nedy[p], 
-	   &rec_nbgz[p], &rec_nedz[p], &rec_nxt[p], &rec_nyt[p], &rec_nzt[p], &displacement[p],
-	   (long int)nxt[p],(long int)nyt[p],(long int)nzt[p], rec_NX[p], rec_NY[p], rec_NZ[p], 
-	   NBGX[p],NEDX[p],NSKPX[p], NBGY[p],NEDY[p],NSKPY[p], NBGZ[p],NEDZ_EP,NSKPZ[p], coord);
-	printf("%d = (%d,%d)) NX,NY,NZ=%d,%d,%d\nnxt,nyt,nzt=%d,%d,%d\nrec_N=(%d,%d,%d)\nrec_nxt,=%d,%d,%d\nNBGX,SKP,END=(%d:%d:%d),(%d:%d:%d),(%d:%d:%d)\nrec_nbg,ed=(%d,%d),(%d,%d),(%d,%d)\ndisp=%ld\n",
-	   rank,coord[0],coord[1],NX*grdfct[p],NY*grdfct[p],nzt[p],nxt[p],nyt[p],nzt[p],
-	   rec_NX[p], rec_NY[p], rec_NZ[p], rec_nxt[p], rec_nyt[p], rec_nzt[p],
-	   NBGX[p],NSKPX[2],NEDX[2],NBGY[2],NSKPY[2],NEDY[2],NBGZ[2],NSKPZ[2],NEDZ_EP,
-	   rec_nbgx[p],rec_nedx[p],rec_nbgy[p],rec_nedy[p],rec_nbgz[p],rec_nedz[p],(long int)displacement[p]);
-
-
-	 /*this should save the final plastic strain down to NEDZ_EP grip points*/
-	 Bufeta2[p] = Alloc1D(rec_nxt[p]*rec_nyt[p]*rec_nzt[p]);
-
-	 for(k=nzt[p]+align-1 - rec_nbgz[p]; k>=nzt[p]+align-1 - rec_nedz[p]; k=k-NSKPZ[p])
-	   for(j=2+ngsl + rec_nbgy[p]; j<=2+ngsl + rec_nedy[p]; j=j+NSKPY[p])
-	     for(i=2+ngsl + rec_nbgx[p]; i<=2+ngsl + rec_nedx[p]; i=i+NSKPX[p]) {
-	       if (tmpInd >= (rec_nxt[p]*rec_nyt[p]*rec_nzt[p])) 
-		  fprintf(stdout, "tmpind=%ld (allocated %d)\n", tmpInd, (rec_nxt[p]*rec_nyt[p]*rec_nzt[p]));
-	       Bufeta2[p][tmpInd] = neta[p][i][j][k];
-	       tmpInd++;
-	     }
-
-	 MPI_Datatype filetype2;
-
-
-       maxNX_NY_NZ_WS = (maxNX_NY_NZ_WS>rec_NZ[p]?maxNX_NY_NZ_WS:rec_NZ[p]);
-       int ones2[maxNX_NY_NZ_WS];
-       MPI_Aint dispArray2[maxNX_NY_NZ_WS];
-       for(i=0;i<maxNX_NY_NZ_WS;++i){
-	 ones2[i] = 1;
-       }
-      
-       err = MPI_Type_contiguous(rec_nxt[p], MPI_FLOAT, &filetype2);
-       err = MPI_Type_commit(&filetype2);
-       for(i=0;i<rec_nyt[p];i++){
-	 dispArray2[i] = sizeof(float);
-	 dispArray2[i] = dispArray2[i]*rec_NX[p]*i;
-       }
-       err = MPI_Type_create_hindexed(rec_nyt[p], ones2, dispArray2, filetype2, &filetype2);
-       err = MPI_Type_commit(&filetype2);
-       for(i=0;i<rec_nzt[p];i++){
-	 dispArray2[i] = sizeof(float);
-	 dispArray2[i] = dispArray2[i]*rec_NY[p]*rec_NX[p]*i;
-       }
-       err = MPI_Type_create_hindexed(rec_nzt[p], ones2, dispArray2, filetype2, &filetype2);
-       err = MPI_Type_commit(&filetype2);
-       MPI_Type_size(filetype2, &tmpSize);
+#ifdef SEISMIO
+      for (p = 0; p < ngrids; p++)
+      {
+         seism_file_close(seism_filex + p, &err);
+         seism_file_close(seism_filey + p, &err);
+         seism_file_close(seism_filez + p, &err);
+         seism_file_close(seism_fileeta + p, &err);
+      }
+#endif
+
+      //This should save the final plastic strain tensor at the end of the simulation
+      if (NVE == 3)
+      {
+#ifndef SEISMIO
+         Bufeta = (Grid1D *)calloc(ngrids, sizeof(Grid1D));
+         for (p = 0; p < ngrids; p++)
+         {
+            fprintf(stdout, "copying plastic strain back to CPU\n");
+            num_bytes = sizeof(float) * (nxt[p] + 4 + ngsl2) * (nyt[p] + 4 + ngsl2) * (nzt[p] + 2 * align);
+            CUCHK(cudaMemcpy(&neta[p][0][0][0], d_neta[p], num_bytes, cudaMemcpyDeviceToHost));
+            tmpInd = 0;
+
+            rec_NZ[p] = (NEDZ_EP - NBGZ[p]) / NSKPZ[p] + 1;
+            calcRecordingPoints(&rec_nbgx[p], &rec_nedx[p], &rec_nbgy[p], &rec_nedy[p],
+                                &rec_nbgz[p], &rec_nedz[p], &rec_nxt[p], &rec_nyt[p], &rec_nzt[p], &displacement[p],
+                                (long int)nxt[p], (long int)nyt[p], (long int)nzt[p], rec_NX[p], rec_NY[p], rec_NZ[p],
+                                NBGX[p], NEDX[p], NSKPX[p], NBGY[p], NEDY[p], NSKPY[p], NBGZ[p], NEDZ_EP, NSKPZ[p], coord);
+            printf("%d = (%d,%d)) NX,NY,NZ=%d,%d,%d\nnxt,nyt,nzt=%d,%d,%d\nrec_N=(%d,%d,%d)\nrec_nxt,=%d,%d,%d\nNBGX,SKP,END=(%d:%d:%d),(%d:%d:%d),(%d:%d:%d)\nrec_nbg,ed=(%d,%d),(%d,%d),(%d,%d)\ndisp=%ld\n",
+                   rank, coord[0], coord[1], NX * grdfct[p], NY * grdfct[p], nzt[p], nxt[p], nyt[p], nzt[p],
+                   rec_NX[p], rec_NY[p], rec_NZ[p], rec_nxt[p], rec_nyt[p], rec_nzt[p],
+                   NBGX[p], NSKPX[2], NEDX[2], NBGY[2], NSKPY[2], NEDY[2], NBGZ[2], NSKPZ[2], NEDZ_EP,
+                   rec_nbgx[p], rec_nedx[p], rec_nbgy[p], rec_nedy[p], rec_nbgz[p], rec_nedz[p], (long int)displacement[p]);
+
+            /*this should save the final plastic strain down to NEDZ_EP grip points*/
+            Bufeta2[p] = Alloc1D(rec_nxt[p] * rec_nyt[p] * rec_nzt[p]);
+
+            for (k = nzt[p] + align - 1 - rec_nbgz[p]; k >= nzt[p] + align - 1 - rec_nedz[p]; k = k - NSKPZ[p])
+               for (j = 2 + ngsl + rec_nbgy[p]; j <= 2 + ngsl + rec_nedy[p]; j = j + NSKPY[p])
+                  for (i = 2 + ngsl + rec_nbgx[p]; i <= 2 + ngsl + rec_nedx[p]; i = i + NSKPX[p])
+                  {
+                     if (tmpInd >= (rec_nxt[p] * rec_nyt[p] * rec_nzt[p]))
+                        fprintf(stdout, "tmpind=%ld (allocated %d)\n", tmpInd, (rec_nxt[p] * rec_nyt[p] * rec_nzt[p]));
+                     Bufeta2[p][tmpInd] = neta[p][i][j][k];
+                     tmpInd++;
+                  }
+
+            MPI_Datatype filetype2;
+
+            maxNX_NY_NZ_WS = (maxNX_NY_NZ_WS > rec_NZ[p] ? maxNX_NY_NZ_WS : rec_NZ[p]);
+            int ones2[maxNX_NY_NZ_WS];
+            MPI_Aint dispArray2[maxNX_NY_NZ_WS];
+            for (i = 0; i < maxNX_NY_NZ_WS; ++i)
+            {
+               ones2[i] = 1;
+            }
+
+            err = MPI_Type_contiguous(rec_nxt[p], MPI_FLOAT, &filetype2);
+            err = MPI_Type_commit(&filetype2);
+            for (i = 0; i < rec_nyt[p]; i++)
+            {
+               dispArray2[i] = sizeof(float);
+               dispArray2[i] = dispArray2[i] * rec_NX[p] * i;
+            }
+            err = MPI_Type_create_hindexed(rec_nyt[p], ones2, dispArray2, filetype2, &filetype2);
+            err = MPI_Type_commit(&filetype2);
+            for (i = 0; i < rec_nzt[p]; i++)
+            {
+               dispArray2[i] = sizeof(float);
+               dispArray2[i] = dispArray2[i] * rec_NY[p] * rec_NX[p] * i;
+            }
+            err = MPI_Type_create_hindexed(rec_nzt[p], ones2, dispArray2, filetype2, &filetype2);
+            err = MPI_Type_commit(&filetype2);
+            MPI_Type_size(filetype2, &tmpSize);
 #if VERBOSE
-       printf("filetype size (supposedly=rec_nxt*rec_nyt*rec_nzt*4=%ld) =%d\n", 
-                  rec_nxt[p]*rec_nyt[p]*rec_nzt[p]*sizeof(float),tmpSize);
+            printf("filetype size (supposedly=rec_nxt*rec_nyt*rec_nzt*4=%ld) =%d\n",
+                   rec_nxt[p] * rec_nyt[p] * rec_nzt[p] * sizeof(float), tmpSize);
 #endif
 
-	 sprintf(filename, "Finaleta_%d_%07ld", p, cur_step);
-	 err = MPI_File_open(MCW,filename,MPI_MODE_CREATE|MPI_MODE_WRONLY,MPI_INFO_NULL,&fh);
-	 err = MPI_File_set_view(fh, displacement[p], MPI_FLOAT, filetype2, "native", MPI_INFO_NULL);
-	 if (err != MPI_SUCCESS) {
-	    fprintf(stderr, "MPI error in MPI_File_set_view():\n");
-	    char errstr[200];
-	    int strlen;
-	    MPI_Error_string(err, errstr, &strlen);
-	    fprintf(stderr, "MPI error in MPI_File_set_view(): %s\n", errstr);
-	 }
-	 err = MPI_File_write_all(fh, Bufeta2, rec_nxt[p]*rec_nyt[p]*rec_nzt[p], MPI_FLOAT, &filestatus);
-	 if (err != MPI_SUCCESS) {
-	    char errstr[200];
-	    int strlen;
-	    MPI_Error_string(err, errstr, &strlen);
-	    fprintf(stderr, "MPI error in MPI_File_write_all(): %s\n", errstr);
-	 }
-	 err = MPI_File_close(&fh);
-      }
-      #else
-      for (p=0; p<ngrids; p++){
-	 nx=NX*grdfct[p];
-	 ny=NX*grdfct[p];
-         num_bytes = sizeof(float)*(nxt[p]+4+ngsl2)*(nyt[p]+4+ngsl2)*(nzt[p]+2*align);
-         CUCHK(cudaMemcpy(&neta[p][0][0][0],d_neta[p],num_bytes,cudaMemcpyDeviceToHost));
-	 seism_createRegularGrid(&one, &nx, &one, &one, &ny, &one,
-				 &one, nzt+p, &one, seism_regGridID+p, &err);
-
-	 sprintf(filenamebaseep,"%s/EP_%d",OUT, p);
-
-	 seism_file_open(filenamebaseep, "w", &one, "float", seism_regGridID+p, seism_fileep+p, &err);
-         seism_write(&seism_fileep[p], &neta[p][0][0][0], &err);
-         seism_file_close(seism_fileep+p, &err);
+            sprintf(filename, "Finaleta_%d_%07ld", p, cur_step);
+            err = MPI_File_open(MCW, filename, MPI_MODE_CREATE | MPI_MODE_WRONLY, MPI_INFO_NULL, &fh);
+            err = MPI_File_set_view(fh, displacement[p], MPI_FLOAT, filetype2, "native", MPI_INFO_NULL);
+            if (err != MPI_SUCCESS)
+            {
+               fprintf(stderr, "MPI error in MPI_File_set_view():\n");
+               char errstr[200];
+               int strlen;
+               MPI_Error_string(err, errstr, &strlen);
+               fprintf(stderr, "MPI error in MPI_File_set_view(): %s\n", errstr);
+            }
+            err = MPI_File_write_all(fh, Bufeta2, rec_nxt[p] * rec_nyt[p] * rec_nzt[p], MPI_FLOAT, &filestatus);
+            if (err != MPI_SUCCESS)
+            {
+               char errstr[200];
+               int strlen;
+               MPI_Error_string(err, errstr, &strlen);
+               fprintf(stderr, "MPI error in MPI_File_write_all(): %s\n", errstr);
+            }
+            err = MPI_File_close(&fh);
+         }
+#else
+   for (p = 0; p < ngrids; p++)
+   {
+      nx = NX * grdfct[p];
+      ny = NX * grdfct[p];
+      num_bytes = sizeof(float) * (nxt[p] + 4 + ngsl2) * (nyt[p] + 4 + ngsl2) * (nzt[p] + 2 * align);
+      CUCHK(cudaMemcpy(&neta[p][0][0][0], d_neta[p], num_bytes, cudaMemcpyDeviceToHost));
+      seism_createRegularGrid(&one, &nx, &one, &one, &ny, &one,
+                              &one, nzt + p, &one, seism_regGridID + p, &err);
+
+      sprintf(filenamebaseep, "%s/EP_%d", OUT, p);
+
+      seism_file_open(filenamebaseep, "w", &one, "float", seism_regGridID + p, seism_fileep + p, &err);
+      seism_write(&seism_fileep[p], &neta[p][0][0][0], &err);
+      seism_file_close(seism_fileep + p, &err);
+   }
+#endif
       }
-      #endif
-
-    }
 
 #if TOPO
-    topo_free(&T);
-    receivers_finalize();
-    sources_finalize();
+      topo_free(&T);
+      receivers_finalize();
+      sources_finalize();
+      energy_output(&energy, ENERGYFILE);
+      energy_free(&energy);
 #endif
-    cudaStreamDestroy(stream_1);
-    //cudaStreamDestroy(stream_1b);
-    cudaStreamDestroy(stream_2);
-    //cudaStreamDestroy(stream_2b);
-    cudaStreamDestroy(stream_i);
-    cudaStreamDestroy(stream_i2);
-    cudaStreamDestroy(stream_o);
-    for (p=0; p<ngrids; p++){
-       cudaFreeHost(SL_vel[p]);
-       cudaFreeHost(SR_vel[p]);
-       cudaFreeHost(RL_vel[p]);
-       cudaFreeHost(RR_vel[p]);
-       cudaFreeHost(SF_vel[p]);
-       cudaFreeHost(SB_vel[p]);
-       cudaFreeHost(RF_vel[p]);
-       cudaFreeHost(RB_vel[p]);
-       if(NVE==3){
-	 cudaFreeHost(SL_yldfac[p]);
-	 cudaFreeHost(SR_yldfac[p]);
-	 cudaFreeHost(RL_yldfac[p]);
-	 cudaFreeHost(RR_yldfac[p]);
-	 cudaFreeHost(SF_yldfac[p]);
-	 cudaFreeHost(SB_yldfac[p]);
-	 cudaFreeHost(RF_yldfac[p]);
-	 cudaFreeHost(RB_yldfac[p]);
-
-	 cudaFree(d_SL_yldfac[p]);
-	 cudaFree(d_SR_yldfac[p]);
-	 cudaFree(d_RL_yldfac[p]);
-	 cudaFree(d_RR_yldfac[p]);
-	 cudaFree(d_SF_yldfac[p]);
-	 cudaFree(d_SB_yldfac[p]);
-	 cudaFree(d_RF_yldfac[p]);
-	 cudaFree(d_RB_yldfac[p]);
-
-       }
-    }
-    GFLOPS  = 1.0;
-    if (NVE < 2) GFLOPS  = GFLOPS*307.0*(xre - xls)*(yre-yls)*nzt[0];
-    else GFLOPS  = GFLOPS*511.0*(xre - xls)*(yre-yls)*nzt[0];
-    GFLOPS  = GFLOPS/(1000*1000*1000);
-    time_un = time_un/(cur_step-READ_STEP);
-    GFLOPS  = GFLOPS/time_un;
-    MPI_Allreduce( &GFLOPS, &GFLOPS_SUM, 1, MPI_DOUBLE, MPI_SUM, MCW );
+      cudaStreamDestroy(stream_1);
+      //cudaStreamDestroy(stream_1b);
+      cudaStreamDestroy(stream_2);
+      //cudaStreamDestroy(stream_2b);
+      cudaStreamDestroy(stream_i);
+      cudaStreamDestroy(stream_i2);
+      cudaStreamDestroy(stream_o);
+      for (p = 0; p < ngrids; p++)
+      {
+         cudaFreeHost(SL_vel[p]);
+         cudaFreeHost(SR_vel[p]);
+         cudaFreeHost(RL_vel[p]);
+         cudaFreeHost(RR_vel[p]);
+         cudaFreeHost(SF_vel[p]);
+         cudaFreeHost(SB_vel[p]);
+         cudaFreeHost(RF_vel[p]);
+         cudaFreeHost(RB_vel[p]);
+         if (NVE == 3)
+         {
+            cudaFreeHost(SL_yldfac[p]);
+            cudaFreeHost(SR_yldfac[p]);
+            cudaFreeHost(RL_yldfac[p]);
+            cudaFreeHost(RR_yldfac[p]);
+            cudaFreeHost(SF_yldfac[p]);
+            cudaFreeHost(SB_yldfac[p]);
+            cudaFreeHost(RF_yldfac[p]);
+            cudaFreeHost(RB_yldfac[p]);
+
+            cudaFree(d_SL_yldfac[p]);
+            cudaFree(d_SR_yldfac[p]);
+            cudaFree(d_RL_yldfac[p]);
+            cudaFree(d_RR_yldfac[p]);
+            cudaFree(d_SF_yldfac[p]);
+            cudaFree(d_SB_yldfac[p]);
+            cudaFree(d_RF_yldfac[p]);
+            cudaFree(d_RB_yldfac[p]);
+         }
+      }
+      GFLOPS = 1.0;
+      if (NVE < 2)
+         GFLOPS = GFLOPS * 307.0 * (xre - xls) * (yre - yls) * nzt[0];
+      else
+         GFLOPS = GFLOPS * 511.0 * (xre - xls) * (yre - yls) * nzt[0];
+      GFLOPS = GFLOPS / (1000 * 1000 * 1000);
+      time_un = time_un / (cur_step - READ_STEP);
+      GFLOPS = GFLOPS / time_un;
+      MPI_Allreduce(&GFLOPS, &GFLOPS_SUM, 1, MPI_DOUBLE, MPI_SUM, MCW);
 #if VERBOSE
-    if(rank==0)
-    {
-        printf("GPU benchmark size (fine grid) NX=%d, NY=%d, NZ=%d, ReadStep=%d\n", NX, NY, NZ[0], READ_STEP);
-    	printf("GPU computing flops=%1.18f GFLOPS, time = %1.18f secs per timestep\n", GFLOPS_SUM, time_un);
-    }	
+      if (rank == 0)
+      {
+         printf("GPU benchmark size (fine grid) NX=%d, NY=%d, NZ=%d, ReadStep=%d\n", NX, NY, NZ[0], READ_STEP);
+         printf("GPU computing flops=%1.18f GFLOPS, time = %1.18f secs per timestep\n", GFLOPS_SUM, time_un);
+      }
 #endif
-//  Main Loop Ends
- 
-//  program ends, free all memories
-    for (p=0; p<ngrids; p++){
-       Delloc3D(u1[p]);
-       Delloc3D(v1[p]);
-       Delloc3D(w1[p]); 
-       Delloc3D(xx[p]);
-       Delloc3D(yy[p]);
-       Delloc3D(zz[p]);
-       Delloc3D(xy[p]);
-       Delloc3D(yz[p]);
-       Delloc3D(xz[p]);
-       Delloc3D(vx1[p]);
-       Delloc3D(vx2[p]);
-       Delloc3Dww(ww[p]);
-       Delloc3D(wwo[p]); 
-
-       cudaFree(d_u1[p]);
-       cudaFree(d_v1[p]);
-       cudaFree(d_w1[p]);
-       cudaFree(d_f_u1[p]);
-       cudaFree(d_f_v1[p]);
-       cudaFree(d_f_w1[p]);
-       cudaFree(d_b_u1[p]);
-       cudaFree(d_b_v1[p]);
-       cudaFree(d_b_w1[p]);
-       cudaFree(d_xx[p]);
-       cudaFree(d_yy[p]);
-       cudaFree(d_zz[p]);
-       cudaFree(d_xy[p]);
-       cudaFree(d_yz[p]);
-       cudaFree(d_xz[p]);
-       cudaFree(d_vx1[p]);
-       cudaFree(d_vx2[p]);
-
-       if(NVE==1 || NVE==3)
-       {
-	  Delloc3D(r1[p]);
-	  Delloc3D(r2[p]);
-	  Delloc3D(r3[p]);
-	  Delloc3D(r4[p]);
-	  Delloc3D(r5[p]);
-	  Delloc3D(r6[p]);
-	  cudaFree(d_r1[p]);
-	  cudaFree(d_r2[p]);
-	  cudaFree(d_r3[p]);
-	  cudaFree(d_r4[p]);
-	  cudaFree(d_r5[p]);
-	  cudaFree(d_r6[p]);
-
-	  Delloc3D(qp[p]);
-	  Delloc3D(qs[p]);
-	  cudaFree(d_qp[p]);
-	  cudaFree(d_qs[p]);
-       }
-       if(NVE==3){
-	 Delloc3D(sigma2[p]);
-	 Delloc3D(cohes[p]);
-	 Delloc3D(phi[p]);
-	 Delloc3D(yldfac[p]);
-	 Delloc3D(neta[p]);
-       }
-
-       if((NPC==0) || (NPC==2))
-       {
-	   Delloc1D(dcrjx[p]);
-	   Delloc1D(dcrjy[p]);
-	   Delloc1D(dcrjz[p]);
-	   cudaFree(d_dcrjx[p]);
-	   cudaFree(d_dcrjy[p]);
-	   cudaFree(d_dcrjz[p]);
-       }
-
-       Delloc3D(d1[p]);
-       Delloc3D(mu[p]);
-       Delloc3D(lam[p]);
-       Delloc3D(lam_mu[p]);
-       cudaFree(d_d1[p]);
-       cudaFree(d_mu[p]);
-       cudaFree(d_lam[p]);
-       cudaFree(d_lam_mu[p]);
-    }
+      //  Main Loop Ends
+
+      //  program ends, free all memories
+      for (p = 0; p < ngrids; p++)
+      {
+         Delloc3D(u1[p]);
+         Delloc3D(v1[p]);
+         Delloc3D(w1[p]);
+         Delloc3D(xx[p]);
+         Delloc3D(yy[p]);
+         Delloc3D(zz[p]);
+         Delloc3D(xy[p]);
+         Delloc3D(yz[p]);
+         Delloc3D(xz[p]);
+         Delloc3D(vx1[p]);
+         Delloc3D(vx2[p]);
+         Delloc3Dww(ww[p]);
+         Delloc3D(wwo[p]);
+
+         cudaFree(d_u1[p]);
+         cudaFree(d_v1[p]);
+         cudaFree(d_w1[p]);
+         cudaFree(d_f_u1[p]);
+         cudaFree(d_f_v1[p]);
+         cudaFree(d_f_w1[p]);
+         cudaFree(d_b_u1[p]);
+         cudaFree(d_b_v1[p]);
+         cudaFree(d_b_w1[p]);
+         cudaFree(d_xx[p]);
+         cudaFree(d_yy[p]);
+         cudaFree(d_zz[p]);
+         cudaFree(d_xy[p]);
+         cudaFree(d_yz[p]);
+         cudaFree(d_xz[p]);
+         cudaFree(d_vx1[p]);
+         cudaFree(d_vx2[p]);
+
+         if (NVE == 1 || NVE == 3)
+         {
+            Delloc3D(r1[p]);
+            Delloc3D(r2[p]);
+            Delloc3D(r3[p]);
+            Delloc3D(r4[p]);
+            Delloc3D(r5[p]);
+            Delloc3D(r6[p]);
+            cudaFree(d_r1[p]);
+            cudaFree(d_r2[p]);
+            cudaFree(d_r3[p]);
+            cudaFree(d_r4[p]);
+            cudaFree(d_r5[p]);
+            cudaFree(d_r6[p]);
+
+            Delloc3D(qp[p]);
+            Delloc3D(qs[p]);
+            cudaFree(d_qp[p]);
+            cudaFree(d_qs[p]);
+         }
+         if (NVE == 3)
+         {
+            Delloc3D(sigma2[p]);
+            Delloc3D(cohes[p]);
+            Delloc3D(phi[p]);
+            Delloc3D(yldfac[p]);
+            Delloc3D(neta[p]);
+         }
 
-    if(NVE==1 || NVE==3) {
-       Delloc1D(coeff);  
-       cudaFree(d_coeff);
-    }
+         if ((NPC == 0) || (NPC == 2))
+         {
+            Delloc1D(dcrjx[p]);
+            Delloc1D(dcrjy[p]);
+            Delloc1D(dcrjz[p]);
+            cudaFree(d_dcrjx[p]);
+            cudaFree(d_dcrjy[p]);
+            cudaFree(d_dcrjz[p]);
+         }
 
-    for (p=0; p<ngrids; p++){
-       if(rank==srcproc[p]) {
-	  Delloc1D(taxx[p]);
-	  Delloc1D(tayy[p]);
-	  Delloc1D(tazz[p]);
-	  Delloc1D(taxz[p]);
-	  Delloc1D(tayz[p]);
-	  Delloc1D(taxy[p]); 
-	  cudaFree(d_taxx[p]);
-	  cudaFree(d_tayy[p]);
-	  cudaFree(d_tazz[p]);
-	  cudaFree(d_taxz[p]);
-	  cudaFree(d_tayz[p]);
-	  cudaFree(d_taxy[p]);
-	  Delloc1P(tpsrc[p]);
-	  cudaFree(d_tpsrc[p]);
-       }
-    }
-    MPI_Comm_free( &MC1 );
+         Delloc3D(d1[p]);
+         Delloc3D(mu[p]);
+         Delloc3D(lam[p]);
+         Delloc3D(lam_mu[p]);
+         cudaFree(d_d1[p]);
+         cudaFree(d_mu[p]);
+         cudaFree(d_lam[p]);
+         cudaFree(d_lam_mu[p]);
+      }
 
-    #ifndef NOBGIO
-    } /* end of if (rank < size) */
+      if (NVE == 1 || NVE == 3)
+      {
+         Delloc1D(coeff);
+         cudaFree(d_coeff);
+      }
 
-    else if (ranktype==1) {
-       if (IFAULT == 4) background_velocity_reader(rank, size, NST, READ_STEP, MCS);
-    }
-    else {
-       nt = (int)(TMAX/DT) + 1;
-       nout= nt / (WRITE_STEP * NTISKP);
-       background_output_writer(rank, size, nout, WRITE_STEP, NTISKP, ngrids, OUT, MCI, NVE);
-    }
-    #endif
+      for (p = 0; p < ngrids; p++)
+      {
+         if (rank == srcproc[p])
+         {
+            Delloc1D(taxx[p]);
+            Delloc1D(tayy[p]);
+            Delloc1D(tazz[p]);
+            Delloc1D(taxz[p]);
+            Delloc1D(tayz[p]);
+            Delloc1D(taxy[p]);
+            cudaFree(d_taxx[p]);
+            cudaFree(d_tayy[p]);
+            cudaFree(d_tazz[p]);
+            cudaFree(d_taxz[p]);
+            cudaFree(d_tayz[p]);
+            cudaFree(d_taxy[p]);
+            Delloc1P(tpsrc[p]);
+            cudaFree(d_tpsrc[p]);
+         }
+      }
+      MPI_Comm_free(&MC1);
+
+#ifndef NOBGIO
+   } /* end of if (rank < size) */
+
+   else if (ranktype == 1)
+   {
+      if (IFAULT == 4)
+         background_velocity_reader(rank, size, NST, READ_STEP, MCS);
+   }
+   else
+   {
+      nt = (int)(TMAX / DT) + 1;
+      nout = nt / (WRITE_STEP * NTISKP);
+      background_output_writer(rank, size, nout, WRITE_STEP, NTISKP, ngrids, OUT, MCI, NVE);
+   }
+#endif
 
-    MPI_Barrier(MCT);
-    MPI_Finalize();
-    return (0);
+   MPI_Barrier(MCT);
+   MPI_Finalize();
+   return (0);
 }
diff --git a/src/buffers/CMakeLists.txt b/src/buffers/CMakeLists.txt
index 6b002a6..e9de39e 100644
--- a/src/buffers/CMakeLists.txt
+++ b/src/buffers/CMakeLists.txt
@@ -1,14 +1,13 @@
 set(HEADERS
-    ${AWP_MINI_SOURCE_DIR}/include/buffers/buffer.h     
-    ${AWP_MINI_SOURCE_DIR}/include/awp/definitions.h
-    ${AWP_MINI_SOURCE_DIR}/include/test/test.h
+    ${AWP_SOURCE_DIR}/include/buffers/buffer.h     
+    ${AWP_SOURCE_DIR}/include/test/test.h
     )
 
 add_library(buffers buffer.c ${HEADERS})
 
 target_include_directories(buffers
         PUBLIC
-        ${AWP_MINI_SOURCE_DIR}/include/
+        ${AWP_SOURCE_DIR}/include/
        )
 
 target_link_libraries(buffers
diff --git a/src/buffers/buffer.c b/src/buffers/buffer.c
index 5d9d13e..dbc7f4b 100644
--- a/src/buffers/buffer.c
+++ b/src/buffers/buffer.c
@@ -99,7 +99,7 @@ void buffer_copy_to_device(buffer_t *buffer, size_t step)
                         buffer->d_buffer_bytes, cudaMemcpyHostToDevice));
 }
 
-void buffer_copy_to_host(buffer_t *buffer, int step)
+void buffer_copy_to_host(buffer_t *buffer, size_t step)
 {
        if (!buffer_is_device_full(buffer, step)) return;
 
diff --git a/src/checksum/CMakeLists.txt b/src/checksum/CMakeLists.txt
index c8d2e22..4e49e37 100644
--- a/src/checksum/CMakeLists.txt
+++ b/src/checksum/CMakeLists.txt
@@ -1,5 +1,5 @@
  set(HEADERS
-    ${AWP_MINI_SOURCE_DIR}/include/checksum/checksum.h     )
+    ${AWP_SOURCE_DIR}/include/checksum/checksum.h     )
  
 add_library(checksum checksum.c md5/md5.c)
 
@@ -7,6 +7,6 @@ target_link_libraries(checksum)
 
 target_include_directories(checksum
         PUBLIC
-        ${AWP_MINI_SOURCE_DIR}/include/checksum
-        ${AWP_MINI_SOURCE_DIR}/include/checksum/md5
+        ${AWP_SOURCE_DIR}/include/checksum
+        ${AWP_SOURCE_DIR}/include/checksum/md5
        )
diff --git a/src/debug/debug.md b/src/debug/debug.md
index ddcef8c..6e7e44e 100644
--- a/src/debug/debug.md
+++ b/src/debug/debug.md
@@ -3,6 +3,68 @@ These notes describe a set of scripts that are helpful for both debugging and fu
 developing AWP. Before you use any of these scripts, please commit your work beforehand so
 that you can easily revert the changes in case something goes wrong.
 
+## Debugging
+It is not uncommon for segmentation faults to occur when running AWP. Sometimes,
+these are caused by user errors and other times they are caused by bugs in the
+program. In either case, these errors can be time consuming to identity without
+a systematic approach and proper tools. This guide is meant to show you one
+effective way of catching segmentation fault using the gdb debugger.
+
+If you can read this document,
+then chances are that you have access to a sufficiently recent version of AWP
+that enables gdb to attach to one of your runs.
+
+1. Search for `GDB_ATTACH`
+at the top of `pmcl3d.c` and uncomment the line. If you cannot find this macro,
+go ahead and copy and paste the following block of code after the `MPI_Init,
+MPI_Comm_rank, MPI_Comm_size` calls.
+
+```C
+    if ( rank == 0) {
+        volatile int i = 0;
+        printf("Process ID %d is ready for attach\n", getpid());
+        fflush(stdout);
+        while (0 == i)
+            sleep(5);
+    }
+
+```
+
+2. Compile AWP in debug mode:
+```
+$ cd build
+$ make clean
+$ cmake -DCMAKE_BUILD_TYPE=Debug ..
+$ make
+
+```
+3. Launch an interactive job and load gdb. On summit, see the user guide for
+   launching interactive jobs: https://docs.olcf.ornl.gov/systems/summit_user_guide.html#interactive-jobs
+   To use gdb on Summit, you need to load it: `module load gdb`.
+4. Run AWP: Once your interactive jobs has started, call AWP with its usual input
+   arguments associated with your particular run. Put `&` at the end of the
+   command to spawn it in a background process so that you regain control of the
+   terminal. For e.g,
+   ```
+        jsrun -n 4 -a 3 -c 3 -g 1 -r 4 -d cyclic pmcl3d [ARGS]&
+   ```
+   After a while, you should see:
+   ```
+        Process ID 23969 is ready for attach
+   ```
+5. Run gdb: `gdb ---pid PID`, but replace PID with process ID displayed in the
+   previous step. Press `n` followed by the return key until you see
+   ```
+   while (0 == i)
+   ```
+   Run the command:
+   `set var i = 1`
+   This command will cause gdb do modify the variable `i` and therefore exit the
+   while loop. Next, you can proceed using gdb as you please. To simplify go to
+   the next error, type `c` and gdb should run until the error occurs and tell
+   you what statement in the source code that caused the error.
+
+
 ## Memory issues
 We have found that certain bugs in AWP are due to memory errors. Tools such as `valgrind`
 and `cuda-memcheck` are excellent for reporting many memory related issues. You are highly
diff --git a/src/functions/CMakeLists.txt b/src/functions/CMakeLists.txt
deleted file mode 100644
index d3333c4..0000000
--- a/src/functions/CMakeLists.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-set(HEADERS
-    ${AWP_MINI_SOURCE_DIR}/include/functions/functions.h     
-    ${AWP_MINI_SOURCE_DIR}/include/functions/random.h     
-    ${AWP_MINI_SOURCE_DIR}/include/functions/norm.h     
-    ${AWP_MINI_SOURCE_DIR}/include/grid/grid_3d.h
-    )
-
-add_library(functions functions.c random.c norm.c ${HEADERS})
-
-target_include_directories(functions
-        PUBLIC
-        ${AWP_MINI_SOURCE_DIR}/include/
-       )
-target_link_libraries(functions grid m)
-
-
diff --git a/src/grid/CMakeLists.txt b/src/grid/CMakeLists.txt
index a565be2..36fed1d 100644
--- a/src/grid/CMakeLists.txt
+++ b/src/grid/CMakeLists.txt
@@ -1,7 +1,7 @@
 set(HEADERS
-    ${AWP_MINI_SOURCE_DIR}/include/grid/grid_3d.h
-    ${AWP_MINI_SOURCE_DIR}/include/grid/shift.h
-    ${AWP_MINI_SOURCE_DIR}/include/awp/definitions.h
+    ${AWP_SOURCE_DIR}/include/grid/grid_3d.h
+    ${AWP_SOURCE_DIR}/include/grid/shift.h
+    ${AWP_SOURCE_DIR}/include/awp/definitions.h
     )
 
 add_library(grid
@@ -14,5 +14,5 @@ target_link_libraries(grid
 
 target_include_directories(grid
         PUBLIC
-        ${AWP_MINI_SOURCE_DIR}/include/
+        ${AWP_SOURCE_DIR}/include/
         )
diff --git a/src/grid/grid_3d.c b/src/grid/grid_3d.c
index 94886c3..d173e72 100644
--- a/src/grid/grid_3d.c
+++ b/src/grid/grid_3d.c
@@ -101,6 +101,15 @@ grid3_t grid_init_metric_grid(const int3_t size, const int3_t shift,
                          gridspacing);
 }
 
+grid3_t grid_init_full_grid(const int3_t size, const int3_t shift,
+                         const int3_t coordinate, const int3_t boundary1,
+                         const int3_t boundary2,
+                         const _prec gridspacing)
+{
+        return grid_init(size, shift, coordinate, boundary1, boundary2, ngsl + 2,
+                         gridspacing);
+}
+
 //FIXME: remove this function. It should be replaced by "grid_init"
 fcn_grid_t fcn_init_grid(const int3_t size, const int3_t shift,
                          const int3_t coordinate, const int padding,
@@ -255,11 +264,11 @@ grid1_t grid_grid1_z(const grid3_t grid)
         return grid1;
 }
 
-int grid_fill1(prec *out, const grid1_t grid)
+int grid_fill1(prec *out, const grid1_t grid, const int isxdir)
 {
         _prec h = grid.gridspacing;
         for (int i = 0; i < grid.size; ++i) {
-                out[i] = h * (i + grid.id * (grid.size - 2 * grid.padding) - 0.5 * grid.shift - grid.padding);
+                out[i] = h * (i + grid.id * (grid.size - 2 * grid.padding) - 0.5 * grid.shift + isxdir * grid.shift - grid.padding);
         }
 
         if (grid.shift && grid.boundary1) {
@@ -277,6 +286,17 @@ int grid_fill1(prec *out, const grid1_t grid)
         return grid.size;
 }
 
+int grid_fill_y_dm(prec *out, const grid1_t grid, const int blocknum) {
+    int count = grid_fill1(out, grid, 0);
+    if (blocknum > 0) {
+        _prec h = grid.gridspacing / 3.0;
+        for (int i = 0; i < grid.size; ++i) {
+            out[i] += h;
+        }
+    }
+    return count;
+}
+
 
 int grid_in_bounds1(const _prec *x, const _prec q, const grid1_t grid)
 {
@@ -298,34 +318,22 @@ int grid_in_bounds1(const _prec *x, const _prec q, const grid1_t grid)
         return SUCCESS;
 }
 
-int grid_in_bounds_ext1(const _prec *x, const _prec q, const grid1_t grid)
-{
-        _prec h = grid.gridspacing;
-        if ( q - (x[0] - h / 2) < 0 ) {
-                return ERR_OUT_OF_BOUNDS_LOWER;
-        }
-        if ( q - (x[grid.size - 1] + h / 2) >= 0) {
-                return ERR_OUT_OF_BOUNDS_UPPER;
-        }
-        return SUCCESS;
-}
-
 int grid_fill_x(prec *out, const fcn_grid_t grid)
 {
         grid1_t grid1 = grid_grid1_x(grid);
-        return grid_fill1(out, grid1);
+        return grid_fill1(out, grid1, 1);
 }
 
 int grid_fill_y(prec *out, const fcn_grid_t grid)
 {
         grid1_t grid1 = grid_grid1_y(grid);
-        return grid_fill1(out, grid1);
+        return grid_fill1(out, grid1, 0);
 }
 
 int grid_fill_z(prec *out, const fcn_grid_t grid)
 {
         grid1_t grid1 = grid_grid1_z(grid);
-        return grid_fill1(out, grid1);
+        return grid_fill1(out, grid1, 0);
 }
 
 int grid_fill3_x(_prec *out, const _prec *x, const grid3_t grid)
@@ -394,3 +402,51 @@ double grid_reduce3(const _prec *in, const grid3_t grid)
         return out;
 }
 
+_prec grid_overlap(const _prec h) {
+    return 7.0 * h;
+}
+_prec grid_height(const int nz, const _prec h, const int istopo) {
+    return istopo == 1 ? (nz - 2) * h : (nz - 1) * h;
+}
+void global_to_local(_prec *zloc, int *block_index, const _prec z,
+                     const _prec h, const int *nz, const int num_grids,
+                     const int istopo) {
+    _prec z0 = z;
+    _prec bi = -1;
+
+    _prec hloc = h;
+    _prec H = 0.0;
+    // Go from top grid to bottom grid
+    for (int i = 0; i < num_grids; ++i ) {
+
+        if (i > 0) 
+            z0 -= grid_overlap(hloc / 3);
+
+        // Check minimum number of grid points per block
+        assert(nz[i] >= 7);
+
+        _prec overlap = grid_overlap(hloc);
+        
+        H = i == 0 ? grid_height(nz[i], hloc, istopo) : grid_height(nz[i], hloc, 0);
+
+        z0 += H;
+        hloc *= 3;
+        bi = i;
+
+        // Check if the coordinate touches the last two grid points, if so, push it to the next grid
+        if (z0 > 0 && z0 < grid_overlap(hloc / 3) ) {
+            continue;
+        }
+
+        if (z0 > 0) break;
+
+    }
+
+    // Check if the mapping succeeded or not
+    if (z0 < 0) {
+        printf("WARNING: Failed to map z=%g to a block.\n", z);
+    }
+
+    *zloc = z0;
+    *block_index = bi;
+}
diff --git a/src/grid/shift.c b/src/grid/shift.c
index 88c06bd..9f3cd80 100644
--- a/src/grid/shift.c
+++ b/src/grid/shift.c
@@ -3,14 +3,14 @@
 
 void shift_node(int *shift)
 {
-        shift[0] = 1;
+        shift[0] = 0;
         shift[1] = 0;
         shift[2] = 0;
 }
 
  void shift_u1(int *shift)
 {
-        shift[0] = 1;
+        shift[0] = 0;
         shift[1] = 1;
         shift[2] = 1;
 }
@@ -18,56 +18,56 @@ void shift_node(int *shift)
 
  void shift_u2(int *shift)
 {
-        shift[0] = 0;
+        shift[0] = 1;
         shift[1] = 0;
         shift[2] = 1;
 }
 
  void shift_u3(int *shift)
 {
-        shift[0] = 0;
+        shift[0] = 1;
         shift[1] = 1;
         shift[2] = 0;
 }
 
  void shift_xx(int *shift)
 {
-        shift[0] = 0;
+        shift[0] = 1;
         shift[1] = 1;
         shift[2] = 1;
 }
 
  void shift_yy(int *shift)
 {
-        shift[0] = 0;
+        shift[0] = 1;
         shift[1] = 1;
         shift[2] = 1;
 }
 
  void shift_zz(int *shift)
 {
-        shift[0] = 0;
+        shift[0] = 1;
         shift[1] = 1;
         shift[2] = 1;
 }
 
  void shift_xy(int *shift)
 {
-        shift[0] = 1;
+        shift[0] = 0;
         shift[1] = 0;
         shift[2] = 1;
 }
 
  void shift_xz(int *shift)
 {
-        shift[0] = 1;
+        shift[0] = 0;
         shift[1] = 1;
         shift[2] = 0;
 }
 
  void shift_yz(int *shift)
 {
-        shift[0] = 0;
+        shift[0] = 1;
         shift[1] = 0;
         shift[2] = 0;
 }
@@ -75,25 +75,25 @@ void shift_node(int *shift)
 
  int3_t grid_node(void)
 {
-        int3_t out = {.x = 1, .y = 0, .z = 0};
+        int3_t out = {.x = 0, .y = 0, .z = 0};
         return out;
 }
 
  int3_t grid_u1(void)
 {
-        int3_t out = {.x = 1, .y = 1, .z = 1};
+        int3_t out = {.x = 0, .y = 1, .z = 1};
         return out;
 }
 
  int3_t grid_u2(void)
 {
-        int3_t out = {.x = 0, .y = 0, .z = 1};
+        int3_t out = {.x = 1, .y = 0, .z = 1};
         return out;
 }
 
  int3_t grid_u3(void)
 {
-        int3_t out = {.x = 0, .y = 1, .z = 0};
+        int3_t out = {.x = 1, .y = 1, .z = 0};
         return out;
 }
 
@@ -115,37 +115,37 @@ void shift_node(int *shift)
 
  int3_t grid_xx(void)
 {
-        int3_t out = {.x = 0, .y = 1, .z = 1};
+        int3_t out = {.x = 1, .y = 1, .z = 1};
         return out;
 }
 
  int3_t grid_yy(void)
 {
-        int3_t out = {.x = 0, .y = 1, .z = 1};
+        int3_t out = {.x = 1, .y = 1, .z = 1};
         return out;
 }
 
  int3_t grid_zz(void)
 {
-        int3_t out = {.x = 0, .y = 1, .z = 1};
+        int3_t out = {.x = 1, .y = 1, .z = 1};
         return out;
 }
 
  int3_t grid_xy(void)
 {
-        int3_t out = {.x = 1, .y = 0, .z = 1};
+        int3_t out = {.x = 0, .y = 0, .z = 1};
         return out;
 }
 
  int3_t grid_xz(void)
 {
-        int3_t out = {.x = 1, .y = 1, .z = 0};
+        int3_t out = {.x = 0, .y = 1, .z = 0};
         return out;
 }
 
  int3_t grid_yz(void)
 {
-        int3_t out = {.x = 0, .y = 0, .z = 0};
+        int3_t out = {.x = 1, .y = 0, .z = 0};
         return out;
 }
 
diff --git a/src/interpolation/CMakeLists.txt b/src/interpolation/CMakeLists.txt
index 1410879..ad94ad5 100644
--- a/src/interpolation/CMakeLists.txt
+++ b/src/interpolation/CMakeLists.txt
@@ -1,9 +1,9 @@
 set(HEADERS
-    ${AWP_MINI_SOURCE_DIR}/include/awp/definitions.h
-    ${AWP_MINI_SOURCE_DIR}/include/grid/grid_3d.h
-    ${AWP_MINI_SOURCE_DIR}/include/interpolation/interpolation.h
-    ${AWP_MINI_SOURCE_DIR}/include/interpolation/interpolation.cuh
-    ${AWP_MINI_SOURCE_DIR}/include/interpolation/lagrange.h
+    ${AWP_SOURCE_DIR}/include/awp/definitions.h
+    ${AWP_SOURCE_DIR}/include/grid/grid_3d.h
+    ${AWP_SOURCE_DIR}/include/interpolation/interpolation.h
+    ${AWP_SOURCE_DIR}/include/interpolation/interpolation.cuh
+    ${AWP_SOURCE_DIR}/include/interpolation/lagrange.h
     )
 
 add_library(interpolation
@@ -14,5 +14,5 @@ target_link_libraries(interpolation grid)
 
 target_include_directories(interpolation
         PUBLIC
-        ${AWP_MINI_SOURCE_DIR}/include/
+        ${AWP_SOURCE_DIR}/include/
         )
diff --git a/src/interpolation/interpolation.c b/src/interpolation/interpolation.c
index dfef26b..a898de7 100644
--- a/src/interpolation/interpolation.c
+++ b/src/interpolation/interpolation.c
@@ -49,7 +49,7 @@ int interp_grid_argnearest(int *nearest, const prec *x, const prec q, grid1_t
 int interp_argnearest_range(int *first, int *last,
                             const int lower, const int upper,
                             const int nearest,
-                            const int n, const prec query)
+                            const int n)
 {
         int err = 0;
         int inearest = nearest;
@@ -106,19 +106,11 @@ int interp_lagrange1_coef(prec *xloc, prec *l, int *first, const prec *x,
                 q = x[n-1];
         }
 
-        if (deg % 2 == 1) {
-                if (x[nearest] - query > 0) {
-                        lower = (int)ceil((double)deg * 0.5);
-                        upper = (int)floor((double)deg * 0.5);
-                } else {
-                        lower = (int)floor((double)deg * 0.5);
-                        upper = (int)ceil((double)deg * 0.5);
-                }
-
+        lower = interp_get_lower(x[nearest], query, deg);
+        upper = interp_get_upper(x[nearest], query, deg);
 
-        }
-        err |= interp_argnearest_range(&lower, &upper, lower, upper, nearest, n,
-                                       q);
+        err |=
+            interp_argnearest_range(&lower, &upper, lower, upper, nearest, n);
         for (int j = 0; j < deg + 1; ++j) {
                 xloc[j] = x[lower + j];
         }
@@ -170,3 +162,30 @@ int interp_lagrange3(prec *out, const prec *in, const prec *x, const prec *y,
         return err;
 }
 
+int interp_get_lower(const prec xnearest, const prec query, const int deg) {
+        int lower = (int)ceil((double)deg * 0.5);
+        if (deg % 2 == 1) {
+                if (xnearest - query > 0) {
+                        lower = (int)ceil((double)deg * 0.5);
+                } else {
+                        lower = (int)floor((double)deg * 0.5);
+                }
+        }
+        return lower;
+}
+
+int interp_get_upper(const prec xnearest, const prec query, const int deg) {
+        int upper = (int)ceil((double)deg * 0.5);
+        if (deg % 2 == 1) {
+                if (xnearest - query > 0) {
+                        upper = (int)floor((double)deg * 0.5);
+                } else {
+                        upper = (int)ceil((double)deg * 0.5);
+                }
+
+
+        }
+        return upper;
+}
+
+
diff --git a/src/interpolation/interpolation.cu b/src/interpolation/interpolation.cu
index 5d893f1..e6a63c6 100644
--- a/src/interpolation/interpolation.cu
+++ b/src/interpolation/interpolation.cu
@@ -4,7 +4,7 @@
 #include <stdint.h>
 
 #include <awp/error.h>
-#include <awp/definitions.h>
+#include <awp/pmcl3d_cons.h>
 #include <interpolation/interpolation.h>
 #include <grid/grid_3d.h>
 #include <interpolation/interpolation.cuh>
diff --git a/src/mpi/CMakeLists.txt b/src/mpi/CMakeLists.txt
index 8fe9fb5..9fa7d45 100644
--- a/src/mpi/CMakeLists.txt
+++ b/src/mpi/CMakeLists.txt
@@ -1,8 +1,7 @@
 set(HEADERS
-    ${AWP_MINI_SOURCE_DIR}/include/mpi/partition.h
-    ${AWP_MINI_SOURCE_DIR}/include/mpi/distribute.h
-    ${AWP_MINI_SOURCE_DIR}/include/mpi/io.h
-    ${AWP_MINI_SOURCE_DIR}/include/test/test.h
+    ${AWP_SOURCE_DIR}/include/mpi/distribute.h
+    ${AWP_SOURCE_DIR}/include/mpi/io.h
+    ${AWP_SOURCE_DIR}/include/test/test.h
     )
 
 add_library(mpi
@@ -10,5 +9,5 @@ add_library(mpi
 
 target_include_directories(mpi 
         PUBLIC
-        ${AWP_MINI_SOURCE_DIR}/include/
+        ${AWP_SOURCE_DIR}/include/
         )
diff --git a/src/mpi/distribute.c b/src/mpi/distribute.c
index ede1e91..141f60f 100644
--- a/src/mpi/distribute.c
+++ b/src/mpi/distribute.c
@@ -1,54 +1,142 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
+#include <assert.h>
 
 #include <mpi/distribute.h>
 #include <awp/error.h>
+#include <grid/grid_3d.h>
+#include <topography/grids.h>
+#include <topography/sources/source.h>
+#include <interpolation/interpolation.h>
 #include <test/test.h>
 
+int grid_in_bounds_output(const prec *x, const size_t mx, const prec q, const prec h)
+{
+        prec x_left = x[2 + ngsl] - h / 2.0;
+        prec x_right = x[mx - 3 - ngsl] + h / 2.0;
+
+        if ( q - x_left < 0 ) {
+                return ERR_OUT_OF_BOUNDS_LOWER;
+        }
+        if ( q - x_right >= 0) {
+                return ERR_OUT_OF_BOUNDS_UPPER;
+        }
+
+        return SUCCESS;
+
+}
+
+int grid_in_bounds_input(const prec *x, const int mx, const prec q, const prec h)
+{
+    // Split the input (moment tensor / force) based on the subdomain it belongs to. Inputs that
+    // fall in the overlap zone belongs to both processes. The force kernels have guard
+    // statements that make sure that no forces are applied outside the actual compute
+    // region. For the moment tensor source, it doesn't matter that points outside the compute
+    // region are modified.
+        if ( q - x[0] < h / 2 ) {
+                return ERR_OUT_OF_BOUNDS_LOWER;
+        }
+        if ( q - x[mx - 1] >=  h / 2) {
+                return ERR_OUT_OF_BOUNDS_UPPER;
+        }
+        return SUCCESS;
+}
+
+int dist_indices_in_bounds(const prec qx, const prec qy,
+                           const prec *x, const size_t mx, 
+                           const prec *y, const size_t my,
+                           const prec hx, const prec hy,
+                           const enum source_type st) {
+        int inbounds_x = 0;
+        int inbounds_y = 0;
+        switch (st) {
+            case MOMENT_TENSOR:
+            case FORCE:
+                        inbounds_x = grid_in_bounds_input(x, mx, qx, hx);
+                        inbounds_y = grid_in_bounds_input(y, my, qy, hy);
+                        break;
+            case RECEIVER:
+            case SGT:
+                        inbounds_x = grid_in_bounds_output(x, mx, qx, hx);
+                        inbounds_y = grid_in_bounds_output(y, my, qy, hy);
+                        break;
+            default:
+                fprintf(stderr, "Unknown source type passed to %s:%s!\n",
+                        __FILE__, __func__);
+                break;
+        }
+        if (inbounds_x == SUCCESS && inbounds_y == SUCCESS)
+                return 1;
+        return 0;
+}
+
+/* Distributes indices based on which part of space they belong to. 
+
+        indices: (output) indices for a particular query point (qx[i], qy[i]) that lies in `grid`. 
+        nidx: Number of indices written
+        qx: Array containing query points  (x-coordinate)
+        qy: Array containing query points  (y-coordinate)
+        n: Number of query points (length of qx, qx)
+        grids: The grids to conduct the search for
+        grid_numbers: Array that contains the grid number that each query point belongs to (in the
+                      z-direction) 
+        is_source: Set grid bounds based on source partitioning 
+                  (disable to set grid bounds for receiver partitioning)
+        mode: Choose between counting indices in current partition (DIST_COUNT), or populate
+              index array (DIST_INSERT_INDICES)
+
+*/
 int dist_indices(int **indices, size_t *nidx, const prec *qx, const prec *qy,
-                 const size_t n, grid3_t grid) 
+                 const size_t n, 
+                 const grid3_t grid, const int *grid_numbers,
+                 const int grid_number, const enum source_type st, const enum dist_options mode)
 {
+
+        size_t nlocal = 0;
+
         grid1_t grid_x = grid_grid1_x(grid);
         grid1_t grid_y = grid_grid1_y(grid);
+        size_t mx = grid_x.size;
+        size_t my = grid_y.size;
+        prec hx = grid_x.gridspacing;
+        prec hy = grid_y.gridspacing;
 
         prec *x = malloc(sizeof(x) * grid_x.size);
         prec *y = malloc(sizeof(y) * grid_y.size);
 
-        grid_fill1(x, grid_x);
-        grid_fill1(y, grid_y);
-
-
-        size_t nlocal = 0; 
+        grid_fill1(x, grid_x, 1);
+        grid_fill_y_dm(y, grid_y, grid_number);
 
-        //FIXME: Add checks for PML region, and boundary regions
-
-        for (size_t i = 0; i < n; ++i) {
-                int inbounds_x = grid_in_bounds_ext1(x, qx[i], grid_x);
-                int inbounds_y = grid_in_bounds_ext1(y, qy[i], grid_y);
-                if (inbounds_x == SUCCESS && inbounds_y == SUCCESS) {
-                        nlocal++;
-                }
-        }
-        
-        *nidx = nlocal;
-        *indices = malloc(sizeof(indices) * nlocal);
-        int *idx = *indices;
-
-        int j = 0;
-        for (size_t i = 0; i < n; ++i) {
-                int inbounds_x = grid_in_bounds_ext1(x, qx[i], grid_x);
-                int inbounds_y = grid_in_bounds_ext1(y, qy[i], grid_y);
-                if (inbounds_x == SUCCESS && inbounds_y == SUCCESS) {
-                        idx[j] = i;
-                        j++;
+        size_t j = *nidx;
+        for (size_t i = 0; i < n; ++i)
+        {
+                if (dist_indices_in_bounds(qx[i], qy[i], x, mx, y, my, hx, hy, st) &&
+                    grid_numbers[i] == grid_number)
+                {
+                        switch (mode)
+                        {
+                        case DIST_COUNT:
+                                nlocal++;
+                                break;
+                        case DIST_INSERT_INDICES:
+                                (*indices)[j] = i;
+                                j++;
+                        }
                 }
         }
 
         free(x);
         free(y);
-        
-        return SUCCESS;
-}
 
+        switch (mode)
+        {
+        case DIST_COUNT:
+                *nidx = nlocal;
+                break;
+        case DIST_INSERT_INDICES:
+                break;
+        }
 
+        return SUCCESS;
+}
diff --git a/src/mpi/io.c b/src/mpi/io.c
index cee175f..dbc9af4 100644
--- a/src/mpi/io.c
+++ b/src/mpi/io.c
@@ -9,9 +9,11 @@ mpi_io_t mpi_io_init(MPI_Comm comm, int rank, MPI_Aint num_elements)
         mpi_io_t out = {
             .comm = comm, .rank = rank, .num_elements = num_elements};
         MPICHK2(MPI_Allreduce(&out.num_elements, &out.total_num_bytes, 1,
-               MPI_AINT, MPI_SUM, out.comm), rank);
+                              MPI_AINT, MPI_SUM, out.comm),
+                rank);
         MPICHK2(MPI_Exscan(&out.num_elements, &out.offset, 1, MPI_OFFSET,
-                          MPI_SUM, out.comm), rank);
+                           MPI_SUM, out.comm),
+                rank);
         out.offset *= sizeof(prec);
         out.total_num_bytes *= sizeof(prec);
         out.rank = rank;
@@ -24,14 +26,14 @@ void mpi_io_write(mpi_io_t *m, prec *data, const char *filename)
         MPI_File fh;
         MPI_Status filestatus;
         MPICHK2(MPI_File_open(m->comm, filename,
-                             MPI_MODE_WRONLY | MPI_MODE_CREATE, MPI_INFO_NULL,
-                             &fh),
-               m->rank);
+                              MPI_MODE_WRONLY | MPI_MODE_CREATE, MPI_INFO_NULL,
+                              &fh),
+                m->rank);
         MPICHK2(MPI_File_write_at_all(fh,
-                                     m->offset,
-                                     data, m->num_elements, MPI_PREC,
-                                     &filestatus),
-               m->rank);
+                                      m->offset,
+                                      data, m->num_elements, MPI_PREC,
+                                      &filestatus),
+                m->rank);
         m->offset += m->total_num_bytes;
         MPICHK2(MPI_File_close(&fh), m->rank);
 }
@@ -41,36 +43,42 @@ void mpi_io_read(mpi_io_t *m, prec *data, const char *filename)
         MPI_File fh;
         MPI_Status filestatus;
         MPICHK2(MPI_File_open(m->comm, filename, MPI_MODE_RDONLY, MPI_INFO_NULL,
-                             &fh),
-               m->rank);
+                              &fh),
+                m->rank);
         MPICHK2(MPI_File_read_at_all(fh,
                                      m->offset,
                                      data, m->num_elements, MPI_PREC,
                                      &filestatus),
-               m->rank);
+                m->rank);
         m->offset += m->total_num_bytes;
         MPICHK2(MPI_File_close(&fh), m->rank);
 }
 
 mpi_io_idx_t mpi_io_idx_init(MPI_Comm comm, int rank, int *indices,
-                int *blocklen, size_t num_blocks, size_t num_writes)
+                             int *blocklen, size_t num_blocks, size_t num_writes)
 {
         mpi_io_idx_t out = {.comm = comm, .rank = rank};
-        int *offsets = malloc(sizeof(offsets) * num_blocks);
+        MPI_Aint *offsets = malloc(sizeof(offsets) * num_blocks);
+        int *offsets2 = malloc(sizeof(offsets2) * num_blocks);
         out.num_bytes = 0;
         out.num_elements = 0;
         out.offset = 0;
-        for (size_t i = 0; i < num_blocks; ++i) {
-                offsets[i] = indices[i] * num_writes;
+        int size;
+        MPICHK2(MPI_Type_size(MPI_PREC, &size), rank);
+        for (size_t i = 0; i < num_blocks; ++i)
+        {
+                offsets[i] = indices[i] * num_writes * size;
                 out.num_elements += blocklen[i];
         }
         out.num_writes = num_writes;
         out.current_write = 0;
         out.num_bytes = blocklen[0] * sizeof(prec);
-
-        MPICHK2(MPI_Type_indexed(num_blocks, blocklen, offsets, MPI_PREC,
-                                &out.dtype),
-               rank);
+        
+        MPICHK2(MPI_Type_create_hindexed(num_blocks,
+                             blocklen,
+                             offsets,
+                             MPI_PREC, 
+                             &out.dtype), rank);
         MPI_Type_commit(&out.dtype);
         free(offsets);
         return out;
@@ -81,18 +89,19 @@ void mpi_io_idx_write(mpi_io_idx_t *m, prec *data, const char *filename)
         MPI_File fh;
         MPI_Status filestatus;
         MPICHK2(MPI_File_open(m->comm, filename,
-                             MPI_MODE_WRONLY | MPI_MODE_CREATE, MPI_INFO_NULL,
-                             &fh),
-               m->rank);
+                              MPI_MODE_WRONLY | MPI_MODE_CREATE, MPI_INFO_NULL,
+                              &fh),
+                m->rank);
         MPICHK2(MPI_File_set_view(fh, m->offset, MPI_PREC, m->dtype, "native",
-                                 MPI_INFO_NULL),
-               m->rank);
+                                  MPI_INFO_NULL),
+                m->rank);
         MPICHK2(MPI_File_write_all(fh, data, m->num_elements, MPI_PREC,
-                                  &filestatus),
-               m->rank);
+                                   &filestatus),
+                m->rank);
         m->offset += m->num_bytes;
         m->current_write++;
-        if (m->current_write == m->num_writes) {
+        if (m->current_write == m->num_writes)
+        {
                 m->current_write = 0;
                 m->offset = 0;
         }
@@ -104,16 +113,17 @@ void mpi_io_idx_read(mpi_io_idx_t *m, prec *data, const char *filename)
         MPI_File fh;
         MPI_Status filestatus;
         MPICHK2(MPI_File_open(m->comm, filename, MPI_MODE_RDONLY, MPI_INFO_NULL,
-                             &fh),
-               m->rank);
+                              &fh),
+                m->rank);
         MPICHK2(MPI_File_set_view(fh, m->offset, MPI_PREC, m->dtype, "native",
-                                 MPI_INFO_NULL),
-               m->rank);
+                                  MPI_INFO_NULL),
+                m->rank);
         MPICHK2(MPI_File_read_all(fh, data, m->num_elements, MPI_PREC,
                                   &filestatus),
-               m->rank);
+                m->rank);
         m->offset += m->num_bytes;
-        if (m->current_write == m->num_writes) {
+        if (m->current_write == m->num_writes)
+        {
                 m->current_write = 0;
                 m->offset = 0;
         }
@@ -124,4 +134,3 @@ void mpi_io_idx_finalize(mpi_io_idx_t *m)
 {
         MPICHK2(MPI_Type_free(&m->dtype), m->rank);
 }
-
diff --git a/src/readers/CMakeLists.txt b/src/readers/CMakeLists.txt
index b7fa82d..bacc39d 100644
--- a/src/readers/CMakeLists.txt
+++ b/src/readers/CMakeLists.txt
@@ -1,7 +1,7 @@
 set(HEADERS
-    ${AWP_MINI_SOURCE_DIR}/include/awp/definitions.h
-    ${AWP_MINI_SOURCE_DIR}/include/readers/input.h
-    ${AWP_MINI_SOURCE_DIR}/include/readers/version.h
+    ${AWP_SOURCE_DIR}/include/awp/definitions.h
+    ${AWP_SOURCE_DIR}/include/readers/input.h
+    ${AWP_SOURCE_DIR}/include/readers/version.h
     )
 
 add_library(readers
@@ -10,7 +10,7 @@ add_library(readers
 
 target_include_directories(readers
         PUBLIC
-        ${AWP_MINI_SOURCE_DIR}/include/
+        ${AWP_SOURCE_DIR}/include/
         )
 
 target_link_libraries(readers 
diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt
index 8e3ad2a..b0f298c 100644
--- a/src/test/CMakeLists.txt
+++ b/src/test/CMakeLists.txt
@@ -1,7 +1,7 @@
 set(HEADERS
-    ${AWP_MINI_SOURCE_DIR}/include/test/grid_check.h
-    ${AWP_MINI_SOURCE_DIR}/include/test/test.h
-    ${AWP_MINI_SOURCE_DIR}/include/awp/definitions.h
+    ${AWP_SOURCE_DIR}/include/test/grid_check.h
+    ${AWP_SOURCE_DIR}/include/test/test.h
+    ${AWP_SOURCE_DIR}/include/awp/definitions.h
     )
 
 add_library(testing
@@ -17,7 +17,7 @@ target_link_libraries(testing
 
 target_include_directories(testing
         PUBLIC
-        ${AWP_MINI_SOURCE_DIR}/include/
+        ${AWP_SOURCE_DIR}/include/
         )
 
 
diff --git a/src/test/check.c b/src/test/check.c
index 4047c6b..d8f22c3 100644
--- a/src/test/check.c
+++ b/src/test/check.c
@@ -29,7 +29,7 @@ int chk_infl(const size_t *a, const size_t *b, const size_t n)
 {
         int err = 0;
         for (size_t i = 0; i < n; ++i) {
-                int diff = abs(a[i] - b[i]);
+                int diff = abs((int)a[i] - (int)b[i]);
                 err = diff > err ? diff : err;
         }
         return err;
diff --git a/src/topography/CMakeLists.txt b/src/topography/CMakeLists.txt
index 78a9150..83111fe 100644
--- a/src/topography/CMakeLists.txt
+++ b/src/topography/CMakeLists.txt
@@ -5,32 +5,24 @@ add_subdirectory(readers)
 add_subdirectory(geometry)
 add_subdirectory(sources)
 add_subdirectory(receivers)
+add_subdirectory(functions)
 
 set(HEADERS
-    ${AWP_MINI_SOURCE_DIR}/include/awp/definitions.h     
-    ${AWP_MINI_SOURCE_DIR}/include/topography/topography.h     
-    ${AWP_MINI_SOURCE_DIR}/include/grid/grid_3d.h
-    ${AWP_MINI_SOURCE_DIR}/include/grid/shift.h
-    ${AWP_MINI_SOURCE_DIR}/include/topography/geometry.h
-    ${AWP_MINI_SOURCE_DIR}/include/topography/geometry/geometry.h
-    ${AWP_MINI_SOURCE_DIR}/include/topography/readers/serial_reader.h
-    ${AWP_MINI_SOURCE_DIR}/include/test/test.h     
-    ${AWP_MINI_SOURCE_DIR}/include/vtk/vtk.h     
-    ${AWP_MINI_SOURCE_DIR}/include/topography/metrics/metrics.h
-    ${AWP_MINI_SOURCE_DIR}/include/topography/host.h
+    ${AWP_SOURCE_DIR}/include/awp/definitions.h     
+    ${AWP_SOURCE_DIR}/include/topography/topography.h     
+    ${AWP_SOURCE_DIR}/include/grid/grid_3d.h
+    ${AWP_SOURCE_DIR}/include/grid/shift.h
+    ${AWP_SOURCE_DIR}/include/topography/geometry.h
+    ${AWP_SOURCE_DIR}/include/topography/geometry/geometry.h
+    ${AWP_SOURCE_DIR}/include/topography/mms.cuh
+    ${AWP_SOURCE_DIR}/include/topography/readers/serial_reader.h
+    ${AWP_SOURCE_DIR}/include/test/test.h     
+    ${AWP_SOURCE_DIR}/include/vtk/vtk.h     
+    ${AWP_SOURCE_DIR}/include/topography/metrics/metrics.h
+    ${AWP_SOURCE_DIR}/include/topography/host.h
     )
 
-set(UNOPT_HEADERS
-    ${HEADERS}
-    ${AWP_MINI_SOURCE_DIR}/include/topography/topography.cuh     
-    ${AWP_MINI_SOURCE_DIR}/include/topography/kernels/unoptimized.cuh     
-    )
-
-set(OPT_HEADERS
-    ${HEADERS}
-    ${AWP_MINI_SOURCE_DIR}/include/topography/kernels/optimized_velocity.cuh     
-    ${AWP_MINI_SOURCE_DIR}/include/topography/kernels/optimized_stress.cuh     
-    )
+add_library(mapping mapping.c)
 
 set(LIBRARIES
     ${MPI_C_LIBRARIES} 
@@ -43,57 +35,31 @@ set(LIBRARIES
     topography_receivers
     readers
     error
+    nvToolsExt
+    nvToolsExt
+    functions
     )
 
-set(UNOPT_LIBRARIES ${LIBRARIES} unoptimized_kernels)
-set(OPT_LIBRARIES ${LIBRARIES} optimized_kernels nvToolsExt)
-set(OPT_ATTENUATION_LIBRARIES ${LIBRARIES} optimized_attenuation_kernels nvToolsExt)
+set(TOPOGRAPHY_LIBRARIES
+    topography.c velocity.cu stress.cu
+    geometry.c host.c grids.c mms.cu energy.cu
+    )
 
+add_library(topography ${TOPOGRAPHY_LIBRARIES})
 
-# Unoptimized version
-add_library(topography
-        topography.c topography.cu geometry.c host.c grids.c
-        ${UNOPT_HEADERS}
-        )
+add_library(topography_no_bc ${TOPOGRAPHY_LIBRARIES})
 
-target_include_directories(topography
-        PUBLIC
-        ${AWP_MINI_SOURCE_DIR}/include/
-       )
+target_link_libraries(topography ${LIBRARIES})
 
-target_link_libraries(topography 
-        ${UNOPT_LIBRARIES}
-        )
+target_compile_definitions(topography_no_bc PUBLIC -DAPPLY_BC=0)
+target_link_libraries(topography_no_bc ${LIBRARIES})
 
-# Optimized version
-add_library(opt_topography
-        topography.c opt_topography.cu velocity.cu stress.cu geometry.c host.c
-        grids.c
-        ${OPT_HEADERS}
-        )
-
-target_include_directories(opt_topography
+target_include_directories(topography
         PUBLIC
-        ${AWP_MINI_SOURCE_DIR}/include/
+        ${AWP_SOURCE_DIR}/include/
        )
 
-target_link_libraries(opt_topography 
-        ${OPT_LIBRARIES} 
-        )
-
-
-# Work in progress that takes attenuation into account
-add_library(opt_topography_attenuation
-        topography.c opt_topography.cu velocity.cu stress_attenuation.cu
-        geometry.c host.c grids.c
-        ${OPT_HEADERS}
-        )
-
-target_include_directories(opt_topography_attenuation
+target_include_directories(mapping
         PUBLIC
-        ${AWP_MINI_SOURCE_DIR}/include/
+        ${AWP_SOURCE_DIR}/include/
        )
-
-target_link_libraries(opt_topography_attenuation
-        ${OPT_ATTENUATION_LIBRARIES} 
-        )
diff --git a/src/topography/energy.cu b/src/topography/energy.cu
new file mode 100644
index 0000000..35c038c
--- /dev/null
+++ b/src/topography/energy.cu
@@ -0,0 +1,181 @@
+#include <topography/energy.cuh>
+
+__global__ void energy_kernel(
+    double *kinetic_rate, double *strain_rate, const float *vxp,
+    const float *vyp, const float *vzp, const float *xxp, const float *yyp,
+    const float *zzp, const float *xyp, const float *xzp, const float *yzp,
+    const float *vx, const float *vy, const float *vz, const float *xx,
+    const float *yy, const float *zz, const float *xy, const float *xz,
+    const float *yz, const float *f, const float *f_1, const float *f_2,
+    const float *f_c, const float *g3, const float *g3_c,
+    const float *rho, const float *mui, const float *lami,
+    const int nx, const int ny, const int nz) {
+    int idz = threadIdx.x + blockDim.x * blockIdx.x;
+    int idy = threadIdx.y + blockDim.y * blockIdx.y;
+
+    if (idz >= nz || idy >= ny) {
+        idz = 0;
+        idy = 0;
+    }
+
+    int my = ny + 4 + 2 * ngsl;
+    int mz = nz + 2 * align;
+
+    int line = mz;
+    int slice = my * mz;
+    int offset = idz + align + line * (2 + ngsl + idy) + slice * (2 + ngsl);
+
+    int fline = 2 * align + 4 + ny + 2 * ngsl;
+    int f_offset = 2 + ngsl + idy + align + fline * (2 + ngsl);
+    int g_offset = align + idz;
+
+    double kinetic_E = 0.0;
+    double strain_E = 0.0;
+
+    float Hz_hat = 1.0f;
+    float Hz = 1.0f;
+
+    const float hhzr[5] = {0.3445563972099920, 0.4372900885645984,
+                           1.3056954965124901, 0.9124580177129197,
+                           1.0000000000000000};
+    const float hzr[5] = {0.0000000000000000, 0.2812150147607664,
+                          1.4480216223843674, 0.6769783776156325,
+                          1.0937849852392336};
+
+
+    int k = nz - idz - 1;
+    if (k < 5 && k >= 0) {
+        Hz_hat = hhzr[k];
+        Hz = hzr[k];
+    }
+
+    int pos = offset;
+    int fpos = f_offset;
+    int gpos = g_offset;
+
+
+    for (int i = 0; i < nx; ++i) {
+
+
+        float Jx = f_1[fpos] * g3_c[gpos];
+        float Jy = f_2[fpos] * g3_c[gpos];
+        float Jz = f_c[fpos] * g3[gpos];
+        float Jxx = f_c[fpos] * g3_c[gpos];
+        float Jxy = f[fpos] * g3_c[gpos];
+        float Jxz = f_1[fpos] * g3[gpos];
+        float Jyz = f_2[fpos] * g3[gpos];
+
+        float rhox = 0.25f * (rho[pos - 1] + rho[pos - line - 1] + rho[pos] + rho[pos - line]);
+        float rhoy = 0.25f * (rho[pos - 1] + rho[pos + slice - 1] + rho[pos] + rho[pos + slice]);
+        float rhoz = 0.25f * (rho[pos] + rho[pos + slice] + rho[pos - line] + rho[pos + slice - line]);
+
+
+        float muixy = 0.5f * (mui[pos] + mui[pos-1]);
+        float muixz = 0.5f * (mui[pos] + mui[pos-line]);
+        float muiyz = 0.5f * (mui[pos] + mui[pos+slice]);
+        float lamixx = 
+            (lami[pos - 1] + lami[pos - 1 + slice] + lami[pos - 1 + slice - line] +
+             lami[pos - line - 1] + lami[pos] + lami[pos + slice] +
+             lami[pos + slice - line] + lami[pos - line]) / 8.f;
+        float lam = 1.0f / lamixx;
+        float muixx =
+            (mui[pos - 1] + mui[pos - 1 + slice] + mui[pos - 1 + slice - line] +
+             mui[pos - line - 1] + mui[pos] + mui[pos + slice] +
+             mui[pos + slice - line] + mui[pos - line]) / 8.f;
+        float mu = 1.0f / muixx;
+        float lam_mu = 0.5f * lam / (mu * (3.0f * lam + 2.0f * mu));
+        float trace =
+            (xx[pos] - xxp[pos]) + (yy[pos] - yyp[pos]) + (zz[pos] - zzp[pos]);
+
+        double exx = 0.5f * muixx * (xx[pos] - xxp[pos]) - lam_mu * trace;
+        double eyy = 0.5f * muixx * (yy[pos] - yyp[pos]) - lam_mu * trace;
+        double ezz = 0.5f * muixx * (zz[pos] - zzp[pos]) - lam_mu * trace;
+        double exy = 0.5f * muixy * (xy[pos] - xyp[pos]);
+        double exz = 0.5f * muixz * (xz[pos] - xzp[pos]);
+        double eyz = 0.5f * muiyz * (yz[pos] - yzp[pos]);
+
+        kinetic_E += 0.5f * Jx * Hz_hat * vx[pos] * rhox * (vx[pos] - vxp[pos]) +
+                     0.5f * Jy * Hz_hat * vy[pos] * rhoy * (vy[pos] - vyp[pos]) +
+                     0.5f * Jz * Hz * vz[pos] * rhoz * (vz[pos] - vzp[pos]);
+        strain_E +=
+            0.5f * xxp[pos] * Jxx * Hz_hat * exx +
+            0.5f * yyp[pos] * Jxx * Hz_hat * eyy +
+            0.5f * zzp[pos] * Jxx * Hz_hat * ezz +
+            1.0f * xyp[pos] * Jxy * Hz_hat * exy +
+            1.0f * xzp[pos] * Jxz * Hz * exz + 
+            1.0f * yzp[pos] * Jyz * Hz * eyz;
+
+        pos += slice;
+        fpos += fline;
+    }
+
+    if (idz > nz - 1 || idy > ny - 1) {
+        kinetic_E = 0;
+        strain_E = 0;
+    }
+
+    __shared__ double spartial_kinetic[1024];
+    __shared__ double spartial_strain[1024];
+
+    double kin = kinetic_E;
+    double str = strain_E;
+    for (int i = 16; i > 0; i /= 2) {
+        kin += __shfl_down_sync(0xffffffff, kin, i);
+        str += __shfl_down_sync(0xffffffff, str, i);
+    }
+
+    if (threadIdx.x == 0) {
+        spartial_kinetic[threadIdx.y] = kin;
+        spartial_strain[threadIdx.y] = str;
+    }
+    __syncthreads();
+
+    if (threadIdx.x == 0 && threadIdx.y == 0) {
+        double block_strain_rate = 0.0;
+        double block_kinetic_rate = 0.0;
+        for (int i = 0; i < blockDim.y; ++i) {
+            block_kinetic_rate += spartial_kinetic[i];
+            block_strain_rate += spartial_strain[i];
+        }
+
+        atomicAdd(strain_rate, block_strain_rate);
+        atomicAdd(kinetic_rate, block_kinetic_rate);
+    }
+}
+
+void energy_rate(energy_t *e, const int step, const float *d_vx, const float *d_vy,
+                 const float *d_vz, const float *d_xx, const float *d_yy,
+                 const float *d_zz, const float *d_xy, const float *d_xz,
+                 const float *d_yz, const float *d_rho, const float *d_mui,
+                 const float *d_lami, 
+                 const f_grid_t *metrics_f,
+                 const g_grid_t *metrics_g,
+                 const int nx, const int ny, const int nz)
+{
+    if (!e->use || e->index >= e->num_steps || step % e->stride != 0) return;
+
+        double out_kinetic[1] = {0.0};
+        double out_strain[1] = {0.0};
+    CUCHK(cudaMemset(e->kinetic_rate, 0, sizeof(double)));
+    CUCHK(cudaMemset(e->strain_rate, 0, sizeof(double)));
+    
+
+    dim3 threads (32, 4, 1);
+    dim3 blocks ( (nz - 4) / threads.x  + 1 , (ny - 1) / threads.y + 1, 1);
+    energy_kernel<<<blocks, threads>>>(e->kinetic_rate, e->strain_rate, e->d_vxp, e->d_vyp, e->d_vzp, e->d_xxp, e->d_yyp, e->d_zzp, e->d_xyp, e->d_xzp, e->d_yzp, d_vx, d_vy, d_vz, d_xx, d_yy, d_zz, d_xy, d_xz, d_yz, metrics_f->d_f, metrics_f->d_f_1, metrics_f->d_f_2, metrics_f->d_f_c, metrics_g->d_g3, metrics_g->d_g3_c, d_rho, d_mui, d_lami, nx, ny, nz);
+    CUCHK(cudaGetLastError());
+    cudaMemcpy(out_kinetic, e->kinetic_rate, sizeof(double), cudaMemcpyDeviceToHost);
+    cudaMemcpy(out_strain, e->strain_rate, sizeof(double), cudaMemcpyDeviceToHost);
+    CUCHK(cudaGetLastError());
+
+    double sum_kinetic, sum_strain;
+    MPICHK(MPI_Reduce(out_kinetic, &sum_kinetic, 1, MPI_DOUBLE, MPI_SUM, 0, e->comm));
+    MPICHK(MPI_Reduce(out_strain, &sum_strain, 1, MPI_DOUBLE, MPI_SUM, 0, e->comm));
+    if (e->rank != 0) return;
+    e->kinetic_energy_rate[e->index] = sum_kinetic;
+    e->strain_energy_rate[e->index] = sum_strain;
+    e->time[e->index] = (double)step * e->dt;
+    printf("step = %d t = %g, kinetic energy rate = %g strain energy rate = %g , sum = %g \n", step, e->time[e->index], sum_kinetic, sum_strain, sum_kinetic + sum_strain);
+    e->index++;
+
+}
diff --git a/src/topography/functions/CMakeLists.txt b/src/topography/functions/CMakeLists.txt
new file mode 100644
index 0000000..0ff2744
--- /dev/null
+++ b/src/topography/functions/CMakeLists.txt
@@ -0,0 +1,16 @@
+set(HEADERS
+    ${AWP_SOURCE_DIR}/include/functions/functions.h     
+    ${AWP_SOURCE_DIR}/include/functions/random.h     
+    ${AWP_SOURCE_DIR}/include/functions/norm.h     
+    ${AWP_SOURCE_DIR}/include/grid/grid_3d.h
+    )
+
+add_library(functions functions.c random.c norm.c ${HEADERS})
+
+target_include_directories(functions
+        PUBLIC
+        ${AWP_SOURCE_DIR}/include/
+       )
+target_link_libraries(functions grid m)
+
+
diff --git a/src/functions/functions.c b/src/topography/functions/functions.c
similarity index 98%
rename from src/functions/functions.c
rename to src/topography/functions/functions.c
index 34b2db0..7b961c6 100644
--- a/src/functions/functions.c
+++ b/src/topography/functions/functions.c
@@ -201,11 +201,12 @@ void fcn_poly(_prec *out,
         const int   ny = (int)args[10];
         const int   rx = (int)args[11];
         const int   ry = (int)args[12];
+        const _prec xshift = s0 == 1 ? 1.0 : 0.0;
         for (int i = i0; i < in; ++i) {
         for (int j = j0; j < jn; ++j) {
         for (int k = k0; k < kn; ++k) {
                 int pos = k + j*line + i*slice; 
-                out[pos] =  a0*pow(i + rx*nx - 0.5*s0, p0)
+                out[pos] =  a0*pow(i + rx*nx - 0.5*s0 + xshift, p0)
                           + a1*pow(j + ry*ny - 0.5*s1, p1)
                           + a2*pow(k         - 0.5*s2, p2);
         }
diff --git a/src/functions/norm.c b/src/topography/functions/norm.c
similarity index 100%
rename from src/functions/norm.c
rename to src/topography/functions/norm.c
diff --git a/src/functions/random.c b/src/topography/functions/random.c
similarity index 86%
rename from src/functions/random.c
rename to src/topography/functions/random.c
index 37370be..804cc56 100644
--- a/src/functions/random.c
+++ b/src/topography/functions/random.c
@@ -1,6 +1,6 @@
 #include <stdlib.h>
 
-#include <awp/definitions.h>
+#include <awp/pmcl3d_cons.h>
 #include <functions/random.h>
 
 _prec randomf(void){
diff --git a/src/topography/geometry.c b/src/topography/geometry.c
index 85fe370..01b030a 100644
--- a/src/topography/geometry.c
+++ b/src/topography/geometry.c
@@ -5,16 +5,40 @@
 
 #include <awp/definitions.h>
 #include <vtk/vtk.h>
-#include <utils/copy.h>
 #include <grid/shift.h>
 #include <topography/topography.h>
 #include <topography/geometry.h>
 #include <topography/geometry/geometry.h>
 
+//#define TOPO_USE_VTK 1
+int copyfile(const char *output, const char *input);
+int copyfile(const char *output, const char *input)
+{
+        FILE *fin = fopen(input, "r"); 
+        FILE *fout = fopen(output, "w"); 
+        int count = -1;
+
+        if (fin == NULL) {
+                fprintf(stderr, "Cannot open file %s. \n", input);
+                return count;
+        }
+
+        if (fout == NULL) {
+                fprintf(stderr, "Cannot write to file %s. \n", output);
+                return count;
+        }
+  
+        char ch;
+        while ((ch = fgetc(fin)) != EOF)
+                fputc(ch, fout);
+        fclose(fin);
+        fclose(fout);
+        return count;
+}
+
 void topo_init_grid(topo_t *T)
 {
         if (!T->use) return;
-        //FIXME: Handle proper grid initialization
         geom_cartesian_topography(&T->metrics_f);
         geom_no_grid_stretching(&T->metrics_g);
 
@@ -26,9 +50,9 @@ void topo_init_grid(topo_t *T)
         T->y1 = malloc(sizeof(T->y1) * y1_grid.size);
         T->z1 = malloc(sizeof(T->z1) * z1_grid.size);
 
-        grid_fill1(T->x1, x1_grid);
-        grid_fill1(T->y1, y1_grid);
-        grid_fill1(T->z1, z1_grid);
+        grid_fill1(T->x1, x1_grid, 1);
+        grid_fill1(T->y1, y1_grid, 0);
+        grid_fill1(T->z1, z1_grid, 0);
 }
 
 void topo_init_gaussian_hill_and_canyon_xz(topo_t *T, const _prec3_t hill_width,
@@ -73,17 +97,17 @@ void topo_write_geometry_vtk(topo_t *T, const int mode)
         _prec *y = malloc(T->topography_grid.num_bytes);
         _prec *z = malloc(T->topography_grid.num_bytes);
 
-        grid_fill3_x(x, T->x1, T->stress_grid);
-        grid_fill3_y(y, T->y1, T->stress_grid);
-        grid_fill3_z(z, T->z1, T->stress_grid);
+        grid_fill3_x(x, T->x1, T->velocity_grid);
+        grid_fill3_y(y, T->y1, T->velocity_grid);
+        grid_fill3_z(z, T->z1, T->velocity_grid);
 
-        geom_mapping_z(z, T->stress_grid, grid_node(), &T->metrics_f,
+        geom_mapping_z(z, T->velocity_grid, grid_node(), &T->metrics_f,
                        &T->metrics_g);
 
         char vtk_file[256];
 
         mkdir("vtk", 0700);
-        sprintf(vtk_file, "vtk/geometry_%d%d.vtk", T->coord[0], T->coord[0]);
+        sprintf(vtk_file, "vtk/geometry_%d%d.vtk", T->coord[0], T->coord[1]);
         switch (mode) {
                 case 0:
                 vtk_write_grid(vtk_file, x, y, z, T->velocity_grid);
@@ -109,10 +133,10 @@ void topo_write_vtk(topo_t *T, const int step, int mode)
         char vtk_vz[256];
         char geom_file[256];
         mkdir("vtk", 0700);
-        sprintf(vtk_vx, "vtk/vx_%d%d_%04d.vtk", T->coord[0], T->coord[0], step);
-        sprintf(vtk_vy, "vtk/vy_%d%d_%04d.vtk", T->coord[0], T->coord[0], step);
-        sprintf(vtk_vz, "vtk/vz_%d%d_%04d.vtk", T->coord[0], T->coord[0], step);
-        sprintf(geom_file, "vtk/geometry_%d%d.vtk", T->coord[0], T->coord[0]);
+        sprintf(vtk_vx, "vtk/vx_%d%d_%04d.vtk", T->coord[0], T->coord[1], step);
+        sprintf(vtk_vy, "vtk/vy_%d%d_%04d.vtk", T->coord[0], T->coord[1], step);
+        sprintf(vtk_vz, "vtk/vz_%d%d_%04d.vtk", T->coord[0], T->coord[1], step);
+        sprintf(geom_file, "vtk/geometry_%d%d.vtk", T->coord[0], T->coord[1]);
         copyfile(vtk_vx, geom_file);
         copyfile(vtk_vy, geom_file);
         copyfile(vtk_vz, geom_file);
@@ -139,5 +163,7 @@ void topo_write_vtk(topo_t *T, const int step, int mode)
         free(vx);
         free(vy);
         free(vz);
+
+        printf("Wrote: %s \n", vtk_vx);
 }
 
diff --git a/src/topography/geometry/CMakeLists.txt b/src/topography/geometry/CMakeLists.txt
index 5f9b901..621cd93 100644
--- a/src/topography/geometry/CMakeLists.txt
+++ b/src/topography/geometry/CMakeLists.txt
@@ -1,14 +1,14 @@
 set(HEADERS
-    ${AWP_MINI_SOURCE_DIR}/include/awp/definitions.h     
-    ${AWP_MINI_SOURCE_DIR}/include/topography/geometry/geometry.h
-    ${AWP_MINI_SOURCE_DIR}/include/topography/metrics/metrics.h
+    ${AWP_SOURCE_DIR}/include/awp/definitions.h     
+    ${AWP_SOURCE_DIR}/include/topography/geometry/geometry.h
+    ${AWP_SOURCE_DIR}/include/topography/metrics/metrics.h
     )
 
 add_library(geometry geometry.c)
 
 target_include_directories(geometry
         PUBLIC
-        ${AWP_MINI_SOURCE_DIR}/include/
+        ${AWP_SOURCE_DIR}/include/
        )
-target_link_libraries(geometry grid functions)
+target_link_libraries(geometry grid functions mapping)
 
diff --git a/src/topography/geometry/geometry.c b/src/topography/geometry/geometry.c
index f51d08c..9b2cab1 100644
--- a/src/topography/geometry/geometry.c
+++ b/src/topography/geometry/geometry.c
@@ -6,6 +6,7 @@
 #include <grid/shift.h>
 #include <topography/geometry/geometry.h>
 #include <topography/metrics/metrics.h>
+#include <topography/mapping.h>
 #include <grid/grid_3d.h>
 #include <functions/functions.h>
 #include <test/test.h>
@@ -37,8 +38,36 @@ void geom_no_grid_stretching(g_grid_t *metrics_g)
         grid1.shift = grid_node().z;
         grid1.boundary1 = 0;
         grid1.boundary2 = 1;
-        grid_fill1(&metrics_g->g[grid1.alignment], grid1);
+        grid_fill1(&metrics_g->g[grid1.alignment], grid1, 0);
+        // Shift grid vector so that the internal coordinate system places z = 0 at the first grid
+        // point immediately above the DM overlap zone
+        for (int i = 0; i < grid1.size; ++i) {
+                metrics_g->g[i + grid1.alignment] -= MAPPING_START_POINT * grid1.gridspacing;
+        }
+}
 
+void geom_grid_stretching(g_grid_t *metrics_g, const struct mapping *map, const _prec block_height)
+{
+        fcn_grid_t grid = metrics_grid_g(metrics_g);
+        grid1_t grid1 = grid_grid1_z(grid);
+        grid1.shift = grid_node().z;
+        grid1.boundary1 = 0;
+        grid1.boundary2 = 1;
+        grid_fill1(&metrics_g->g[grid1.alignment], grid1, 0);
+        // Shift grid vector so that the internal coordinate system places z = 0 at the first grid
+        // point immediately above the DM overlap zone
+        for (int i = 0; i < MAPPING_START_POINT; ++i) {
+                metrics_g->g[i + grid1.alignment] -= MAPPING_START_POINT * grid1.gridspacing;
+        }
+                
+        for (int i = MAPPING_START_POINT; i < grid1.size - 1; ++i) {
+                double h = 1.0 / (grid1.size - 2 - MAPPING_START_POINT);
+                double r = (i - MAPPING_START_POINT) * h;
+                metrics_g->g[i + grid1.alignment] = block_height * map_eval(r, map);
+        }
+        // assign ghost point
+        metrics_g->g[grid1.size - 1 + grid1.alignment] =
+            metrics_g->g[grid1.size - 2 + grid1.alignment];
 }
 
 void geom_gaussian(f_grid_t *metrics_f, const _prec *x, const _prec *y,
@@ -56,7 +85,7 @@ void geom_gaussian(f_grid_t *metrics_f, const _prec *x, const _prec *y,
         prec xm = x[last];
         prec ym = y[last];
 
-        // Grid spacing in vertical direction for a grid satsifying:
+        // Grid spacing in vertical direction for a grid satisfying:
         // 0 <= z' < =1
         // This normalization constant is used so that the user can specify
         // block dimension using physical units.
@@ -199,7 +228,7 @@ void geom_custom(const f_grid_t *metrics_f, const grid3_t grid, const int px,
         // 0 <= z' < =1
         // This normalization constant is used so that the user can specify
         // block dimension using physical units.
-        _prec normalize = 1.0 / grid.gridspacing / (grid.size.z - 2);
+        _prec normalize = 1.0 / grid.gridspacing / (grid.size.z - 2 - MAPPING_START_POINT);
 
         for (int i = 0; i < len_x; ++i) {
         for (int j = 0; j < len_y; ++j) {
@@ -209,24 +238,6 @@ void geom_custom(const f_grid_t *metrics_f, const grid3_t grid, const int px,
         }
 }
 
-void geom_ramp(_prec *out, const fcn_grid_t grid, const f_grid_t *metrics_f,
-                   const _prec *x, const _prec *y, const _prec3_t ramp) {
-        int len_x = metrics_f->bounds_x[1] - metrics_f->bounds_x[0];
-        int len_y = metrics_f->bounds_y[1] - metrics_f->bounds_y[0];
-
-        int off_x = metrics_f->offset[0] + metrics_f->bounds_x[0];
-        int off_y = metrics_f->offset[1] + metrics_f->bounds_y[0];
-        for (int i = 0; i < len_x; ++i) {
-        for (int j = 0; j < len_y; ++j) {
-                int f_pos = (off_y + j) + (off_x + i) * metrics_f->slice;
-                int pos = grid.offset1.z + (grid.offset1.y + j) * grid.line +
-                          (grid.offset1.x + i) * grid.slice;
-                metrics_f->f[f_pos] = ramp.x * x[pos] + ramp.y * y[pos];
-        }
-        }
-
-}
-
 void geom_mapping_z(_prec *out, const fcn_grid_t grid, const int3_t shift,
                     const f_grid_t *metrics_f,
                     const g_grid_t *metrics_g) {
diff --git a/src/topography/grids.c b/src/topography/grids.c
index 3d21776..5bbb7db 100644
--- a/src/topography/grids.c
+++ b/src/topography/grids.c
@@ -1,7 +1,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 
-#include <awp/definitions.h>
+#include <awp/pmcl3d_cons.h>
 #include <topography/grids.h>
 #include <test/test.h>
 #include <grid/grid_3d.h>
@@ -15,7 +15,6 @@ grids_t grids_init(const int nx, const int ny, const int nz, const int coord_x,
         int3_t size = {.x = nx, .y = ny, .z = nz};
         int3_t coord = {.x = coord_x, .y = coord_y, .z = 0};
 
-        //FIXME: Adjust depending on grid type: DM, topography, free surface.
         int3_t bnd1 = {0, 0, 0};
         int3_t bnd2 = {0, 0, topography};
 
@@ -24,20 +23,20 @@ grids_t grids_init(const int nx, const int ny, const int nz, const int coord_x,
         prec h = gridspacing;
 
         // velocity grids
-        grids.x = grid_init(size, grid_x(), coord, bnd1, bnd2, 0, h);
-        grids.y = grid_init(size, grid_y(), coord, bnd1, bnd2, 0, h);
-        grids.z = grid_init(size, grid_z(), coord, bnd1, bnd2, 0, h);
+        grids.x = grid_init(size, grid_x(), coord, bnd1, bnd2, 2 + ngsl, h);
+        grids.y = grid_init(size, grid_y(), coord, bnd1, bnd2, 2 + ngsl, h);
+        grids.z = grid_init(size, grid_z(), coord, bnd1, bnd2, 2 + ngsl, h);
 
         // stress grids
-        grids.xx = grid_init(size, grid_xx(), coord, bnd1, bnd2, ngsl / 2, h);
-        grids.yy = grid_init(size, grid_yy(), coord, bnd1, bnd2, ngsl / 2, h);
-        grids.zz = grid_init(size, grid_zz(), coord, bnd1, bnd2, ngsl / 2, h);
-        grids.xy = grid_init(size, grid_xy(), coord, bnd1, bnd2, ngsl / 2, h);
-        grids.xz = grid_init(size, grid_xz(), coord, bnd1, bnd2, ngsl / 2, h);
-        grids.yz = grid_init(size, grid_yz(), coord, bnd1, bnd2, ngsl / 2, h);
+        grids.xx = grid_init(size, grid_xx(), coord, bnd1, bnd2, 2 + ngsl, h);
+        grids.yy = grid_init(size, grid_yy(), coord, bnd1, bnd2, 2 + ngsl, h);
+        grids.zz = grid_init(size, grid_zz(), coord, bnd1, bnd2, 2 + ngsl, h);
+        grids.xy = grid_init(size, grid_xy(), coord, bnd1, bnd2, 2 + ngsl, h);
+        grids.xz = grid_init(size, grid_xz(), coord, bnd1, bnd2, 2 + ngsl, h);
+        grids.yz = grid_init(size, grid_yz(), coord, bnd1, bnd2, 2 + ngsl, h);
 
         // Material and topography grid
-        grids.node = grid_init(size, grid_node(), coord, bnd1, bnd2, 0, h);
+        grids.node = grid_init(size, grid_node(), coord, bnd1, bnd2, 2 + ngsl, h);
 
         return grids;
 }
@@ -46,7 +45,7 @@ void grids_finalize(grids_t *grids)
 {
 }
 
-void grid_data_init(grid_data_t *grid_data, const grid3_t grid)
+void grid_data_init(grid_data_t *grid_data, const grid3_t grid, const int block_number)
 {
         grid1_t xgrid = grid_grid1_x(grid);
         grid1_t ygrid = grid_grid1_y(grid);
@@ -54,9 +53,9 @@ void grid_data_init(grid_data_t *grid_data, const grid3_t grid)
         grid_data->x = malloc(sizeof grid_data->x * xgrid.size);
         grid_data->y = malloc(sizeof grid_data->y * ygrid.size);
         grid_data->z = malloc(sizeof grid_data->z * zgrid.size);
-        grid_fill1(grid_data->x, xgrid);
-        grid_fill1(grid_data->y, ygrid);
-        grid_fill1(grid_data->z, zgrid);
+        grid_fill1(grid_data->x, xgrid, 1);
+        grid_fill_y_dm(grid_data->y, ygrid, block_number);
+        grid_fill1(grid_data->z, zgrid, 0);
 }
 
 void grid_data_free(grid_data_t *grid_data)
@@ -78,6 +77,15 @@ grid3_t grids_select(const enum grid_types grid_type, const grids_t *grids)
                 case Z:
                         return grids->z;
                         break;
+                case SX:
+                        return grids->x;
+                        break;
+                case SY:
+                        return grids->y;
+                        break;
+                case SZ:
+                        return grids->z;
+                        break;
                 case XX:
                         return grids->xx;
                         break;
@@ -107,3 +115,35 @@ grid3_t grids_select(const enum grid_types grid_type, const grids_t *grids)
 
 }
 
+const char *grid_typename(const enum grid_types gt) {
+        switch(gt) {
+                case X:
+                        return "X";
+                case Y:
+                        return "Y";
+                case Z:
+                        return "Z";
+                case SX:
+                        return "SX";
+                case SY:
+                        return "SY";
+                case SZ:
+                        return "SZ";
+                case XX:
+                        return "XX";
+                case XY:
+                        return "XY";
+                case YY:
+                        return "YY";
+                case ZZ:
+                        return "ZZ";
+                case XZ:
+                        return "XZ";
+                case YZ:
+                        return "YZ";
+                case NODE:
+                        return "NODE";
+        }
+        return "";
+}
+
diff --git a/src/topography/host.c b/src/topography/host.c
index c7839e1..6fea3ac 100644
--- a/src/topography/host.c
+++ b/src/topography/host.c
@@ -1,7 +1,7 @@
 #include <stdio.h>
 #include <cuda_runtime.h>
 
-#include <awp/definitions.h>
+#include <awp/pmcl3d_cons.h>
 #include <topography/host.h>
 
 void topo_h_malloc(topo_t *host)
diff --git a/src/topography/initializations/CMakeLists.txt b/src/topography/initializations/CMakeLists.txt
index 6c72f1d..f30a5d4 100644
--- a/src/topography/initializations/CMakeLists.txt
+++ b/src/topography/initializations/CMakeLists.txt
@@ -1,11 +1,11 @@
 set(HEADERS
-    ${AWP_MINI_SOURCE_DIR}/include/awp/definitions.h     
-    ${AWP_MINI_SOURCE_DIR}/include/topography/topography.h     
-    ${AWP_MINI_SOURCE_DIR}/include/topography/initializations/constant.h     
-    ${AWP_MINI_SOURCE_DIR}/include/topography/initializations/random.h     
-    ${AWP_MINI_SOURCE_DIR}/include/topography/initializations/cerjan.h     
-    ${AWP_MINI_SOURCE_DIR}/include/topography/initializations/linear.h     
-    ${AWP_MINI_SOURCE_DIR}/include/topography/initializations/quadratic.h     
+    ${AWP_SOURCE_DIR}/include/awp/definitions.h     
+    ${AWP_SOURCE_DIR}/include/topography/topography.h     
+    ${AWP_SOURCE_DIR}/include/topography/initializations/constant.h     
+    ${AWP_SOURCE_DIR}/include/topography/initializations/random.h     
+    ${AWP_SOURCE_DIR}/include/topography/initializations/cerjan.h     
+    ${AWP_SOURCE_DIR}/include/topography/initializations/linear.h     
+    ${AWP_SOURCE_DIR}/include/topography/initializations/quadratic.h     
     )
 
 add_library(topography_initializations
@@ -14,5 +14,5 @@ add_library(topography_initializations
 
 target_include_directories(topography_initializations
         PUBLIC
-        ${AWP_MINI_SOURCE_DIR}/include/
+        ${AWP_SOURCE_DIR}/include/
         )
diff --git a/src/topography/kernels/CMakeLists.txt b/src/topography/kernels/CMakeLists.txt
index e2bb9f6..e69de29 100644
--- a/src/topography/kernels/CMakeLists.txt
+++ b/src/topography/kernels/CMakeLists.txt
@@ -1,43 +0,0 @@
-set(HEADERS 
-   ${AWP_MINI_SOURCE_DIR}/include/awp/definitions.h 
-   )
-
-set(UNOPT_HEADERS 
-   ${AWP_MINI_SOURCE_DIR}/include/topography/kernels/unoptimized.cuh
-   ) 
-
-set(OPT_HEADERS
-    ${AWP_MINI_SOURCE_DIR}/include/topography/opt_topography.cuh     
-    ${AWP_MINI_SOURCE_DIR}/include/topography/kernels/optimized_launch_config.cuh     
-    ${AWP_MINI_SOURCE_DIR}/include/topography/kernels/optimized_velocity.cuh     
-    ${AWP_MINI_SOURCE_DIR}/include/topography/kernels/optimized_stress.cuh     
-    )
-
-add_library(unoptimized_kernels
-        unoptimized.cu 
-        ${HEADERS} ${UNOPT_HEADERS})
-
-target_include_directories(unoptimized_kernels
-        PUBLIC
-        ${AWP_MINI_SOURCE_DIR}/include/
-        )
-
-add_library(optimized_kernels
-        optimized_velocity.cu 
-        optimized_stress.cu 
-        ${HEADERS} ${OPT_HEADERS})
-
-target_include_directories(optimized_kernels
-        PUBLIC
-        ${AWP_MINI_SOURCE_DIR}/include/
-        )
-
-add_library(optimized_attenuation_kernels
-        optimized_velocity.cu 
-        stress_attenuation.cu 
-        ${HEADERS} ${OPT_HEADERS})
-
-target_include_directories(optimized_attenuation_kernels
-        PUBLIC
-        ${AWP_MINI_SOURCE_DIR}/include/
-        )
diff --git a/src/topography/kernels/optimized.cu b/src/topography/kernels/optimized.cu
deleted file mode 100644
index 7038ed4..0000000
--- a/src/topography/kernels/optimized.cu
+++ /dev/null
@@ -1,116 +0,0 @@
-#include <topography/kernels/optimized.cuh>
-#include <topography/kernels/optimized_launch_config.cuh>
-
-__global__ void
-dtopo_str_110(float *__restrict__ u1, float *__restrict__ u2,
-              float *__restrict__ u3, const float *__restrict__ dcrjx,
-              const float *__restrict__ dcrjy, const float *__restrict__ dcrjz,
-              const float *__restrict__ f1_1, const float *__restrict__ f1_2,
-              const float *__restrict__ f1_c, const float *__restrict__ f2_1,
-              const float *__restrict__ f2_2, const float *__restrict__ f2_c,
-              const float *__restrict__ f_1, const float *__restrict__ f_2,
-              const float *__restrict__ f_c, const float *__restrict__ g,
-              const float *__restrict__ g3, const float *__restrict__ g3_c,
-              const float *__restrict__ g_c, const float *__restrict__ lami,
-              const float *__restrict__ mui, const float *__restrict__ s11,
-              const float *__restrict__ s12, const float *__restrict__ s13,
-              const float *__restrict__ s22, const float *__restrict__ s23,
-              const float *__restrict__ s33, const float a, const float nu,
-              const int nx, const int ny, const int nz, const int bi,
-              const int bj, const int ei, const int ej) {
-  const int j = threadIdx.y + blockIdx.y * blockDim.y + bj;
-  if (j >= ngsl + ny)
-    return;
-  if (j >= ej)
-    return;
-  const int k = threadIdx.x + blockIdx.x * blockDim.x;
-  if (k >= 6)
-    return;
-  for (int i = bi; i < ei; ++i) {
-    text
-  }
-}
-
-__global__ void
-dtopo_str_111(float *__restrict__ u1, float *__restrict__ u2,
-              float *__restrict__ u3, const float *__restrict__ dcrjx,
-              const float *__restrict__ dcrjy, const float *__restrict__ dcrjz,
-              const float *__restrict__ f1_1, const float *__restrict__ f1_2,
-              const float *__restrict__ f1_c, const float *__restrict__ f2_1,
-              const float *__restrict__ f2_2, const float *__restrict__ f2_c,
-              const float *__restrict__ f_1, const float *__restrict__ f_2,
-              const float *__restrict__ f_c, const float *__restrict__ g,
-              const float *__restrict__ g3, const float *__restrict__ g3_c,
-              const float *__restrict__ g_c, const float *__restrict__ lami,
-              const float *__restrict__ mui, const float *__restrict__ s11,
-              const float *__restrict__ s12, const float *__restrict__ s13,
-              const float *__restrict__ s22, const float *__restrict__ s23,
-              const float *__restrict__ s33, const float a, const float nu,
-              const int nx, const int ny, const int nz, const int bi,
-              const int bj, const int ei, const int ej) {
-  const int j = threadIdx.y + blockIdx.y * blockDim.y + bj;
-  if (j >= ngsl + ny)
-    return;
-  if (j >= ej)
-    return;
-  const int k = threadIdx.x + blockIdx.x * blockDim.x;
-  if (k >= nz - 12)
-    return;
-  for (int i = bi; i < ei; ++i) {
-    text
-  }
-}
-
-__global__ void
-dtopo_str_112(float *__restrict__ u1, float *__restrict__ u2,
-              float *__restrict__ u3, const float *__restrict__ dcrjx,
-              const float *__restrict__ dcrjy, const float *__restrict__ dcrjz,
-              const float *__restrict__ f1_1, const float *__restrict__ f1_2,
-              const float *__restrict__ f1_c, const float *__restrict__ f2_1,
-              const float *__restrict__ f2_2, const float *__restrict__ f2_c,
-              const float *__restrict__ f_1, const float *__restrict__ f_2,
-              const float *__restrict__ f_c, const float *__restrict__ g,
-              const float *__restrict__ g3, const float *__restrict__ g3_c,
-              const float *__restrict__ g_c, const float *__restrict__ lami,
-              const float *__restrict__ mui, const float *__restrict__ s11,
-              const float *__restrict__ s12, const float *__restrict__ s13,
-              const float *__restrict__ s22, const float *__restrict__ s23,
-              const float *__restrict__ s33, const float a, const float nu,
-              const int nx, const int ny, const int nz, const int bi,
-              const int bj, const int ei, const int ej) {
-  const int j = threadIdx.y + blockIdx.y * blockDim.y + bj;
-  if (j >= ngsl + ny)
-    return;
-  if (j >= ej)
-    return;
-  const int k = threadIdx.x + blockIdx.x * blockDim.x;
-  if (k >= 6)
-    return;
-  for (int i = bi; i < ei; ++i) {
-    text
-  }
-}
-
-__global__ void dtopo_init_material_111(float *__restrict__ lami,
-                                        float *__restrict__ mui,
-                                        float *__restrict__ rho, const int nx,
-                                        const int ny, const int nz) {
-  const int i = threadIdx.z + blockIdx.z * blockDim.z;
-  if (i >= nx)
-    return;
-  const int j = threadIdx.y + blockIdx.y * blockDim.y;
-  if (j >= ny)
-    return;
-  const int k = threadIdx.x + blockIdx.x * blockDim.x;
-  if (k >= nz)
-    return;
-#define _lami(i, j, k) lami[(i)*ny * nz + (j)*nz + (k)]
-#define _mui(i, j, k) mui[(i)*ny * nz + (j)*nz + (k)]
-#define _rho(i, j, k) rho[(i)*ny * nz + (j)*nz + (k)]
-  _rho(i, j, k) = 1.0;
-  _lami(i, j, k) = 1.0;
-  _mui(i, j, k) = 1.0;
-#undef _lami
-#undef _mui
-#undef _rho
-}
diff --git a/src/topography/kernels/optimized_stress.cu b/src/topography/kernels/optimized_stress.cu
deleted file mode 100644
index 5ad1030..0000000
--- a/src/topography/kernels/optimized_stress.cu
+++ /dev/null
@@ -1,2433 +0,0 @@
-#include <topography/kernels/optimized_launch_config.cuh>
-#include <topography/kernels/optimized_stress.cuh>
-#include <stdio.h>
-
-__global__ void dtopo_str_110(
-    float *__restrict__ s11, float *__restrict__ s12, float *__restrict__ s13,
-    float *__restrict__ s22, float *__restrict__ s23, float *__restrict__ s33,
-    float *__restrict__ u1, float *__restrict__ u2, float *__restrict__ u3,
-    const float *__restrict__ dcrjx, const float *__restrict__ dcrjy,
-    const float *__restrict__ dcrjz, const float *__restrict__ f,
-    const float *__restrict__ f1_1, const float *__restrict__ f1_2,
-    const float *__restrict__ f1_c, const float *__restrict__ f2_1,
-    const float *__restrict__ f2_2, const float *__restrict__ f2_c,
-    const float *__restrict__ f_1, const float *__restrict__ f_2,
-    const float *__restrict__ f_c, const float *__restrict__ g,
-    const float *__restrict__ g3, const float *__restrict__ g3_c,
-    const float *__restrict__ g_c, const float *__restrict__ lami,
-    const float *__restrict__ mui, const float a, const float nu, const int nx,
-    const int ny, const int nz, const int bi, const int bj, const int ei,
-    const int ej) {
-  const float phz4l[6][7] = {
-      {0.8338228784688313, 0.1775123316429260, 0.1435067013076542,
-       -0.1548419114194114, 0.0000000000000000, 0.0000000000000000,
-       0.0000000000000000},
-      {0.1813404047323969, 1.1246711188154426, -0.2933634518280757,
-       -0.0126480717197637, 0.0000000000000000, 0.0000000000000000,
-       0.0000000000000000},
-      {-0.1331142706282399, 0.7930714675884345, 0.3131998767078508,
-       0.0268429263319546, 0.0000000000000000, 0.0000000000000000,
-       0.0000000000000000},
-      {0.0969078556633046, -0.1539344946680898, 0.4486491202844389,
-       0.6768738207821733, -0.0684963020618270, 0.0000000000000000,
-       0.0000000000000000},
-      {0.0000000000000000, 0.0000000000000000, -0.0625000000000000,
-       0.5625000000000000, 0.5625000000000000, -0.0625000000000000,
-       0.0000000000000000},
-      {0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
-       -0.0625000000000000, 0.5625000000000000, 0.5625000000000000,
-       -0.0625000000000000}};
-  const float phy4[4] = {-0.0625000000000000, 0.5625000000000000,
-                         0.5625000000000000, -0.0625000000000000};
-  const float px4[4] = {-0.0625000000000000, 0.5625000000000000,
-                        0.5625000000000000, -0.0625000000000000};
-  const float dhz4l[6][7] = {
-      {-1.4511412472637157, 1.8534237417911470, -0.3534237417911469,
-       -0.0488587527362844, 0.0000000000000000, 0.0000000000000000,
-       0.0000000000000000},
-      {-0.8577143189081458, 0.5731429567244373, 0.4268570432755628,
-       -0.1422856810918542, 0.0000000000000000, 0.0000000000000000,
-       0.0000000000000000},
-      {-0.1674548505882877, -0.4976354482351368, 0.4976354482351368,
-       0.1674548505882877, 0.0000000000000000, 0.0000000000000000,
-       0.0000000000000000},
-      {0.1027061113405124, -0.2624541326469860, -0.8288742701021167,
-       1.0342864927831414, -0.0456642013745513, 0.0000000000000000,
-       0.0000000000000000},
-      {0.0000000000000000, 0.0000000000000000, 0.0416666666666667,
-       -1.1250000000000000, 1.1250000000000000, -0.0416666666666667,
-       0.0000000000000000},
-      {0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
-       0.0416666666666667, -1.1250000000000000, 1.1250000000000000,
-       -0.0416666666666667}};
-  const float phdz4l[6][9] = {
-      {-1.5373923010673116, 1.0330083346742178, 0.6211677623382129,
-       0.0454110758451345, -0.1680934225988761, 0.0058985508086226,
-       0.0000000000000000, 0.0000000000000000, 0.0000000000000000},
-      {-0.8713921425924011, 0.1273679143938725, 0.9297550647681330,
-       -0.1912595577524762, 0.0050469052908678, 0.0004818158920039,
-       0.0000000000000000, 0.0000000000000000, 0.0000000000000000},
-      {-0.0563333965151294, -0.3996393739211770, -0.0536007135209481,
-       0.5022638816465500, 0.0083321572725344, -0.0010225549618299,
-       0.0000000000000000, 0.0000000000000000, 0.0000000000000000},
-      {-0.0132930497153990, 0.0706942590708847, -0.5596445380498725,
-       -0.1434031863528334, 0.7456356868769503, -0.1028431844156395,
-       0.0028540125859095, 0.0000000000000000, 0.0000000000000000},
-      {-0.0025849423769932, 0.0492307522105194, -0.0524552477068130,
-       -0.5317248489238559, -0.0530169938441241, 0.6816971139746001,
-       -0.0937500000000000, 0.0026041666666667, 0.0000000000000000},
-      {-0.0009619461344193, -0.0035553215968974, 0.0124936029037323,
-       0.0773639466787397, -0.6736586580761996, -0.0002232904416222,
-       0.6796875000000000, -0.0937500000000000, 0.0026041666666667}};
-  const float dx4[4] = {0.0416666666666667, -1.1250000000000000,
-                        1.1250000000000000, -0.0416666666666667};
-  const float dhy4[4] = {0.0416666666666667, -1.1250000000000000,
-                         1.1250000000000000, -0.0416666666666667};
-  const float phx4[4] = {-0.0625000000000000, 0.5625000000000000,
-                         0.5625000000000000, -0.0625000000000000};
-  const float py4[4] = {-0.0625000000000000, 0.5625000000000000,
-                        0.5625000000000000, -0.0625000000000000};
-  const float dy4[4] = {0.0416666666666667, -1.1250000000000000,
-                        1.1250000000000000, -0.0416666666666667};
-  const float dhx4[4] = {0.0416666666666667, -1.1250000000000000,
-                         1.1250000000000000, -0.0416666666666667};
-  const float dz4l[6][8] = {
-      {-1.7779989465546748, 1.3337480247900155, 0.7775013168066564,
-       -0.3332503950419969, 0.0000000000000000, 0.0000000000000000,
-       0.0000000000000000, 0.0000000000000000},
-      {-0.4410217341392059, -0.1730842484889890, 0.4487228323259926,
-       0.1653831503022022, 0.0000000000000000, 0.0000000000000000,
-       0.0000000000000000, 0.0000000000000000},
-      {0.1798793213882701, -0.2757257254150788, -0.9597948548284453,
-       1.1171892610431817, -0.0615480021879277, 0.0000000000000000,
-       0.0000000000000000, 0.0000000000000000},
-      {0.0153911381507088, 0.0568851455503591, -0.1998976464597171,
-       -0.8628231468598346, 1.0285385292191949, -0.0380940196007109,
-       0.0000000000000000, 0.0000000000000000},
-      {0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
-       0.0416666666666667, -1.1250000000000000, 1.1250000000000000,
-       -0.0416666666666667, 0.0000000000000000},
-      {0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
-       0.0000000000000000, 0.0416666666666667, -1.1250000000000000,
-       1.1250000000000000, -0.0416666666666667}};
-  const float pdhz4l[6][9] = {
-      {-1.5886075042755416, 2.2801810182668110, -0.8088980291471827,
-       0.1316830205960989, -0.0143585054401857, 0.0000000000000000,
-       0.0000000000000000, 0.0000000000000000, 0.0000000000000000},
-      {-0.4823226655921296, -0.0574614517751294, 0.5663203488781653,
-       -0.0309656800624243, 0.0044294485515179, 0.0000000000000000,
-       0.0000000000000000, 0.0000000000000000, 0.0000000000000000},
-      {0.0174954311279016, -0.4325508330649350, -0.3111668377093504,
-       0.8538512002386446, -0.1314757107290064, 0.0038467501367455,
-       0.0000000000000000, 0.0000000000000000, 0.0000000000000000},
-      {0.1277481742492071, -0.2574468839590017, -0.4155794781917712,
-       0.0115571196122084, 0.6170517361659126, -0.0857115441015996,
-       0.0023808762250444, 0.0000000000000000, 0.0000000000000000},
-      {-0.0064191319587820, 0.0164033832904366, 0.0752421418813823,
-       -0.6740179057989464, 0.0002498459192428, 0.6796875000000000,
-       -0.0937500000000000, 0.0026041666666667, 0.0000000000000000},
-      {0.0000000000000000, 0.0000000000000000, -0.0026041666666667,
-       0.0937500000000000, -0.6796875000000000, -0.0000000000000000,
-       0.6796875000000000, -0.0937500000000000, 0.0026041666666667}};
-  const int j = threadIdx.y + blockIdx.y * blockDim.y + bj;
-  if (j >= ngsl + ny)
-    return;
-  if (j >= ej)
-    return;
-  const int k = threadIdx.x + blockIdx.x * blockDim.x;
-  if (k >= 6)
-    return;
-#define _dcrjx(i) dcrjx[(i) + ngsl + 2]
-#define _dcrjy(j) dcrjy[(j) + ngsl + 2]
-#define _dcrjz(k) dcrjz[(k) + align]
-#define _f(i, j)                                                               \
-  f[(j) + align + ngsl + ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f1_1(i, j)                                                            \
-  f1_1[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f1_2(i, j)                                                            \
-  f1_2[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f1_c(i, j)                                                            \
-  f1_c[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f2_1(i, j)                                                            \
-  f2_1[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f2_2(i, j)                                                            \
-  f2_2[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f2_c(i, j)                                                            \
-  f2_c[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f_1(i, j)                                                             \
-  f_1[(j) + align + ngsl +                                                     \
-      ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f_2(i, j)                                                             \
-  f_2[(j) + align + ngsl +                                                     \
-      ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f_c(i, j)                                                             \
-  f_c[(j) + align + ngsl +                                                     \
-      ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _g(k) g[(k) + align]
-#define _g3(k) g3[(k) + align]
-#define _g3_c(k) g3_c[(k) + align]
-#define _g_c(k) g_c[(k) + align]
-#define _lami(i, j, k)                                                         \
-  lami[(k) + align +                                                           \
-       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +             \
-       (2 * align + nz) * ((j) + ngsl + 2)]
-#define _mui(i, j, k)                                                          \
-  mui[(k) + align +                                                            \
-      (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
-      (2 * align + nz) * ((j) + ngsl + 2)]
-#define _s11(i, j, k)                                                          \
-  s11[(k) + align +                                                            \
-      (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
-      (2 * align + nz) * ((j) + ngsl + 2)]
-#define _s12(i, j, k)                                                          \
-  s12[(k) + align +                                                            \
-      (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
-      (2 * align + nz) * ((j) + ngsl + 2)]
-#define _s13(i, j, k)                                                          \
-  s13[(k) + align +                                                            \
-      (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
-      (2 * align + nz) * ((j) + ngsl + 2)]
-#define _s22(i, j, k)                                                          \
-  s22[(k) + align +                                                            \
-      (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
-      (2 * align + nz) * ((j) + ngsl + 2)]
-#define _s23(i, j, k)                                                          \
-  s23[(k) + align +                                                            \
-      (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
-      (2 * align + nz) * ((j) + ngsl + 2)]
-#define _s33(i, j, k)                                                          \
-  s33[(k) + align +                                                            \
-      (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
-      (2 * align + nz) * ((j) + ngsl + 2)]
-#define _u1(i, j, k)                                                           \
-  u1[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
-     (2 * align + nz) * ((j) + ngsl + 2)]
-#define _u2(i, j, k)                                                           \
-  u2[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
-     (2 * align + nz) * ((j) + ngsl + 2)]
-#define _u3(i, j, k)                                                           \
-  u3[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
-     (2 * align + nz) * ((j) + ngsl + 2)]
-  for (int i = bi; i < ei; ++i) {
-    float Jii = _f_c(i, j) * _g3_c(k);
-    Jii = 1.0 * 1.0 / Jii;
-    float J12i = _f(i, j) * _g3_c(k);
-    J12i = 1.0 * 1.0 / J12i;
-    float J13i = _f_1(i, j) * _g3(k);
-    J13i = 1.0 * 1.0 / J13i;
-    float J23i = _f_2(i, j) * _g3(k);
-    J23i = 1.0 * 1.0 / J23i;
-    float lam =
-        nu * 1.0 /
-        (phz4l[k][0] *
-             (phy4[2] *
-                  (px4[1] * _lami(i, j, 0) + px4[0] * _lami(i - 1, j, 0) +
-                   px4[2] * _lami(i + 1, j, 0) + px4[3] * _lami(i + 2, j, 0)) +
-              phy4[0] * (px4[1] * _lami(i, j - 2, 0) +
-                         px4[0] * _lami(i - 1, j - 2, 0) +
-                         px4[2] * _lami(i + 1, j - 2, 0) +
-                         px4[3] * _lami(i + 2, j - 2, 0)) +
-              phy4[1] * (px4[1] * _lami(i, j - 1, 0) +
-                         px4[0] * _lami(i - 1, j - 1, 0) +
-                         px4[2] * _lami(i + 1, j - 1, 0) +
-                         px4[3] * _lami(i + 2, j - 1, 0)) +
-              phy4[3] * (px4[1] * _lami(i, j + 1, 0) +
-                         px4[0] * _lami(i - 1, j + 1, 0) +
-                         px4[2] * _lami(i + 1, j + 1, 0) +
-                         px4[3] * _lami(i + 2, j + 1, 0))) +
-         phz4l[k][1] *
-             (phy4[2] *
-                  (px4[1] * _lami(i, j, 1) + px4[0] * _lami(i - 1, j, 1) +
-                   px4[2] * _lami(i + 1, j, 1) + px4[3] * _lami(i + 2, j, 1)) +
-              phy4[0] * (px4[1] * _lami(i, j - 2, 1) +
-                         px4[0] * _lami(i - 1, j - 2, 1) +
-                         px4[2] * _lami(i + 1, j - 2, 1) +
-                         px4[3] * _lami(i + 2, j - 2, 1)) +
-              phy4[1] * (px4[1] * _lami(i, j - 1, 1) +
-                         px4[0] * _lami(i - 1, j - 1, 1) +
-                         px4[2] * _lami(i + 1, j - 1, 1) +
-                         px4[3] * _lami(i + 2, j - 1, 1)) +
-              phy4[3] * (px4[1] * _lami(i, j + 1, 1) +
-                         px4[0] * _lami(i - 1, j + 1, 1) +
-                         px4[2] * _lami(i + 1, j + 1, 1) +
-                         px4[3] * _lami(i + 2, j + 1, 1))) +
-         phz4l[k][2] *
-             (phy4[2] *
-                  (px4[1] * _lami(i, j, 2) + px4[0] * _lami(i - 1, j, 2) +
-                   px4[2] * _lami(i + 1, j, 2) + px4[3] * _lami(i + 2, j, 2)) +
-              phy4[0] * (px4[1] * _lami(i, j - 2, 2) +
-                         px4[0] * _lami(i - 1, j - 2, 2) +
-                         px4[2] * _lami(i + 1, j - 2, 2) +
-                         px4[3] * _lami(i + 2, j - 2, 2)) +
-              phy4[1] * (px4[1] * _lami(i, j - 1, 2) +
-                         px4[0] * _lami(i - 1, j - 1, 2) +
-                         px4[2] * _lami(i + 1, j - 1, 2) +
-                         px4[3] * _lami(i + 2, j - 1, 2)) +
-              phy4[3] * (px4[1] * _lami(i, j + 1, 2) +
-                         px4[0] * _lami(i - 1, j + 1, 2) +
-                         px4[2] * _lami(i + 1, j + 1, 2) +
-                         px4[3] * _lami(i + 2, j + 1, 2))) +
-         phz4l[k][3] *
-             (phy4[2] *
-                  (px4[1] * _lami(i, j, 3) + px4[0] * _lami(i - 1, j, 3) +
-                   px4[2] * _lami(i + 1, j, 3) + px4[3] * _lami(i + 2, j, 3)) +
-              phy4[0] * (px4[1] * _lami(i, j - 2, 3) +
-                         px4[0] * _lami(i - 1, j - 2, 3) +
-                         px4[2] * _lami(i + 1, j - 2, 3) +
-                         px4[3] * _lami(i + 2, j - 2, 3)) +
-              phy4[1] * (px4[1] * _lami(i, j - 1, 3) +
-                         px4[0] * _lami(i - 1, j - 1, 3) +
-                         px4[2] * _lami(i + 1, j - 1, 3) +
-                         px4[3] * _lami(i + 2, j - 1, 3)) +
-              phy4[3] * (px4[1] * _lami(i, j + 1, 3) +
-                         px4[0] * _lami(i - 1, j + 1, 3) +
-                         px4[2] * _lami(i + 1, j + 1, 3) +
-                         px4[3] * _lami(i + 2, j + 1, 3))) +
-         phz4l[k][4] *
-             (phy4[2] *
-                  (px4[1] * _lami(i, j, 4) + px4[0] * _lami(i - 1, j, 4) +
-                   px4[2] * _lami(i + 1, j, 4) + px4[3] * _lami(i + 2, j, 4)) +
-              phy4[0] * (px4[1] * _lami(i, j - 2, 4) +
-                         px4[0] * _lami(i - 1, j - 2, 4) +
-                         px4[2] * _lami(i + 1, j - 2, 4) +
-                         px4[3] * _lami(i + 2, j - 2, 4)) +
-              phy4[1] * (px4[1] * _lami(i, j - 1, 4) +
-                         px4[0] * _lami(i - 1, j - 1, 4) +
-                         px4[2] * _lami(i + 1, j - 1, 4) +
-                         px4[3] * _lami(i + 2, j - 1, 4)) +
-              phy4[3] * (px4[1] * _lami(i, j + 1, 4) +
-                         px4[0] * _lami(i - 1, j + 1, 4) +
-                         px4[2] * _lami(i + 1, j + 1, 4) +
-                         px4[3] * _lami(i + 2, j + 1, 4))) +
-         phz4l[k][5] *
-             (phy4[2] *
-                  (px4[1] * _lami(i, j, 5) + px4[0] * _lami(i - 1, j, 5) +
-                   px4[2] * _lami(i + 1, j, 5) + px4[3] * _lami(i + 2, j, 5)) +
-              phy4[0] * (px4[1] * _lami(i, j - 2, 5) +
-                         px4[0] * _lami(i - 1, j - 2, 5) +
-                         px4[2] * _lami(i + 1, j - 2, 5) +
-                         px4[3] * _lami(i + 2, j - 2, 5)) +
-              phy4[1] * (px4[1] * _lami(i, j - 1, 5) +
-                         px4[0] * _lami(i - 1, j - 1, 5) +
-                         px4[2] * _lami(i + 1, j - 1, 5) +
-                         px4[3] * _lami(i + 2, j - 1, 5)) +
-              phy4[3] * (px4[1] * _lami(i, j + 1, 5) +
-                         px4[0] * _lami(i - 1, j + 1, 5) +
-                         px4[2] * _lami(i + 1, j + 1, 5) +
-                         px4[3] * _lami(i + 2, j + 1, 5))) +
-         phz4l[k][6] *
-             (phy4[2] *
-                  (px4[1] * _lami(i, j, 6) + px4[0] * _lami(i - 1, j, 6) +
-                   px4[2] * _lami(i + 1, j, 6) + px4[3] * _lami(i + 2, j, 6)) +
-              phy4[0] * (px4[1] * _lami(i, j - 2, 6) +
-                         px4[0] * _lami(i - 1, j - 2, 6) +
-                         px4[2] * _lami(i + 1, j - 2, 6) +
-                         px4[3] * _lami(i + 2, j - 2, 6)) +
-              phy4[1] * (px4[1] * _lami(i, j - 1, 6) +
-                         px4[0] * _lami(i - 1, j - 1, 6) +
-                         px4[2] * _lami(i + 1, j - 1, 6) +
-                         px4[3] * _lami(i + 2, j - 1, 6)) +
-              phy4[3] * (px4[1] * _lami(i, j + 1, 6) +
-                         px4[0] * _lami(i - 1, j + 1, 6) +
-                         px4[2] * _lami(i + 1, j + 1, 6) +
-                         px4[3] * _lami(i + 2, j + 1, 6))));
-    float twomu =
-        2 * nu * 1.0 /
-        (phz4l[k][0] *
-             (phy4[2] *
-                  (px4[1] * _mui(i, j, 0) + px4[0] * _mui(i - 1, j, 0) +
-                   px4[2] * _mui(i + 1, j, 0) + px4[3] * _mui(i + 2, j, 0)) +
-              phy4[0] *
-                  (px4[1] * _mui(i, j - 2, 0) + px4[0] * _mui(i - 1, j - 2, 0) +
-                   px4[2] * _mui(i + 1, j - 2, 0) +
-                   px4[3] * _mui(i + 2, j - 2, 0)) +
-              phy4[1] *
-                  (px4[1] * _mui(i, j - 1, 0) + px4[0] * _mui(i - 1, j - 1, 0) +
-                   px4[2] * _mui(i + 1, j - 1, 0) +
-                   px4[3] * _mui(i + 2, j - 1, 0)) +
-              phy4[3] *
-                  (px4[1] * _mui(i, j + 1, 0) + px4[0] * _mui(i - 1, j + 1, 0) +
-                   px4[2] * _mui(i + 1, j + 1, 0) +
-                   px4[3] * _mui(i + 2, j + 1, 0))) +
-         phz4l[k][1] *
-             (phy4[2] *
-                  (px4[1] * _mui(i, j, 1) + px4[0] * _mui(i - 1, j, 1) +
-                   px4[2] * _mui(i + 1, j, 1) + px4[3] * _mui(i + 2, j, 1)) +
-              phy4[0] *
-                  (px4[1] * _mui(i, j - 2, 1) + px4[0] * _mui(i - 1, j - 2, 1) +
-                   px4[2] * _mui(i + 1, j - 2, 1) +
-                   px4[3] * _mui(i + 2, j - 2, 1)) +
-              phy4[1] *
-                  (px4[1] * _mui(i, j - 1, 1) + px4[0] * _mui(i - 1, j - 1, 1) +
-                   px4[2] * _mui(i + 1, j - 1, 1) +
-                   px4[3] * _mui(i + 2, j - 1, 1)) +
-              phy4[3] *
-                  (px4[1] * _mui(i, j + 1, 1) + px4[0] * _mui(i - 1, j + 1, 1) +
-                   px4[2] * _mui(i + 1, j + 1, 1) +
-                   px4[3] * _mui(i + 2, j + 1, 1))) +
-         phz4l[k][2] *
-             (phy4[2] *
-                  (px4[1] * _mui(i, j, 2) + px4[0] * _mui(i - 1, j, 2) +
-                   px4[2] * _mui(i + 1, j, 2) + px4[3] * _mui(i + 2, j, 2)) +
-              phy4[0] *
-                  (px4[1] * _mui(i, j - 2, 2) + px4[0] * _mui(i - 1, j - 2, 2) +
-                   px4[2] * _mui(i + 1, j - 2, 2) +
-                   px4[3] * _mui(i + 2, j - 2, 2)) +
-              phy4[1] *
-                  (px4[1] * _mui(i, j - 1, 2) + px4[0] * _mui(i - 1, j - 1, 2) +
-                   px4[2] * _mui(i + 1, j - 1, 2) +
-                   px4[3] * _mui(i + 2, j - 1, 2)) +
-              phy4[3] *
-                  (px4[1] * _mui(i, j + 1, 2) + px4[0] * _mui(i - 1, j + 1, 2) +
-                   px4[2] * _mui(i + 1, j + 1, 2) +
-                   px4[3] * _mui(i + 2, j + 1, 2))) +
-         phz4l[k][3] *
-             (phy4[2] *
-                  (px4[1] * _mui(i, j, 3) + px4[0] * _mui(i - 1, j, 3) +
-                   px4[2] * _mui(i + 1, j, 3) + px4[3] * _mui(i + 2, j, 3)) +
-              phy4[0] *
-                  (px4[1] * _mui(i, j - 2, 3) + px4[0] * _mui(i - 1, j - 2, 3) +
-                   px4[2] * _mui(i + 1, j - 2, 3) +
-                   px4[3] * _mui(i + 2, j - 2, 3)) +
-              phy4[1] *
-                  (px4[1] * _mui(i, j - 1, 3) + px4[0] * _mui(i - 1, j - 1, 3) +
-                   px4[2] * _mui(i + 1, j - 1, 3) +
-                   px4[3] * _mui(i + 2, j - 1, 3)) +
-              phy4[3] *
-                  (px4[1] * _mui(i, j + 1, 3) + px4[0] * _mui(i - 1, j + 1, 3) +
-                   px4[2] * _mui(i + 1, j + 1, 3) +
-                   px4[3] * _mui(i + 2, j + 1, 3))) +
-         phz4l[k][4] *
-             (phy4[2] *
-                  (px4[1] * _mui(i, j, 4) + px4[0] * _mui(i - 1, j, 4) +
-                   px4[2] * _mui(i + 1, j, 4) + px4[3] * _mui(i + 2, j, 4)) +
-              phy4[0] *
-                  (px4[1] * _mui(i, j - 2, 4) + px4[0] * _mui(i - 1, j - 2, 4) +
-                   px4[2] * _mui(i + 1, j - 2, 4) +
-                   px4[3] * _mui(i + 2, j - 2, 4)) +
-              phy4[1] *
-                  (px4[1] * _mui(i, j - 1, 4) + px4[0] * _mui(i - 1, j - 1, 4) +
-                   px4[2] * _mui(i + 1, j - 1, 4) +
-                   px4[3] * _mui(i + 2, j - 1, 4)) +
-              phy4[3] *
-                  (px4[1] * _mui(i, j + 1, 4) + px4[0] * _mui(i - 1, j + 1, 4) +
-                   px4[2] * _mui(i + 1, j + 1, 4) +
-                   px4[3] * _mui(i + 2, j + 1, 4))) +
-         phz4l[k][5] *
-             (phy4[2] *
-                  (px4[1] * _mui(i, j, 5) + px4[0] * _mui(i - 1, j, 5) +
-                   px4[2] * _mui(i + 1, j, 5) + px4[3] * _mui(i + 2, j, 5)) +
-              phy4[0] *
-                  (px4[1] * _mui(i, j - 2, 5) + px4[0] * _mui(i - 1, j - 2, 5) +
-                   px4[2] * _mui(i + 1, j - 2, 5) +
-                   px4[3] * _mui(i + 2, j - 2, 5)) +
-              phy4[1] *
-                  (px4[1] * _mui(i, j - 1, 5) + px4[0] * _mui(i - 1, j - 1, 5) +
-                   px4[2] * _mui(i + 1, j - 1, 5) +
-                   px4[3] * _mui(i + 2, j - 1, 5)) +
-              phy4[3] *
-                  (px4[1] * _mui(i, j + 1, 5) + px4[0] * _mui(i - 1, j + 1, 5) +
-                   px4[2] * _mui(i + 1, j + 1, 5) +
-                   px4[3] * _mui(i + 2, j + 1, 5))) +
-         phz4l[k][6] *
-             (phy4[2] *
-                  (px4[1] * _mui(i, j, 6) + px4[0] * _mui(i - 1, j, 6) +
-                   px4[2] * _mui(i + 1, j, 6) + px4[3] * _mui(i + 2, j, 6)) +
-              phy4[0] *
-                  (px4[1] * _mui(i, j - 2, 6) + px4[0] * _mui(i - 1, j - 2, 6) +
-                   px4[2] * _mui(i + 1, j - 2, 6) +
-                   px4[3] * _mui(i + 2, j - 2, 6)) +
-              phy4[1] *
-                  (px4[1] * _mui(i, j - 1, 6) + px4[0] * _mui(i - 1, j - 1, 6) +
-                   px4[2] * _mui(i + 1, j - 1, 6) +
-                   px4[3] * _mui(i + 2, j - 1, 6)) +
-              phy4[3] *
-                  (px4[1] * _mui(i, j + 1, 6) + px4[0] * _mui(i - 1, j + 1, 6) +
-                   px4[2] * _mui(i + 1, j + 1, 6) +
-                   px4[3] * _mui(i + 2, j + 1, 6))));
-    float mu12 = nu * 1.0 /
-                 (phz4l[k][0] * _mui(i, j, 0) + phz4l[k][1] * _mui(i, j, 1) +
-                  phz4l[k][2] * _mui(i, j, 2) + phz4l[k][3] * _mui(i, j, 3) +
-                  phz4l[k][4] * _mui(i, j, 4) + phz4l[k][5] * _mui(i, j, 5) +
-                  phz4l[k][6] * _mui(i, j, 6));
-    float mu13 = nu * 1.0 /
-                 (phy4[2] * _mui(i, j, k) + phy4[0] * _mui(i, j - 2, k) +
-                  phy4[1] * _mui(i, j - 1, k) + phy4[3] * _mui(i, j + 1, k));
-    float mu23 = nu * 1.0 /
-                 (px4[1] * _mui(i, j, k) + px4[0] * _mui(i - 1, j, k) +
-                  px4[2] * _mui(i + 1, j, k) + px4[3] * _mui(i + 2, j, k));
-    float div =
-        dhy4[2] * _u2(i, j, k) + dhy4[0] * _u2(i, j - 2, k) +
-        dhy4[1] * _u2(i, j - 1, k) + dhy4[3] * _u2(i, j + 1, k) +
-        dx4[1] * _u1(i, j, k) + dx4[0] * _u1(i - 1, j, k) +
-        dx4[2] * _u1(i + 1, j, k) + dx4[3] * _u1(i + 2, j, k) +
-        Jii * (dhz4l[k][0] * _u3(i, j, 0) + dhz4l[k][1] * _u3(i, j, 1) +
-               dhz4l[k][2] * _u3(i, j, 2) + dhz4l[k][3] * _u3(i, j, 3) +
-               dhz4l[k][4] * _u3(i, j, 4) + dhz4l[k][5] * _u3(i, j, 5) +
-               dhz4l[k][6] * _u3(i, j, 6)) -
-        Jii * _g_c(k) *
-            (phy4[2] * _f2_2(i, j) *
-                 (phdz4l[k][0] * _u2(i, j, 0) + phdz4l[k][1] * _u2(i, j, 1) +
-                  phdz4l[k][2] * _u2(i, j, 2) + phdz4l[k][3] * _u2(i, j, 3) +
-                  phdz4l[k][4] * _u2(i, j, 4) + phdz4l[k][5] * _u2(i, j, 5) +
-                  phdz4l[k][6] * _u2(i, j, 6) + phdz4l[k][7] * _u2(i, j, 7) +
-                  phdz4l[k][8] * _u2(i, j, 8)) +
-             phy4[0] * _f2_2(i, j - 2) *
-                 (phdz4l[k][0] * _u2(i, j - 2, 0) +
-                  phdz4l[k][1] * _u2(i, j - 2, 1) +
-                  phdz4l[k][2] * _u2(i, j - 2, 2) +
-                  phdz4l[k][3] * _u2(i, j - 2, 3) +
-                  phdz4l[k][4] * _u2(i, j - 2, 4) +
-                  phdz4l[k][5] * _u2(i, j - 2, 5) +
-                  phdz4l[k][6] * _u2(i, j - 2, 6) +
-                  phdz4l[k][7] * _u2(i, j - 2, 7) +
-                  phdz4l[k][8] * _u2(i, j - 2, 8)) +
-             phy4[1] * _f2_2(i, j - 1) *
-                 (phdz4l[k][0] * _u2(i, j - 1, 0) +
-                  phdz4l[k][1] * _u2(i, j - 1, 1) +
-                  phdz4l[k][2] * _u2(i, j - 1, 2) +
-                  phdz4l[k][3] * _u2(i, j - 1, 3) +
-                  phdz4l[k][4] * _u2(i, j - 1, 4) +
-                  phdz4l[k][5] * _u2(i, j - 1, 5) +
-                  phdz4l[k][6] * _u2(i, j - 1, 6) +
-                  phdz4l[k][7] * _u2(i, j - 1, 7) +
-                  phdz4l[k][8] * _u2(i, j - 1, 8)) +
-             phy4[3] * _f2_2(i, j + 1) *
-                 (phdz4l[k][0] * _u2(i, j + 1, 0) +
-                  phdz4l[k][1] * _u2(i, j + 1, 1) +
-                  phdz4l[k][2] * _u2(i, j + 1, 2) +
-                  phdz4l[k][3] * _u2(i, j + 1, 3) +
-                  phdz4l[k][4] * _u2(i, j + 1, 4) +
-                  phdz4l[k][5] * _u2(i, j + 1, 5) +
-                  phdz4l[k][6] * _u2(i, j + 1, 6) +
-                  phdz4l[k][7] * _u2(i, j + 1, 7) +
-                  phdz4l[k][8] * _u2(i, j + 1, 8))) -
-        Jii * _g_c(k) *
-            (px4[1] * _f1_1(i, j) *
-                 (phdz4l[k][0] * _u1(i, j, 0) + phdz4l[k][1] * _u1(i, j, 1) +
-                  phdz4l[k][2] * _u1(i, j, 2) + phdz4l[k][3] * _u1(i, j, 3) +
-                  phdz4l[k][4] * _u1(i, j, 4) + phdz4l[k][5] * _u1(i, j, 5) +
-                  phdz4l[k][6] * _u1(i, j, 6) + phdz4l[k][7] * _u1(i, j, 7) +
-                  phdz4l[k][8] * _u1(i, j, 8)) +
-             px4[0] * _f1_1(i - 1, j) *
-                 (phdz4l[k][0] * _u1(i - 1, j, 0) +
-                  phdz4l[k][1] * _u1(i - 1, j, 1) +
-                  phdz4l[k][2] * _u1(i - 1, j, 2) +
-                  phdz4l[k][3] * _u1(i - 1, j, 3) +
-                  phdz4l[k][4] * _u1(i - 1, j, 4) +
-                  phdz4l[k][5] * _u1(i - 1, j, 5) +
-                  phdz4l[k][6] * _u1(i - 1, j, 6) +
-                  phdz4l[k][7] * _u1(i - 1, j, 7) +
-                  phdz4l[k][8] * _u1(i - 1, j, 8)) +
-             px4[2] * _f1_1(i + 1, j) *
-                 (phdz4l[k][0] * _u1(i + 1, j, 0) +
-                  phdz4l[k][1] * _u1(i + 1, j, 1) +
-                  phdz4l[k][2] * _u1(i + 1, j, 2) +
-                  phdz4l[k][3] * _u1(i + 1, j, 3) +
-                  phdz4l[k][4] * _u1(i + 1, j, 4) +
-                  phdz4l[k][5] * _u1(i + 1, j, 5) +
-                  phdz4l[k][6] * _u1(i + 1, j, 6) +
-                  phdz4l[k][7] * _u1(i + 1, j, 7) +
-                  phdz4l[k][8] * _u1(i + 1, j, 8)) +
-             px4[3] * _f1_1(i + 2, j) *
-                 (phdz4l[k][0] * _u1(i + 2, j, 0) +
-                  phdz4l[k][1] * _u1(i + 2, j, 1) +
-                  phdz4l[k][2] * _u1(i + 2, j, 2) +
-                  phdz4l[k][3] * _u1(i + 2, j, 3) +
-                  phdz4l[k][4] * _u1(i + 2, j, 4) +
-                  phdz4l[k][5] * _u1(i + 2, j, 5) +
-                  phdz4l[k][6] * _u1(i + 2, j, 6) +
-                  phdz4l[k][7] * _u1(i + 2, j, 7) +
-                  phdz4l[k][8] * _u1(i + 2, j, 8)));
-    float f_dcrj = _dcrjx(i) * _dcrjy(j) * _dcrjz(k);
-    _s11(i, j, k) =
-        (a * _s11(i, j, k) + lam * div +
-         twomu * (dx4[1] * _u1(i, j, k) + dx4[0] * _u1(i - 1, j, k) +
-                  dx4[2] * _u1(i + 1, j, k) + dx4[3] * _u1(i + 2, j, k)) -
-         twomu * Jii * _g_c(k) *
-             (px4[1] * _f1_1(i, j) *
-                  (phdz4l[k][0] * _u1(i, j, 0) + phdz4l[k][1] * _u1(i, j, 1) +
-                   phdz4l[k][2] * _u1(i, j, 2) + phdz4l[k][3] * _u1(i, j, 3) +
-                   phdz4l[k][4] * _u1(i, j, 4) + phdz4l[k][5] * _u1(i, j, 5) +
-                   phdz4l[k][6] * _u1(i, j, 6) + phdz4l[k][7] * _u1(i, j, 7) +
-                   phdz4l[k][8] * _u1(i, j, 8)) +
-              px4[0] * _f1_1(i - 1, j) *
-                  (phdz4l[k][0] * _u1(i - 1, j, 0) +
-                   phdz4l[k][1] * _u1(i - 1, j, 1) +
-                   phdz4l[k][2] * _u1(i - 1, j, 2) +
-                   phdz4l[k][3] * _u1(i - 1, j, 3) +
-                   phdz4l[k][4] * _u1(i - 1, j, 4) +
-                   phdz4l[k][5] * _u1(i - 1, j, 5) +
-                   phdz4l[k][6] * _u1(i - 1, j, 6) +
-                   phdz4l[k][7] * _u1(i - 1, j, 7) +
-                   phdz4l[k][8] * _u1(i - 1, j, 8)) +
-              px4[2] * _f1_1(i + 1, j) *
-                  (phdz4l[k][0] * _u1(i + 1, j, 0) +
-                   phdz4l[k][1] * _u1(i + 1, j, 1) +
-                   phdz4l[k][2] * _u1(i + 1, j, 2) +
-                   phdz4l[k][3] * _u1(i + 1, j, 3) +
-                   phdz4l[k][4] * _u1(i + 1, j, 4) +
-                   phdz4l[k][5] * _u1(i + 1, j, 5) +
-                   phdz4l[k][6] * _u1(i + 1, j, 6) +
-                   phdz4l[k][7] * _u1(i + 1, j, 7) +
-                   phdz4l[k][8] * _u1(i + 1, j, 8)) +
-              px4[3] * _f1_1(i + 2, j) *
-                  (phdz4l[k][0] * _u1(i + 2, j, 0) +
-                   phdz4l[k][1] * _u1(i + 2, j, 1) +
-                   phdz4l[k][2] * _u1(i + 2, j, 2) +
-                   phdz4l[k][3] * _u1(i + 2, j, 3) +
-                   phdz4l[k][4] * _u1(i + 2, j, 4) +
-                   phdz4l[k][5] * _u1(i + 2, j, 5) +
-                   phdz4l[k][6] * _u1(i + 2, j, 6) +
-                   phdz4l[k][7] * _u1(i + 2, j, 7) +
-                   phdz4l[k][8] * _u1(i + 2, j, 8)))) *
-        f_dcrj;
-    _s22(i, j, k) =
-        (a * _s22(i, j, k) + lam * div +
-         twomu * (dhy4[2] * _u2(i, j, k) + dhy4[0] * _u2(i, j - 2, k) +
-                  dhy4[1] * _u2(i, j - 1, k) + dhy4[3] * _u2(i, j + 1, k)) -
-         twomu * Jii * _g_c(k) *
-             (phy4[2] * _f2_2(i, j) *
-                  (phdz4l[k][0] * _u2(i, j, 0) + phdz4l[k][1] * _u2(i, j, 1) +
-                   phdz4l[k][2] * _u2(i, j, 2) + phdz4l[k][3] * _u2(i, j, 3) +
-                   phdz4l[k][4] * _u2(i, j, 4) + phdz4l[k][5] * _u2(i, j, 5) +
-                   phdz4l[k][6] * _u2(i, j, 6) + phdz4l[k][7] * _u2(i, j, 7) +
-                   phdz4l[k][8] * _u2(i, j, 8)) +
-              phy4[0] * _f2_2(i, j - 2) *
-                  (phdz4l[k][0] * _u2(i, j - 2, 0) +
-                   phdz4l[k][1] * _u2(i, j - 2, 1) +
-                   phdz4l[k][2] * _u2(i, j - 2, 2) +
-                   phdz4l[k][3] * _u2(i, j - 2, 3) +
-                   phdz4l[k][4] * _u2(i, j - 2, 4) +
-                   phdz4l[k][5] * _u2(i, j - 2, 5) +
-                   phdz4l[k][6] * _u2(i, j - 2, 6) +
-                   phdz4l[k][7] * _u2(i, j - 2, 7) +
-                   phdz4l[k][8] * _u2(i, j - 2, 8)) +
-              phy4[1] * _f2_2(i, j - 1) *
-                  (phdz4l[k][0] * _u2(i, j - 1, 0) +
-                   phdz4l[k][1] * _u2(i, j - 1, 1) +
-                   phdz4l[k][2] * _u2(i, j - 1, 2) +
-                   phdz4l[k][3] * _u2(i, j - 1, 3) +
-                   phdz4l[k][4] * _u2(i, j - 1, 4) +
-                   phdz4l[k][5] * _u2(i, j - 1, 5) +
-                   phdz4l[k][6] * _u2(i, j - 1, 6) +
-                   phdz4l[k][7] * _u2(i, j - 1, 7) +
-                   phdz4l[k][8] * _u2(i, j - 1, 8)) +
-              phy4[3] * _f2_2(i, j + 1) *
-                  (phdz4l[k][0] * _u2(i, j + 1, 0) +
-                   phdz4l[k][1] * _u2(i, j + 1, 1) +
-                   phdz4l[k][2] * _u2(i, j + 1, 2) +
-                   phdz4l[k][3] * _u2(i, j + 1, 3) +
-                   phdz4l[k][4] * _u2(i, j + 1, 4) +
-                   phdz4l[k][5] * _u2(i, j + 1, 5) +
-                   phdz4l[k][6] * _u2(i, j + 1, 6) +
-                   phdz4l[k][7] * _u2(i, j + 1, 7) +
-                   phdz4l[k][8] * _u2(i, j + 1, 8)))) *
-        f_dcrj;
-    _s33(i, j, k) =
-        (a * _s33(i, j, k) + lam * div +
-         twomu * Jii *
-             (dhz4l[k][0] * _u3(i, j, 0) + dhz4l[k][1] * _u3(i, j, 1) +
-              dhz4l[k][2] * _u3(i, j, 2) + dhz4l[k][3] * _u3(i, j, 3) +
-              dhz4l[k][4] * _u3(i, j, 4) + dhz4l[k][5] * _u3(i, j, 5) +
-              dhz4l[k][6] * _u3(i, j, 6))) *
-        f_dcrj;
-    _s12(i, j, k) =
-        (a * _s12(i, j, k) +
-         mu12 * (dhx4[2] * _u2(i, j, k) + dhx4[0] * _u2(i - 2, j, k) +
-                 dhx4[1] * _u2(i - 1, j, k) + dhx4[3] * _u2(i + 1, j, k) +
-                 dy4[1] * _u1(i, j, k) + dy4[0] * _u1(i, j - 1, k) +
-                 dy4[2] * _u1(i, j + 1, k) + dy4[3] * _u1(i, j + 2, k) -
-                 J12i * _g_c(k) *
-                     (phx4[2] * _f1_2(i, j) *
-                          (phdz4l[k][0] * _u2(i, j, 0) +
-                           phdz4l[k][1] * _u2(i, j, 1) +
-                           phdz4l[k][2] * _u2(i, j, 2) +
-                           phdz4l[k][3] * _u2(i, j, 3) +
-                           phdz4l[k][4] * _u2(i, j, 4) +
-                           phdz4l[k][5] * _u2(i, j, 5) +
-                           phdz4l[k][6] * _u2(i, j, 6) +
-                           phdz4l[k][7] * _u2(i, j, 7) +
-                           phdz4l[k][8] * _u2(i, j, 8)) +
-                      phx4[0] * _f1_2(i - 2, j) *
-                          (phdz4l[k][0] * _u2(i - 2, j, 0) +
-                           phdz4l[k][1] * _u2(i - 2, j, 1) +
-                           phdz4l[k][2] * _u2(i - 2, j, 2) +
-                           phdz4l[k][3] * _u2(i - 2, j, 3) +
-                           phdz4l[k][4] * _u2(i - 2, j, 4) +
-                           phdz4l[k][5] * _u2(i - 2, j, 5) +
-                           phdz4l[k][6] * _u2(i - 2, j, 6) +
-                           phdz4l[k][7] * _u2(i - 2, j, 7) +
-                           phdz4l[k][8] * _u2(i - 2, j, 8)) +
-                      phx4[1] * _f1_2(i - 1, j) *
-                          (phdz4l[k][0] * _u2(i - 1, j, 0) +
-                           phdz4l[k][1] * _u2(i - 1, j, 1) +
-                           phdz4l[k][2] * _u2(i - 1, j, 2) +
-                           phdz4l[k][3] * _u2(i - 1, j, 3) +
-                           phdz4l[k][4] * _u2(i - 1, j, 4) +
-                           phdz4l[k][5] * _u2(i - 1, j, 5) +
-                           phdz4l[k][6] * _u2(i - 1, j, 6) +
-                           phdz4l[k][7] * _u2(i - 1, j, 7) +
-                           phdz4l[k][8] * _u2(i - 1, j, 8)) +
-                      phx4[3] * _f1_2(i + 1, j) *
-                          (phdz4l[k][0] * _u2(i + 1, j, 0) +
-                           phdz4l[k][1] * _u2(i + 1, j, 1) +
-                           phdz4l[k][2] * _u2(i + 1, j, 2) +
-                           phdz4l[k][3] * _u2(i + 1, j, 3) +
-                           phdz4l[k][4] * _u2(i + 1, j, 4) +
-                           phdz4l[k][5] * _u2(i + 1, j, 5) +
-                           phdz4l[k][6] * _u2(i + 1, j, 6) +
-                           phdz4l[k][7] * _u2(i + 1, j, 7) +
-                           phdz4l[k][8] * _u2(i + 1, j, 8))) -
-                 J12i * _g_c(k) *
-                     (py4[1] * _f2_1(i, j) *
-                          (phdz4l[k][0] * _u1(i, j, 0) +
-                           phdz4l[k][1] * _u1(i, j, 1) +
-                           phdz4l[k][2] * _u1(i, j, 2) +
-                           phdz4l[k][3] * _u1(i, j, 3) +
-                           phdz4l[k][4] * _u1(i, j, 4) +
-                           phdz4l[k][5] * _u1(i, j, 5) +
-                           phdz4l[k][6] * _u1(i, j, 6) +
-                           phdz4l[k][7] * _u1(i, j, 7) +
-                           phdz4l[k][8] * _u1(i, j, 8)) +
-                      py4[0] * _f2_1(i, j - 1) *
-                          (phdz4l[k][0] * _u1(i, j - 1, 0) +
-                           phdz4l[k][1] * _u1(i, j - 1, 1) +
-                           phdz4l[k][2] * _u1(i, j - 1, 2) +
-                           phdz4l[k][3] * _u1(i, j - 1, 3) +
-                           phdz4l[k][4] * _u1(i, j - 1, 4) +
-                           phdz4l[k][5] * _u1(i, j - 1, 5) +
-                           phdz4l[k][6] * _u1(i, j - 1, 6) +
-                           phdz4l[k][7] * _u1(i, j - 1, 7) +
-                           phdz4l[k][8] * _u1(i, j - 1, 8)) +
-                      py4[2] * _f2_1(i, j + 1) *
-                          (phdz4l[k][0] * _u1(i, j + 1, 0) +
-                           phdz4l[k][1] * _u1(i, j + 1, 1) +
-                           phdz4l[k][2] * _u1(i, j + 1, 2) +
-                           phdz4l[k][3] * _u1(i, j + 1, 3) +
-                           phdz4l[k][4] * _u1(i, j + 1, 4) +
-                           phdz4l[k][5] * _u1(i, j + 1, 5) +
-                           phdz4l[k][6] * _u1(i, j + 1, 6) +
-                           phdz4l[k][7] * _u1(i, j + 1, 7) +
-                           phdz4l[k][8] * _u1(i, j + 1, 8)) +
-                      py4[3] * _f2_1(i, j + 2) *
-                          (phdz4l[k][0] * _u1(i, j + 2, 0) +
-                           phdz4l[k][1] * _u1(i, j + 2, 1) +
-                           phdz4l[k][2] * _u1(i, j + 2, 2) +
-                           phdz4l[k][3] * _u1(i, j + 2, 3) +
-                           phdz4l[k][4] * _u1(i, j + 2, 4) +
-                           phdz4l[k][5] * _u1(i, j + 2, 5) +
-                           phdz4l[k][6] * _u1(i, j + 2, 6) +
-                           phdz4l[k][7] * _u1(i, j + 2, 7) +
-                           phdz4l[k][8] * _u1(i, j + 2, 8))))) *
-        f_dcrj;
-    _s13(i, j, k) =
-        (a * _s13(i, j, k) +
-         mu13 *
-             (dhx4[2] * _u3(i, j, k) + dhx4[0] * _u3(i - 2, j, k) +
-              dhx4[1] * _u3(i - 1, j, k) + dhx4[3] * _u3(i + 1, j, k) +
-              J13i * (dz4l[k][0] * _u1(i, j, 0) + dz4l[k][1] * _u1(i, j, 1) +
-                      dz4l[k][2] * _u1(i, j, 2) + dz4l[k][3] * _u1(i, j, 3) +
-                      dz4l[k][4] * _u1(i, j, 4) + dz4l[k][5] * _u1(i, j, 5) +
-                      dz4l[k][6] * _u1(i, j, 6) + dz4l[k][7] * _u1(i, j, 7)) -
-              J13i * _g(k) *
-                  (phx4[2] * _f1_c(i, j) *
-                       (pdhz4l[k][0] * _u3(i, j, 0) +
-                        pdhz4l[k][1] * _u3(i, j, 1) +
-                        pdhz4l[k][2] * _u3(i, j, 2) +
-                        pdhz4l[k][3] * _u3(i, j, 3) +
-                        pdhz4l[k][4] * _u3(i, j, 4) +
-                        pdhz4l[k][5] * _u3(i, j, 5) +
-                        pdhz4l[k][6] * _u3(i, j, 6) +
-                        pdhz4l[k][7] * _u3(i, j, 7) +
-                        pdhz4l[k][8] * _u3(i, j, 8)) +
-                   phx4[0] * _f1_c(i - 2, j) *
-                       (pdhz4l[k][0] * _u3(i - 2, j, 0) +
-                        pdhz4l[k][1] * _u3(i - 2, j, 1) +
-                        pdhz4l[k][2] * _u3(i - 2, j, 2) +
-                        pdhz4l[k][3] * _u3(i - 2, j, 3) +
-                        pdhz4l[k][4] * _u3(i - 2, j, 4) +
-                        pdhz4l[k][5] * _u3(i - 2, j, 5) +
-                        pdhz4l[k][6] * _u3(i - 2, j, 6) +
-                        pdhz4l[k][7] * _u3(i - 2, j, 7) +
-                        pdhz4l[k][8] * _u3(i - 2, j, 8)) +
-                   phx4[1] * _f1_c(i - 1, j) *
-                       (pdhz4l[k][0] * _u3(i - 1, j, 0) +
-                        pdhz4l[k][1] * _u3(i - 1, j, 1) +
-                        pdhz4l[k][2] * _u3(i - 1, j, 2) +
-                        pdhz4l[k][3] * _u3(i - 1, j, 3) +
-                        pdhz4l[k][4] * _u3(i - 1, j, 4) +
-                        pdhz4l[k][5] * _u3(i - 1, j, 5) +
-                        pdhz4l[k][6] * _u3(i - 1, j, 6) +
-                        pdhz4l[k][7] * _u3(i - 1, j, 7) +
-                        pdhz4l[k][8] * _u3(i - 1, j, 8)) +
-                   phx4[3] * _f1_c(i + 1, j) *
-                       (pdhz4l[k][0] * _u3(i + 1, j, 0) +
-                        pdhz4l[k][1] * _u3(i + 1, j, 1) +
-                        pdhz4l[k][2] * _u3(i + 1, j, 2) +
-                        pdhz4l[k][3] * _u3(i + 1, j, 3) +
-                        pdhz4l[k][4] * _u3(i + 1, j, 4) +
-                        pdhz4l[k][5] * _u3(i + 1, j, 5) +
-                        pdhz4l[k][6] * _u3(i + 1, j, 6) +
-                        pdhz4l[k][7] * _u3(i + 1, j, 7) +
-                        pdhz4l[k][8] * _u3(i + 1, j, 8))))) *
-        f_dcrj;
-    _s23(i, j, k) =
-        (a * _s23(i, j, k) +
-         mu23 *
-             (dy4[1] * _u3(i, j, k) + dy4[0] * _u3(i, j - 1, k) +
-              dy4[2] * _u3(i, j + 1, k) + dy4[3] * _u3(i, j + 2, k) +
-              J23i * (dz4l[k][0] * _u2(i, j, 0) + dz4l[k][1] * _u2(i, j, 1) +
-                      dz4l[k][2] * _u2(i, j, 2) + dz4l[k][3] * _u2(i, j, 3) +
-                      dz4l[k][4] * _u2(i, j, 4) + dz4l[k][5] * _u2(i, j, 5) +
-                      dz4l[k][6] * _u2(i, j, 6) + dz4l[k][7] * _u2(i, j, 7)) -
-              J23i * _g(k) *
-                  (py4[1] * _f2_c(i, j) *
-                       (pdhz4l[k][0] * _u3(i, j, 0) +
-                        pdhz4l[k][1] * _u3(i, j, 1) +
-                        pdhz4l[k][2] * _u3(i, j, 2) +
-                        pdhz4l[k][3] * _u3(i, j, 3) +
-                        pdhz4l[k][4] * _u3(i, j, 4) +
-                        pdhz4l[k][5] * _u3(i, j, 5) +
-                        pdhz4l[k][6] * _u3(i, j, 6) +
-                        pdhz4l[k][7] * _u3(i, j, 7) +
-                        pdhz4l[k][8] * _u3(i, j, 8)) +
-                   py4[0] * _f2_c(i, j - 1) *
-                       (pdhz4l[k][0] * _u3(i, j - 1, 0) +
-                        pdhz4l[k][1] * _u3(i, j - 1, 1) +
-                        pdhz4l[k][2] * _u3(i, j - 1, 2) +
-                        pdhz4l[k][3] * _u3(i, j - 1, 3) +
-                        pdhz4l[k][4] * _u3(i, j - 1, 4) +
-                        pdhz4l[k][5] * _u3(i, j - 1, 5) +
-                        pdhz4l[k][6] * _u3(i, j - 1, 6) +
-                        pdhz4l[k][7] * _u3(i, j - 1, 7) +
-                        pdhz4l[k][8] * _u3(i, j - 1, 8)) +
-                   py4[2] * _f2_c(i, j + 1) *
-                       (pdhz4l[k][0] * _u3(i, j + 1, 0) +
-                        pdhz4l[k][1] * _u3(i, j + 1, 1) +
-                        pdhz4l[k][2] * _u3(i, j + 1, 2) +
-                        pdhz4l[k][3] * _u3(i, j + 1, 3) +
-                        pdhz4l[k][4] * _u3(i, j + 1, 4) +
-                        pdhz4l[k][5] * _u3(i, j + 1, 5) +
-                        pdhz4l[k][6] * _u3(i, j + 1, 6) +
-                        pdhz4l[k][7] * _u3(i, j + 1, 7) +
-                        pdhz4l[k][8] * _u3(i, j + 1, 8)) +
-                   py4[3] * _f2_c(i, j + 2) *
-                       (pdhz4l[k][0] * _u3(i, j + 2, 0) +
-                        pdhz4l[k][1] * _u3(i, j + 2, 1) +
-                        pdhz4l[k][2] * _u3(i, j + 2, 2) +
-                        pdhz4l[k][3] * _u3(i, j + 2, 3) +
-                        pdhz4l[k][4] * _u3(i, j + 2, 4) +
-                        pdhz4l[k][5] * _u3(i, j + 2, 5) +
-                        pdhz4l[k][6] * _u3(i, j + 2, 6) +
-                        pdhz4l[k][7] * _u3(i, j + 2, 7) +
-                        pdhz4l[k][8] * _u3(i, j + 2, 8))))) *
-        f_dcrj;
-  }
-#undef _dcrjx
-#undef _dcrjy
-#undef _dcrjz
-#undef _f
-#undef _f1_1
-#undef _f1_2
-#undef _f1_c
-#undef _f2_1
-#undef _f2_2
-#undef _f2_c
-#undef _f_1
-#undef _f_2
-#undef _f_c
-#undef _g
-#undef _g3
-#undef _g3_c
-#undef _g_c
-#undef _lami
-#undef _mui
-#undef _s11
-#undef _s12
-#undef _s13
-#undef _s22
-#undef _s23
-#undef _s33
-#undef _u1
-#undef _u2
-#undef _u3
-}
-
-__global__ void dtopo_str_111(
-    float *__restrict__ s11, float *__restrict__ s12, float *__restrict__ s13,
-    float *__restrict__ s22, float *__restrict__ s23, float *__restrict__ s33,
-    float *__restrict__ u1, float *__restrict__ u2, float *__restrict__ u3,
-    const float *__restrict__ dcrjx, const float *__restrict__ dcrjy,
-    const float *__restrict__ dcrjz, const float *__restrict__ f,
-    const float *__restrict__ f1_1, const float *__restrict__ f1_2,
-    const float *__restrict__ f1_c, const float *__restrict__ f2_1,
-    const float *__restrict__ f2_2, const float *__restrict__ f2_c,
-    const float *__restrict__ f_1, const float *__restrict__ f_2,
-    const float *__restrict__ f_c, const float *__restrict__ g,
-    const float *__restrict__ g3, const float *__restrict__ g3_c,
-    const float *__restrict__ g_c, const float *__restrict__ lami,
-    const float *__restrict__ mui, const float a, const float nu, const int nx,
-    const int ny, const int nz, const int bi, const int bj, const int ei,
-    const int ej) {
-  const float phz4[4] = {-0.0625000000000000, 0.5625000000000000,
-                         0.5625000000000000, -0.0625000000000000};
-  const float phy4[4] = {-0.0625000000000000, 0.5625000000000000,
-                         0.5625000000000000, -0.0625000000000000};
-  const float px4[4] = {-0.0625000000000000, 0.5625000000000000,
-                        0.5625000000000000, -0.0625000000000000};
-  const float dhz4[4] = {0.0416666666666667, -1.1250000000000000,
-                         1.1250000000000000, -0.0416666666666667};
-  const float phdz4[7] = {-0.0026041666666667, 0.0937500000000000,
-                          -0.6796875000000000, -0.0000000000000000,
-                          0.6796875000000000,  -0.0937500000000000,
-                          0.0026041666666667};
-  const float dx4[4] = {0.0416666666666667, -1.1250000000000000,
-                        1.1250000000000000, -0.0416666666666667};
-  const float dhy4[4] = {0.0416666666666667, -1.1250000000000000,
-                         1.1250000000000000, -0.0416666666666667};
-  const float phx4[4] = {-0.0625000000000000, 0.5625000000000000,
-                         0.5625000000000000, -0.0625000000000000};
-  const float py4[4] = {-0.0625000000000000, 0.5625000000000000,
-                        0.5625000000000000, -0.0625000000000000};
-  const float dy4[4] = {0.0416666666666667, -1.1250000000000000,
-                        1.1250000000000000, -0.0416666666666667};
-  const float dhx4[4] = {0.0416666666666667, -1.1250000000000000,
-                         1.1250000000000000, -0.0416666666666667};
-  const float dz4[4] = {0.0416666666666667, -1.1250000000000000,
-                        1.1250000000000000, -0.0416666666666667};
-  const float pdhz4[7] = {-0.0026041666666667, 0.0937500000000000,
-                          -0.6796875000000000, -0.0000000000000000,
-                          0.6796875000000000,  -0.0937500000000000,
-                          0.0026041666666667};
-  const int j = threadIdx.y + blockIdx.y * blockDim.y + bj;
-  if (j >= ngsl + ny)
-    return;
-  if (j >= ej)
-    return;
-  const int k = threadIdx.x + blockIdx.x * blockDim.x;
-  if (k >= nz - 12)
-    return;
-#define _dcrjx(i) dcrjx[(i) + ngsl + 2]
-#define _dcrjy(j) dcrjy[(j) + ngsl + 2]
-#define _dcrjz(k) dcrjz[(k) + align]
-#define _f(i, j)                                                               \
-  f[(j) + align + ngsl + ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f1_1(i, j)                                                            \
-  f1_1[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f1_2(i, j)                                                            \
-  f1_2[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f1_c(i, j)                                                            \
-  f1_c[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f2_1(i, j)                                                            \
-  f2_1[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f2_2(i, j)                                                            \
-  f2_2[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f2_c(i, j)                                                            \
-  f2_c[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f_1(i, j)                                                             \
-  f_1[(j) + align + ngsl +                                                     \
-      ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f_2(i, j)                                                             \
-  f_2[(j) + align + ngsl +                                                     \
-      ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f_c(i, j)                                                             \
-  f_c[(j) + align + ngsl +                                                     \
-      ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _g(k) g[(k) + align]
-#define _g3(k) g3[(k) + align]
-#define _g3_c(k) g3_c[(k) + align]
-#define _g_c(k) g_c[(k) + align]
-#define _lami(i, j, k)                                                         \
-  lami[(k) + align +                                                           \
-       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +             \
-       (2 * align + nz) * ((j) + ngsl + 2)]
-#define _mui(i, j, k)                                                          \
-  mui[(k) + align +                                                            \
-      (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
-      (2 * align + nz) * ((j) + ngsl + 2)]
-#define _s11(i, j, k)                                                          \
-  s11[(k) + align +                                                            \
-      (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
-      (2 * align + nz) * ((j) + ngsl + 2)]
-#define _s12(i, j, k)                                                          \
-  s12[(k) + align +                                                            \
-      (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
-      (2 * align + nz) * ((j) + ngsl + 2)]
-#define _s13(i, j, k)                                                          \
-  s13[(k) + align +                                                            \
-      (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
-      (2 * align + nz) * ((j) + ngsl + 2)]
-#define _s22(i, j, k)                                                          \
-  s22[(k) + align +                                                            \
-      (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
-      (2 * align + nz) * ((j) + ngsl + 2)]
-#define _s23(i, j, k)                                                          \
-  s23[(k) + align +                                                            \
-      (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
-      (2 * align + nz) * ((j) + ngsl + 2)]
-#define _s33(i, j, k)                                                          \
-  s33[(k) + align +                                                            \
-      (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
-      (2 * align + nz) * ((j) + ngsl + 2)]
-#define _u1(i, j, k)                                                           \
-  u1[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
-     (2 * align + nz) * ((j) + ngsl + 2)]
-#define _u2(i, j, k)                                                           \
-  u2[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
-     (2 * align + nz) * ((j) + ngsl + 2)]
-#define _u3(i, j, k)                                                           \
-  u3[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
-     (2 * align + nz) * ((j) + ngsl + 2)]
-  for (int i = bi; i < ei; ++i) {
-    float Jii = _f_c(i, j) * _g3_c(k + 6);
-    Jii = 1.0 * 1.0 / Jii;
-    float J12i = _f(i, j) * _g3_c(k + 6);
-    J12i = 1.0 * 1.0 / J12i;
-    float J13i = _f_1(i, j) * _g3(k + 6);
-    J13i = 1.0 * 1.0 / J13i;
-    float J23i = _f_2(i, j) * _g3(k + 6);
-    J23i = 1.0 * 1.0 / J23i;
-    float lam = nu * 1.0 /
-                (phz4[0] * (phy4[2] * (px4[1] * _lami(i, j, k + 4) +
-                                       px4[0] * _lami(i - 1, j, k + 4) +
-                                       px4[2] * _lami(i + 1, j, k + 4) +
-                                       px4[3] * _lami(i + 2, j, k + 4)) +
-                            phy4[0] * (px4[1] * _lami(i, j - 2, k + 4) +
-                                       px4[0] * _lami(i - 1, j - 2, k + 4) +
-                                       px4[2] * _lami(i + 1, j - 2, k + 4) +
-                                       px4[3] * _lami(i + 2, j - 2, k + 4)) +
-                            phy4[1] * (px4[1] * _lami(i, j - 1, k + 4) +
-                                       px4[0] * _lami(i - 1, j - 1, k + 4) +
-                                       px4[2] * _lami(i + 1, j - 1, k + 4) +
-                                       px4[3] * _lami(i + 2, j - 1, k + 4)) +
-                            phy4[3] * (px4[1] * _lami(i, j + 1, k + 4) +
-                                       px4[0] * _lami(i - 1, j + 1, k + 4) +
-                                       px4[2] * _lami(i + 1, j + 1, k + 4) +
-                                       px4[3] * _lami(i + 2, j + 1, k + 4))) +
-                 phz4[1] * (phy4[2] * (px4[1] * _lami(i, j, k + 5) +
-                                       px4[0] * _lami(i - 1, j, k + 5) +
-                                       px4[2] * _lami(i + 1, j, k + 5) +
-                                       px4[3] * _lami(i + 2, j, k + 5)) +
-                            phy4[0] * (px4[1] * _lami(i, j - 2, k + 5) +
-                                       px4[0] * _lami(i - 1, j - 2, k + 5) +
-                                       px4[2] * _lami(i + 1, j - 2, k + 5) +
-                                       px4[3] * _lami(i + 2, j - 2, k + 5)) +
-                            phy4[1] * (px4[1] * _lami(i, j - 1, k + 5) +
-                                       px4[0] * _lami(i - 1, j - 1, k + 5) +
-                                       px4[2] * _lami(i + 1, j - 1, k + 5) +
-                                       px4[3] * _lami(i + 2, j - 1, k + 5)) +
-                            phy4[3] * (px4[1] * _lami(i, j + 1, k + 5) +
-                                       px4[0] * _lami(i - 1, j + 1, k + 5) +
-                                       px4[2] * _lami(i + 1, j + 1, k + 5) +
-                                       px4[3] * _lami(i + 2, j + 1, k + 5))) +
-                 phz4[2] * (phy4[2] * (px4[1] * _lami(i, j, k + 6) +
-                                       px4[0] * _lami(i - 1, j, k + 6) +
-                                       px4[2] * _lami(i + 1, j, k + 6) +
-                                       px4[3] * _lami(i + 2, j, k + 6)) +
-                            phy4[0] * (px4[1] * _lami(i, j - 2, k + 6) +
-                                       px4[0] * _lami(i - 1, j - 2, k + 6) +
-                                       px4[2] * _lami(i + 1, j - 2, k + 6) +
-                                       px4[3] * _lami(i + 2, j - 2, k + 6)) +
-                            phy4[1] * (px4[1] * _lami(i, j - 1, k + 6) +
-                                       px4[0] * _lami(i - 1, j - 1, k + 6) +
-                                       px4[2] * _lami(i + 1, j - 1, k + 6) +
-                                       px4[3] * _lami(i + 2, j - 1, k + 6)) +
-                            phy4[3] * (px4[1] * _lami(i, j + 1, k + 6) +
-                                       px4[0] * _lami(i - 1, j + 1, k + 6) +
-                                       px4[2] * _lami(i + 1, j + 1, k + 6) +
-                                       px4[3] * _lami(i + 2, j + 1, k + 6))) +
-                 phz4[3] * (phy4[2] * (px4[1] * _lami(i, j, k + 7) +
-                                       px4[0] * _lami(i - 1, j, k + 7) +
-                                       px4[2] * _lami(i + 1, j, k + 7) +
-                                       px4[3] * _lami(i + 2, j, k + 7)) +
-                            phy4[0] * (px4[1] * _lami(i, j - 2, k + 7) +
-                                       px4[0] * _lami(i - 1, j - 2, k + 7) +
-                                       px4[2] * _lami(i + 1, j - 2, k + 7) +
-                                       px4[3] * _lami(i + 2, j - 2, k + 7)) +
-                            phy4[1] * (px4[1] * _lami(i, j - 1, k + 7) +
-                                       px4[0] * _lami(i - 1, j - 1, k + 7) +
-                                       px4[2] * _lami(i + 1, j - 1, k + 7) +
-                                       px4[3] * _lami(i + 2, j - 1, k + 7)) +
-                            phy4[3] * (px4[1] * _lami(i, j + 1, k + 7) +
-                                       px4[0] * _lami(i - 1, j + 1, k + 7) +
-                                       px4[2] * _lami(i + 1, j + 1, k + 7) +
-                                       px4[3] * _lami(i + 2, j + 1, k + 7))));
-    float twomu = 2 * nu * 1.0 /
-                  (phz4[0] * (phy4[2] * (px4[1] * _mui(i, j, k + 4) +
-                                         px4[0] * _mui(i - 1, j, k + 4) +
-                                         px4[2] * _mui(i + 1, j, k + 4) +
-                                         px4[3] * _mui(i + 2, j, k + 4)) +
-                              phy4[0] * (px4[1] * _mui(i, j - 2, k + 4) +
-                                         px4[0] * _mui(i - 1, j - 2, k + 4) +
-                                         px4[2] * _mui(i + 1, j - 2, k + 4) +
-                                         px4[3] * _mui(i + 2, j - 2, k + 4)) +
-                              phy4[1] * (px4[1] * _mui(i, j - 1, k + 4) +
-                                         px4[0] * _mui(i - 1, j - 1, k + 4) +
-                                         px4[2] * _mui(i + 1, j - 1, k + 4) +
-                                         px4[3] * _mui(i + 2, j - 1, k + 4)) +
-                              phy4[3] * (px4[1] * _mui(i, j + 1, k + 4) +
-                                         px4[0] * _mui(i - 1, j + 1, k + 4) +
-                                         px4[2] * _mui(i + 1, j + 1, k + 4) +
-                                         px4[3] * _mui(i + 2, j + 1, k + 4))) +
-                   phz4[1] * (phy4[2] * (px4[1] * _mui(i, j, k + 5) +
-                                         px4[0] * _mui(i - 1, j, k + 5) +
-                                         px4[2] * _mui(i + 1, j, k + 5) +
-                                         px4[3] * _mui(i + 2, j, k + 5)) +
-                              phy4[0] * (px4[1] * _mui(i, j - 2, k + 5) +
-                                         px4[0] * _mui(i - 1, j - 2, k + 5) +
-                                         px4[2] * _mui(i + 1, j - 2, k + 5) +
-                                         px4[3] * _mui(i + 2, j - 2, k + 5)) +
-                              phy4[1] * (px4[1] * _mui(i, j - 1, k + 5) +
-                                         px4[0] * _mui(i - 1, j - 1, k + 5) +
-                                         px4[2] * _mui(i + 1, j - 1, k + 5) +
-                                         px4[3] * _mui(i + 2, j - 1, k + 5)) +
-                              phy4[3] * (px4[1] * _mui(i, j + 1, k + 5) +
-                                         px4[0] * _mui(i - 1, j + 1, k + 5) +
-                                         px4[2] * _mui(i + 1, j + 1, k + 5) +
-                                         px4[3] * _mui(i + 2, j + 1, k + 5))) +
-                   phz4[2] * (phy4[2] * (px4[1] * _mui(i, j, k + 6) +
-                                         px4[0] * _mui(i - 1, j, k + 6) +
-                                         px4[2] * _mui(i + 1, j, k + 6) +
-                                         px4[3] * _mui(i + 2, j, k + 6)) +
-                              phy4[0] * (px4[1] * _mui(i, j - 2, k + 6) +
-                                         px4[0] * _mui(i - 1, j - 2, k + 6) +
-                                         px4[2] * _mui(i + 1, j - 2, k + 6) +
-                                         px4[3] * _mui(i + 2, j - 2, k + 6)) +
-                              phy4[1] * (px4[1] * _mui(i, j - 1, k + 6) +
-                                         px4[0] * _mui(i - 1, j - 1, k + 6) +
-                                         px4[2] * _mui(i + 1, j - 1, k + 6) +
-                                         px4[3] * _mui(i + 2, j - 1, k + 6)) +
-                              phy4[3] * (px4[1] * _mui(i, j + 1, k + 6) +
-                                         px4[0] * _mui(i - 1, j + 1, k + 6) +
-                                         px4[2] * _mui(i + 1, j + 1, k + 6) +
-                                         px4[3] * _mui(i + 2, j + 1, k + 6))) +
-                   phz4[3] * (phy4[2] * (px4[1] * _mui(i, j, k + 7) +
-                                         px4[0] * _mui(i - 1, j, k + 7) +
-                                         px4[2] * _mui(i + 1, j, k + 7) +
-                                         px4[3] * _mui(i + 2, j, k + 7)) +
-                              phy4[0] * (px4[1] * _mui(i, j - 2, k + 7) +
-                                         px4[0] * _mui(i - 1, j - 2, k + 7) +
-                                         px4[2] * _mui(i + 1, j - 2, k + 7) +
-                                         px4[3] * _mui(i + 2, j - 2, k + 7)) +
-                              phy4[1] * (px4[1] * _mui(i, j - 1, k + 7) +
-                                         px4[0] * _mui(i - 1, j - 1, k + 7) +
-                                         px4[2] * _mui(i + 1, j - 1, k + 7) +
-                                         px4[3] * _mui(i + 2, j - 1, k + 7)) +
-                              phy4[3] * (px4[1] * _mui(i, j + 1, k + 7) +
-                                         px4[0] * _mui(i - 1, j + 1, k + 7) +
-                                         px4[2] * _mui(i + 1, j + 1, k + 7) +
-                                         px4[3] * _mui(i + 2, j + 1, k + 7))));
-    float mu12 = nu * 1.0 /
-                 (phz4[0] * _mui(i, j, k + 4) + phz4[1] * _mui(i, j, k + 5) +
-                  phz4[2] * _mui(i, j, k + 6) + phz4[3] * _mui(i, j, k + 7));
-    float mu13 =
-        nu * 1.0 /
-        (phy4[2] * _mui(i, j, k + 6) + phy4[0] * _mui(i, j - 2, k + 6) +
-         phy4[1] * _mui(i, j - 1, k + 6) + phy4[3] * _mui(i, j + 1, k + 6));
-    float mu23 =
-        nu * 1.0 /
-        (px4[1] * _mui(i, j, k + 6) + px4[0] * _mui(i - 1, j, k + 6) +
-         px4[2] * _mui(i + 1, j, k + 6) + px4[3] * _mui(i + 2, j, k + 6));
-    float div =
-        dhy4[2] * _u2(i, j, k + 6) + dhy4[0] * _u2(i, j - 2, k + 6) +
-        dhy4[1] * _u2(i, j - 1, k + 6) + dhy4[3] * _u2(i, j + 1, k + 6) +
-        dx4[1] * _u1(i, j, k + 6) + dx4[0] * _u1(i - 1, j, k + 6) +
-        dx4[2] * _u1(i + 1, j, k + 6) + dx4[3] * _u1(i + 2, j, k + 6) +
-        Jii * (dhz4[0] * _u3(i, j, k + 4) + dhz4[1] * _u3(i, j, k + 5) +
-               dhz4[2] * _u3(i, j, k + 6) + dhz4[3] * _u3(i, j, k + 7)) -
-        Jii * _g_c(k + 6) *
-            (phy4[2] * _f2_2(i, j) *
-                 (phdz4[0] * _u2(i, j, k + 3) + phdz4[1] * _u2(i, j, k + 4) +
-                  phdz4[2] * _u2(i, j, k + 5) + phdz4[3] * _u2(i, j, k + 6) +
-                  phdz4[4] * _u2(i, j, k + 7) + phdz4[5] * _u2(i, j, k + 8) +
-                  phdz4[6] * _u2(i, j, k + 9)) +
-             phy4[0] * _f2_2(i, j - 2) *
-                 (phdz4[0] * _u2(i, j - 2, k + 3) +
-                  phdz4[1] * _u2(i, j - 2, k + 4) +
-                  phdz4[2] * _u2(i, j - 2, k + 5) +
-                  phdz4[3] * _u2(i, j - 2, k + 6) +
-                  phdz4[4] * _u2(i, j - 2, k + 7) +
-                  phdz4[5] * _u2(i, j - 2, k + 8) +
-                  phdz4[6] * _u2(i, j - 2, k + 9)) +
-             phy4[1] * _f2_2(i, j - 1) *
-                 (phdz4[0] * _u2(i, j - 1, k + 3) +
-                  phdz4[1] * _u2(i, j - 1, k + 4) +
-                  phdz4[2] * _u2(i, j - 1, k + 5) +
-                  phdz4[3] * _u2(i, j - 1, k + 6) +
-                  phdz4[4] * _u2(i, j - 1, k + 7) +
-                  phdz4[5] * _u2(i, j - 1, k + 8) +
-                  phdz4[6] * _u2(i, j - 1, k + 9)) +
-             phy4[3] * _f2_2(i, j + 1) *
-                 (phdz4[0] * _u2(i, j + 1, k + 3) +
-                  phdz4[1] * _u2(i, j + 1, k + 4) +
-                  phdz4[2] * _u2(i, j + 1, k + 5) +
-                  phdz4[3] * _u2(i, j + 1, k + 6) +
-                  phdz4[4] * _u2(i, j + 1, k + 7) +
-                  phdz4[5] * _u2(i, j + 1, k + 8) +
-                  phdz4[6] * _u2(i, j + 1, k + 9))) -
-        Jii * _g_c(k + 6) *
-            (px4[1] * _f1_1(i, j) *
-                 (phdz4[0] * _u1(i, j, k + 3) + phdz4[1] * _u1(i, j, k + 4) +
-                  phdz4[2] * _u1(i, j, k + 5) + phdz4[3] * _u1(i, j, k + 6) +
-                  phdz4[4] * _u1(i, j, k + 7) + phdz4[5] * _u1(i, j, k + 8) +
-                  phdz4[6] * _u1(i, j, k + 9)) +
-             px4[0] * _f1_1(i - 1, j) *
-                 (phdz4[0] * _u1(i - 1, j, k + 3) +
-                  phdz4[1] * _u1(i - 1, j, k + 4) +
-                  phdz4[2] * _u1(i - 1, j, k + 5) +
-                  phdz4[3] * _u1(i - 1, j, k + 6) +
-                  phdz4[4] * _u1(i - 1, j, k + 7) +
-                  phdz4[5] * _u1(i - 1, j, k + 8) +
-                  phdz4[6] * _u1(i - 1, j, k + 9)) +
-             px4[2] * _f1_1(i + 1, j) *
-                 (phdz4[0] * _u1(i + 1, j, k + 3) +
-                  phdz4[1] * _u1(i + 1, j, k + 4) +
-                  phdz4[2] * _u1(i + 1, j, k + 5) +
-                  phdz4[3] * _u1(i + 1, j, k + 6) +
-                  phdz4[4] * _u1(i + 1, j, k + 7) +
-                  phdz4[5] * _u1(i + 1, j, k + 8) +
-                  phdz4[6] * _u1(i + 1, j, k + 9)) +
-             px4[3] * _f1_1(i + 2, j) *
-                 (phdz4[0] * _u1(i + 2, j, k + 3) +
-                  phdz4[1] * _u1(i + 2, j, k + 4) +
-                  phdz4[2] * _u1(i + 2, j, k + 5) +
-                  phdz4[3] * _u1(i + 2, j, k + 6) +
-                  phdz4[4] * _u1(i + 2, j, k + 7) +
-                  phdz4[5] * _u1(i + 2, j, k + 8) +
-                  phdz4[6] * _u1(i + 2, j, k + 9)));
-    float f_dcrj = _dcrjx(i) * _dcrjy(j) * _dcrjz(k + 6);
-    _s11(i, j, k + 6) =
-        (a * _s11(i, j, k + 6) + lam * div +
-         twomu *
-             (dx4[1] * _u1(i, j, k + 6) + dx4[0] * _u1(i - 1, j, k + 6) +
-              dx4[2] * _u1(i + 1, j, k + 6) + dx4[3] * _u1(i + 2, j, k + 6)) -
-         twomu * Jii * _g_c(k + 6) *
-             (px4[1] * _f1_1(i, j) *
-                  (phdz4[0] * _u1(i, j, k + 3) + phdz4[1] * _u1(i, j, k + 4) +
-                   phdz4[2] * _u1(i, j, k + 5) + phdz4[3] * _u1(i, j, k + 6) +
-                   phdz4[4] * _u1(i, j, k + 7) + phdz4[5] * _u1(i, j, k + 8) +
-                   phdz4[6] * _u1(i, j, k + 9)) +
-              px4[0] * _f1_1(i - 1, j) *
-                  (phdz4[0] * _u1(i - 1, j, k + 3) +
-                   phdz4[1] * _u1(i - 1, j, k + 4) +
-                   phdz4[2] * _u1(i - 1, j, k + 5) +
-                   phdz4[3] * _u1(i - 1, j, k + 6) +
-                   phdz4[4] * _u1(i - 1, j, k + 7) +
-                   phdz4[5] * _u1(i - 1, j, k + 8) +
-                   phdz4[6] * _u1(i - 1, j, k + 9)) +
-              px4[2] * _f1_1(i + 1, j) *
-                  (phdz4[0] * _u1(i + 1, j, k + 3) +
-                   phdz4[1] * _u1(i + 1, j, k + 4) +
-                   phdz4[2] * _u1(i + 1, j, k + 5) +
-                   phdz4[3] * _u1(i + 1, j, k + 6) +
-                   phdz4[4] * _u1(i + 1, j, k + 7) +
-                   phdz4[5] * _u1(i + 1, j, k + 8) +
-                   phdz4[6] * _u1(i + 1, j, k + 9)) +
-              px4[3] * _f1_1(i + 2, j) *
-                  (phdz4[0] * _u1(i + 2, j, k + 3) +
-                   phdz4[1] * _u1(i + 2, j, k + 4) +
-                   phdz4[2] * _u1(i + 2, j, k + 5) +
-                   phdz4[3] * _u1(i + 2, j, k + 6) +
-                   phdz4[4] * _u1(i + 2, j, k + 7) +
-                   phdz4[5] * _u1(i + 2, j, k + 8) +
-                   phdz4[6] * _u1(i + 2, j, k + 9)))) *
-        f_dcrj;
-    _s22(i, j, k + 6) =
-        (a * _s22(i, j, k + 6) + lam * div +
-         twomu *
-             (dhy4[2] * _u2(i, j, k + 6) + dhy4[0] * _u2(i, j - 2, k + 6) +
-              dhy4[1] * _u2(i, j - 1, k + 6) + dhy4[3] * _u2(i, j + 1, k + 6)) -
-         twomu * Jii * _g_c(k + 6) *
-             (phy4[2] * _f2_2(i, j) *
-                  (phdz4[0] * _u2(i, j, k + 3) + phdz4[1] * _u2(i, j, k + 4) +
-                   phdz4[2] * _u2(i, j, k + 5) + phdz4[3] * _u2(i, j, k + 6) +
-                   phdz4[4] * _u2(i, j, k + 7) + phdz4[5] * _u2(i, j, k + 8) +
-                   phdz4[6] * _u2(i, j, k + 9)) +
-              phy4[0] * _f2_2(i, j - 2) *
-                  (phdz4[0] * _u2(i, j - 2, k + 3) +
-                   phdz4[1] * _u2(i, j - 2, k + 4) +
-                   phdz4[2] * _u2(i, j - 2, k + 5) +
-                   phdz4[3] * _u2(i, j - 2, k + 6) +
-                   phdz4[4] * _u2(i, j - 2, k + 7) +
-                   phdz4[5] * _u2(i, j - 2, k + 8) +
-                   phdz4[6] * _u2(i, j - 2, k + 9)) +
-              phy4[1] * _f2_2(i, j - 1) *
-                  (phdz4[0] * _u2(i, j - 1, k + 3) +
-                   phdz4[1] * _u2(i, j - 1, k + 4) +
-                   phdz4[2] * _u2(i, j - 1, k + 5) +
-                   phdz4[3] * _u2(i, j - 1, k + 6) +
-                   phdz4[4] * _u2(i, j - 1, k + 7) +
-                   phdz4[5] * _u2(i, j - 1, k + 8) +
-                   phdz4[6] * _u2(i, j - 1, k + 9)) +
-              phy4[3] * _f2_2(i, j + 1) *
-                  (phdz4[0] * _u2(i, j + 1, k + 3) +
-                   phdz4[1] * _u2(i, j + 1, k + 4) +
-                   phdz4[2] * _u2(i, j + 1, k + 5) +
-                   phdz4[3] * _u2(i, j + 1, k + 6) +
-                   phdz4[4] * _u2(i, j + 1, k + 7) +
-                   phdz4[5] * _u2(i, j + 1, k + 8) +
-                   phdz4[6] * _u2(i, j + 1, k + 9)))) *
-        f_dcrj;
-    _s33(i, j, k + 6) =
-        (a * _s33(i, j, k + 6) + lam * div +
-         twomu * Jii *
-             (dhz4[0] * _u3(i, j, k + 4) + dhz4[1] * _u3(i, j, k + 5) +
-              dhz4[2] * _u3(i, j, k + 6) + dhz4[3] * _u3(i, j, k + 7))) *
-        f_dcrj;
-    _s12(i, j, k + 6) =
-        (a * _s12(i, j, k + 6) +
-         mu12 *
-             (dhx4[2] * _u2(i, j, k + 6) + dhx4[0] * _u2(i - 2, j, k + 6) +
-              dhx4[1] * _u2(i - 1, j, k + 6) + dhx4[3] * _u2(i + 1, j, k + 6) +
-              dy4[1] * _u1(i, j, k + 6) + dy4[0] * _u1(i, j - 1, k + 6) +
-              dy4[2] * _u1(i, j + 1, k + 6) + dy4[3] * _u1(i, j + 2, k + 6) -
-              J12i * _g_c(k + 6) *
-                  (phx4[2] * _f1_2(i, j) *
-                       (phdz4[0] * _u2(i, j, k + 3) +
-                        phdz4[1] * _u2(i, j, k + 4) +
-                        phdz4[2] * _u2(i, j, k + 5) +
-                        phdz4[3] * _u2(i, j, k + 6) +
-                        phdz4[4] * _u2(i, j, k + 7) +
-                        phdz4[5] * _u2(i, j, k + 8) +
-                        phdz4[6] * _u2(i, j, k + 9)) +
-                   phx4[0] * _f1_2(i - 2, j) *
-                       (phdz4[0] * _u2(i - 2, j, k + 3) +
-                        phdz4[1] * _u2(i - 2, j, k + 4) +
-                        phdz4[2] * _u2(i - 2, j, k + 5) +
-                        phdz4[3] * _u2(i - 2, j, k + 6) +
-                        phdz4[4] * _u2(i - 2, j, k + 7) +
-                        phdz4[5] * _u2(i - 2, j, k + 8) +
-                        phdz4[6] * _u2(i - 2, j, k + 9)) +
-                   phx4[1] * _f1_2(i - 1, j) *
-                       (phdz4[0] * _u2(i - 1, j, k + 3) +
-                        phdz4[1] * _u2(i - 1, j, k + 4) +
-                        phdz4[2] * _u2(i - 1, j, k + 5) +
-                        phdz4[3] * _u2(i - 1, j, k + 6) +
-                        phdz4[4] * _u2(i - 1, j, k + 7) +
-                        phdz4[5] * _u2(i - 1, j, k + 8) +
-                        phdz4[6] * _u2(i - 1, j, k + 9)) +
-                   phx4[3] * _f1_2(i + 1, j) *
-                       (phdz4[0] * _u2(i + 1, j, k + 3) +
-                        phdz4[1] * _u2(i + 1, j, k + 4) +
-                        phdz4[2] * _u2(i + 1, j, k + 5) +
-                        phdz4[3] * _u2(i + 1, j, k + 6) +
-                        phdz4[4] * _u2(i + 1, j, k + 7) +
-                        phdz4[5] * _u2(i + 1, j, k + 8) +
-                        phdz4[6] * _u2(i + 1, j, k + 9))) -
-              J12i * _g_c(k + 6) *
-                  (py4[1] * _f2_1(i, j) *
-                       (phdz4[0] * _u1(i, j, k + 3) +
-                        phdz4[1] * _u1(i, j, k + 4) +
-                        phdz4[2] * _u1(i, j, k + 5) +
-                        phdz4[3] * _u1(i, j, k + 6) +
-                        phdz4[4] * _u1(i, j, k + 7) +
-                        phdz4[5] * _u1(i, j, k + 8) +
-                        phdz4[6] * _u1(i, j, k + 9)) +
-                   py4[0] * _f2_1(i, j - 1) *
-                       (phdz4[0] * _u1(i, j - 1, k + 3) +
-                        phdz4[1] * _u1(i, j - 1, k + 4) +
-                        phdz4[2] * _u1(i, j - 1, k + 5) +
-                        phdz4[3] * _u1(i, j - 1, k + 6) +
-                        phdz4[4] * _u1(i, j - 1, k + 7) +
-                        phdz4[5] * _u1(i, j - 1, k + 8) +
-                        phdz4[6] * _u1(i, j - 1, k + 9)) +
-                   py4[2] * _f2_1(i, j + 1) *
-                       (phdz4[0] * _u1(i, j + 1, k + 3) +
-                        phdz4[1] * _u1(i, j + 1, k + 4) +
-                        phdz4[2] * _u1(i, j + 1, k + 5) +
-                        phdz4[3] * _u1(i, j + 1, k + 6) +
-                        phdz4[4] * _u1(i, j + 1, k + 7) +
-                        phdz4[5] * _u1(i, j + 1, k + 8) +
-                        phdz4[6] * _u1(i, j + 1, k + 9)) +
-                   py4[3] * _f2_1(i, j + 2) *
-                       (phdz4[0] * _u1(i, j + 2, k + 3) +
-                        phdz4[1] * _u1(i, j + 2, k + 4) +
-                        phdz4[2] * _u1(i, j + 2, k + 5) +
-                        phdz4[3] * _u1(i, j + 2, k + 6) +
-                        phdz4[4] * _u1(i, j + 2, k + 7) +
-                        phdz4[5] * _u1(i, j + 2, k + 8) +
-                        phdz4[6] * _u1(i, j + 2, k + 9))))) *
-        f_dcrj;
-    _s13(i, j, k + 6) =
-        (a * _s13(i, j, k + 6) +
-         mu13 *
-             (dhx4[2] * _u3(i, j, k + 6) + dhx4[0] * _u3(i - 2, j, k + 6) +
-              dhx4[1] * _u3(i - 1, j, k + 6) + dhx4[3] * _u3(i + 1, j, k + 6) +
-              J13i * (dz4[0] * _u1(i, j, k + 5) + dz4[1] * _u1(i, j, k + 6) +
-                      dz4[2] * _u1(i, j, k + 7) + dz4[3] * _u1(i, j, k + 8)) -
-              J13i * _g(k + 6) *
-                  (phx4[2] * _f1_c(i, j) *
-                       (pdhz4[0] * _u3(i, j, k + 3) +
-                        pdhz4[1] * _u3(i, j, k + 4) +
-                        pdhz4[2] * _u3(i, j, k + 5) +
-                        pdhz4[3] * _u3(i, j, k + 6) +
-                        pdhz4[4] * _u3(i, j, k + 7) +
-                        pdhz4[5] * _u3(i, j, k + 8) +
-                        pdhz4[6] * _u3(i, j, k + 9)) +
-                   phx4[0] * _f1_c(i - 2, j) *
-                       (pdhz4[0] * _u3(i - 2, j, k + 3) +
-                        pdhz4[1] * _u3(i - 2, j, k + 4) +
-                        pdhz4[2] * _u3(i - 2, j, k + 5) +
-                        pdhz4[3] * _u3(i - 2, j, k + 6) +
-                        pdhz4[4] * _u3(i - 2, j, k + 7) +
-                        pdhz4[5] * _u3(i - 2, j, k + 8) +
-                        pdhz4[6] * _u3(i - 2, j, k + 9)) +
-                   phx4[1] * _f1_c(i - 1, j) *
-                       (pdhz4[0] * _u3(i - 1, j, k + 3) +
-                        pdhz4[1] * _u3(i - 1, j, k + 4) +
-                        pdhz4[2] * _u3(i - 1, j, k + 5) +
-                        pdhz4[3] * _u3(i - 1, j, k + 6) +
-                        pdhz4[4] * _u3(i - 1, j, k + 7) +
-                        pdhz4[5] * _u3(i - 1, j, k + 8) +
-                        pdhz4[6] * _u3(i - 1, j, k + 9)) +
-                   phx4[3] * _f1_c(i + 1, j) *
-                       (pdhz4[0] * _u3(i + 1, j, k + 3) +
-                        pdhz4[1] * _u3(i + 1, j, k + 4) +
-                        pdhz4[2] * _u3(i + 1, j, k + 5) +
-                        pdhz4[3] * _u3(i + 1, j, k + 6) +
-                        pdhz4[4] * _u3(i + 1, j, k + 7) +
-                        pdhz4[5] * _u3(i + 1, j, k + 8) +
-                        pdhz4[6] * _u3(i + 1, j, k + 9))))) *
-        f_dcrj;
-    _s23(i, j, k + 6) =
-        (a * _s23(i, j, k + 6) +
-         mu23 *
-             (dy4[1] * _u3(i, j, k + 6) + dy4[0] * _u3(i, j - 1, k + 6) +
-              dy4[2] * _u3(i, j + 1, k + 6) + dy4[3] * _u3(i, j + 2, k + 6) +
-              J23i * (dz4[0] * _u2(i, j, k + 5) + dz4[1] * _u2(i, j, k + 6) +
-                      dz4[2] * _u2(i, j, k + 7) + dz4[3] * _u2(i, j, k + 8)) -
-              J23i * _g(k + 6) *
-                  (py4[1] * _f2_c(i, j) *
-                       (pdhz4[0] * _u3(i, j, k + 3) +
-                        pdhz4[1] * _u3(i, j, k + 4) +
-                        pdhz4[2] * _u3(i, j, k + 5) +
-                        pdhz4[3] * _u3(i, j, k + 6) +
-                        pdhz4[4] * _u3(i, j, k + 7) +
-                        pdhz4[5] * _u3(i, j, k + 8) +
-                        pdhz4[6] * _u3(i, j, k + 9)) +
-                   py4[0] * _f2_c(i, j - 1) *
-                       (pdhz4[0] * _u3(i, j - 1, k + 3) +
-                        pdhz4[1] * _u3(i, j - 1, k + 4) +
-                        pdhz4[2] * _u3(i, j - 1, k + 5) +
-                        pdhz4[3] * _u3(i, j - 1, k + 6) +
-                        pdhz4[4] * _u3(i, j - 1, k + 7) +
-                        pdhz4[5] * _u3(i, j - 1, k + 8) +
-                        pdhz4[6] * _u3(i, j - 1, k + 9)) +
-                   py4[2] * _f2_c(i, j + 1) *
-                       (pdhz4[0] * _u3(i, j + 1, k + 3) +
-                        pdhz4[1] * _u3(i, j + 1, k + 4) +
-                        pdhz4[2] * _u3(i, j + 1, k + 5) +
-                        pdhz4[3] * _u3(i, j + 1, k + 6) +
-                        pdhz4[4] * _u3(i, j + 1, k + 7) +
-                        pdhz4[5] * _u3(i, j + 1, k + 8) +
-                        pdhz4[6] * _u3(i, j + 1, k + 9)) +
-                   py4[3] * _f2_c(i, j + 2) *
-                       (pdhz4[0] * _u3(i, j + 2, k + 3) +
-                        pdhz4[1] * _u3(i, j + 2, k + 4) +
-                        pdhz4[2] * _u3(i, j + 2, k + 5) +
-                        pdhz4[3] * _u3(i, j + 2, k + 6) +
-                        pdhz4[4] * _u3(i, j + 2, k + 7) +
-                        pdhz4[5] * _u3(i, j + 2, k + 8) +
-                        pdhz4[6] * _u3(i, j + 2, k + 9))))) *
-        f_dcrj;
-  }
-#undef _dcrjx
-#undef _dcrjy
-#undef _dcrjz
-#undef _f
-#undef _f1_1
-#undef _f1_2
-#undef _f1_c
-#undef _f2_1
-#undef _f2_2
-#undef _f2_c
-#undef _f_1
-#undef _f_2
-#undef _f_c
-#undef _g
-#undef _g3
-#undef _g3_c
-#undef _g_c
-#undef _lami
-#undef _mui
-#undef _s11
-#undef _s12
-#undef _s13
-#undef _s22
-#undef _s23
-#undef _s33
-#undef _u1
-#undef _u2
-#undef _u3
-}
-
-__global__ void dtopo_str_112(
-    float *__restrict__ s11, float *__restrict__ s12, float *__restrict__ s13,
-    float *__restrict__ s22, float *__restrict__ s23, float *__restrict__ s33,
-    float *__restrict__ u1, float *__restrict__ u2, float *__restrict__ u3,
-    const float *__restrict__ dcrjx, const float *__restrict__ dcrjy,
-    const float *__restrict__ dcrjz, const float *__restrict__ f,
-    const float *__restrict__ f1_1, const float *__restrict__ f1_2,
-    const float *__restrict__ f1_c, const float *__restrict__ f2_1,
-    const float *__restrict__ f2_2, const float *__restrict__ f2_c,
-    const float *__restrict__ f_1, const float *__restrict__ f_2,
-    const float *__restrict__ f_c, const float *__restrict__ g,
-    const float *__restrict__ g3, const float *__restrict__ g3_c,
-    const float *__restrict__ g_c, const float *__restrict__ lami,
-    const float *__restrict__ mui, const float a, const float nu, const int nx,
-    const int ny, const int nz, const int bi, const int bj, const int ei,
-    const int ej) {
-  const float phz4r[6][8] = {
-      {0.0000000000000000, 0.8338228784688313, 0.1775123316429260,
-       0.1435067013076542, -0.1548419114194114, 0.0000000000000000,
-       0.0000000000000000, 0.0000000000000000},
-      {0.0000000000000000, 0.1813404047323969, 1.1246711188154426,
-       -0.2933634518280757, -0.0126480717197637, 0.0000000000000000,
-       0.0000000000000000, 0.0000000000000000},
-      {0.0000000000000000, -0.1331142706282399, 0.7930714675884345,
-       0.3131998767078508, 0.0268429263319546, 0.0000000000000000,
-       0.0000000000000000, 0.0000000000000000},
-      {0.0000000000000000, 0.0969078556633046, -0.1539344946680898,
-       0.4486491202844389, 0.6768738207821733, -0.0684963020618270,
-       0.0000000000000000, 0.0000000000000000},
-      {0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
-       -0.0625000000000000, 0.5625000000000000, 0.5625000000000000,
-       -0.0625000000000000, 0.0000000000000000},
-      {0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
-       0.0000000000000000, -0.0625000000000000, 0.5625000000000000,
-       0.5625000000000000, -0.0625000000000000}};
-  const float phy4[4] = {-0.0625000000000000, 0.5625000000000000,
-                         0.5625000000000000, -0.0625000000000000};
-  const float px4[4] = {-0.0625000000000000, 0.5625000000000000,
-                        0.5625000000000000, -0.0625000000000000};
-  const float dhz4r[6][8] = {
-      {0.0000000000000000, 1.4511412472637157, -1.8534237417911470,
-       0.3534237417911469, 0.0488587527362844, 0.0000000000000000,
-       0.0000000000000000, 0.0000000000000000},
-      {0.0000000000000000, 0.8577143189081458, -0.5731429567244373,
-       -0.4268570432755628, 0.1422856810918542, 0.0000000000000000,
-       0.0000000000000000, 0.0000000000000000},
-      {0.0000000000000000, 0.1674548505882877, 0.4976354482351368,
-       -0.4976354482351368, -0.1674548505882877, 0.0000000000000000,
-       0.0000000000000000, 0.0000000000000000},
-      {0.0000000000000000, -0.1027061113405124, 0.2624541326469860,
-       0.8288742701021167, -1.0342864927831414, 0.0456642013745513,
-       0.0000000000000000, 0.0000000000000000},
-      {0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
-       -0.0416666666666667, 1.1250000000000000, -1.1250000000000000,
-       0.0416666666666667, 0.0000000000000000},
-      {0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
-       0.0000000000000000, -0.0416666666666667, 1.1250000000000000,
-       -1.1250000000000000, 0.0416666666666667}};
-  const float phdz4r[6][9] = {
-      {1.5373923010673116, -1.0330083346742178, -0.6211677623382129,
-       -0.0454110758451345, 0.1680934225988761, -0.0058985508086226,
-       0.0000000000000000, 0.0000000000000000, 0.0000000000000000},
-      {0.8713921425924012, -0.1273679143938725, -0.9297550647681331,
-       0.1912595577524762, -0.0050469052908678, -0.0004818158920039,
-       0.0000000000000000, 0.0000000000000000, 0.0000000000000000},
-      {0.0563333965151294, 0.3996393739211770, 0.0536007135209481,
-       -0.5022638816465500, -0.0083321572725344, 0.0010225549618299,
-       0.0000000000000000, 0.0000000000000000, 0.0000000000000000},
-      {0.0132930497153990, -0.0706942590708847, 0.5596445380498726,
-       0.1434031863528334, -0.7456356868769503, 0.1028431844156395,
-       -0.0028540125859095, 0.0000000000000000, 0.0000000000000000},
-      {0.0025849423769932, -0.0492307522105194, 0.0524552477068130,
-       0.5317248489238559, 0.0530169938441240, -0.6816971139746001,
-       0.0937500000000000, -0.0026041666666667, 0.0000000000000000},
-      {0.0009619461344193, 0.0035553215968974, -0.0124936029037323,
-       -0.0773639466787397, 0.6736586580761996, 0.0002232904416222,
-       -0.6796875000000000, 0.0937500000000000, -0.0026041666666667}};
-  const float dx4[4] = {0.0416666666666667, -1.1250000000000000,
-                        1.1250000000000000, -0.0416666666666667};
-  const float dhy4[4] = {0.0416666666666667, -1.1250000000000000,
-                         1.1250000000000000, -0.0416666666666667};
-  const float phx4[4] = {-0.0625000000000000, 0.5625000000000000,
-                         0.5625000000000000, -0.0625000000000000};
-  const float py4[4] = {-0.0625000000000000, 0.5625000000000000,
-                        0.5625000000000000, -0.0625000000000000};
-  const float dy4[4] = {0.0416666666666667, -1.1250000000000000,
-                        1.1250000000000000, -0.0416666666666667};
-  const float dhx4[4] = {0.0416666666666667, -1.1250000000000000,
-                         1.1250000000000000, -0.0416666666666667};
-  const float dz4r[6][7] = {
-      {0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
-       0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
-       0.0000000000000000},
-      {1.7779989465546748, -1.3337480247900155, -0.7775013168066564,
-       0.3332503950419969, 0.0000000000000000, 0.0000000000000000,
-       0.0000000000000000},
-      {0.4410217341392059, 0.1730842484889890, -0.4487228323259926,
-       -0.1653831503022022, 0.0000000000000000, 0.0000000000000000,
-       0.0000000000000000},
-      {-0.1798793213882701, 0.2757257254150788, 0.9597948548284453,
-       -1.1171892610431817, 0.0615480021879277, 0.0000000000000000,
-       0.0000000000000000},
-      {-0.0153911381507088, -0.0568851455503591, 0.1998976464597171,
-       0.8628231468598346, -1.0285385292191949, 0.0380940196007109,
-       0.0000000000000000},
-      {0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
-       -0.0416666666666667, 1.1250000000000000, -1.1250000000000000,
-       0.0416666666666667}};
-  const float pdhz4r[6][9] = {
-      {0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
-       0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
-       0.0000000000000000, 0.0000000000000000, 0.0000000000000000},
-      {0.0000000000000000, 1.5886075042755419, -2.2801810182668114,
-       0.8088980291471826, -0.1316830205960989, 0.0143585054401857,
-       0.0000000000000000, 0.0000000000000000, 0.0000000000000000},
-      {0.0000000000000000, 0.4823226655921295, 0.0574614517751295,
-       -0.5663203488781653, 0.0309656800624243, -0.0044294485515179,
-       0.0000000000000000, 0.0000000000000000, 0.0000000000000000},
-      {0.0000000000000000, -0.0174954311279016, 0.4325508330649349,
-       0.3111668377093504, -0.8538512002386446, 0.1314757107290064,
-       -0.0038467501367455, 0.0000000000000000, 0.0000000000000000},
-      {0.0000000000000000, -0.1277481742492071, 0.2574468839590017,
-       0.4155794781917712, -0.0115571196122084, -0.6170517361659126,
-       0.0857115441015996, -0.0023808762250444, 0.0000000000000000},
-      {0.0000000000000000, 0.0064191319587820, -0.0164033832904366,
-       -0.0752421418813823, 0.6740179057989464, -0.0002498459192428,
-       -0.6796875000000000, 0.0937500000000000, -0.0026041666666667}};
-  const int j = threadIdx.y + blockIdx.y * blockDim.y + bj;
-  if (j >= ngsl + ny)
-    return;
-  if (j >= ej)
-    return;
-  const int k = threadIdx.x + blockIdx.x * blockDim.x;
-  if (k >= 6)
-    return;
-#define _dcrjx(i) dcrjx[(i) + ngsl + 2]
-#define _dcrjy(j) dcrjy[(j) + ngsl + 2]
-#define _dcrjz(k) dcrjz[(k) + align]
-#define _f(i, j)                                                               \
-  f[(j) + align + ngsl + ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f1_1(i, j)                                                            \
-  f1_1[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f1_2(i, j)                                                            \
-  f1_2[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f1_c(i, j)                                                            \
-  f1_c[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f2_1(i, j)                                                            \
-  f2_1[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f2_2(i, j)                                                            \
-  f2_2[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f2_c(i, j)                                                            \
-  f2_c[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f_1(i, j)                                                             \
-  f_1[(j) + align + ngsl +                                                     \
-      ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f_2(i, j)                                                             \
-  f_2[(j) + align + ngsl +                                                     \
-      ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f_c(i, j)                                                             \
-  f_c[(j) + align + ngsl +                                                     \
-      ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _g(k) g[(k) + align]
-#define _g3(k) g3[(k) + align]
-#define _g3_c(k) g3_c[(k) + align]
-#define _g_c(k) g_c[(k) + align]
-#define _lami(i, j, k)                                                         \
-  lami[(k) + align +                                                           \
-       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +             \
-       (2 * align + nz) * ((j) + ngsl + 2)]
-#define _mui(i, j, k)                                                          \
-  mui[(k) + align +                                                            \
-      (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
-      (2 * align + nz) * ((j) + ngsl + 2)]
-#define _s11(i, j, k)                                                          \
-  s11[(k) + align +                                                            \
-      (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
-      (2 * align + nz) * ((j) + ngsl + 2)]
-#define _s12(i, j, k)                                                          \
-  s12[(k) + align +                                                            \
-      (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
-      (2 * align + nz) * ((j) + ngsl + 2)]
-#define _s13(i, j, k)                                                          \
-  s13[(k) + align +                                                            \
-      (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
-      (2 * align + nz) * ((j) + ngsl + 2)]
-#define _s22(i, j, k)                                                          \
-  s22[(k) + align +                                                            \
-      (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
-      (2 * align + nz) * ((j) + ngsl + 2)]
-#define _s23(i, j, k)                                                          \
-  s23[(k) + align +                                                            \
-      (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
-      (2 * align + nz) * ((j) + ngsl + 2)]
-#define _s33(i, j, k)                                                          \
-  s33[(k) + align +                                                            \
-      (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
-      (2 * align + nz) * ((j) + ngsl + 2)]
-#define _u1(i, j, k)                                                           \
-  u1[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
-     (2 * align + nz) * ((j) + ngsl + 2)]
-#define _u2(i, j, k)                                                           \
-  u2[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
-     (2 * align + nz) * ((j) + ngsl + 2)]
-#define _u3(i, j, k)                                                           \
-  u3[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
-     (2 * align + nz) * ((j) + ngsl + 2)]
-  for (int i = bi; i < ei; ++i) {
-    float Jii = _f_c(i, j) * _g3_c(nz - 1 - k);
-    Jii = 1.0 * 1.0 / Jii;
-    float J12i = _f(i, j) * _g3_c(nz - 1 - k);
-    J12i = 1.0 * 1.0 / J12i;
-    float J13i = _f_1(i, j) * _g3(nz - 1 - k);
-    J13i = 1.0 * 1.0 / J13i;
-    float J23i = _f_2(i, j) * _g3(nz - 1 - k);
-    J23i = 1.0 * 1.0 / J23i;
-    float lam =
-        nu * 1.0 /
-        (phz4r[k][7] * (phy4[2] * (px4[1] * _lami(i, j, nz - 8) +
-                                   px4[0] * _lami(i - 1, j, nz - 8) +
-                                   px4[2] * _lami(i + 1, j, nz - 8) +
-                                   px4[3] * _lami(i + 2, j, nz - 8)) +
-                        phy4[0] * (px4[1] * _lami(i, j - 2, nz - 8) +
-                                   px4[0] * _lami(i - 1, j - 2, nz - 8) +
-                                   px4[2] * _lami(i + 1, j - 2, nz - 8) +
-                                   px4[3] * _lami(i + 2, j - 2, nz - 8)) +
-                        phy4[1] * (px4[1] * _lami(i, j - 1, nz - 8) +
-                                   px4[0] * _lami(i - 1, j - 1, nz - 8) +
-                                   px4[2] * _lami(i + 1, j - 1, nz - 8) +
-                                   px4[3] * _lami(i + 2, j - 1, nz - 8)) +
-                        phy4[3] * (px4[1] * _lami(i, j + 1, nz - 8) +
-                                   px4[0] * _lami(i - 1, j + 1, nz - 8) +
-                                   px4[2] * _lami(i + 1, j + 1, nz - 8) +
-                                   px4[3] * _lami(i + 2, j + 1, nz - 8))) +
-         phz4r[k][6] * (phy4[2] * (px4[1] * _lami(i, j, nz - 7) +
-                                   px4[0] * _lami(i - 1, j, nz - 7) +
-                                   px4[2] * _lami(i + 1, j, nz - 7) +
-                                   px4[3] * _lami(i + 2, j, nz - 7)) +
-                        phy4[0] * (px4[1] * _lami(i, j - 2, nz - 7) +
-                                   px4[0] * _lami(i - 1, j - 2, nz - 7) +
-                                   px4[2] * _lami(i + 1, j - 2, nz - 7) +
-                                   px4[3] * _lami(i + 2, j - 2, nz - 7)) +
-                        phy4[1] * (px4[1] * _lami(i, j - 1, nz - 7) +
-                                   px4[0] * _lami(i - 1, j - 1, nz - 7) +
-                                   px4[2] * _lami(i + 1, j - 1, nz - 7) +
-                                   px4[3] * _lami(i + 2, j - 1, nz - 7)) +
-                        phy4[3] * (px4[1] * _lami(i, j + 1, nz - 7) +
-                                   px4[0] * _lami(i - 1, j + 1, nz - 7) +
-                                   px4[2] * _lami(i + 1, j + 1, nz - 7) +
-                                   px4[3] * _lami(i + 2, j + 1, nz - 7))) +
-         phz4r[k][5] * (phy4[2] * (px4[1] * _lami(i, j, nz - 6) +
-                                   px4[0] * _lami(i - 1, j, nz - 6) +
-                                   px4[2] * _lami(i + 1, j, nz - 6) +
-                                   px4[3] * _lami(i + 2, j, nz - 6)) +
-                        phy4[0] * (px4[1] * _lami(i, j - 2, nz - 6) +
-                                   px4[0] * _lami(i - 1, j - 2, nz - 6) +
-                                   px4[2] * _lami(i + 1, j - 2, nz - 6) +
-                                   px4[3] * _lami(i + 2, j - 2, nz - 6)) +
-                        phy4[1] * (px4[1] * _lami(i, j - 1, nz - 6) +
-                                   px4[0] * _lami(i - 1, j - 1, nz - 6) +
-                                   px4[2] * _lami(i + 1, j - 1, nz - 6) +
-                                   px4[3] * _lami(i + 2, j - 1, nz - 6)) +
-                        phy4[3] * (px4[1] * _lami(i, j + 1, nz - 6) +
-                                   px4[0] * _lami(i - 1, j + 1, nz - 6) +
-                                   px4[2] * _lami(i + 1, j + 1, nz - 6) +
-                                   px4[3] * _lami(i + 2, j + 1, nz - 6))) +
-         phz4r[k][4] * (phy4[2] * (px4[1] * _lami(i, j, nz - 5) +
-                                   px4[0] * _lami(i - 1, j, nz - 5) +
-                                   px4[2] * _lami(i + 1, j, nz - 5) +
-                                   px4[3] * _lami(i + 2, j, nz - 5)) +
-                        phy4[0] * (px4[1] * _lami(i, j - 2, nz - 5) +
-                                   px4[0] * _lami(i - 1, j - 2, nz - 5) +
-                                   px4[2] * _lami(i + 1, j - 2, nz - 5) +
-                                   px4[3] * _lami(i + 2, j - 2, nz - 5)) +
-                        phy4[1] * (px4[1] * _lami(i, j - 1, nz - 5) +
-                                   px4[0] * _lami(i - 1, j - 1, nz - 5) +
-                                   px4[2] * _lami(i + 1, j - 1, nz - 5) +
-                                   px4[3] * _lami(i + 2, j - 1, nz - 5)) +
-                        phy4[3] * (px4[1] * _lami(i, j + 1, nz - 5) +
-                                   px4[0] * _lami(i - 1, j + 1, nz - 5) +
-                                   px4[2] * _lami(i + 1, j + 1, nz - 5) +
-                                   px4[3] * _lami(i + 2, j + 1, nz - 5))) +
-         phz4r[k][3] * (phy4[2] * (px4[1] * _lami(i, j, nz - 4) +
-                                   px4[0] * _lami(i - 1, j, nz - 4) +
-                                   px4[2] * _lami(i + 1, j, nz - 4) +
-                                   px4[3] * _lami(i + 2, j, nz - 4)) +
-                        phy4[0] * (px4[1] * _lami(i, j - 2, nz - 4) +
-                                   px4[0] * _lami(i - 1, j - 2, nz - 4) +
-                                   px4[2] * _lami(i + 1, j - 2, nz - 4) +
-                                   px4[3] * _lami(i + 2, j - 2, nz - 4)) +
-                        phy4[1] * (px4[1] * _lami(i, j - 1, nz - 4) +
-                                   px4[0] * _lami(i - 1, j - 1, nz - 4) +
-                                   px4[2] * _lami(i + 1, j - 1, nz - 4) +
-                                   px4[3] * _lami(i + 2, j - 1, nz - 4)) +
-                        phy4[3] * (px4[1] * _lami(i, j + 1, nz - 4) +
-                                   px4[0] * _lami(i - 1, j + 1, nz - 4) +
-                                   px4[2] * _lami(i + 1, j + 1, nz - 4) +
-                                   px4[3] * _lami(i + 2, j + 1, nz - 4))) +
-         phz4r[k][2] * (phy4[2] * (px4[1] * _lami(i, j, nz - 3) +
-                                   px4[0] * _lami(i - 1, j, nz - 3) +
-                                   px4[2] * _lami(i + 1, j, nz - 3) +
-                                   px4[3] * _lami(i + 2, j, nz - 3)) +
-                        phy4[0] * (px4[1] * _lami(i, j - 2, nz - 3) +
-                                   px4[0] * _lami(i - 1, j - 2, nz - 3) +
-                                   px4[2] * _lami(i + 1, j - 2, nz - 3) +
-                                   px4[3] * _lami(i + 2, j - 2, nz - 3)) +
-                        phy4[1] * (px4[1] * _lami(i, j - 1, nz - 3) +
-                                   px4[0] * _lami(i - 1, j - 1, nz - 3) +
-                                   px4[2] * _lami(i + 1, j - 1, nz - 3) +
-                                   px4[3] * _lami(i + 2, j - 1, nz - 3)) +
-                        phy4[3] * (px4[1] * _lami(i, j + 1, nz - 3) +
-                                   px4[0] * _lami(i - 1, j + 1, nz - 3) +
-                                   px4[2] * _lami(i + 1, j + 1, nz - 3) +
-                                   px4[3] * _lami(i + 2, j + 1, nz - 3))) +
-         phz4r[k][1] * (phy4[2] * (px4[1] * _lami(i, j, nz - 2) +
-                                   px4[0] * _lami(i - 1, j, nz - 2) +
-                                   px4[2] * _lami(i + 1, j, nz - 2) +
-                                   px4[3] * _lami(i + 2, j, nz - 2)) +
-                        phy4[0] * (px4[1] * _lami(i, j - 2, nz - 2) +
-                                   px4[0] * _lami(i - 1, j - 2, nz - 2) +
-                                   px4[2] * _lami(i + 1, j - 2, nz - 2) +
-                                   px4[3] * _lami(i + 2, j - 2, nz - 2)) +
-                        phy4[1] * (px4[1] * _lami(i, j - 1, nz - 2) +
-                                   px4[0] * _lami(i - 1, j - 1, nz - 2) +
-                                   px4[2] * _lami(i + 1, j - 1, nz - 2) +
-                                   px4[3] * _lami(i + 2, j - 1, nz - 2)) +
-                        phy4[3] * (px4[1] * _lami(i, j + 1, nz - 2) +
-                                   px4[0] * _lami(i - 1, j + 1, nz - 2) +
-                                   px4[2] * _lami(i + 1, j + 1, nz - 2) +
-                                   px4[3] * _lami(i + 2, j + 1, nz - 2))) +
-         phz4r[k][0] * (phy4[2] * (px4[1] * _lami(i, j, nz - 1) +
-                                   px4[0] * _lami(i - 1, j, nz - 1) +
-                                   px4[2] * _lami(i + 1, j, nz - 1) +
-                                   px4[3] * _lami(i + 2, j, nz - 1)) +
-                        phy4[0] * (px4[1] * _lami(i, j - 2, nz - 1) +
-                                   px4[0] * _lami(i - 1, j - 2, nz - 1) +
-                                   px4[2] * _lami(i + 1, j - 2, nz - 1) +
-                                   px4[3] * _lami(i + 2, j - 2, nz - 1)) +
-                        phy4[1] * (px4[1] * _lami(i, j - 1, nz - 1) +
-                                   px4[0] * _lami(i - 1, j - 1, nz - 1) +
-                                   px4[2] * _lami(i + 1, j - 1, nz - 1) +
-                                   px4[3] * _lami(i + 2, j - 1, nz - 1)) +
-                        phy4[3] * (px4[1] * _lami(i, j + 1, nz - 1) +
-                                   px4[0] * _lami(i - 1, j + 1, nz - 1) +
-                                   px4[2] * _lami(i + 1, j + 1, nz - 1) +
-                                   px4[3] * _lami(i + 2, j + 1, nz - 1))));
-    float twomu =
-        2 * nu * 1.0 /
-        (phz4r[k][7] * (phy4[2] * (px4[1] * _mui(i, j, nz - 8) +
-                                   px4[0] * _mui(i - 1, j, nz - 8) +
-                                   px4[2] * _mui(i + 1, j, nz - 8) +
-                                   px4[3] * _mui(i + 2, j, nz - 8)) +
-                        phy4[0] * (px4[1] * _mui(i, j - 2, nz - 8) +
-                                   px4[0] * _mui(i - 1, j - 2, nz - 8) +
-                                   px4[2] * _mui(i + 1, j - 2, nz - 8) +
-                                   px4[3] * _mui(i + 2, j - 2, nz - 8)) +
-                        phy4[1] * (px4[1] * _mui(i, j - 1, nz - 8) +
-                                   px4[0] * _mui(i - 1, j - 1, nz - 8) +
-                                   px4[2] * _mui(i + 1, j - 1, nz - 8) +
-                                   px4[3] * _mui(i + 2, j - 1, nz - 8)) +
-                        phy4[3] * (px4[1] * _mui(i, j + 1, nz - 8) +
-                                   px4[0] * _mui(i - 1, j + 1, nz - 8) +
-                                   px4[2] * _mui(i + 1, j + 1, nz - 8) +
-                                   px4[3] * _mui(i + 2, j + 1, nz - 8))) +
-         phz4r[k][6] * (phy4[2] * (px4[1] * _mui(i, j, nz - 7) +
-                                   px4[0] * _mui(i - 1, j, nz - 7) +
-                                   px4[2] * _mui(i + 1, j, nz - 7) +
-                                   px4[3] * _mui(i + 2, j, nz - 7)) +
-                        phy4[0] * (px4[1] * _mui(i, j - 2, nz - 7) +
-                                   px4[0] * _mui(i - 1, j - 2, nz - 7) +
-                                   px4[2] * _mui(i + 1, j - 2, nz - 7) +
-                                   px4[3] * _mui(i + 2, j - 2, nz - 7)) +
-                        phy4[1] * (px4[1] * _mui(i, j - 1, nz - 7) +
-                                   px4[0] * _mui(i - 1, j - 1, nz - 7) +
-                                   px4[2] * _mui(i + 1, j - 1, nz - 7) +
-                                   px4[3] * _mui(i + 2, j - 1, nz - 7)) +
-                        phy4[3] * (px4[1] * _mui(i, j + 1, nz - 7) +
-                                   px4[0] * _mui(i - 1, j + 1, nz - 7) +
-                                   px4[2] * _mui(i + 1, j + 1, nz - 7) +
-                                   px4[3] * _mui(i + 2, j + 1, nz - 7))) +
-         phz4r[k][5] * (phy4[2] * (px4[1] * _mui(i, j, nz - 6) +
-                                   px4[0] * _mui(i - 1, j, nz - 6) +
-                                   px4[2] * _mui(i + 1, j, nz - 6) +
-                                   px4[3] * _mui(i + 2, j, nz - 6)) +
-                        phy4[0] * (px4[1] * _mui(i, j - 2, nz - 6) +
-                                   px4[0] * _mui(i - 1, j - 2, nz - 6) +
-                                   px4[2] * _mui(i + 1, j - 2, nz - 6) +
-                                   px4[3] * _mui(i + 2, j - 2, nz - 6)) +
-                        phy4[1] * (px4[1] * _mui(i, j - 1, nz - 6) +
-                                   px4[0] * _mui(i - 1, j - 1, nz - 6) +
-                                   px4[2] * _mui(i + 1, j - 1, nz - 6) +
-                                   px4[3] * _mui(i + 2, j - 1, nz - 6)) +
-                        phy4[3] * (px4[1] * _mui(i, j + 1, nz - 6) +
-                                   px4[0] * _mui(i - 1, j + 1, nz - 6) +
-                                   px4[2] * _mui(i + 1, j + 1, nz - 6) +
-                                   px4[3] * _mui(i + 2, j + 1, nz - 6))) +
-         phz4r[k][4] * (phy4[2] * (px4[1] * _mui(i, j, nz - 5) +
-                                   px4[0] * _mui(i - 1, j, nz - 5) +
-                                   px4[2] * _mui(i + 1, j, nz - 5) +
-                                   px4[3] * _mui(i + 2, j, nz - 5)) +
-                        phy4[0] * (px4[1] * _mui(i, j - 2, nz - 5) +
-                                   px4[0] * _mui(i - 1, j - 2, nz - 5) +
-                                   px4[2] * _mui(i + 1, j - 2, nz - 5) +
-                                   px4[3] * _mui(i + 2, j - 2, nz - 5)) +
-                        phy4[1] * (px4[1] * _mui(i, j - 1, nz - 5) +
-                                   px4[0] * _mui(i - 1, j - 1, nz - 5) +
-                                   px4[2] * _mui(i + 1, j - 1, nz - 5) +
-                                   px4[3] * _mui(i + 2, j - 1, nz - 5)) +
-                        phy4[3] * (px4[1] * _mui(i, j + 1, nz - 5) +
-                                   px4[0] * _mui(i - 1, j + 1, nz - 5) +
-                                   px4[2] * _mui(i + 1, j + 1, nz - 5) +
-                                   px4[3] * _mui(i + 2, j + 1, nz - 5))) +
-         phz4r[k][3] * (phy4[2] * (px4[1] * _mui(i, j, nz - 4) +
-                                   px4[0] * _mui(i - 1, j, nz - 4) +
-                                   px4[2] * _mui(i + 1, j, nz - 4) +
-                                   px4[3] * _mui(i + 2, j, nz - 4)) +
-                        phy4[0] * (px4[1] * _mui(i, j - 2, nz - 4) +
-                                   px4[0] * _mui(i - 1, j - 2, nz - 4) +
-                                   px4[2] * _mui(i + 1, j - 2, nz - 4) +
-                                   px4[3] * _mui(i + 2, j - 2, nz - 4)) +
-                        phy4[1] * (px4[1] * _mui(i, j - 1, nz - 4) +
-                                   px4[0] * _mui(i - 1, j - 1, nz - 4) +
-                                   px4[2] * _mui(i + 1, j - 1, nz - 4) +
-                                   px4[3] * _mui(i + 2, j - 1, nz - 4)) +
-                        phy4[3] * (px4[1] * _mui(i, j + 1, nz - 4) +
-                                   px4[0] * _mui(i - 1, j + 1, nz - 4) +
-                                   px4[2] * _mui(i + 1, j + 1, nz - 4) +
-                                   px4[3] * _mui(i + 2, j + 1, nz - 4))) +
-         phz4r[k][2] * (phy4[2] * (px4[1] * _mui(i, j, nz - 3) +
-                                   px4[0] * _mui(i - 1, j, nz - 3) +
-                                   px4[2] * _mui(i + 1, j, nz - 3) +
-                                   px4[3] * _mui(i + 2, j, nz - 3)) +
-                        phy4[0] * (px4[1] * _mui(i, j - 2, nz - 3) +
-                                   px4[0] * _mui(i - 1, j - 2, nz - 3) +
-                                   px4[2] * _mui(i + 1, j - 2, nz - 3) +
-                                   px4[3] * _mui(i + 2, j - 2, nz - 3)) +
-                        phy4[1] * (px4[1] * _mui(i, j - 1, nz - 3) +
-                                   px4[0] * _mui(i - 1, j - 1, nz - 3) +
-                                   px4[2] * _mui(i + 1, j - 1, nz - 3) +
-                                   px4[3] * _mui(i + 2, j - 1, nz - 3)) +
-                        phy4[3] * (px4[1] * _mui(i, j + 1, nz - 3) +
-                                   px4[0] * _mui(i - 1, j + 1, nz - 3) +
-                                   px4[2] * _mui(i + 1, j + 1, nz - 3) +
-                                   px4[3] * _mui(i + 2, j + 1, nz - 3))) +
-         phz4r[k][1] * (phy4[2] * (px4[1] * _mui(i, j, nz - 2) +
-                                   px4[0] * _mui(i - 1, j, nz - 2) +
-                                   px4[2] * _mui(i + 1, j, nz - 2) +
-                                   px4[3] * _mui(i + 2, j, nz - 2)) +
-                        phy4[0] * (px4[1] * _mui(i, j - 2, nz - 2) +
-                                   px4[0] * _mui(i - 1, j - 2, nz - 2) +
-                                   px4[2] * _mui(i + 1, j - 2, nz - 2) +
-                                   px4[3] * _mui(i + 2, j - 2, nz - 2)) +
-                        phy4[1] * (px4[1] * _mui(i, j - 1, nz - 2) +
-                                   px4[0] * _mui(i - 1, j - 1, nz - 2) +
-                                   px4[2] * _mui(i + 1, j - 1, nz - 2) +
-                                   px4[3] * _mui(i + 2, j - 1, nz - 2)) +
-                        phy4[3] * (px4[1] * _mui(i, j + 1, nz - 2) +
-                                   px4[0] * _mui(i - 1, j + 1, nz - 2) +
-                                   px4[2] * _mui(i + 1, j + 1, nz - 2) +
-                                   px4[3] * _mui(i + 2, j + 1, nz - 2))) +
-         phz4r[k][0] * (phy4[2] * (px4[1] * _mui(i, j, nz - 1) +
-                                   px4[0] * _mui(i - 1, j, nz - 1) +
-                                   px4[2] * _mui(i + 1, j, nz - 1) +
-                                   px4[3] * _mui(i + 2, j, nz - 1)) +
-                        phy4[0] * (px4[1] * _mui(i, j - 2, nz - 1) +
-                                   px4[0] * _mui(i - 1, j - 2, nz - 1) +
-                                   px4[2] * _mui(i + 1, j - 2, nz - 1) +
-                                   px4[3] * _mui(i + 2, j - 2, nz - 1)) +
-                        phy4[1] * (px4[1] * _mui(i, j - 1, nz - 1) +
-                                   px4[0] * _mui(i - 1, j - 1, nz - 1) +
-                                   px4[2] * _mui(i + 1, j - 1, nz - 1) +
-                                   px4[3] * _mui(i + 2, j - 1, nz - 1)) +
-                        phy4[3] * (px4[1] * _mui(i, j + 1, nz - 1) +
-                                   px4[0] * _mui(i - 1, j + 1, nz - 1) +
-                                   px4[2] * _mui(i + 1, j + 1, nz - 1) +
-                                   px4[3] * _mui(i + 2, j + 1, nz - 1))));
-    float mu12 =
-        nu * 1.0 /
-        (phz4r[k][7] * _mui(i, j, nz - 8) + phz4r[k][6] * _mui(i, j, nz - 7) +
-         phz4r[k][5] * _mui(i, j, nz - 6) + phz4r[k][4] * _mui(i, j, nz - 5) +
-         phz4r[k][3] * _mui(i, j, nz - 4) + phz4r[k][2] * _mui(i, j, nz - 3) +
-         phz4r[k][1] * _mui(i, j, nz - 2) + phz4r[k][0] * _mui(i, j, nz - 1));
-    float mu13 = nu * 1.0 /
-                 (phy4[2] * _mui(i, j, nz - 1 - k) +
-                  phy4[0] * _mui(i, j - 2, nz - 1 - k) +
-                  phy4[1] * _mui(i, j - 1, nz - 1 - k) +
-                  phy4[3] * _mui(i, j + 1, nz - 1 - k));
-    float mu23 =
-        nu * 1.0 /
-        (px4[1] * _mui(i, j, nz - 1 - k) + px4[0] * _mui(i - 1, j, nz - 1 - k) +
-         px4[2] * _mui(i + 1, j, nz - 1 - k) +
-         px4[3] * _mui(i + 2, j, nz - 1 - k));
-    float div =
-        dhy4[2] * _u2(i, j, nz - 1 - k) + dhy4[0] * _u2(i, j - 2, nz - 1 - k) +
-        dhy4[1] * _u2(i, j - 1, nz - 1 - k) +
-        dhy4[3] * _u2(i, j + 1, nz - 1 - k) + dx4[1] * _u1(i, j, nz - 1 - k) +
-        dx4[0] * _u1(i - 1, j, nz - 1 - k) +
-        dx4[2] * _u1(i + 1, j, nz - 1 - k) +
-        dx4[3] * _u1(i + 2, j, nz - 1 - k) +
-        Jii *
-            (dhz4r[k][7] * _u3(i, j, nz - 8) + dhz4r[k][6] * _u3(i, j, nz - 7) +
-             dhz4r[k][5] * _u3(i, j, nz - 6) + dhz4r[k][4] * _u3(i, j, nz - 5) +
-             dhz4r[k][3] * _u3(i, j, nz - 4) + dhz4r[k][2] * _u3(i, j, nz - 3) +
-             dhz4r[k][1] * _u3(i, j, nz - 2) +
-             dhz4r[k][0] * _u3(i, j, nz - 1)) -
-        Jii * _g_c(nz - 1 - k) *
-            (phy4[2] * _f2_2(i, j) *
-                 (phdz4r[k][8] * _u2(i, j, nz - 9) +
-                  phdz4r[k][7] * _u2(i, j, nz - 8) +
-                  phdz4r[k][6] * _u2(i, j, nz - 7) +
-                  phdz4r[k][5] * _u2(i, j, nz - 6) +
-                  phdz4r[k][4] * _u2(i, j, nz - 5) +
-                  phdz4r[k][3] * _u2(i, j, nz - 4) +
-                  phdz4r[k][2] * _u2(i, j, nz - 3) +
-                  phdz4r[k][1] * _u2(i, j, nz - 2) +
-                  phdz4r[k][0] * _u2(i, j, nz - 1)) +
-             phy4[0] * _f2_2(i, j - 2) *
-                 (phdz4r[k][8] * _u2(i, j - 2, nz - 9) +
-                  phdz4r[k][7] * _u2(i, j - 2, nz - 8) +
-                  phdz4r[k][6] * _u2(i, j - 2, nz - 7) +
-                  phdz4r[k][5] * _u2(i, j - 2, nz - 6) +
-                  phdz4r[k][4] * _u2(i, j - 2, nz - 5) +
-                  phdz4r[k][3] * _u2(i, j - 2, nz - 4) +
-                  phdz4r[k][2] * _u2(i, j - 2, nz - 3) +
-                  phdz4r[k][1] * _u2(i, j - 2, nz - 2) +
-                  phdz4r[k][0] * _u2(i, j - 2, nz - 1)) +
-             phy4[1] * _f2_2(i, j - 1) *
-                 (phdz4r[k][8] * _u2(i, j - 1, nz - 9) +
-                  phdz4r[k][7] * _u2(i, j - 1, nz - 8) +
-                  phdz4r[k][6] * _u2(i, j - 1, nz - 7) +
-                  phdz4r[k][5] * _u2(i, j - 1, nz - 6) +
-                  phdz4r[k][4] * _u2(i, j - 1, nz - 5) +
-                  phdz4r[k][3] * _u2(i, j - 1, nz - 4) +
-                  phdz4r[k][2] * _u2(i, j - 1, nz - 3) +
-                  phdz4r[k][1] * _u2(i, j - 1, nz - 2) +
-                  phdz4r[k][0] * _u2(i, j - 1, nz - 1)) +
-             phy4[3] * _f2_2(i, j + 1) *
-                 (phdz4r[k][8] * _u2(i, j + 1, nz - 9) +
-                  phdz4r[k][7] * _u2(i, j + 1, nz - 8) +
-                  phdz4r[k][6] * _u2(i, j + 1, nz - 7) +
-                  phdz4r[k][5] * _u2(i, j + 1, nz - 6) +
-                  phdz4r[k][4] * _u2(i, j + 1, nz - 5) +
-                  phdz4r[k][3] * _u2(i, j + 1, nz - 4) +
-                  phdz4r[k][2] * _u2(i, j + 1, nz - 3) +
-                  phdz4r[k][1] * _u2(i, j + 1, nz - 2) +
-                  phdz4r[k][0] * _u2(i, j + 1, nz - 1))) -
-        Jii * _g_c(nz - 1 - k) *
-            (px4[1] * _f1_1(i, j) *
-                 (phdz4r[k][8] * _u1(i, j, nz - 9) +
-                  phdz4r[k][7] * _u1(i, j, nz - 8) +
-                  phdz4r[k][6] * _u1(i, j, nz - 7) +
-                  phdz4r[k][5] * _u1(i, j, nz - 6) +
-                  phdz4r[k][4] * _u1(i, j, nz - 5) +
-                  phdz4r[k][3] * _u1(i, j, nz - 4) +
-                  phdz4r[k][2] * _u1(i, j, nz - 3) +
-                  phdz4r[k][1] * _u1(i, j, nz - 2) +
-                  phdz4r[k][0] * _u1(i, j, nz - 1)) +
-             px4[0] * _f1_1(i - 1, j) *
-                 (phdz4r[k][8] * _u1(i - 1, j, nz - 9) +
-                  phdz4r[k][7] * _u1(i - 1, j, nz - 8) +
-                  phdz4r[k][6] * _u1(i - 1, j, nz - 7) +
-                  phdz4r[k][5] * _u1(i - 1, j, nz - 6) +
-                  phdz4r[k][4] * _u1(i - 1, j, nz - 5) +
-                  phdz4r[k][3] * _u1(i - 1, j, nz - 4) +
-                  phdz4r[k][2] * _u1(i - 1, j, nz - 3) +
-                  phdz4r[k][1] * _u1(i - 1, j, nz - 2) +
-                  phdz4r[k][0] * _u1(i - 1, j, nz - 1)) +
-             px4[2] * _f1_1(i + 1, j) *
-                 (phdz4r[k][8] * _u1(i + 1, j, nz - 9) +
-                  phdz4r[k][7] * _u1(i + 1, j, nz - 8) +
-                  phdz4r[k][6] * _u1(i + 1, j, nz - 7) +
-                  phdz4r[k][5] * _u1(i + 1, j, nz - 6) +
-                  phdz4r[k][4] * _u1(i + 1, j, nz - 5) +
-                  phdz4r[k][3] * _u1(i + 1, j, nz - 4) +
-                  phdz4r[k][2] * _u1(i + 1, j, nz - 3) +
-                  phdz4r[k][1] * _u1(i + 1, j, nz - 2) +
-                  phdz4r[k][0] * _u1(i + 1, j, nz - 1)) +
-             px4[3] * _f1_1(i + 2, j) *
-                 (phdz4r[k][8] * _u1(i + 2, j, nz - 9) +
-                  phdz4r[k][7] * _u1(i + 2, j, nz - 8) +
-                  phdz4r[k][6] * _u1(i + 2, j, nz - 7) +
-                  phdz4r[k][5] * _u1(i + 2, j, nz - 6) +
-                  phdz4r[k][4] * _u1(i + 2, j, nz - 5) +
-                  phdz4r[k][3] * _u1(i + 2, j, nz - 4) +
-                  phdz4r[k][2] * _u1(i + 2, j, nz - 3) +
-                  phdz4r[k][1] * _u1(i + 2, j, nz - 2) +
-                  phdz4r[k][0] * _u1(i + 2, j, nz - 1)));
-    float f_dcrj = _dcrjx(i) * _dcrjy(j) * _dcrjz(nz - 1 - k);
-    _s11(i, j, nz - 1 - k) =
-        (a * _s11(i, j, nz - 1 - k) + lam * div +
-         twomu * (dx4[1] * _u1(i, j, nz - 1 - k) +
-                  dx4[0] * _u1(i - 1, j, nz - 1 - k) +
-                  dx4[2] * _u1(i + 1, j, nz - 1 - k) +
-                  dx4[3] * _u1(i + 2, j, nz - 1 - k)) -
-         twomu * Jii * _g_c(nz - 1 - k) *
-             (px4[1] * _f1_1(i, j) *
-                  (phdz4r[k][8] * _u1(i, j, nz - 9) +
-                   phdz4r[k][7] * _u1(i, j, nz - 8) +
-                   phdz4r[k][6] * _u1(i, j, nz - 7) +
-                   phdz4r[k][5] * _u1(i, j, nz - 6) +
-                   phdz4r[k][4] * _u1(i, j, nz - 5) +
-                   phdz4r[k][3] * _u1(i, j, nz - 4) +
-                   phdz4r[k][2] * _u1(i, j, nz - 3) +
-                   phdz4r[k][1] * _u1(i, j, nz - 2) +
-                   phdz4r[k][0] * _u1(i, j, nz - 1)) +
-              px4[0] * _f1_1(i - 1, j) *
-                  (phdz4r[k][8] * _u1(i - 1, j, nz - 9) +
-                   phdz4r[k][7] * _u1(i - 1, j, nz - 8) +
-                   phdz4r[k][6] * _u1(i - 1, j, nz - 7) +
-                   phdz4r[k][5] * _u1(i - 1, j, nz - 6) +
-                   phdz4r[k][4] * _u1(i - 1, j, nz - 5) +
-                   phdz4r[k][3] * _u1(i - 1, j, nz - 4) +
-                   phdz4r[k][2] * _u1(i - 1, j, nz - 3) +
-                   phdz4r[k][1] * _u1(i - 1, j, nz - 2) +
-                   phdz4r[k][0] * _u1(i - 1, j, nz - 1)) +
-              px4[2] * _f1_1(i + 1, j) *
-                  (phdz4r[k][8] * _u1(i + 1, j, nz - 9) +
-                   phdz4r[k][7] * _u1(i + 1, j, nz - 8) +
-                   phdz4r[k][6] * _u1(i + 1, j, nz - 7) +
-                   phdz4r[k][5] * _u1(i + 1, j, nz - 6) +
-                   phdz4r[k][4] * _u1(i + 1, j, nz - 5) +
-                   phdz4r[k][3] * _u1(i + 1, j, nz - 4) +
-                   phdz4r[k][2] * _u1(i + 1, j, nz - 3) +
-                   phdz4r[k][1] * _u1(i + 1, j, nz - 2) +
-                   phdz4r[k][0] * _u1(i + 1, j, nz - 1)) +
-              px4[3] * _f1_1(i + 2, j) *
-                  (phdz4r[k][8] * _u1(i + 2, j, nz - 9) +
-                   phdz4r[k][7] * _u1(i + 2, j, nz - 8) +
-                   phdz4r[k][6] * _u1(i + 2, j, nz - 7) +
-                   phdz4r[k][5] * _u1(i + 2, j, nz - 6) +
-                   phdz4r[k][4] * _u1(i + 2, j, nz - 5) +
-                   phdz4r[k][3] * _u1(i + 2, j, nz - 4) +
-                   phdz4r[k][2] * _u1(i + 2, j, nz - 3) +
-                   phdz4r[k][1] * _u1(i + 2, j, nz - 2) +
-                   phdz4r[k][0] * _u1(i + 2, j, nz - 1)))) *
-        f_dcrj;
-    _s22(i, j, nz - 1 - k) =
-        (a * _s22(i, j, nz - 1 - k) + lam * div +
-         twomu * (dhy4[2] * _u2(i, j, nz - 1 - k) +
-                  dhy4[0] * _u2(i, j - 2, nz - 1 - k) +
-                  dhy4[1] * _u2(i, j - 1, nz - 1 - k) +
-                  dhy4[3] * _u2(i, j + 1, nz - 1 - k)) -
-         twomu * Jii * _g_c(nz - 1 - k) *
-             (phy4[2] * _f2_2(i, j) *
-                  (phdz4r[k][8] * _u2(i, j, nz - 9) +
-                   phdz4r[k][7] * _u2(i, j, nz - 8) +
-                   phdz4r[k][6] * _u2(i, j, nz - 7) +
-                   phdz4r[k][5] * _u2(i, j, nz - 6) +
-                   phdz4r[k][4] * _u2(i, j, nz - 5) +
-                   phdz4r[k][3] * _u2(i, j, nz - 4) +
-                   phdz4r[k][2] * _u2(i, j, nz - 3) +
-                   phdz4r[k][1] * _u2(i, j, nz - 2) +
-                   phdz4r[k][0] * _u2(i, j, nz - 1)) +
-              phy4[0] * _f2_2(i, j - 2) *
-                  (phdz4r[k][8] * _u2(i, j - 2, nz - 9) +
-                   phdz4r[k][7] * _u2(i, j - 2, nz - 8) +
-                   phdz4r[k][6] * _u2(i, j - 2, nz - 7) +
-                   phdz4r[k][5] * _u2(i, j - 2, nz - 6) +
-                   phdz4r[k][4] * _u2(i, j - 2, nz - 5) +
-                   phdz4r[k][3] * _u2(i, j - 2, nz - 4) +
-                   phdz4r[k][2] * _u2(i, j - 2, nz - 3) +
-                   phdz4r[k][1] * _u2(i, j - 2, nz - 2) +
-                   phdz4r[k][0] * _u2(i, j - 2, nz - 1)) +
-              phy4[1] * _f2_2(i, j - 1) *
-                  (phdz4r[k][8] * _u2(i, j - 1, nz - 9) +
-                   phdz4r[k][7] * _u2(i, j - 1, nz - 8) +
-                   phdz4r[k][6] * _u2(i, j - 1, nz - 7) +
-                   phdz4r[k][5] * _u2(i, j - 1, nz - 6) +
-                   phdz4r[k][4] * _u2(i, j - 1, nz - 5) +
-                   phdz4r[k][3] * _u2(i, j - 1, nz - 4) +
-                   phdz4r[k][2] * _u2(i, j - 1, nz - 3) +
-                   phdz4r[k][1] * _u2(i, j - 1, nz - 2) +
-                   phdz4r[k][0] * _u2(i, j - 1, nz - 1)) +
-              phy4[3] * _f2_2(i, j + 1) *
-                  (phdz4r[k][8] * _u2(i, j + 1, nz - 9) +
-                   phdz4r[k][7] * _u2(i, j + 1, nz - 8) +
-                   phdz4r[k][6] * _u2(i, j + 1, nz - 7) +
-                   phdz4r[k][5] * _u2(i, j + 1, nz - 6) +
-                   phdz4r[k][4] * _u2(i, j + 1, nz - 5) +
-                   phdz4r[k][3] * _u2(i, j + 1, nz - 4) +
-                   phdz4r[k][2] * _u2(i, j + 1, nz - 3) +
-                   phdz4r[k][1] * _u2(i, j + 1, nz - 2) +
-                   phdz4r[k][0] * _u2(i, j + 1, nz - 1)))) *
-        f_dcrj;
-    _s33(i, j, nz - 1 - k) = (a * _s33(i, j, nz - 1 - k) + lam * div +
-                              twomu * Jii *
-                                  (dhz4r[k][7] * _u3(i, j, nz - 8) +
-                                   dhz4r[k][6] * _u3(i, j, nz - 7) +
-                                   dhz4r[k][5] * _u3(i, j, nz - 6) +
-                                   dhz4r[k][4] * _u3(i, j, nz - 5) +
-                                   dhz4r[k][3] * _u3(i, j, nz - 4) +
-                                   dhz4r[k][2] * _u3(i, j, nz - 3) +
-                                   dhz4r[k][1] * _u3(i, j, nz - 2) +
-                                   dhz4r[k][0] * _u3(i, j, nz - 1))) *
-                             f_dcrj;
-    _s12(i, j, nz - 1 - k) =
-        (a * _s12(i, j, nz - 1 - k) +
-         mu12 * (dhx4[2] * _u2(i, j, nz - 1 - k) +
-                 dhx4[0] * _u2(i - 2, j, nz - 1 - k) +
-                 dhx4[1] * _u2(i - 1, j, nz - 1 - k) +
-                 dhx4[3] * _u2(i + 1, j, nz - 1 - k) +
-                 dy4[1] * _u1(i, j, nz - 1 - k) +
-                 dy4[0] * _u1(i, j - 1, nz - 1 - k) +
-                 dy4[2] * _u1(i, j + 1, nz - 1 - k) +
-                 dy4[3] * _u1(i, j + 2, nz - 1 - k) -
-                 J12i * _g_c(nz - 1 - k) *
-                     (phx4[2] * _f1_2(i, j) *
-                          (phdz4r[k][8] * _u2(i, j, nz - 9) +
-                           phdz4r[k][7] * _u2(i, j, nz - 8) +
-                           phdz4r[k][6] * _u2(i, j, nz - 7) +
-                           phdz4r[k][5] * _u2(i, j, nz - 6) +
-                           phdz4r[k][4] * _u2(i, j, nz - 5) +
-                           phdz4r[k][3] * _u2(i, j, nz - 4) +
-                           phdz4r[k][2] * _u2(i, j, nz - 3) +
-                           phdz4r[k][1] * _u2(i, j, nz - 2) +
-                           phdz4r[k][0] * _u2(i, j, nz - 1)) +
-                      phx4[0] * _f1_2(i - 2, j) *
-                          (phdz4r[k][8] * _u2(i - 2, j, nz - 9) +
-                           phdz4r[k][7] * _u2(i - 2, j, nz - 8) +
-                           phdz4r[k][6] * _u2(i - 2, j, nz - 7) +
-                           phdz4r[k][5] * _u2(i - 2, j, nz - 6) +
-                           phdz4r[k][4] * _u2(i - 2, j, nz - 5) +
-                           phdz4r[k][3] * _u2(i - 2, j, nz - 4) +
-                           phdz4r[k][2] * _u2(i - 2, j, nz - 3) +
-                           phdz4r[k][1] * _u2(i - 2, j, nz - 2) +
-                           phdz4r[k][0] * _u2(i - 2, j, nz - 1)) +
-                      phx4[1] * _f1_2(i - 1, j) *
-                          (phdz4r[k][8] * _u2(i - 1, j, nz - 9) +
-                           phdz4r[k][7] * _u2(i - 1, j, nz - 8) +
-                           phdz4r[k][6] * _u2(i - 1, j, nz - 7) +
-                           phdz4r[k][5] * _u2(i - 1, j, nz - 6) +
-                           phdz4r[k][4] * _u2(i - 1, j, nz - 5) +
-                           phdz4r[k][3] * _u2(i - 1, j, nz - 4) +
-                           phdz4r[k][2] * _u2(i - 1, j, nz - 3) +
-                           phdz4r[k][1] * _u2(i - 1, j, nz - 2) +
-                           phdz4r[k][0] * _u2(i - 1, j, nz - 1)) +
-                      phx4[3] * _f1_2(i + 1, j) *
-                          (phdz4r[k][8] * _u2(i + 1, j, nz - 9) +
-                           phdz4r[k][7] * _u2(i + 1, j, nz - 8) +
-                           phdz4r[k][6] * _u2(i + 1, j, nz - 7) +
-                           phdz4r[k][5] * _u2(i + 1, j, nz - 6) +
-                           phdz4r[k][4] * _u2(i + 1, j, nz - 5) +
-                           phdz4r[k][3] * _u2(i + 1, j, nz - 4) +
-                           phdz4r[k][2] * _u2(i + 1, j, nz - 3) +
-                           phdz4r[k][1] * _u2(i + 1, j, nz - 2) +
-                           phdz4r[k][0] * _u2(i + 1, j, nz - 1))) -
-                 J12i * _g_c(nz - 1 - k) *
-                     (py4[1] * _f2_1(i, j) *
-                          (phdz4r[k][8] * _u1(i, j, nz - 9) +
-                           phdz4r[k][7] * _u1(i, j, nz - 8) +
-                           phdz4r[k][6] * _u1(i, j, nz - 7) +
-                           phdz4r[k][5] * _u1(i, j, nz - 6) +
-                           phdz4r[k][4] * _u1(i, j, nz - 5) +
-                           phdz4r[k][3] * _u1(i, j, nz - 4) +
-                           phdz4r[k][2] * _u1(i, j, nz - 3) +
-                           phdz4r[k][1] * _u1(i, j, nz - 2) +
-                           phdz4r[k][0] * _u1(i, j, nz - 1)) +
-                      py4[0] * _f2_1(i, j - 1) *
-                          (phdz4r[k][8] * _u1(i, j - 1, nz - 9) +
-                           phdz4r[k][7] * _u1(i, j - 1, nz - 8) +
-                           phdz4r[k][6] * _u1(i, j - 1, nz - 7) +
-                           phdz4r[k][5] * _u1(i, j - 1, nz - 6) +
-                           phdz4r[k][4] * _u1(i, j - 1, nz - 5) +
-                           phdz4r[k][3] * _u1(i, j - 1, nz - 4) +
-                           phdz4r[k][2] * _u1(i, j - 1, nz - 3) +
-                           phdz4r[k][1] * _u1(i, j - 1, nz - 2) +
-                           phdz4r[k][0] * _u1(i, j - 1, nz - 1)) +
-                      py4[2] * _f2_1(i, j + 1) *
-                          (phdz4r[k][8] * _u1(i, j + 1, nz - 9) +
-                           phdz4r[k][7] * _u1(i, j + 1, nz - 8) +
-                           phdz4r[k][6] * _u1(i, j + 1, nz - 7) +
-                           phdz4r[k][5] * _u1(i, j + 1, nz - 6) +
-                           phdz4r[k][4] * _u1(i, j + 1, nz - 5) +
-                           phdz4r[k][3] * _u1(i, j + 1, nz - 4) +
-                           phdz4r[k][2] * _u1(i, j + 1, nz - 3) +
-                           phdz4r[k][1] * _u1(i, j + 1, nz - 2) +
-                           phdz4r[k][0] * _u1(i, j + 1, nz - 1)) +
-                      py4[3] * _f2_1(i, j + 2) *
-                          (phdz4r[k][8] * _u1(i, j + 2, nz - 9) +
-                           phdz4r[k][7] * _u1(i, j + 2, nz - 8) +
-                           phdz4r[k][6] * _u1(i, j + 2, nz - 7) +
-                           phdz4r[k][5] * _u1(i, j + 2, nz - 6) +
-                           phdz4r[k][4] * _u1(i, j + 2, nz - 5) +
-                           phdz4r[k][3] * _u1(i, j + 2, nz - 4) +
-                           phdz4r[k][2] * _u1(i, j + 2, nz - 3) +
-                           phdz4r[k][1] * _u1(i, j + 2, nz - 2) +
-                           phdz4r[k][0] * _u1(i, j + 2, nz - 1))))) *
-        f_dcrj;
-    _s13(i, j, nz - 1 - k) =
-        (a * _s13(i, j, nz - 1 - k) +
-         mu13 * (dhx4[2] * _u3(i, j, nz - 1 - k) +
-                 dhx4[0] * _u3(i - 2, j, nz - 1 - k) +
-                 dhx4[1] * _u3(i - 1, j, nz - 1 - k) +
-                 dhx4[3] * _u3(i + 1, j, nz - 1 - k) +
-                 J13i * (dz4r[k][6] * _u1(i, j, nz - 7) +
-                         dz4r[k][5] * _u1(i, j, nz - 6) +
-                         dz4r[k][4] * _u1(i, j, nz - 5) +
-                         dz4r[k][3] * _u1(i, j, nz - 4) +
-                         dz4r[k][2] * _u1(i, j, nz - 3) +
-                         dz4r[k][1] * _u1(i, j, nz - 2) +
-                         dz4r[k][0] * _u1(i, j, nz - 1)) -
-                 J13i * _g(nz - 1 - k) *
-                     (phx4[2] * _f1_c(i, j) *
-                          (pdhz4r[k][8] * _u3(i, j, nz - 9) +
-                           pdhz4r[k][7] * _u3(i, j, nz - 8) +
-                           pdhz4r[k][6] * _u3(i, j, nz - 7) +
-                           pdhz4r[k][5] * _u3(i, j, nz - 6) +
-                           pdhz4r[k][4] * _u3(i, j, nz - 5) +
-                           pdhz4r[k][3] * _u3(i, j, nz - 4) +
-                           pdhz4r[k][2] * _u3(i, j, nz - 3) +
-                           pdhz4r[k][1] * _u3(i, j, nz - 2) +
-                           pdhz4r[k][0] * _u3(i, j, nz - 1)) +
-                      phx4[0] * _f1_c(i - 2, j) *
-                          (pdhz4r[k][8] * _u3(i - 2, j, nz - 9) +
-                           pdhz4r[k][7] * _u3(i - 2, j, nz - 8) +
-                           pdhz4r[k][6] * _u3(i - 2, j, nz - 7) +
-                           pdhz4r[k][5] * _u3(i - 2, j, nz - 6) +
-                           pdhz4r[k][4] * _u3(i - 2, j, nz - 5) +
-                           pdhz4r[k][3] * _u3(i - 2, j, nz - 4) +
-                           pdhz4r[k][2] * _u3(i - 2, j, nz - 3) +
-                           pdhz4r[k][1] * _u3(i - 2, j, nz - 2) +
-                           pdhz4r[k][0] * _u3(i - 2, j, nz - 1)) +
-                      phx4[1] * _f1_c(i - 1, j) *
-                          (pdhz4r[k][8] * _u3(i - 1, j, nz - 9) +
-                           pdhz4r[k][7] * _u3(i - 1, j, nz - 8) +
-                           pdhz4r[k][6] * _u3(i - 1, j, nz - 7) +
-                           pdhz4r[k][5] * _u3(i - 1, j, nz - 6) +
-                           pdhz4r[k][4] * _u3(i - 1, j, nz - 5) +
-                           pdhz4r[k][3] * _u3(i - 1, j, nz - 4) +
-                           pdhz4r[k][2] * _u3(i - 1, j, nz - 3) +
-                           pdhz4r[k][1] * _u3(i - 1, j, nz - 2) +
-                           pdhz4r[k][0] * _u3(i - 1, j, nz - 1)) +
-                      phx4[3] * _f1_c(i + 1, j) *
-                          (pdhz4r[k][8] * _u3(i + 1, j, nz - 9) +
-                           pdhz4r[k][7] * _u3(i + 1, j, nz - 8) +
-                           pdhz4r[k][6] * _u3(i + 1, j, nz - 7) +
-                           pdhz4r[k][5] * _u3(i + 1, j, nz - 6) +
-                           pdhz4r[k][4] * _u3(i + 1, j, nz - 5) +
-                           pdhz4r[k][3] * _u3(i + 1, j, nz - 4) +
-                           pdhz4r[k][2] * _u3(i + 1, j, nz - 3) +
-                           pdhz4r[k][1] * _u3(i + 1, j, nz - 2) +
-                           pdhz4r[k][0] * _u3(i + 1, j, nz - 1))))) *
-        f_dcrj;
-    _s23(i, j, nz - 1 - k) =
-        (a * _s23(i, j, nz - 1 - k) +
-         mu23 * (dy4[1] * _u3(i, j, nz - 1 - k) +
-                 dy4[0] * _u3(i, j - 1, nz - 1 - k) +
-                 dy4[2] * _u3(i, j + 1, nz - 1 - k) +
-                 dy4[3] * _u3(i, j + 2, nz - 1 - k) +
-                 J23i * (dz4r[k][6] * _u2(i, j, nz - 7) +
-                         dz4r[k][5] * _u2(i, j, nz - 6) +
-                         dz4r[k][4] * _u2(i, j, nz - 5) +
-                         dz4r[k][3] * _u2(i, j, nz - 4) +
-                         dz4r[k][2] * _u2(i, j, nz - 3) +
-                         dz4r[k][1] * _u2(i, j, nz - 2) +
-                         dz4r[k][0] * _u2(i, j, nz - 1)) -
-                 J23i * _g(nz - 1 - k) *
-                     (py4[1] * _f2_c(i, j) *
-                          (pdhz4r[k][8] * _u3(i, j, nz - 9) +
-                           pdhz4r[k][7] * _u3(i, j, nz - 8) +
-                           pdhz4r[k][6] * _u3(i, j, nz - 7) +
-                           pdhz4r[k][5] * _u3(i, j, nz - 6) +
-                           pdhz4r[k][4] * _u3(i, j, nz - 5) +
-                           pdhz4r[k][3] * _u3(i, j, nz - 4) +
-                           pdhz4r[k][2] * _u3(i, j, nz - 3) +
-                           pdhz4r[k][1] * _u3(i, j, nz - 2) +
-                           pdhz4r[k][0] * _u3(i, j, nz - 1)) +
-                      py4[0] * _f2_c(i, j - 1) *
-                          (pdhz4r[k][8] * _u3(i, j - 1, nz - 9) +
-                           pdhz4r[k][7] * _u3(i, j - 1, nz - 8) +
-                           pdhz4r[k][6] * _u3(i, j - 1, nz - 7) +
-                           pdhz4r[k][5] * _u3(i, j - 1, nz - 6) +
-                           pdhz4r[k][4] * _u3(i, j - 1, nz - 5) +
-                           pdhz4r[k][3] * _u3(i, j - 1, nz - 4) +
-                           pdhz4r[k][2] * _u3(i, j - 1, nz - 3) +
-                           pdhz4r[k][1] * _u3(i, j - 1, nz - 2) +
-                           pdhz4r[k][0] * _u3(i, j - 1, nz - 1)) +
-                      py4[2] * _f2_c(i, j + 1) *
-                          (pdhz4r[k][8] * _u3(i, j + 1, nz - 9) +
-                           pdhz4r[k][7] * _u3(i, j + 1, nz - 8) +
-                           pdhz4r[k][6] * _u3(i, j + 1, nz - 7) +
-                           pdhz4r[k][5] * _u3(i, j + 1, nz - 6) +
-                           pdhz4r[k][4] * _u3(i, j + 1, nz - 5) +
-                           pdhz4r[k][3] * _u3(i, j + 1, nz - 4) +
-                           pdhz4r[k][2] * _u3(i, j + 1, nz - 3) +
-                           pdhz4r[k][1] * _u3(i, j + 1, nz - 2) +
-                           pdhz4r[k][0] * _u3(i, j + 1, nz - 1)) +
-                      py4[3] * _f2_c(i, j + 2) *
-                          (pdhz4r[k][8] * _u3(i, j + 2, nz - 9) +
-                           pdhz4r[k][7] * _u3(i, j + 2, nz - 8) +
-                           pdhz4r[k][6] * _u3(i, j + 2, nz - 7) +
-                           pdhz4r[k][5] * _u3(i, j + 2, nz - 6) +
-                           pdhz4r[k][4] * _u3(i, j + 2, nz - 5) +
-                           pdhz4r[k][3] * _u3(i, j + 2, nz - 4) +
-                           pdhz4r[k][2] * _u3(i, j + 2, nz - 3) +
-                           pdhz4r[k][1] * _u3(i, j + 2, nz - 2) +
-                           pdhz4r[k][0] * _u3(i, j + 2, nz - 1))))) *
-        f_dcrj;
-  }
-#undef _dcrjx
-#undef _dcrjy
-#undef _dcrjz
-#undef _f
-#undef _f1_1
-#undef _f1_2
-#undef _f1_c
-#undef _f2_1
-#undef _f2_2
-#undef _f2_c
-#undef _f_1
-#undef _f_2
-#undef _f_c
-#undef _g
-#undef _g3
-#undef _g3_c
-#undef _g_c
-#undef _lami
-#undef _mui
-#undef _s11
-#undef _s12
-#undef _s13
-#undef _s22
-#undef _s23
-#undef _s33
-#undef _u1
-#undef _u2
-#undef _u3
-}
-
-__global__ void dtopo_init_material_111(float *__restrict__ lami,
-                                        float *__restrict__ mui,
-                                        float *__restrict__ rho, const int nx,
-                                        const int ny, const int nz) {
-  const int i = threadIdx.z + blockIdx.z * blockDim.z;
-  if (i >= nx)
-    return;
-  const int j = threadIdx.y + blockIdx.y * blockDim.y;
-  if (j >= ny)
-    return;
-  const int k = threadIdx.x + blockIdx.x * blockDim.x;
-  if (k >= nz)
-    return;
-#define _lami(i, j, k) lami[(i)*ny * nz + (j)*nz + (k)]
-#define _mui(i, j, k) mui[(i)*ny * nz + (j)*nz + (k)]
-#define _rho(i, j, k) rho[(i)*ny * nz + (j)*nz + (k)]
-  _rho(i, j, k) = 1.0;
-  _lami(i, j, k) = 1.0;
-  _mui(i, j, k) = 1.0;
-#undef _lami
-#undef _mui
-#undef _rho
-}
diff --git a/src/topography/kernels/stress_attenuation.cu b/src/topography/kernels/stress.cu
similarity index 95%
rename from src/topography/kernels/stress_attenuation.cu
rename to src/topography/kernels/stress.cu
index a1eba47..d436b60 100644
--- a/src/topography/kernels/stress_attenuation.cu
+++ b/src/topography/kernels/stress.cu
@@ -3,7 +3,6 @@
 
 #include <awp/definitions.h>
 #include <test/test.h>
-#include <topography/kernels/stress_attenuation.cuh>
 #include <stdio.h>
 #define CURVILINEAR
 #define _f(i, j) f[(j) + align + (i) * (2 * align + 2 * ngsl + ny + 4)]
@@ -79,14 +78,14 @@ dtopo_str_111(_prec*  __restrict__ xx, _prec*  __restrict__ yy, _prec*  __restri
        _prec*  __restrict__ u1, 
        _prec*  __restrict__ v1,    
        _prec*  __restrict__ w1,    
-       const float *__restrict__ f,
-       const float *__restrict__ f1_1, const float *__restrict__ f1_2,
-       const float *__restrict__ f1_c, const float *__restrict__ f2_1,
-       const float *__restrict__ f2_2, const float *__restrict__ f2_c,
-       const float *__restrict__ f_1, const float *__restrict__ f_2,
-       const float *__restrict__ f_c, const float *__restrict__ g,
-       const float *__restrict__ g3, const float *__restrict__ g3_c,
-       const float *__restrict__ g_c,
+       const _prec *__restrict__ f,
+       const _prec *__restrict__ f1_1, const _prec *__restrict__ f1_2,
+       const _prec *__restrict__ f1_c, const _prec *__restrict__ f2_1,
+       const _prec *__restrict__ f2_2, const _prec *__restrict__ f2_c,
+       const _prec *__restrict__ f_1, const _prec *__restrict__ f_2,
+       const _prec *__restrict__ f_c, const _prec *__restrict__ g,
+       const _prec *__restrict__ g3, const _prec *__restrict__ g3_c,
+       const _prec *__restrict__ g_c,
        const _prec *__restrict__  lam,   
        const _prec *__restrict__  mu,     
        const _prec *__restrict__  qp,
@@ -119,31 +118,31 @@ dtopo_str_111(_prec*  __restrict__ xx, _prec*  __restrict__ yy, _prec*  __restri
   _prec f_xx, f_yy, f_zz, f_xy, f_xz, f_yz;
   int maxk, mink;
 
-  const float px4[4] = {-0.0625000000000000, 0.5625000000000000,
+  const _prec px4[4] = {-0.0625000000000000, 0.5625000000000000,
                         0.5625000000000000, -0.0625000000000000};
-  const float dhx4[4] = {0.0416666666666667, -1.1250000000000000,
+  const _prec dhx4[4] = {0.0416666666666667, -1.1250000000000000,
                          1.1250000000000000, -0.0416666666666667};
-  const float phdz4[7] = {-0.0026041666666667, 0.0937500000000000,
+  const _prec phdz4[7] = {-0.0026041666666667, 0.0937500000000000,
                           -0.6796875000000000, -0.0000000000000000,
                           0.6796875000000000,  -0.0937500000000000,
                           0.0026041666666667};
-  const float dx4[4] = {0.0416666666666667, -1.1250000000000000,
+  const _prec dx4[4] = {0.0416666666666667, -1.1250000000000000,
                         1.1250000000000000, -0.0416666666666667};
-  const float phx4[4] = {-0.0625000000000000, 0.5625000000000000,
+  const _prec phx4[4] = {-0.0625000000000000, 0.5625000000000000,
                          0.5625000000000000, -0.0625000000000000};
-  const float phy4[4] = {-0.0625000000000000, 0.5625000000000000,
+  const _prec phy4[4] = {-0.0625000000000000, 0.5625000000000000,
                          0.5625000000000000, -0.0625000000000000};
-  const float dhy4[4] = {0.0416666666666667, -1.1250000000000000,
+  const _prec dhy4[4] = {0.0416666666666667, -1.1250000000000000,
                          1.1250000000000000, -0.0416666666666667};
-  const float dhz4[4] = {0.0416666666666667, -1.1250000000000000,
+  const _prec dhz4[4] = {0.0416666666666667, -1.1250000000000000,
                          1.1250000000000000, -0.0416666666666667};
-  const float py4[4] = {-0.0625000000000000, 0.5625000000000000,
+  const _prec py4[4] = {-0.0625000000000000, 0.5625000000000000,
                         0.5625000000000000, -0.0625000000000000};
-  const float dy4[4] = {0.0416666666666667, -1.1250000000000000,
+  const _prec dy4[4] = {0.0416666666666667, -1.1250000000000000,
                         1.1250000000000000, -0.0416666666666667};
-  const float dz4[4] = {0.0416666666666667, -1.1250000000000000,
+  const _prec dz4[4] = {0.0416666666666667, -1.1250000000000000,
                         1.1250000000000000, -0.0416666666666667};
-  const float pdhz4[7] = {-0.0026041666666667, 0.0937500000000000,
+  const _prec pdhz4[7] = {-0.0026041666666667, 0.0937500000000000,
                           -0.6796875000000000, -0.0000000000000000,
                           0.6796875000000000,  -0.0937500000000000,
                           0.0026041666666667};
@@ -337,7 +336,7 @@ dtopo_str_111(_prec*  __restrict__ xx, _prec*  __restrict__ yy, _prec*  __restri
 
 #ifdef CURVILINEAR
 
-    float Jii = _f_c(i, j) * _g3_c(k);
+    _prec Jii = _f_c(i, j) * _g3_c(k);
           Jii = 1.0 * 1.0 / Jii;
           
     vs1 =
@@ -468,7 +467,7 @@ dtopo_str_111(_prec*  __restrict__ xx, _prec*  __restrict__ yy, _prec*  __restri
 
     // xy
 #ifdef CURVILINEAR
-  float J12i = _f(i, j) * _g3_c(k + 6);
+  _prec J12i = _f(i, j) * _g3_c(k + 6);
   J12i = 1.0 / J12i;
 
   vs1 =
@@ -547,7 +546,7 @@ dtopo_str_111(_prec*  __restrict__ xx, _prec*  __restrict__ yy, _prec*  __restri
     // xz
 #ifdef CURVILINEAR
 
-  float J13i = _f_1(i, j) * _g3(k);
+  _prec J13i = _f_1(i, j) * _g3(k);
   J13i = 1.0 * 1.0 / J13i;
 
   vs1 = J13i * (dz4[1] * _u1(i, j, k) + dz4[0] * _u1(i, j, k - 1) +
@@ -597,7 +596,7 @@ dtopo_str_111(_prec*  __restrict__ xx, _prec*  __restrict__ yy, _prec*  __restri
     // yz
 
 #ifdef CURVILINEAR
-    float J23i = _f_2(i, j) * _g3(k);
+    _prec J23i = _f_2(i, j) * _g3(k);
     J23i = 1.0 * 1.0 / J23i;
     vs1 = J23i * (dz4[1] * _v1(i, j, k) + dz4[0] * _v1(i, j, k - 1) +
                         dz4[2] * _v1(i, j, k + 1) + dz4[3] * _v1(i, j, k + 2));
@@ -675,14 +674,14 @@ dtopo_str_112(_prec*  __restrict__ xx, _prec*  __restrict__ yy, _prec*  __restri
        _prec*  __restrict__ u1, 
        _prec*  __restrict__ v1,    
        _prec*  __restrict__ w1,    
-       const float *__restrict__ f,
-       const float *__restrict__ f1_1, const float *__restrict__ f1_2,
-       const float *__restrict__ f1_c, const float *__restrict__ f2_1,
-       const float *__restrict__ f2_2, const float *__restrict__ f2_c,
-       const float *__restrict__ f_1, const float *__restrict__ f_2,
-       const float *__restrict__ f_c, const float *__restrict__ g,
-       const float *__restrict__ g3, const float *__restrict__ g3_c,
-       const float *__restrict__ g_c,
+       const _prec *__restrict__ f,
+       const _prec *__restrict__ f1_1, const _prec *__restrict__ f1_2,
+       const _prec *__restrict__ f1_c, const _prec *__restrict__ f2_1,
+       const _prec *__restrict__ f2_2, const _prec *__restrict__ f2_c,
+       const _prec *__restrict__ f_1, const _prec *__restrict__ f_2,
+       const _prec *__restrict__ f_c, const _prec *__restrict__ g,
+       const _prec *__restrict__ g3, const _prec *__restrict__ g3_c,
+       const _prec *__restrict__ g_c,
        const _prec *__restrict__  lam,   
        const _prec *__restrict__  mu,     
        const _prec *__restrict__  qp,
@@ -715,15 +714,15 @@ dtopo_str_112(_prec*  __restrict__ xx, _prec*  __restrict__ yy, _prec*  __restri
   _prec f_xx, f_yy, f_zz, f_xy, f_xz, f_yz;
   int maxk, mink;
 
-  const float px4[4] = {-0.0625000000000000, 0.5625000000000000,
+  const _prec px4[4] = {-0.0625000000000000, 0.5625000000000000,
                         0.5625000000000000, -0.0625000000000000};
-  const float dhx4[4] = {0.0416666666666667, -1.1250000000000000,
+  const _prec dhx4[4] = {0.0416666666666667, -1.1250000000000000,
                          1.1250000000000000, -0.0416666666666667};
-  const float phdz4[7] = {-0.0026041666666667, 0.0937500000000000,
+  const _prec phdz4[7] = {-0.0026041666666667, 0.0937500000000000,
                           -0.6796875000000000, -0.0000000000000000,
                           0.6796875000000000,  -0.0937500000000000,
                           0.0026041666666667};
-  const float phdz4r[6][9] = {
+  const _prec phdz4r[6][9] = {
       {1.5373923010673116, -1.0330083346742178, -0.6211677623382129,
        -0.0454110758451345, 0.1680934225988761, -0.0058985508086226,
        0.0000000000000000, 0.0000000000000000, 0.0000000000000000},
@@ -742,7 +741,7 @@ dtopo_str_112(_prec*  __restrict__ xx, _prec*  __restrict__ yy, _prec*  __restri
       {0.0009619461344193, 0.0035553215968974, -0.0124936029037323,
        -0.0773639466787397, 0.6736586580761996, 0.0002232904416222,
        -0.6796875000000000, 0.0937500000000000, -0.0026041666666667}};
-  const float dz4r[6][7] = {
+  const _prec dz4r[6][7] = {
       {0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
        0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
        0.0000000000000000},
@@ -761,27 +760,27 @@ dtopo_str_112(_prec*  __restrict__ xx, _prec*  __restrict__ yy, _prec*  __restri
       {0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
        -0.0416666666666667, 1.1250000000000000, -1.1250000000000000,
        0.0416666666666667}};
-  const float dx4[4] = {0.0416666666666667, -1.1250000000000000,
+  const _prec dx4[4] = {0.0416666666666667, -1.1250000000000000,
                         1.1250000000000000, -0.0416666666666667};
-  const float phx4[4] = {-0.0625000000000000, 0.5625000000000000,
+  const _prec phx4[4] = {-0.0625000000000000, 0.5625000000000000,
                          0.5625000000000000, -0.0625000000000000};
-  const float phy4[4] = {-0.0625000000000000, 0.5625000000000000,
+  const _prec phy4[4] = {-0.0625000000000000, 0.5625000000000000,
                          0.5625000000000000, -0.0625000000000000};
-  const float dhy4[4] = {0.0416666666666667, -1.1250000000000000,
+  const _prec dhy4[4] = {0.0416666666666667, -1.1250000000000000,
                          1.1250000000000000, -0.0416666666666667};
-  const float dhz4[4] = {0.0416666666666667, -1.1250000000000000,
+  const _prec dhz4[4] = {0.0416666666666667, -1.1250000000000000,
                          1.1250000000000000, -0.0416666666666667};
-  const float py4[4] = {-0.0625000000000000, 0.5625000000000000,
+  const _prec py4[4] = {-0.0625000000000000, 0.5625000000000000,
                         0.5625000000000000, -0.0625000000000000};
-  const float dy4[4] = {0.0416666666666667, -1.1250000000000000,
+  const _prec dy4[4] = {0.0416666666666667, -1.1250000000000000,
                         1.1250000000000000, -0.0416666666666667};
-  const float dz4[4] = {0.0416666666666667, -1.1250000000000000,
+  const _prec dz4[4] = {0.0416666666666667, -1.1250000000000000,
                         1.1250000000000000, -0.0416666666666667};
-  const float pdhz4[7] = {-0.0026041666666667, 0.0937500000000000,
+  const _prec pdhz4[7] = {-0.0026041666666667, 0.0937500000000000,
                           -0.6796875000000000, -0.0000000000000000,
                           0.6796875000000000,  -0.0937500000000000,
                           0.0026041666666667};
-  const float dhz4r[6][8] = {
+  const _prec dhz4r[6][8] = {
       {0.0000000000000000, 1.4511412472637157, -1.8534237417911470,
        0.3534237417911469, 0.0488587527362844, 0.0000000000000000,
        0.0000000000000000, 0.0000000000000000},
@@ -800,7 +799,7 @@ dtopo_str_112(_prec*  __restrict__ xx, _prec*  __restrict__ yy, _prec*  __restri
       {0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
        0.0000000000000000, -0.0416666666666667, 1.1250000000000000,
        -1.1250000000000000, 0.0416666666666667}};
-  const float pdhz4r[6][9] = {
+  const _prec pdhz4r[6][9] = {
       {0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
        0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
        0.0000000000000000, 0.0000000000000000, 0.0000000000000000},
@@ -1015,10 +1014,10 @@ dtopo_str_112(_prec*  __restrict__ xx, _prec*  __restrict__ yy, _prec*  __restri
 #define _g3(k) g3[(k) + align]
 
 #ifdef CURVILINEAR
-  float Jii = _f_c(i, j) * _g3_c(nz - 1 - kc - 6);
+  _prec Jii = _f_c(i, j) * _g3_c(nz - 1 - kc - 6);
   Jii = 1.0 * 1.0 / Jii;
   // xx, yy, zz
-  float vs1 = dx4[1] * _u1(i, j, nz - 1 - kc - 6) +
+  _prec vs1 = dx4[1] * _u1(i, j, nz - 1 - kc - 6) +
               dx4[0] * _u1(i - 1, j, nz - 1 - kc - 6) +
               dx4[2] * _u1(i + 1, j, nz - 1 - kc - 6) +
               dx4[3] * _u1(i + 2, j, nz - 1 - kc - 6) -
@@ -1063,7 +1062,7 @@ dtopo_str_112(_prec*  __restrict__ xx, _prec*  __restrict__ yy, _prec*  __restri
                         phdz4r[kb][2] * _u1(i + 2, j, nz - 3) +
                         phdz4r[kb][1] * _u1(i + 2, j, nz - 2) +
                         phdz4r[kb][0] * _u1(i + 2, j, nz - 1)));
-  float vs2 = dhy4[2] * _v1(i, j, nz - 1 - kc - 6) +
+  _prec vs2 = dhy4[2] * _v1(i, j, nz - 1 - kc - 6) +
               dhy4[0] * _v1(i, j - 2, nz - 1 - kc - 6) +
               dhy4[1] * _v1(i, j - 1, nz - 1 - kc - 6) +
               dhy4[3] * _v1(i, j + 1, nz - 1 - kc - 6) -
@@ -1108,7 +1107,7 @@ dtopo_str_112(_prec*  __restrict__ xx, _prec*  __restrict__ yy, _prec*  __restri
                         phdz4r[kb][2] * _v1(i, j + 1, nz - 3) +
                         phdz4r[kb][1] * _v1(i, j + 1, nz - 2) +
                         phdz4r[kb][0] * _v1(i, j + 1, nz - 1)));
-  float vs3 =
+  _prec vs3 =
       Jii * (dhz4r[kb][7] * _w1(i, j, nz - 8) + dhz4r[kb][6] * _w1(i, j, nz - 7) +
              dhz4r[kb][5] * _w1(i, j, nz - 6) + dhz4r[kb][4] * _w1(i, j, nz - 5) +
              dhz4r[kb][3] * _w1(i, j, nz - 4) + dhz4r[kb][2] * _w1(i, j, nz - 3) +
@@ -1146,7 +1145,7 @@ dtopo_str_112(_prec*  __restrict__ xx, _prec*  __restrict__ yy, _prec*  __restri
 
     // xy
 #ifdef CURVILINEAR
-  float J12i = _f(i, j) * _g3_c(nz - 1 - kc - 6);
+  _prec J12i = _f(i, j) * _g3_c(nz - 1 - kc - 6);
   J12i = 1.0 * 1.0 / J12i;
   vs1 = dy4[1] * _u1(i, j, nz - 1 - kc - 6) +
               dy4[0] * _u1(i, j - 1, nz - 1 - kc - 6) +
@@ -1251,7 +1250,7 @@ dtopo_str_112(_prec*  __restrict__ xx, _prec*  __restrict__ yy, _prec*  __restri
 
     // xz
 #ifdef CURVILINEAR
-  float J13i = _f_1(i, j) * _g3(nz - 1 - kc - 6);
+  _prec J13i = _f_1(i, j) * _g3(nz - 1 - kc - 6);
   J13i = 1.0 * 1.0 / J13i;
   vs1 =
       J13i * (dz4r[kb][6] * _u1(i, j, nz - 7) + dz4r[kb][5] * _u1(i, j, nz - 6) +
@@ -1316,7 +1315,7 @@ dtopo_str_112(_prec*  __restrict__ xx, _prec*  __restrict__ yy, _prec*  __restri
     // yz
 
 #ifdef CURVILINEAR
-  float J23i = _f_2(i, j) * _g3(nz - 1 - kc - 6);
+  _prec J23i = _f_2(i, j) * _g3(nz - 1 - kc - 6);
   J23i = 1.0 * 1.0 / J23i;
   vs1 =
       J23i * (dz4r[kb][6] * _v1(i, j, nz - 7) + dz4r[kb][5] * _v1(i, j, nz - 6) +
@@ -1416,22 +1415,22 @@ dtopo_str_112(_prec*  __restrict__ xx, _prec*  __restrict__ yy, _prec*  __restri
 
 // Kernel functions without attenuation
 __global__ void dtopo_str_110(
-    float *__restrict__ s11, float *__restrict__ s12, float *__restrict__ s13,
-    float *__restrict__ s22, float *__restrict__ s23, float *__restrict__ s33,
-    float *__restrict__ u1, float *__restrict__ u2, float *__restrict__ u3,
-    const float *__restrict__ dcrjx, const float *__restrict__ dcrjy,
-    const float *__restrict__ dcrjz, const float *__restrict__ f,
-    const float *__restrict__ f1_1, const float *__restrict__ f1_2,
-    const float *__restrict__ f1_c, const float *__restrict__ f2_1,
-    const float *__restrict__ f2_2, const float *__restrict__ f2_c,
-    const float *__restrict__ f_1, const float *__restrict__ f_2,
-    const float *__restrict__ f_c, const float *__restrict__ g,
-    const float *__restrict__ g3, const float *__restrict__ g3_c,
-    const float *__restrict__ g_c, const float *__restrict__ lami,
-    const float *__restrict__ mui, const float a, const float nu, const int nx,
+    _prec *__restrict__ s11, _prec *__restrict__ s12, _prec *__restrict__ s13,
+    _prec *__restrict__ s22, _prec *__restrict__ s23, _prec *__restrict__ s33,
+    _prec *__restrict__ u1, _prec *__restrict__ u2, _prec *__restrict__ u3,
+    const _prec *__restrict__ dcrjx, const _prec *__restrict__ dcrjy,
+    const _prec *__restrict__ dcrjz, const _prec *__restrict__ f,
+    const _prec *__restrict__ f1_1, const _prec *__restrict__ f1_2,
+    const _prec *__restrict__ f1_c, const _prec *__restrict__ f2_1,
+    const _prec *__restrict__ f2_2, const _prec *__restrict__ f2_c,
+    const _prec *__restrict__ f_1, const _prec *__restrict__ f_2,
+    const _prec *__restrict__ f_c, const _prec *__restrict__ g,
+    const _prec *__restrict__ g3, const _prec *__restrict__ g3_c,
+    const _prec *__restrict__ g_c, const _prec *__restrict__ lami,
+    const _prec *__restrict__ mui, const _prec a, const _prec nu, const int nx,
     const int ny, const int nz, const int bi, const int bj, const int ei,
     const int ej) {
-  const float phz4l[6][7] = {
+  const _prec phz4l[6][7] = {
       {0.8338228784688313, 0.1775123316429260, 0.1435067013076542,
        -0.1548419114194114, 0.0000000000000000, 0.0000000000000000,
        0.0000000000000000},
@@ -1450,11 +1449,11 @@ __global__ void dtopo_str_110(
       {0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
        -0.0625000000000000, 0.5625000000000000, 0.5625000000000000,
        -0.0625000000000000}};
-  const float phy4[4] = {-0.0625000000000000, 0.5625000000000000,
+  const _prec phy4[4] = {-0.0625000000000000, 0.5625000000000000,
                          0.5625000000000000, -0.0625000000000000};
-  const float px4[4] = {-0.0625000000000000, 0.5625000000000000,
+  const _prec px4[4] = {-0.0625000000000000, 0.5625000000000000,
                         0.5625000000000000, -0.0625000000000000};
-  const float dhz4l[6][7] = {
+  const _prec dhz4l[6][7] = {
       {-1.4511412472637157, 1.8534237417911470, -0.3534237417911469,
        -0.0488587527362844, 0.0000000000000000, 0.0000000000000000,
        0.0000000000000000},
@@ -1473,7 +1472,7 @@ __global__ void dtopo_str_110(
       {0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
        0.0416666666666667, -1.1250000000000000, 1.1250000000000000,
        -0.0416666666666667}};
-  const float phdz4l[6][9] = {
+  const _prec phdz4l[6][9] = {
       {-1.5373923010673116, 1.0330083346742178, 0.6211677623382129,
        0.0454110758451345, -0.1680934225988761, 0.0058985508086226,
        0.0000000000000000, 0.0000000000000000, 0.0000000000000000},
@@ -1492,19 +1491,19 @@ __global__ void dtopo_str_110(
       {-0.0009619461344193, -0.0035553215968974, 0.0124936029037323,
        0.0773639466787397, -0.6736586580761996, -0.0002232904416222,
        0.6796875000000000, -0.0937500000000000, 0.0026041666666667}};
-  const float dx4[4] = {0.0416666666666667, -1.1250000000000000,
+  const _prec dx4[4] = {0.0416666666666667, -1.1250000000000000,
                         1.1250000000000000, -0.0416666666666667};
-  const float dhy4[4] = {0.0416666666666667, -1.1250000000000000,
+  const _prec dhy4[4] = {0.0416666666666667, -1.1250000000000000,
                          1.1250000000000000, -0.0416666666666667};
-  const float phx4[4] = {-0.0625000000000000, 0.5625000000000000,
+  const _prec phx4[4] = {-0.0625000000000000, 0.5625000000000000,
                          0.5625000000000000, -0.0625000000000000};
-  const float py4[4] = {-0.0625000000000000, 0.5625000000000000,
+  const _prec py4[4] = {-0.0625000000000000, 0.5625000000000000,
                         0.5625000000000000, -0.0625000000000000};
-  const float dy4[4] = {0.0416666666666667, -1.1250000000000000,
+  const _prec dy4[4] = {0.0416666666666667, -1.1250000000000000,
                         1.1250000000000000, -0.0416666666666667};
-  const float dhx4[4] = {0.0416666666666667, -1.1250000000000000,
+  const _prec dhx4[4] = {0.0416666666666667, -1.1250000000000000,
                          1.1250000000000000, -0.0416666666666667};
-  const float dz4l[6][8] = {
+  const _prec dz4l[6][8] = {
       {-1.7779989465546748, 1.3337480247900155, 0.7775013168066564,
        -0.3332503950419969, 0.0000000000000000, 0.0000000000000000,
        0.0000000000000000, 0.0000000000000000},
@@ -1523,7 +1522,7 @@ __global__ void dtopo_str_110(
       {0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
        0.0000000000000000, 0.0416666666666667, -1.1250000000000000,
        1.1250000000000000, -0.0416666666666667}};
-  const float pdhz4l[6][9] = {
+  const _prec pdhz4l[6][9] = {
       {-1.5886075042755416, 2.2801810182668110, -0.8088980291471827,
        0.1316830205960989, -0.0143585054401857, 0.0000000000000000,
        0.0000000000000000, 0.0000000000000000, 0.0000000000000000},
@@ -1628,15 +1627,15 @@ __global__ void dtopo_str_110(
   u3[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
      (2 * align + nz) * ((j) + ngsl + 2)]
   for (int i = bi; i < ei; ++i) {
-    float Jii = _f_c(i, j) * _g3_c(k);
+    _prec Jii = _f_c(i, j) * _g3_c(k);
     Jii = 1.0 * 1.0 / Jii;
-    float J12i = _f(i, j) * _g3_c(k);
+    _prec J12i = _f(i, j) * _g3_c(k);
     J12i = 1.0 * 1.0 / J12i;
-    float J13i = _f_1(i, j) * _g3(k);
+    _prec J13i = _f_1(i, j) * _g3(k);
     J13i = 1.0 * 1.0 / J13i;
-    float J23i = _f_2(i, j) * _g3(k);
+    _prec J23i = _f_2(i, j) * _g3(k);
     J23i = 1.0 * 1.0 / J23i;
-    float lam =
+    _prec lam =
         nu * 1.0 /
         (phz4l[k][0] *
              (phy4[2] *
@@ -1750,7 +1749,7 @@ __global__ void dtopo_str_110(
                          px4[0] * _lami(i - 1, j + 1, 6) +
                          px4[2] * _lami(i + 1, j + 1, 6) +
                          px4[3] * _lami(i + 2, j + 1, 6))));
-    float twomu =
+    _prec twomu =
         2 * nu * 1.0 /
         (phz4l[k][0] *
              (phy4[2] *
@@ -1864,18 +1863,18 @@ __global__ void dtopo_str_110(
                   (px4[1] * _mui(i, j + 1, 6) + px4[0] * _mui(i - 1, j + 1, 6) +
                    px4[2] * _mui(i + 1, j + 1, 6) +
                    px4[3] * _mui(i + 2, j + 1, 6))));
-    float mu12 = nu * 1.0 /
+    _prec mu12 = nu * 1.0 /
                  (phz4l[k][0] * _mui(i, j, 0) + phz4l[k][1] * _mui(i, j, 1) +
                   phz4l[k][2] * _mui(i, j, 2) + phz4l[k][3] * _mui(i, j, 3) +
                   phz4l[k][4] * _mui(i, j, 4) + phz4l[k][5] * _mui(i, j, 5) +
                   phz4l[k][6] * _mui(i, j, 6));
-    float mu13 = nu * 1.0 /
+    _prec mu13 = nu * 1.0 /
                  (phy4[2] * _mui(i, j, k) + phy4[0] * _mui(i, j - 2, k) +
                   phy4[1] * _mui(i, j - 1, k) + phy4[3] * _mui(i, j + 1, k));
-    float mu23 = nu * 1.0 /
+    _prec mu23 = nu * 1.0 /
                  (px4[1] * _mui(i, j, k) + px4[0] * _mui(i - 1, j, k) +
                   px4[2] * _mui(i + 1, j, k) + px4[3] * _mui(i + 2, j, k));
-    float div =
+    _prec div =
         dhy4[2] * _u2(i, j, k) + dhy4[0] * _u2(i, j - 2, k) +
         dhy4[1] * _u2(i, j - 1, k) + dhy4[3] * _u2(i, j + 1, k) +
         dx4[1] * _u1(i, j, k) + dx4[0] * _u1(i - 1, j, k) +
@@ -1958,7 +1957,7 @@ __global__ void dtopo_str_110(
                   phdz4l[k][6] * _u1(i + 2, j, 6) +
                   phdz4l[k][7] * _u1(i + 2, j, 7) +
                   phdz4l[k][8] * _u1(i + 2, j, 8)));
-    float f_dcrj = _dcrjx(i) * _dcrjy(j) * _dcrjz(k);
+    _prec f_dcrj = _dcrjx(i) * _dcrjy(j) * _dcrjz(k);
     _s11(i, j, k) =
         (a * _s11(i, j, k) + lam * div +
          twomu * (dx4[1] * _u1(i, j, k) + dx4[0] * _u1(i - 1, j, k) +
@@ -2274,48 +2273,48 @@ __global__ void dtopo_str_110(
 }
 
 __global__ void dtopo_str_111(
-    float *__restrict__ s11, float *__restrict__ s12, float *__restrict__ s13,
-    float *__restrict__ s22, float *__restrict__ s23, float *__restrict__ s33,
-    float *__restrict__ u1, float *__restrict__ u2, float *__restrict__ u3,
-    const float *__restrict__ dcrjx, const float *__restrict__ dcrjy,
-    const float *__restrict__ dcrjz, const float *__restrict__ f,
-    const float *__restrict__ f1_1, const float *__restrict__ f1_2,
-    const float *__restrict__ f1_c, const float *__restrict__ f2_1,
-    const float *__restrict__ f2_2, const float *__restrict__ f2_c,
-    const float *__restrict__ f_1, const float *__restrict__ f_2,
-    const float *__restrict__ f_c, const float *__restrict__ g,
-    const float *__restrict__ g3, const float *__restrict__ g3_c,
-    const float *__restrict__ g_c, const float *__restrict__ lami,
-    const float *__restrict__ mui, const float a, const float nu, const int nx,
+    _prec *__restrict__ s11, _prec *__restrict__ s12, _prec *__restrict__ s13,
+    _prec *__restrict__ s22, _prec *__restrict__ s23, _prec *__restrict__ s33,
+    _prec *__restrict__ u1, _prec *__restrict__ u2, _prec *__restrict__ u3,
+    const _prec *__restrict__ dcrjx, const _prec *__restrict__ dcrjy,
+    const _prec *__restrict__ dcrjz, const _prec *__restrict__ f,
+    const _prec *__restrict__ f1_1, const _prec *__restrict__ f1_2,
+    const _prec *__restrict__ f1_c, const _prec *__restrict__ f2_1,
+    const _prec *__restrict__ f2_2, const _prec *__restrict__ f2_c,
+    const _prec *__restrict__ f_1, const _prec *__restrict__ f_2,
+    const _prec *__restrict__ f_c, const _prec *__restrict__ g,
+    const _prec *__restrict__ g3, const _prec *__restrict__ g3_c,
+    const _prec *__restrict__ g_c, const _prec *__restrict__ lami,
+    const _prec *__restrict__ mui, const _prec a, const _prec nu, const int nx,
     const int ny, const int nz, const int bi, const int bj, const int ei,
     const int ej) {
-  const float phz4[4] = {-0.0625000000000000, 0.5625000000000000,
+  const _prec phz4[4] = {-0.0625000000000000, 0.5625000000000000,
                          0.5625000000000000, -0.0625000000000000};
-  const float phy4[4] = {-0.0625000000000000, 0.5625000000000000,
+  const _prec phy4[4] = {-0.0625000000000000, 0.5625000000000000,
                          0.5625000000000000, -0.0625000000000000};
-  const float px4[4] = {-0.0625000000000000, 0.5625000000000000,
+  const _prec px4[4] = {-0.0625000000000000, 0.5625000000000000,
                         0.5625000000000000, -0.0625000000000000};
-  const float dhz4[4] = {0.0416666666666667, -1.1250000000000000,
+  const _prec dhz4[4] = {0.0416666666666667, -1.1250000000000000,
                          1.1250000000000000, -0.0416666666666667};
-  const float phdz4[7] = {-0.0026041666666667, 0.0937500000000000,
+  const _prec phdz4[7] = {-0.0026041666666667, 0.0937500000000000,
                           -0.6796875000000000, -0.0000000000000000,
                           0.6796875000000000,  -0.0937500000000000,
                           0.0026041666666667};
-  const float dx4[4] = {0.0416666666666667, -1.1250000000000000,
+  const _prec dx4[4] = {0.0416666666666667, -1.1250000000000000,
                         1.1250000000000000, -0.0416666666666667};
-  const float dhy4[4] = {0.0416666666666667, -1.1250000000000000,
+  const _prec dhy4[4] = {0.0416666666666667, -1.1250000000000000,
                          1.1250000000000000, -0.0416666666666667};
-  const float phx4[4] = {-0.0625000000000000, 0.5625000000000000,
+  const _prec phx4[4] = {-0.0625000000000000, 0.5625000000000000,
                          0.5625000000000000, -0.0625000000000000};
-  const float py4[4] = {-0.0625000000000000, 0.5625000000000000,
+  const _prec py4[4] = {-0.0625000000000000, 0.5625000000000000,
                         0.5625000000000000, -0.0625000000000000};
-  const float dy4[4] = {0.0416666666666667, -1.1250000000000000,
+  const _prec dy4[4] = {0.0416666666666667, -1.1250000000000000,
                         1.1250000000000000, -0.0416666666666667};
-  const float dhx4[4] = {0.0416666666666667, -1.1250000000000000,
+  const _prec dhx4[4] = {0.0416666666666667, -1.1250000000000000,
                          1.1250000000000000, -0.0416666666666667};
-  const float dz4[4] = {0.0416666666666667, -1.1250000000000000,
+  const _prec dz4[4] = {0.0416666666666667, -1.1250000000000000,
                         1.1250000000000000, -0.0416666666666667};
-  const float pdhz4[7] = {-0.0026041666666667, 0.0937500000000000,
+  const _prec pdhz4[7] = {-0.0026041666666667, 0.0937500000000000,
                           -0.6796875000000000, -0.0000000000000000,
                           0.6796875000000000,  -0.0937500000000000,
                           0.0026041666666667};
@@ -2406,15 +2405,15 @@ __global__ void dtopo_str_111(
      (2 * align + nz) * ((j) + ngsl + 2)]
 
   for (int i = bi; i < ei; ++i) {
-    float Jii = _f_c(i, j) * _g3_c(k + 6);
+    _prec Jii = _f_c(i, j) * _g3_c(k + 6);
     Jii = 1.0 * 1.0 / Jii;
-    float J12i = _f(i, j) * _g3_c(k + 6);
+    _prec J12i = _f(i, j) * _g3_c(k + 6);
     J12i = 1.0 * 1.0 / J12i;
-    float J13i = _f_1(i, j) * _g3(k + 6);
+    _prec J13i = _f_1(i, j) * _g3(k + 6);
     J13i = 1.0 * 1.0 / J13i;
-    float J23i = _f_2(i, j) * _g3(k + 6);
+    _prec J23i = _f_2(i, j) * _g3(k + 6);
     J23i = 1.0 * 1.0 / J23i;
-    float lam = nu * 1.0 /
+    _prec lam = nu * 1.0 /
                 (phz4[0] * (phy4[2] * (px4[1] * _lami(i, j, k + 4) +
                                        px4[0] * _lami(i - 1, j, k + 4) +
                                        px4[2] * _lami(i + 1, j, k + 4) +
@@ -2479,7 +2478,7 @@ __global__ void dtopo_str_111(
                                        px4[0] * _lami(i - 1, j + 1, k + 7) +
                                        px4[2] * _lami(i + 1, j + 1, k + 7) +
                                        px4[3] * _lami(i + 2, j + 1, k + 7))));
-    float twomu = 2 * nu * 1.0 /
+    _prec twomu = 2 * nu * 1.0 /
                   (phz4[0] * (phy4[2] * (px4[1] * _mui(i, j, k + 4) +
                                          px4[0] * _mui(i - 1, j, k + 4) +
                                          px4[2] * _mui(i + 1, j, k + 4) +
@@ -2544,18 +2543,18 @@ __global__ void dtopo_str_111(
                                          px4[0] * _mui(i - 1, j + 1, k + 7) +
                                          px4[2] * _mui(i + 1, j + 1, k + 7) +
                                          px4[3] * _mui(i + 2, j + 1, k + 7))));
-    float mu12 = nu * 1.0 /
+    _prec mu12 = nu * 1.0 /
                  (phz4[0] * _mui(i, j, k + 4) + phz4[1] * _mui(i, j, k + 5) +
                   phz4[2] * _mui(i, j, k + 6) + phz4[3] * _mui(i, j, k + 7));
-    float mu13 =
+    _prec mu13 =
         nu * 1.0 /
         (phy4[2] * _mui(i, j, k + 6) + phy4[0] * _mui(i, j - 2, k + 6) +
          phy4[1] * _mui(i, j - 1, k + 6) + phy4[3] * _mui(i, j + 1, k + 6));
-    float mu23 =
+    _prec mu23 =
         nu * 1.0 /
         (px4[1] * _mui(i, j, k + 6) + px4[0] * _mui(i - 1, j, k + 6) +
          px4[2] * _mui(i + 1, j, k + 6) + px4[3] * _mui(i + 2, j, k + 6));
-    float div =
+    _prec div =
         dhy4[2] * _u2(i, j, k + 6) + dhy4[0] * _u2(i, j - 2, k + 6) +
         dhy4[1] * _u2(i, j - 1, k + 6) + dhy4[3] * _u2(i, j + 1, k + 6) +
         dx4[1] * _u1(i, j, k + 6) + dx4[0] * _u1(i - 1, j, k + 6) +
@@ -2622,7 +2621,7 @@ __global__ void dtopo_str_111(
                   phdz4[4] * _u1(i + 2, j, k + 7) +
                   phdz4[5] * _u1(i + 2, j, k + 8) +
                   phdz4[6] * _u1(i + 2, j, k + 9)));
-    float f_dcrj = _dcrjx(i) * _dcrjy(j) * _dcrjz(k + 6);
+    _prec f_dcrj = _dcrjx(i) * _dcrjy(j) * _dcrjz(k + 6);
     _s11(i, j, k + 6) =
         (a * _s11(i, j, k + 6) + lam * div +
          twomu *
@@ -2889,22 +2888,22 @@ __global__ void dtopo_str_111(
 }
 
 __global__ void dtopo_str_112(
-    float *__restrict__ s11, float *__restrict__ s12, float *__restrict__ s13,
-    float *__restrict__ s22, float *__restrict__ s23, float *__restrict__ s33,
-    float *__restrict__ u1, float *__restrict__ u2, float *__restrict__ u3,
-    const float *__restrict__ dcrjx, const float *__restrict__ dcrjy,
-    const float *__restrict__ dcrjz, const float *__restrict__ f,
-    const float *__restrict__ f1_1, const float *__restrict__ f1_2,
-    const float *__restrict__ f1_c, const float *__restrict__ f2_1,
-    const float *__restrict__ f2_2, const float *__restrict__ f2_c,
-    const float *__restrict__ f_1, const float *__restrict__ f_2,
-    const float *__restrict__ f_c, const float *__restrict__ g,
-    const float *__restrict__ g3, const float *__restrict__ g3_c,
-    const float *__restrict__ g_c, const float *__restrict__ lami,
-    const float *__restrict__ mui, const float a, const float nu, const int nx,
+    _prec *__restrict__ s11, _prec *__restrict__ s12, _prec *__restrict__ s13,
+    _prec *__restrict__ s22, _prec *__restrict__ s23, _prec *__restrict__ s33,
+    _prec *__restrict__ u1, _prec *__restrict__ u2, _prec *__restrict__ u3,
+    const _prec *__restrict__ dcrjx, const _prec *__restrict__ dcrjy,
+    const _prec *__restrict__ dcrjz, const _prec *__restrict__ f,
+    const _prec *__restrict__ f1_1, const _prec *__restrict__ f1_2,
+    const _prec *__restrict__ f1_c, const _prec *__restrict__ f2_1,
+    const _prec *__restrict__ f2_2, const _prec *__restrict__ f2_c,
+    const _prec *__restrict__ f_1, const _prec *__restrict__ f_2,
+    const _prec *__restrict__ f_c, const _prec *__restrict__ g,
+    const _prec *__restrict__ g3, const _prec *__restrict__ g3_c,
+    const _prec *__restrict__ g_c, const _prec *__restrict__ lami,
+    const _prec *__restrict__ mui, const _prec a, const _prec nu, const int nx,
     const int ny, const int nz, const int bi, const int bj, const int ei,
     const int ej) {
-  const float phz4r[6][8] = {
+  const _prec phz4r[6][8] = {
       {0.0000000000000000, 0.8338228784688313, 0.1775123316429260,
        0.1435067013076542, -0.1548419114194114, 0.0000000000000000,
        0.0000000000000000, 0.0000000000000000},
@@ -2923,11 +2922,11 @@ __global__ void dtopo_str_112(
       {0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
        0.0000000000000000, -0.0625000000000000, 0.5625000000000000,
        0.5625000000000000, -0.0625000000000000}};
-  const float phy4[4] = {-0.0625000000000000, 0.5625000000000000,
+  const _prec phy4[4] = {-0.0625000000000000, 0.5625000000000000,
                          0.5625000000000000, -0.0625000000000000};
-  const float px4[4] = {-0.0625000000000000, 0.5625000000000000,
+  const _prec px4[4] = {-0.0625000000000000, 0.5625000000000000,
                         0.5625000000000000, -0.0625000000000000};
-  const float dhz4r[6][8] = {
+  const _prec dhz4r[6][8] = {
       {0.0000000000000000, 1.4511412472637157, -1.8534237417911470,
        0.3534237417911469, 0.0488587527362844, 0.0000000000000000,
        0.0000000000000000, 0.0000000000000000},
@@ -2946,7 +2945,7 @@ __global__ void dtopo_str_112(
       {0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
        0.0000000000000000, -0.0416666666666667, 1.1250000000000000,
        -1.1250000000000000, 0.0416666666666667}};
-  const float phdz4r[6][9] = {
+  const _prec phdz4r[6][9] = {
       {1.5373923010673116, -1.0330083346742178, -0.6211677623382129,
        -0.0454110758451345, 0.1680934225988761, -0.0058985508086226,
        0.0000000000000000, 0.0000000000000000, 0.0000000000000000},
@@ -2965,19 +2964,19 @@ __global__ void dtopo_str_112(
       {0.0009619461344193, 0.0035553215968974, -0.0124936029037323,
        -0.0773639466787397, 0.6736586580761996, 0.0002232904416222,
        -0.6796875000000000, 0.0937500000000000, -0.0026041666666667}};
-  const float dx4[4] = {0.0416666666666667, -1.1250000000000000,
+  const _prec dx4[4] = {0.0416666666666667, -1.1250000000000000,
                         1.1250000000000000, -0.0416666666666667};
-  const float dhy4[4] = {0.0416666666666667, -1.1250000000000000,
+  const _prec dhy4[4] = {0.0416666666666667, -1.1250000000000000,
                          1.1250000000000000, -0.0416666666666667};
-  const float phx4[4] = {-0.0625000000000000, 0.5625000000000000,
+  const _prec phx4[4] = {-0.0625000000000000, 0.5625000000000000,
                          0.5625000000000000, -0.0625000000000000};
-  const float py4[4] = {-0.0625000000000000, 0.5625000000000000,
+  const _prec py4[4] = {-0.0625000000000000, 0.5625000000000000,
                         0.5625000000000000, -0.0625000000000000};
-  const float dy4[4] = {0.0416666666666667, -1.1250000000000000,
+  const _prec dy4[4] = {0.0416666666666667, -1.1250000000000000,
                         1.1250000000000000, -0.0416666666666667};
-  const float dhx4[4] = {0.0416666666666667, -1.1250000000000000,
+  const _prec dhx4[4] = {0.0416666666666667, -1.1250000000000000,
                          1.1250000000000000, -0.0416666666666667};
-  const float dz4r[6][7] = {
+  const _prec dz4r[6][7] = {
       {0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
        0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
        0.0000000000000000},
@@ -2996,7 +2995,7 @@ __global__ void dtopo_str_112(
       {0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
        -0.0416666666666667, 1.1250000000000000, -1.1250000000000000,
        0.0416666666666667}};
-  const float pdhz4r[6][9] = {
+  const _prec pdhz4r[6][9] = {
       {0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
        0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
        0.0000000000000000, 0.0000000000000000, 0.0000000000000000},
@@ -3101,15 +3100,15 @@ __global__ void dtopo_str_112(
   u3[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
      (2 * align + nz) * ((j) + ngsl + 2)]
   for (int i = bi; i < ei; ++i) {
-    float Jii = _f_c(i, j) * _g3_c(nz - 1 - k);
+    _prec Jii = _f_c(i, j) * _g3_c(nz - 1 - k);
     Jii = 1.0 * 1.0 / Jii;
-    float J12i = _f(i, j) * _g3_c(nz - 1 - k);
+    _prec J12i = _f(i, j) * _g3_c(nz - 1 - k);
     J12i = 1.0 * 1.0 / J12i;
-    float J13i = _f_1(i, j) * _g3(nz - 1 - k);
+    _prec J13i = _f_1(i, j) * _g3(nz - 1 - k);
     J13i = 1.0 * 1.0 / J13i;
-    float J23i = _f_2(i, j) * _g3(nz - 1 - k);
+    _prec J23i = _f_2(i, j) * _g3(nz - 1 - k);
     J23i = 1.0 * 1.0 / J23i;
-    float lam =
+    _prec lam =
         nu * 1.0 /
         (phz4r[k][7] * (phy4[2] * (px4[1] * _lami(i, j, nz - 8) +
                                    px4[0] * _lami(i - 1, j, nz - 8) +
@@ -3239,7 +3238,7 @@ __global__ void dtopo_str_112(
                                    px4[0] * _lami(i - 1, j + 1, nz - 1) +
                                    px4[2] * _lami(i + 1, j + 1, nz - 1) +
                                    px4[3] * _lami(i + 2, j + 1, nz - 1))));
-    float twomu =
+    _prec twomu =
         2 * nu * 1.0 /
         (phz4r[k][7] * (phy4[2] * (px4[1] * _mui(i, j, nz - 8) +
                                    px4[0] * _mui(i - 1, j, nz - 8) +
@@ -3369,23 +3368,23 @@ __global__ void dtopo_str_112(
                                    px4[0] * _mui(i - 1, j + 1, nz - 1) +
                                    px4[2] * _mui(i + 1, j + 1, nz - 1) +
                                    px4[3] * _mui(i + 2, j + 1, nz - 1))));
-    float mu12 =
+    _prec mu12 =
         nu * 1.0 /
         (phz4r[k][7] * _mui(i, j, nz - 8) + phz4r[k][6] * _mui(i, j, nz - 7) +
          phz4r[k][5] * _mui(i, j, nz - 6) + phz4r[k][4] * _mui(i, j, nz - 5) +
          phz4r[k][3] * _mui(i, j, nz - 4) + phz4r[k][2] * _mui(i, j, nz - 3) +
          phz4r[k][1] * _mui(i, j, nz - 2) + phz4r[k][0] * _mui(i, j, nz - 1));
-    float mu13 = nu * 1.0 /
+    _prec mu13 = nu * 1.0 /
                  (phy4[2] * _mui(i, j, nz - 1 - k) +
                   phy4[0] * _mui(i, j - 2, nz - 1 - k) +
                   phy4[1] * _mui(i, j - 1, nz - 1 - k) +
                   phy4[3] * _mui(i, j + 1, nz - 1 - k));
-    float mu23 =
+    _prec mu23 =
         nu * 1.0 /
         (px4[1] * _mui(i, j, nz - 1 - k) + px4[0] * _mui(i - 1, j, nz - 1 - k) +
          px4[2] * _mui(i + 1, j, nz - 1 - k) +
          px4[3] * _mui(i + 2, j, nz - 1 - k));
-    float div =
+    _prec div =
         dhy4[2] * _u2(i, j, nz - 1 - k) + dhy4[0] * _u2(i, j - 2, nz - 1 - k) +
         dhy4[1] * _u2(i, j - 1, nz - 1 - k) +
         dhy4[3] * _u2(i, j + 1, nz - 1 - k) + dx4[1] * _u1(i, j, nz - 1 - k) +
@@ -3480,7 +3479,7 @@ __global__ void dtopo_str_112(
                   phdz4r[k][2] * _u1(i + 2, j, nz - 3) +
                   phdz4r[k][1] * _u1(i + 2, j, nz - 2) +
                   phdz4r[k][0] * _u1(i + 2, j, nz - 1)));
-    float f_dcrj = _dcrjx(i) * _dcrjy(j) * _dcrjz(nz - 1 - k);
+    _prec f_dcrj = _dcrjx(i) * _dcrjy(j) * _dcrjz(nz - 1 - k);
     _s11(i, j, nz - 1 - k) =
         (a * _s11(i, j, nz - 1 - k) + lam * div +
          twomu * (dx4[1] * _u1(i, j, nz - 1 - k) +
@@ -3822,9 +3821,9 @@ __global__ void dtopo_str_112(
 #undef _u3
 }
 
-__global__ void dtopo_init_material_111(float *__restrict__ lami,
-                                        float *__restrict__ mui,
-                                        float *__restrict__ rho, const int nx,
+__global__ void dtopo_init_material_111(_prec *__restrict__ lami,
+                                        _prec *__restrict__ mui,
+                                        _prec *__restrict__ rho, const int nx,
                                         const int ny, const int nz) {
   const int i = threadIdx.z + blockIdx.z * blockDim.z;
   if (i >= nx)
diff --git a/src/topography/kernels/stress_index_unroll.cu b/src/topography/kernels/stress_index_unroll.cu
new file mode 100644
index 0000000..0cae4af
--- /dev/null
+++ b/src/topography/kernels/stress_index_unroll.cu
@@ -0,0 +1,756 @@
+#define _f(i, j) f[(j) + align + (i) * (2 * align + 2 * ngsl + ny + 4)]
+#define _f_1(i, j) f_1[(j) + align + (i) * (2 * align + 2 * ngsl + ny + 4)]
+#define _f_2(i, j) f_2[(j) + align + (i) * (2 * align + 2 * ngsl + ny + 4)]
+#define _f2_c(i, j) f2_c[(j) + align + (i) * (2 * align + 2 * ngsl + ny + 4)]
+#define _f1_1(i, j) f1_1[(j) + align + (i) * (2 * align + 2 * ngsl + ny + 4)]
+#define _f2_1(i, j) f2_1[(j) + align + (i) * (2 * align + 2 * ngsl + ny + 4)]
+#define _f2_2(i, j) f2_2[(j) + align + (i) * (2 * align + 2 * ngsl + ny + 4)]
+#define _f_c(i, j) f_c[(j) + align + (i) * (2 * align + 2 * ngsl + ny + 4)]
+#define _f1_c(i, j) f1_c[(j) + align + (i) * (2 * align + 2 * ngsl + ny + 4)]
+#define _f1_2(i, j) f1_2[(j) + align + (i) * (2 * align + 2 * ngsl + ny + 4)]
+#define _g3_c(k) g3_c[(k)]
+#define _g_c(k) g_c[(k)]
+#define _g(k) g[(k)]
+#define _g3(k) g3[(k)]
+
+#define RSTRCT __restrict__
+#define LDG(x) x
+#define OVERLAP_ZONE_INDEX 8
+
+template <int tx, int ty, int tz, int na, int nb>
+__launch_bounds__ (tx*ty*tz)
+__global__ void dtopo_str_111_index_unroll(_prec*  RSTRCT xx, _prec*  RSTRCT yy, _prec*  RSTRCT zz,
+           _prec*  RSTRCT xy, _prec*  RSTRCT xz, _prec*  RSTRCT yz,
+       _prec*  RSTRCT r1, _prec*  RSTRCT r2,  _prec*  RSTRCT r3, 
+       _prec*  RSTRCT r4, _prec*  RSTRCT r5,  _prec*  RSTRCT r6,
+       _prec*  RSTRCT u1, 
+       _prec*  RSTRCT v1,    
+       _prec*  RSTRCT w1,    
+       const _prec *RSTRCT f,
+       const _prec *RSTRCT f1_1, const _prec *RSTRCT f1_2,
+       const _prec *RSTRCT f1_c, const _prec *RSTRCT f2_1,
+       const _prec *RSTRCT f2_2, const _prec *RSTRCT f2_c,
+       const _prec *RSTRCT f_1, const _prec *RSTRCT f_2,
+       const _prec *RSTRCT f_c, const _prec *RSTRCT g,
+       const _prec *RSTRCT g3, const _prec *RSTRCT g3_c,
+       const _prec *RSTRCT g_c,
+       const _prec *RSTRCT  lam,   
+       const _prec *RSTRCT  mu,     
+       const _prec *RSTRCT  qp,
+       const _prec *RSTRCT  coeff, 
+       const _prec *RSTRCT  qs, 
+       const _prec *RSTRCT  dcrjx, 
+       const _prec *RSTRCT  dcrjy, 
+       const _prec *RSTRCT  dcrjz, 
+       const _prec *RSTRCT d_vx1, 
+       const _prec *RSTRCT d_vx2, 
+       const int *RSTRCT d_ww, 
+       const _prec *RSTRCT d_wwo,
+       int NX, int ny, int nz, int rankx, int ranky, 
+       int nzt, int s_i, int e_i, int s_j, int e_j) 
+{ 
+  register int   i,  j,  k;
+  register int   j0,  k0;
+  register int   pos,     pos_ip1, pos_im2, pos_im1;
+  register int   pos_km2, pos_km1, pos_kp1, pos_kp2;
+  register int   pos_jm2, pos_jm1, pos_jp1, pos_jp2;
+  register int   pos_ik1, pos_jk1, pos_ijk, pos_ijk1,f_ww;
+  register _prec vs1, vs2, vs3, a1, tmp, vx1,f_wwo;
+  register _prec xl,  xm,  xmu1, xmu2, xmu3;
+  register _prec qpa, h,   h1,   h2,   h3;
+  register _prec qpaw,hw,h1w,h2w,h3w; 
+  register _prec f_vx1, f_vx2,  f_dcrj, f_r,  f_dcrjy, f_dcrjz;
+  register _prec f_rtmp;
+  register _prec f_u1, u1_ip1, u1_ip2, u1_im1;
+  register _prec f_v1, v1_im1, v1_ip1, v1_im2;
+  register _prec f_w1, w1_im1, w1_im2, w1_ip1;
+  _prec f_xx, f_yy, f_zz, f_xy, f_xz, f_yz;
+
+  const _prec px4[4] = {-0.0625000000000000, 0.5625000000000000,
+                        0.5625000000000000, -0.0625000000000000};
+  const _prec dhx4[4] = {0.0416666666666667, -1.1250000000000000,
+                         1.1250000000000000, -0.0416666666666667};
+  const _prec phdz4[7] = {-0.0026041666666667, 0.0937500000000000,
+                          -0.6796875000000000, -0.0000000000000000,
+                          0.6796875000000000,  -0.0937500000000000,
+                          0.0026041666666667};
+  const _prec dx4[4] = {0.0416666666666667, -1.1250000000000000,
+                        1.1250000000000000, -0.0416666666666667};
+  const _prec phx4[4] = {-0.0625000000000000, 0.5625000000000000,
+                         0.5625000000000000, -0.0625000000000000};
+  const _prec phy4[4] = {-0.0625000000000000, 0.5625000000000000,
+                         0.5625000000000000, -0.0625000000000000};
+  const _prec dhy4[4] = {0.0416666666666667, -1.1250000000000000,
+                         1.1250000000000000, -0.0416666666666667};
+  const _prec dhz4[4] = {0.0416666666666667, -1.1250000000000000,
+                         1.1250000000000000, -0.0416666666666667};
+  const _prec py4[4] = {-0.0625000000000000, 0.5625000000000000,
+                        0.5625000000000000, -0.0625000000000000};
+  const _prec dy4[4] = {0.0416666666666667, -1.1250000000000000,
+                        1.1250000000000000, -0.0416666666666667};
+  const _prec dz4[4] = {0.0416666666666667, -1.1250000000000000,
+                        1.1250000000000000, -0.0416666666666667};
+  const _prec pdhz4[7] = {-0.0026041666666667, 0.0937500000000000,
+                          -0.6796875000000000, -0.0000000000000000,
+                          0.6796875000000000,  -0.0937500000000000,
+                          0.0026041666666667};
+
+    
+  int dm_offset = 3;
+  k0 = na * (blockIdx.x * blockDim.x + threadIdx.x) + align;
+  j0 = nb * (blockIdx.y * blockDim.y + threadIdx.y) + s_j;
+  i = blockIdx.z * blockDim.z + threadIdx.z + s_i;
+
+  _prec rxx[nb][na], ryy[nb][na], rzz[nb][na];
+  _prec rxy[nb][na], rxz[nb][na], ryz[nb][na];
+  _prec rr1[nb][na], rr2[nb][na], rr3[nb][na];
+  _prec rr4[nb][na], rr5[nb][na], rr6[nb][na];
+
+  if (i >= e_i)
+    return;
+  if (j0 >= e_j)
+    return;
+  
+#pragma unroll
+  for (int b = 0; b < nb; ++b) {
+          j = j0 + b;
+#pragma unroll
+  for (int a = 0; a < na; ++a) {
+          k = k0 + a;
+
+  pos  = i*d_slice_1+j*d_yline_1+k;
+
+
+
+  u1_ip1 = u1[pos+d_slice_2];
+  f_u1   = u1[pos+d_slice_1];
+  u1_im1 = u1[pos];    
+  f_v1   = v1[pos+d_slice_1];
+  v1_im1 = v1[pos];
+  v1_im2 = v1[pos-d_slice_1];
+  f_w1   = w1[pos+d_slice_1];
+  w1_im1 = w1[pos];
+  w1_im2 = w1[pos-d_slice_1];
+  f_dcrjz = dcrjz[k];
+  f_dcrjy = dcrjy[j];
+
+  // i - 1, j, k - 3: k + 3
+  int m2p0m3 = pos - d_slice_2 - 3;
+  int m2p0m2 = pos - d_slice_2 - 2;
+  int m2p0m1 = pos - d_slice_2 - 1;
+  int m2p0p0 = pos - d_slice_2 + 0;
+  int m2p0p1 = pos - d_slice_2 + 1;
+  int m2p0p2 = pos - d_slice_2 + 2;
+  int m2p0p3 = pos - d_slice_2 + 3;
+
+
+  // i - 1, j, k - 3: k + 3
+  int m1p0m3 = pos - d_slice_1 - 3;
+  int m1p0m2 = pos - d_slice_1 - 2;
+  int m1p0m1 = pos - d_slice_1 - 1;
+  int m1p0p0 = pos - d_slice_1 + 0;
+  int m1p0p1 = pos - d_slice_1 + 1;
+  int m1p0p2 = pos - d_slice_1 + 2;
+  int m1p0p3 = pos - d_slice_1 + 3;
+
+  // i, j, k - 3: k + 3
+  int p0p0m3 = pos - 3;
+  int p0p0m2 = pos - 2;
+  int p0p0m1 = pos - 1;
+  int p0p0p0 = pos + 0;
+  int p0p0p1 = pos + 1;
+  int p0p0p2 = pos + 2;
+  int p0p0p3 = pos + 3;
+
+  // i + 1, j, k - 3: k + 3
+  int p1p0m3 = pos + d_slice_1 - 3;
+  int p1p0m2 = pos + d_slice_1 - 2;
+  int p1p0m1 = pos + d_slice_1 - 1;
+  int p1p0p0 = pos + d_slice_1 + 0;
+  int p1p0p1 = pos + d_slice_1 + 1;
+  int p1p0p2 = pos + d_slice_1 + 2;
+  int p1p0p3 = pos + d_slice_1 + 3;
+
+  // i + 2, j, k - 3: k + 3
+  int p2p0m3 = pos + d_slice_2 - 3;
+  int p2p0m2 = pos + d_slice_2 - 2;
+  int p2p0m1 = pos + d_slice_2 - 1;
+  int p2p0p0 = pos + d_slice_2 + 0;
+  int p2p0p1 = pos + d_slice_2 + 1;
+  int p2p0p2 = pos + d_slice_2 + 2;
+  int p2p0p3 = pos + d_slice_2 + 3;
+
+  // i, j - 2, k - 3: k + 3
+  int p0m2m3 = pos - d_yline_2 - 3;
+  int p0m2m2 = pos - d_yline_2 - 2;
+  int p0m2m1 = pos - d_yline_2 - 1;
+  int p0m2p0 = pos - d_yline_2 + 0;
+  int p0m2p1 = pos - d_yline_2 + 1;
+  int p0m2p2 = pos - d_yline_2 + 2;
+  int p0m2p3 = pos - d_yline_2 + 3;
+
+  // i, j - 1, k - 3: k + 3
+  int p0m1m3 = pos - d_yline_1 - 3;
+  int p0m1m2 = pos - d_yline_1 - 2;
+  int p0m1m1 = pos - d_yline_1 - 1;
+  int p0m1p0 = pos - d_yline_1 + 0;
+  int p0m1p1 = pos - d_yline_1 + 1;
+  int p0m1p2 = pos - d_yline_1 + 2;
+  int p0m1p3 = pos - d_yline_1 + 3;
+
+  // i, j + 1, k - 3: k + 3
+  int p0p1m3 = pos + d_yline_1 - 3;
+  int p0p1m2 = pos + d_yline_1 - 2;
+  int p0p1m1 = pos + d_yline_1 - 1;
+  int p0p1p0 = pos + d_yline_1 + 0;
+  int p0p1p1 = pos + d_yline_1 + 1;
+  int p0p1p2 = pos + d_yline_1 + 2;
+  int p0p1p3 = pos + d_yline_1 + 3;
+
+  // i, j + 2, k - 3: k + 3
+  int p0p2m3 = pos + d_yline_2 - 3;
+  int p0p2m2 = pos + d_yline_2 - 2;
+  int p0p2m1 = pos + d_yline_2 - 1;
+  int p0p2p0 = pos + d_yline_2 + 0;
+  int p0p2p1 = pos + d_yline_2 + 1;
+  int p0p2p2 = pos + d_yline_2 + 2;
+  int p0p2p3 = pos + d_yline_2 + 3;
+
+
+  // i - 2 : i + 1, j
+  //int m2p0 = fpos - d_fline_2;
+  //int m1p0 = fpos - d_fline_1;
+  //int p0p0 = fpos;
+  //int p1p0 = fpos + d_fline_1;
+  //int p2p0 = fpos + d_fline_2;
+
+
+    f_vx1 = d_vx1[pos];
+    f_vx2 = d_vx2[pos];
+    f_ww  = d_ww[pos];
+    f_wwo = d_wwo[pos];
+    
+    f_dcrj   = dcrjx[i]*f_dcrjy*f_dcrjz;
+
+
+    pos_km2  = pos-2;
+    pos_km1  = pos-1;
+    pos_kp1  = pos+1;
+    pos_kp2  = pos+2;
+    pos_jm2  = pos-d_yline_2;
+    pos_jm1  = pos-d_yline_1;
+    pos_jp1  = pos+d_yline_1;
+    pos_jp2  = pos+d_yline_2;
+    pos_im2  = pos-d_slice_2;
+    pos_im1  = pos-d_slice_1;
+    pos_ip1  = pos+d_slice_1;
+    pos_jk1  = pos-d_yline_1-1;
+    pos_ik1  = pos+d_slice_1-1;
+    pos_ijk  = pos+d_slice_1-d_yline_1;
+    pos_ijk1 = pos+d_slice_1-d_yline_1-1;
+
+    xl       = 8.0f/(  LDG(lam[pos])      + LDG(lam[pos_ip1]) + LDG(lam[pos_jm1]) + LDG(lam[pos_ijk])
+                       + LDG(lam[pos_km1])  + LDG(lam[pos_ik1]) + LDG(lam[pos_jk1]) + LDG(lam[pos_ijk1]) );
+    xm       = 16.0f/( LDG(mu[pos])       + LDG(mu[pos_ip1])  + LDG(mu[pos_jm1])  + LDG(mu[pos_ijk])
+                       + LDG(mu[pos_km1])   + LDG(mu[pos_ik1])  + LDG(mu[pos_jk1])  + LDG(mu[pos_ijk1]) );
+    xmu1     = 2.0f/(  LDG(mu[pos])       + LDG(mu[pos_km1]) );
+    xmu2     = 2.0/(  LDG(mu[pos])       + LDG(mu[pos_jm1]) );
+    xmu3     = 2.0/(  LDG(mu[pos])       + LDG(mu[pos_ip1]) );
+    xl       = xl  +  xm;
+    qpa      = 0.0625f*( LDG(qp[pos])     + LDG(qp[pos_ip1]) + LDG(qp[pos_jm1]) + LDG(qp[pos_ijk])
+                         + LDG(qp[pos_km1]) + LDG(qp[pos_ik1]) + LDG(qp[pos_jk1]) + LDG(qp[pos_ijk1]) );
+
+    if(1.0f/(qpa*2.0f)<=200.0f)
+    {
+      qpaw=coeff[f_ww*2-2]*(2.*qpa)*(2.*qpa)+coeff[f_ww*2-1]*(2.*qpa);
+    }
+    else {
+        //suggested by Kyle
+	qpaw  = 2.0f*f_wwo*qpa;
+        // qpaw  = f_wwo*qpa;
+    }
+    qpaw=qpaw/f_wwo;
+
+
+    h        = 0.0625f*( LDG(qs[pos])     + LDG(qs[pos_ip1]) + LDG(qs[pos_jm1]) + LDG(qs[pos_ijk])
+                         + LDG(qs[pos_km1]) + LDG(qs[pos_ik1]) + LDG(qs[pos_jk1]) + LDG(qs[pos_ijk1]) );
+
+    if(1.0f/(h*2.0f)<=200.0f)
+    {
+      hw=coeff[f_ww*2-2]*(2.0f*h)*(2.0f*h)+coeff[f_ww*2-1]*(2.0f*h);
+    }
+    else {
+      //suggested by Kyle
+      hw  = 2.0f*f_wwo*h;
+      // hw  = f_wwo*h;
+    }
+    hw=hw/f_wwo;
+
+
+    h1       = 0.250f*(  qs[pos]     + qs[pos_km1] );
+
+    if(1.0f/(h1*2.0f)<=200.0f)
+    {
+      h1w=coeff[f_ww*2-2]*(2.0f*h1)*(2.0f*h1)+coeff[f_ww*2-1]*(2.0f*h1);
+    }
+    else {
+        //suggested by Kyle
+	h1w  = 2.0f*f_wwo*h1;
+        // h1w  = f_wwo*h1;
+    }
+    h1w=h1w/f_wwo;
+
+    h2       = 0.250f*(  qs[pos]     + qs[pos_jm1] );
+    if(1.0f/(h2*2.0f)<=200.0f)
+    {
+      h2w=coeff[f_ww*2-2]*(2.0f*h2)*(2.0f*h2)+coeff[f_ww*2-1]*(2.0f*h2);
+    }
+    else {
+        //suggested by Kyle
+        //h2w  = f_wwo*h2;
+	h2w  = 2.0f*f_wwo*h2;
+    }
+    h2w=h2w/f_wwo;
+
+
+    h3       = 0.250f*(  qs[pos]     + qs[pos_ip1] );
+    if(1.0f/(h3*2.0f)<=200.0f)
+    {
+      h3w=coeff[f_ww*2-2]*(2.0f*h3)*(2.0f*h3)+coeff[f_ww*2-1]*(2.0f*h3);
+    }
+    else {
+      //suggested by Kyle
+      h3w  = 2.0f*f_wwo*h3;
+      //h3w  = f_wwo*h3;
+    }
+    h3w=h3w/f_wwo;
+
+    h        = -xm*hw*d_dh1;
+    h1       = -xmu1*h1w*d_dh1;
+    h2       = -xmu2*h2w*d_dh1;
+    h3       = -xmu3*h3w*d_dh1;
+
+
+    qpa      = -qpaw*xl*d_dh1;
+
+    xm       = xm*d_dth;
+    xmu1     = xmu1*d_dth;
+    xmu2     = xmu2*d_dth;
+    xmu3     = xmu3*d_dth;
+    xl       = xl*d_dth;
+    h        = h*f_vx1;
+    h1       = h1*f_vx1;
+    h2       = h2*f_vx1;
+    h3       = h3*f_vx1;
+    qpa      = qpa*f_vx1;
+
+    xm       = xm+d_DT*h;
+    xmu1     = xmu1+d_DT*h1;
+    xmu2     = xmu2+d_DT*h2;
+    xmu3     = xmu3+d_DT*h3;
+    vx1      = d_DT*(1+f_vx2*f_vx1);
+        
+    u1_ip2   = u1_ip1;
+    u1_ip1   = f_u1;
+    f_u1     = u1_im1;
+    u1_im1   = u1[pos_im1];
+    v1_ip1   = f_v1;
+    f_v1     = v1_im1;
+    v1_im1   = v1_im2;
+    v1_im2   = v1[pos_im2];
+    w1_ip1   = f_w1;
+    f_w1     = w1_im1;
+    w1_im1   = w1_im2;
+    w1_im2   = w1[pos_im2];
+
+
+    float mapping = 1.0;
+    if (k - align < OVERLAP_ZONE_INDEX)
+        mapping = 0.0;
+
+
+    // xx, yy, zz
+
+    _prec Jii = _f_c(i, j) * _g3_c(k);
+          Jii = 1.0 * 1.0 / Jii;
+
+    if (k - align < OVERLAP_ZONE_INDEX)
+        Jii = 1.0;
+
+
+
+    vs1 =
+      dx4[1] * u1[p0p0p0] + dx4[0] * u1[m1p0p0] +
+      dx4[2] * u1[p1p0p0] + dx4[3] * u1[p2p0p0] -
+      mapping * Jii * _g_c(k) *
+          (
+           px4[0] * _f1_1(i - 1, j) *
+               (
+                phdz4[0] * u1[m1p0m3] +
+                phdz4[1] * u1[m1p0m2] +
+                phdz4[2] * u1[m1p0m1] +
+                phdz4[3] * u1[m1p0p0] +
+                phdz4[4] * u1[m1p0p1] +
+                phdz4[5] * u1[m1p0p2] +
+                phdz4[6] * u1[m1p0p3]
+                ) 
+               +
+           px4[1] * _f1_1(i, j) *
+               (
+                phdz4[0] * u1[p0p0m3] +
+                phdz4[1] * u1[p0p0m2] +
+                phdz4[2] * u1[p0p0m1] +
+                phdz4[3] * u1[p0p0p0] +
+                phdz4[4] * u1[p0p0p1] +
+                phdz4[5] * u1[p0p0p2] +
+                phdz4[6] * u1[p0p0p3]
+                ) +
+           px4[2] * _f1_1(i + 1, j) *
+               (
+                phdz4[0] * u1[p1p0m3] +
+                phdz4[1] * u1[p1p0m2] +
+                phdz4[2] * u1[p1p0m1] +
+                phdz4[3] * u1[p1p0p0] +
+                phdz4[4] * u1[p1p0p1] +
+                phdz4[5] * u1[p1p0p2] +
+                phdz4[6] * u1[p1p0p3]
+                ) +
+           px4[3] * _f1_1(i + 2, j) *
+               (
+                phdz4[0] * u1[p2p0m3] +
+                phdz4[1] * u1[p2p0m2] +
+                phdz4[2] * u1[p2p0m1] +
+                phdz4[3] * u1[p2p0p0] +
+                phdz4[4] * u1[p2p0p1] +
+                phdz4[5] * u1[p2p0p2] +
+                phdz4[6] * u1[p2p0p3]
+                )
+         );
+    vs2 =
+      dhy4[2] * v1[p0p0p0] + dhy4[0] * v1[p0m2p0] +
+      dhy4[1] * v1[p0m1p0] + dhy4[3] * v1[p0p1p0] -
+      mapping * Jii * _g_c(k) *
+           (phy4[0] * _f2_2(i, j - 2) *
+                (
+                phdz4[0] * v1[p0m2m3] +
+                phdz4[1] * v1[p0m2m2] +
+                phdz4[2] * v1[p0m2m1] +
+                phdz4[3] * v1[p0m2p0] +
+                phdz4[4] * v1[p0m2p1] +
+                phdz4[5] * v1[p0m2p2] +
+                phdz4[6] * v1[p0m2p3]
+                ) +
+           phy4[1] * _f2_2(i, j - 1) *
+               (
+                phdz4[0] * v1[p0m1m3] +
+                phdz4[1] * v1[p0m1m2] +
+                phdz4[2] * v1[p0m1m1] +
+                phdz4[3] * v1[p0m1p0] +
+                phdz4[4] * v1[p0m1p1] +
+                phdz4[5] * v1[p0m1p2] +
+                phdz4[6] * v1[p0m1p3]
+               ) +
+          phy4[2] * _f2_2(i, j) *
+               (
+                phdz4[0] * v1[p0p0m3] +
+                phdz4[1] * v1[p0p0m2] +
+                phdz4[2] * v1[p0p0m1] +
+                phdz4[3] * v1[p0p0p0] +
+                phdz4[4] * v1[p0p0p1] +
+                phdz4[5] * v1[p0p0p2] +
+                phdz4[6] * v1[p0p0p3]
+                ) +
+           phy4[3] * _f2_2(i, j + 1) *
+               (
+                phdz4[0] * v1[p0p1m3] +
+                phdz4[1] * v1[p0p1m2] +
+                phdz4[2] * v1[p0p1m1] +
+                phdz4[3] * v1[p0p1p0] +
+                phdz4[4] * v1[p0p1p1] +
+                phdz4[5] * v1[p0p1p2] +
+                phdz4[6] * v1[p0p1p3]
+                )
+               );
+  vs3 =
+      Jii * (dhz4[2] * w1[p0p0p0] + dhz4[0] * w1[p0p0m2] +
+             dhz4[1] * w1[p0p0m1] + dhz4[3] * w1[p0p0p1]);
+
+    tmp      = xl*(vs1+vs2+vs3);
+
+    a1       = qpa*(vs1+vs2+vs3);
+    tmp      = tmp+d_DT*a1;
+
+    f_r      = r1[pos];
+    f_rtmp   = -h*(vs2+vs3) + a1; 
+    f_xx     = xx[pos]  + tmp - xm*(vs2+vs3) + vx1*f_r;  
+    rr1[b][a]  = f_vx2*f_r + f_wwo*f_rtmp;
+    f_rtmp   = f_rtmp*(f_wwo-1) + f_vx2*f_r*(1-f_vx1); 
+    rxx[b][a]  = (f_xx + d_DT*f_rtmp)*f_dcrj;
+
+    f_r      = r2[pos];
+    f_rtmp   = -h*(vs1+vs3) + a1;  
+    f_yy     = (yy[pos]  + tmp - xm*(vs1+vs3) + vx1*f_r)*f_dcrj;
+    rr2[b][a]  = f_vx2*f_r + f_wwo*f_rtmp; 
+    f_rtmp   = f_rtmp*(f_wwo-1) + f_vx2*f_r*(1-f_vx1); 
+    ryy[b][a]  = (f_yy + d_DT*f_rtmp)*f_dcrj;
+	
+    f_r      = r3[pos];
+    f_rtmp   = -h*(vs1+vs2) + a1;
+    f_zz     = (zz[pos]  + tmp - xm*(vs1+vs2) + vx1*f_r)*f_dcrj;
+    rr3[b][a]  = f_vx2*f_r + f_wwo*f_rtmp;
+    f_rtmp   = f_rtmp*(f_wwo-1.0f) + f_vx2*f_r*(1.0f-f_vx1);  
+    rzz[b][a]  = (f_zz + d_DT*f_rtmp)*f_dcrj;
+
+    // xy
+  _prec J12i = _f(i, j) * _g3_c(k);
+  J12i = 1.0 / J12i;
+
+  vs1 =
+      dy4[1] * u1[p0p0p0] + dy4[0] * u1[p0m1p0] +
+      dy4[2] * u1[p0p1p0] + dy4[3] * u1[p0p2p0] -
+      mapping * J12i * _g_c(k) *
+          (
+           py4[0] * _f2_1(i, j - 1) *
+               (
+                phdz4[0] * u1[p0m1m3] +
+                phdz4[1] * u1[p0m1m2] +
+                phdz4[2] * u1[p0m1m1] +
+                phdz4[3] * u1[p0m1p0] +
+                phdz4[4] * u1[p0m1p1] +
+                phdz4[5] * u1[p0m1p2] +
+                phdz4[6] * u1[p0m1p3]) +
+           py4[1] * _f2_1(i, j) *
+               (
+                phdz4[0] * u1[p0p0m3] +
+                phdz4[1] * u1[p0p0m2] +
+                phdz4[2] * u1[p0p0m1] +
+                phdz4[3] * u1[p0p0p0] +
+                phdz4[4] * u1[p0p0p1] +
+                phdz4[5] * u1[p0p0p2] +
+                phdz4[6] * u1[p0p0p3]) +
+           py4[2] * _f2_1(i, j + 1) *
+               (
+                phdz4[0] * u1[p0p1m3] +
+                phdz4[1] * u1[p0p1m2] +
+                phdz4[2] * u1[p0p1m1] +
+                phdz4[3] * u1[p0p1p0] +
+                phdz4[4] * u1[p0p1p1] +
+                phdz4[5] * u1[p0p1p2] +
+                phdz4[6] * u1[p0p1p3]) +
+           py4[3] * _f2_1(i, j + 2) *
+               (
+                phdz4[0] * u1[p0p2m3] +
+                phdz4[1] * u1[p0p2m2] +
+                phdz4[2] * u1[p0p2m1] +
+                phdz4[3] * u1[p0p2p0] +
+                phdz4[4] * u1[p0p2p1] +
+                phdz4[5] * u1[p0p2p2] +
+                phdz4[6] * u1[p0p2p3]) 
+                );
+  vs2 =
+      dhx4[2] * v1[p0p0p0] + dhx4[0] * v1[m2p0p0] +
+      dhx4[1] * v1[m1p0p0] + dhx4[3] * v1[p1p0p0] -
+      mapping * J12i * _g_c(k) *
+          (
+           phx4[0] * _f1_2(i - 2, j) *
+               (
+                phdz4[0] * v1[m2p0m3] +
+                phdz4[1] * v1[m2p0m2] +
+                phdz4[2] * v1[m2p0m1] +
+                phdz4[3] * v1[m2p0p0] +
+                phdz4[4] * v1[m2p0p1] +
+                phdz4[5] * v1[m2p0p2] +
+                phdz4[6] * v1[m2p0p3]
+                ) +
+           phx4[1] * _f1_2(i - 1, j) *
+               (
+                phdz4[0] * v1[m1p0m3] +
+                phdz4[1] * v1[m1p0m2] +
+                phdz4[2] * v1[m1p0m1] +
+                phdz4[3] * v1[m1p0p0] +
+                phdz4[4] * v1[m1p0p1] +
+                phdz4[5] * v1[m1p0p2] +
+                phdz4[6] * v1[m1p0p3]
+                ) +
+           phx4[2] * _f1_2(i, j) *
+               (
+                phdz4[0] * v1[p0p0m3] +
+                phdz4[1] * v1[p0p0m2] +
+                phdz4[2] * v1[p0p0m1] +
+                phdz4[3] * v1[p0p0p0] +
+                phdz4[4] * v1[p0p0p1] +
+                phdz4[5] * v1[p0p0p2] +
+                phdz4[6] * v1[p0p0p3]
+                ) +
+           phx4[3] * _f1_2(i + 1, j) *
+               (
+                phdz4[0] * v1[p1p0m3] +
+                phdz4[1] * v1[p1p0m2] +
+                phdz4[2] * v1[p1p0m1] +
+                phdz4[3] * v1[p1p0p0] +
+                phdz4[4] * v1[p1p0p1] +
+                phdz4[5] * v1[p1p0p2] +
+                phdz4[6] * v1[p1p0p3]
+                ));
+
+    f_r      = r4[pos];
+    f_rtmp   = h1*(vs1+vs2); 
+    f_xy     = xy[pos]  + xmu1*(vs1+vs2) + vx1*f_r;
+    rr4[b][a]  = f_vx2*f_r + f_wwo*f_rtmp; 
+    f_rtmp   = f_rtmp*(f_wwo-1) + f_vx2*f_r*(1-f_vx1);
+    rxy[b][a]  = (f_xy + d_DT*f_rtmp)*f_dcrj;
+
+    // xz
+
+  _prec J13i = _f_1(i, j) * _g3(k);
+  J13i = 1.0 * 1.0 / J13i;
+  if (k - align  < OVERLAP_ZONE_INDEX)
+      J13i = 1.0;
+
+  vs1 = J13i * (dz4[1] * u1[p0p0p0] + dz4[0] * u1[p0p0m1] +
+                dz4[2] * u1[p0p0p1] + dz4[3] * u1[p0p0p2]);
+  vs2 =
+      dhx4[2] * w1[p0p0p0] + dhx4[0] * w1[m2p0p0] +
+      dhx4[1] * w1[m1p0p0] + dhx4[3] * w1[p1p0p0] -
+      J13i * _g(k) *
+          (     
+           phx4[0] * _f1_c(i - 2, j) *
+               (
+                pdhz4[0] * w1[m2p0m3] +
+                pdhz4[1] * w1[m2p0m2] +
+                pdhz4[2] * w1[m2p0m1] +
+                pdhz4[3] * w1[m2p0p0] +
+                pdhz4[4] * w1[m2p0p1] +
+                pdhz4[5] * w1[m2p0p2] +
+                pdhz4[6] * w1[m2p0p3]
+               ) + 
+           phx4[1] * _f1_c(i - 1, j) *
+                (
+                pdhz4[0] * w1[m1p0m3] +
+                pdhz4[1] * w1[m1p0m2] +
+                pdhz4[2] * w1[m1p0m1] +
+                pdhz4[3] * w1[m1p0p0] +
+                pdhz4[4] * w1[m1p0p1] +
+                pdhz4[5] * w1[m1p0p2] +
+                pdhz4[6] * w1[m1p0p3]) +
+           phx4[2] * _f1_c(i, j) *
+               (pdhz4[0] * w1[p0p0m3] +
+                pdhz4[1] * w1[p0p0m2] +
+                pdhz4[2] * w1[p0p0m1] +
+                pdhz4[3] * w1[p0p0p0] +
+                pdhz4[4] * w1[p0p0p1] +
+                pdhz4[5] * w1[p0p0p2] +
+                pdhz4[6] * w1[p0p0p3]) +
+           phx4[3] * _f1_c(i + 1, j) *
+               (pdhz4[0] * w1[p1p0m3] +
+                pdhz4[1] * w1[p1p0m2] +
+                pdhz4[2] * w1[p1p0m1] +
+                pdhz4[3] * w1[p1p0p0] +
+                pdhz4[4] * w1[p1p0p1] +
+                pdhz4[5] * w1[p1p0p2] +
+                pdhz4[6] * w1[p1p0p3]
+                ));
+    f_r     = r5[pos];
+    f_rtmp  = h2*(vs1+vs2);
+    f_xz    = xz[pos]  + xmu2*(vs1+vs2) + vx1*f_r; 
+    rr5[b][a] = f_vx2*f_r + f_wwo*f_rtmp; 
+    f_rtmp  = f_rtmp*(f_wwo-1.0f) + f_vx2*f_r*(1.0f-f_vx1); 
+    rxz[b][a] = (f_xz + d_DT*f_rtmp)*f_dcrj;
+
+    // yz
+
+    _prec J23i = _f_2(i, j) * _g3(k);
+    J23i = 1.0 * 1.0 / J23i;
+  if (k - align  < OVERLAP_ZONE_INDEX)
+      J23i = 1.0;
+    vs1 = J23i * (dz4[1] * v1[p0p0p0] + dz4[0] * v1[p0p0m1] +
+                  dz4[2] * v1[p0p0p1] + dz4[3] * v1[p0p0p2]);
+    vs2 =
+        dy4[1] * w1[p0p0p0] + dy4[0] * w1[p0m1p0] +
+        dy4[2] * w1[p0p1p0] + dy4[3] * w1[p0p2p0] -
+        J23i * _g(k) *
+            (
+             py4[0] * _f2_c(i, j - 1) *
+                 (
+                  pdhz4[0] * w1[p0m1m3] +
+                  pdhz4[1] * w1[p0m1m2] +
+                  pdhz4[2] * w1[p0m1m1] +
+                  pdhz4[3] * w1[p0m1p0] +
+                  pdhz4[4] * w1[p0m1p1] +
+                  pdhz4[5] * w1[p0m1p2] +
+                  pdhz4[6] * w1[p0m1p3]
+                  ) +
+             py4[1] * _f2_c(i, j) *
+                 (
+                  pdhz4[0] * w1[p0p0m3] +
+                  pdhz4[1] * w1[p0p0m2] +
+                  pdhz4[2] * w1[p0p0m1] +
+                  pdhz4[3] * w1[p0p0p0] +
+                  pdhz4[4] * w1[p0p0p1] +
+                  pdhz4[5] * w1[p0p0p2] +
+                  pdhz4[6] * w1[p0p0p3]
+                  ) +
+             py4[2] * _f2_c(i, j + 1) *
+                 (
+                  pdhz4[0] * w1[p0p1m3] +
+                  pdhz4[1] * w1[p0p1m2] +
+                  pdhz4[2] * w1[p0p1m1] +
+                  pdhz4[3] * w1[p0p1p0] +
+                  pdhz4[4] * w1[p0p1p1] +
+                  pdhz4[5] * w1[p0p1p2] +
+                  pdhz4[6] * w1[p0p1p3]
+                  ) +
+             py4[3] * _f2_c(i, j + 2) *
+                 (
+                  pdhz4[0] * w1[p0p2m3] +
+                  pdhz4[1] * w1[p0p2m2] +
+                  pdhz4[2] * w1[p0p2m1] +
+                  pdhz4[3] * w1[p0p2p0] +
+                  pdhz4[4] * w1[p0p2p1] +
+                  pdhz4[5] * w1[p0p2p2] +
+                  pdhz4[6] * w1[p0p2p3]
+                  ));
+           
+    f_r     = r6[pos];
+    f_rtmp  = h3*(vs1+vs2);
+    f_yz    = yz[pos]  + xmu3*(vs1+vs2) + vx1*f_r;
+    rr6[b][a] = f_vx2*f_r + f_wwo*f_rtmp;
+    f_rtmp  = f_rtmp*(f_wwo-1.0f) + f_vx2*f_r*(1.0f-f_vx1); 
+    ryz[b][a] = (f_yz + d_DT*f_rtmp)*f_dcrj; 
+  }
+  }
+
+#pragma unroll
+  for (int b = 0; b < nb; ++b) {
+          j = j0 + b;
+     if (j >= e_j)
+       continue;
+#pragma unroll
+  for (int a = 0; a < na; ++a) {
+     k = k0 + a;
+     pos  = i*d_slice_1+j*d_yline_1+k;
+     if (k < dm_offset + align)
+       continue;
+     if (k >= nz - 6 + align)
+       continue;
+
+        xx[pos] =  rxx[b][a];
+        yy[pos] =  ryy[b][a];
+        zz[pos] =  rzz[b][a];
+        xy[pos] =  rxy[b][a];
+        xz[pos] =  rxz[b][a];
+        yz[pos] =  ryz[b][a];
+        
+        r1[pos] =  rr1[b][a];
+        r2[pos] =  rr2[b][a];
+        r3[pos] =  rr3[b][a];
+        r4[pos] =  rr4[b][a];
+        r5[pos] =  rr5[b][a];
+        r6[pos] =  rr6[b][a];
+
+        }
+  }
+  
+
+}
+#undef OVERLAP_ZONE_INDEX
diff --git a/src/topography/kernels/optimized_velocity.cu b/src/topography/kernels/velocity.cu
similarity index 90%
rename from src/topography/kernels/optimized_velocity.cu
rename to src/topography/kernels/velocity.cu
index 84278d3..8b973a3 100644
--- a/src/topography/kernels/optimized_velocity.cu
+++ b/src/topography/kernels/velocity.cu
@@ -1,5 +1,3 @@
-#include <topography/kernels/optimized_launch_config.cuh>
-#include <topography/kernels/optimized_velocity.cuh>
 
 // Turning __restrict__ on or off...
 //#define RSTRCT
@@ -7,25 +5,31 @@
 #include <stdio.h>
 #include <test/test.h>
 
+
+#define OVERLAP_ZONE_INDEX 8
+#ifndef APPLY_BC
+#define APPLY_BC 1
+#endif
+
 __launch_bounds__(DTOPO_VEL_110_MAX_THREADS_PER_BLOCK)
 
     __global__ void dtopo_vel_110(
-        float* RSTRCT u1, float* RSTRCT u2, float* RSTRCT u3,
-        const float* RSTRCT dcrjx, const float* RSTRCT dcrjy,
-        const float* RSTRCT dcrjz, const float* RSTRCT f,
-        const float* RSTRCT f1_1, const float* RSTRCT f1_2,
-        const float* RSTRCT f1_c, const float* RSTRCT f2_1,
-        const float* RSTRCT f2_2, const float* RSTRCT f2_c,
-        const float* RSTRCT f_1, const float* RSTRCT f_2,
-        const float* RSTRCT f_c, const float* RSTRCT g,
-        const float* RSTRCT g3, const float* RSTRCT g3_c,
-        const float* RSTRCT g_c, const float* RSTRCT rho,
-        const float* RSTRCT s11, const float* RSTRCT s12,
-        const float* RSTRCT s13, const float* RSTRCT s22,
-        const float* RSTRCT s23, const float* RSTRCT s33,
-        const float a, const float nu, const int nx, const int ny, const int nz,
+        _prec* RSTRCT u1, _prec* RSTRCT u2, _prec* RSTRCT u3,
+        const _prec* RSTRCT dcrjx, const _prec* RSTRCT dcrjy,
+        const _prec* RSTRCT dcrjz, const _prec* RSTRCT f,
+        const _prec* RSTRCT f1_1, const _prec* RSTRCT f1_2,
+        const _prec* RSTRCT f1_c, const _prec* RSTRCT f2_1,
+        const _prec* RSTRCT f2_2, const _prec* RSTRCT f2_c,
+        const _prec* RSTRCT f_1, const _prec* RSTRCT f_2,
+        const _prec* RSTRCT f_c, const _prec* RSTRCT g,
+        const _prec* RSTRCT g3, const _prec* RSTRCT g3_c,
+        const _prec* RSTRCT g_c, const _prec* RSTRCT rho,
+        const _prec* RSTRCT s11, const _prec* RSTRCT s12,
+        const _prec* RSTRCT s13, const _prec* RSTRCT s22,
+        const _prec* RSTRCT s23, const _prec* RSTRCT s33,
+        const _prec a, const _prec nu, const int nx, const int ny, const int nz,
         const int bi, const int bj, const int ei, const int ej) {
-        const float phz2l[6][7] = {
+        const _prec phz2l[6][7] = {
             {1.0000000000000000, 0.0000000000000000, 0.0000000000000000,
              0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
              0.0000000000000000},
@@ -44,9 +48,9 @@ __launch_bounds__(DTOPO_VEL_110_MAX_THREADS_PER_BLOCK)
             {0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
              0.0000000000000000, 0.5000000000000000, 0.5000000000000000,
              0.0000000000000000}};
-        const float phy2[2] = {0.5000000000000000, 0.5000000000000000};
-        const float phx2[2] = {0.5000000000000000, 0.5000000000000000};
-        const float dhpz4l[6][9] = {
+        const _prec phy2[2] = {0.5000000000000000, 0.5000000000000000};
+        const _prec phx2[2] = {0.5000000000000000, 0.5000000000000000};
+        const _prec dhpz4l[6][9] = {
             {-1.4276800979942257, 0.2875185051606178, 2.0072491465276454,
              -0.8773816261307504, 0.0075022330101095, 0.0027918394266035,
              0.0000000000000000, 0.0000000000000000, 0.0000000000000000},
@@ -65,15 +69,15 @@ __launch_bounds__(DTOPO_VEL_110_MAX_THREADS_PER_BLOCK)
             {-0.0020323834153791, -0.0002106933140862, 0.0013351454085978,
              0.0938400881871787, -0.6816971139746001, 0.0002232904416222,
              0.6796875000000000, -0.0937500000000000, 0.0026041666666667}};
-        const float phx4[4] = {-0.0625000000000000, 0.5625000000000000,
+        const _prec phx4[4] = {-0.0625000000000000, 0.5625000000000000,
                                0.5625000000000000, -0.0625000000000000};
-        const float phy4[4] = {-0.0625000000000000, 0.5625000000000000,
+        const _prec phy4[4] = {-0.0625000000000000, 0.5625000000000000,
                                0.5625000000000000, -0.0625000000000000};
-        const float dhy4[4] = {0.0416666666666667, -1.1250000000000000,
+        const _prec dhy4[4] = {0.0416666666666667, -1.1250000000000000,
                                1.1250000000000000, -0.0416666666666667};
-        const float dhx4[4] = {0.0416666666666667, -1.1250000000000000,
+        const _prec dhx4[4] = {0.0416666666666667, -1.1250000000000000,
                                1.1250000000000000, -0.0416666666666667};
-        const float dhz4l[6][7] = {
+        const _prec dhz4l[6][7] = {
             {-1.4511412472637157, 1.8534237417911470, -0.3534237417911469,
              -0.0488587527362844, 0.0000000000000000, 0.0000000000000000,
              0.0000000000000000},
@@ -92,15 +96,15 @@ __launch_bounds__(DTOPO_VEL_110_MAX_THREADS_PER_BLOCK)
             {0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
              0.0416666666666667, -1.1250000000000000, 1.1250000000000000,
              -0.0416666666666667}};
-        const float px4[4] = {-0.0625000000000000, 0.5625000000000000,
+        const _prec px4[4] = {-0.0625000000000000, 0.5625000000000000,
                               0.5625000000000000, -0.0625000000000000};
-        const float py4[4] = {-0.0625000000000000, 0.5625000000000000,
+        const _prec py4[4] = {-0.0625000000000000, 0.5625000000000000,
                               0.5625000000000000, -0.0625000000000000};
-        const float dx4[4] = {0.0416666666666667, -1.1250000000000000,
+        const _prec dx4[4] = {0.0416666666666667, -1.1250000000000000,
                               1.1250000000000000, -0.0416666666666667};
-        const float dy4[4] = {0.0416666666666667, -1.1250000000000000,
+        const _prec dy4[4] = {0.0416666666666667, -1.1250000000000000,
                               1.1250000000000000, -0.0416666666666667};
-        const float dphz4l[6][9] = {
+        const _prec dphz4l[6][9] = {
             {-1.3764648947859957, 1.8523239861274134, -0.5524268681758195,
              0.0537413571133823, 0.0228264197210198, 0.0000000000000000,
              0.0000000000000000, 0.0000000000000000, 0.0000000000000000},
@@ -119,7 +123,7 @@ __launch_bounds__(DTOPO_VEL_110_MAX_THREADS_PER_BLOCK)
             {0.0000000000000000, 0.0000000000000000, -0.0026041666666667,
              0.0937500000000000, -0.6796875000000000, 0.0000000000000000,
              0.6796875000000000, -0.0937500000000000, 0.0026041666666667}};
-        const float dz4l[6][8] = {
+        const _prec dz4l[6][8] = {
             {-1.7779989465546748, 1.3337480247900155, 0.7775013168066564,
              -0.3332503950419969, 0.0000000000000000, 0.0000000000000000,
              0.0000000000000000, 0.0000000000000000},
@@ -223,7 +227,7 @@ __launch_bounds__(DTOPO_VEL_110_MAX_THREADS_PER_BLOCK)
         u3[(k) + align +                                               \
            (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
            (2 * align + nz) * ((j) + ngsl + 2)]
-        float rho1 =
+        _prec rho1 =
             phz2l[k][0] *
                 (phy2[1] * _rho(i, j, 0) + phy2[0] * _rho(i, j - 1, 0)) +
             phz2l[k][1] *
@@ -238,7 +242,7 @@ __launch_bounds__(DTOPO_VEL_110_MAX_THREADS_PER_BLOCK)
                 (phy2[1] * _rho(i, j, 5) + phy2[0] * _rho(i, j - 1, 5)) +
             phz2l[k][6] *
                 (phy2[1] * _rho(i, j, 6) + phy2[0] * _rho(i, j - 1, 6));
-        float rho2 =
+        _prec rho2 =
             phz2l[k][0] *
                 (phx2[1] * _rho(i, j, 0) + phx2[0] * _rho(i - 1, j, 0)) +
             phz2l[k][1] *
@@ -253,17 +257,17 @@ __launch_bounds__(DTOPO_VEL_110_MAX_THREADS_PER_BLOCK)
                 (phx2[1] * _rho(i, j, 5) + phx2[0] * _rho(i - 1, j, 5)) +
             phz2l[k][6] *
                 (phx2[1] * _rho(i, j, 6) + phx2[0] * _rho(i - 1, j, 6));
-        float rho3 =
+        _prec rho3 =
             phy2[1] * (phx2[1] * _rho(i, j, k) + phx2[0] * _rho(i - 1, j, k)) +
             phy2[0] *
                 (phx2[1] * _rho(i, j - 1, k) + phx2[0] * _rho(i - 1, j - 1, k));
-        float Ai1 = _f_1(i, j) * _g3_c(k) * rho1;
+        _prec Ai1 = _f_1(i, j) * _g3_c(k) * rho1;
         Ai1 = nu * 1.0 / Ai1;
-        float Ai2 = _f_2(i, j) * _g3_c(k) * rho2;
+        _prec Ai2 = _f_2(i, j) * _g3_c(k) * rho2;
         Ai2 = nu * 1.0 / Ai2;
-        float Ai3 = _f_c(i, j) * _g3(k) * rho3;
+        _prec Ai3 = _f_c(i, j) * _g3(k) * rho3;
         Ai3 = nu * 1.0 / Ai3;
-        float f_dcrj = _dcrjx(i) * _dcrjy(j) * _dcrjz(k);
+        _prec f_dcrj = _dcrjx(i) * _dcrjy(j) * _dcrjz(k);
         _u1(i, j, k) =
             (a * _u1(i, j, k) +
              Ai1 * (dhx4[2] * _f_c(i, j) * _g3_c(k) * _s11(i, j, k) +
@@ -611,48 +615,48 @@ __launch_bounds__(DTOPO_VEL_110_MAX_THREADS_PER_BLOCK)
 __launch_bounds__(DTOPO_VEL_111_MAX_THREADS_PER_BLOCK)
 
 __global__ void dtopo_vel_111(
-        float *RSTRCT u1, float *RSTRCT u2, float *RSTRCT u3,
-        const float *RSTRCT dcrjx, const float *RSTRCT dcrjy,
-        const float *RSTRCT dcrjz, const float *RSTRCT f,
-        const float *RSTRCT f1_1, const float *RSTRCT f1_2,
-        const float *RSTRCT f1_c, const float *RSTRCT f2_1,
-        const float *RSTRCT f2_2, const float *RSTRCT f2_c,
-        const float *RSTRCT f_1, const float *RSTRCT f_2,
-        const float *RSTRCT f_c, const float *RSTRCT g,
-        const float *RSTRCT g3, const float *RSTRCT g3_c,
-        const float *RSTRCT g_c, const float *RSTRCT rho,
-        const float *RSTRCT s11, const float *RSTRCT s12,
-        const float *RSTRCT s13, const float *RSTRCT s22,
-        const float *RSTRCT s23, const float *RSTRCT s33,
-        const float a, const float nu, const int nx, const int ny, const int nz,
+        _prec *RSTRCT u1, _prec *RSTRCT u2, _prec *RSTRCT u3,
+        const _prec *RSTRCT dcrjx, const _prec *RSTRCT dcrjy,
+        const _prec *RSTRCT dcrjz, const _prec *RSTRCT f,
+        const _prec *RSTRCT f1_1, const _prec *RSTRCT f1_2,
+        const _prec *RSTRCT f1_c, const _prec *RSTRCT f2_1,
+        const _prec *RSTRCT f2_2, const _prec *RSTRCT f2_c,
+        const _prec *RSTRCT f_1, const _prec *RSTRCT f_2,
+        const _prec *RSTRCT f_c, const _prec *RSTRCT g,
+        const _prec *RSTRCT g3, const _prec *RSTRCT g3_c,
+        const _prec *RSTRCT g_c, const _prec *RSTRCT rho,
+        const _prec *RSTRCT s11, const _prec *RSTRCT s12,
+        const _prec *RSTRCT s13, const _prec *RSTRCT s22,
+        const _prec *RSTRCT s23, const _prec *RSTRCT s33,
+        const _prec a, const _prec nu, const int nx, const int ny, const int nz,
         const int bi, const int bj, const int ei, const int ej) {
-  const float dhpz4[7] = {-0.0026041666666667, 0.0937500000000000,
+  const _prec dhpz4[7] = {-0.0026041666666667, 0.0937500000000000,
                           -0.6796875000000000, 0.0000000000000000,
                           0.6796875000000000,  -0.0937500000000000,
                           0.0026041666666667};
-  const float phx4[4] = {-0.0625000000000000, 0.5625000000000000,
+  const _prec phx4[4] = {-0.0625000000000000, 0.5625000000000000,
                          0.5625000000000000, -0.0625000000000000};
-  const float phy4[4] = {-0.0625000000000000, 0.5625000000000000,
+  const _prec phy4[4] = {-0.0625000000000000, 0.5625000000000000,
                          0.5625000000000000, -0.0625000000000000};
-  const float dhy4[4] = {0.0416666666666667, -1.1250000000000000,
+  const _prec dhy4[4] = {0.0416666666666667, -1.1250000000000000,
                          1.1250000000000000, -0.0416666666666667};
-  const float dhx4[4] = {0.0416666666666667, -1.1250000000000000,
+  const _prec dhx4[4] = {0.0416666666666667, -1.1250000000000000,
                          1.1250000000000000, -0.0416666666666667};
-  const float dhz4[4] = {0.0416666666666667, -1.1250000000000000,
+  const _prec dhz4[4] = {0.0416666666666667, -1.1250000000000000,
                          1.1250000000000000, -0.0416666666666667};
-  const float px4[4] = {-0.0625000000000000, 0.5625000000000000,
+  const _prec px4[4] = {-0.0625000000000000, 0.5625000000000000,
                         0.5625000000000000, -0.0625000000000000};
-  const float py4[4] = {-0.0625000000000000, 0.5625000000000000,
+  const _prec py4[4] = {-0.0625000000000000, 0.5625000000000000,
                         0.5625000000000000, -0.0625000000000000};
-  const float dx4[4] = {0.0416666666666667, -1.1250000000000000,
+  const _prec dx4[4] = {0.0416666666666667, -1.1250000000000000,
                         1.1250000000000000, -0.0416666666666667};
-  const float dy4[4] = {0.0416666666666667, -1.1250000000000000,
+  const _prec dy4[4] = {0.0416666666666667, -1.1250000000000000,
                         1.1250000000000000, -0.0416666666666667};
-  const float dphz4[7] = {-0.0026041666666667, 0.0937500000000000,
+  const _prec dphz4[7] = {-0.0026041666666667, 0.0937500000000000,
                           -0.6796875000000000, 0.0000000000000000,
                           0.6796875000000000,  -0.0937500000000000,
                           0.0026041666666667};
-  const float dz4[4] = {0.0416666666666667, -1.1250000000000000,
+  const _prec dz4[4] = {0.0416666666666667, -1.1250000000000000,
                         1.1250000000000000, -0.0416666666666667};
   int dm_offset = 3;
   const int i = threadIdx.z + blockIdx.z * blockDim.z + bi;
@@ -744,20 +748,20 @@ __global__ void dtopo_vel_111(
 #define _u3(i, j, k)                                                           \
   u3[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
      (2 * align + nz) * ((j) + ngsl + 2)]
-  float rho1 = 0.25 * (_rho(i, j, k - 1) + _rho(i, j - 1, k - 1)) +
+  _prec rho1 = 0.25 * (_rho(i, j, k - 1) + _rho(i, j - 1, k - 1)) +
                0.25 * (_rho(i, j, k) + _rho(i, j - 1, k));
-  float rho2 = 0.25 * (_rho(i, j, k - 1) + _rho(i - 1, j, k - 1)) +
+  _prec rho2 = 0.25 * (_rho(i, j, k - 1) + _rho(i - 1, j, k - 1)) +
                0.25 * (_rho(i, j, k) + _rho(i - 1, j, k));
-  float rho3 = 0.25 * (_rho(i, j, k) + _rho(i - 1, j, k)) +
+  _prec rho3 = 0.25 * (_rho(i, j, k) + _rho(i - 1, j, k)) +
                0.25 * (_rho(i, j - 1, k) + _rho(i - 1, j - 1, k));
 
-  float Ai1 = _f_1(i, j) * _g3_c(k) * rho1;
+  _prec Ai1 = _f_1(i, j) * _g3_c(k) * rho1;
   Ai1 = nu * 1.0 / Ai1;
-  float Ai2 = _f_2(i, j) * _g3_c(k) * rho2;
+  _prec Ai2 = _f_2(i, j) * _g3_c(k) * rho2;
   Ai2 = nu * 1.0 / Ai2;
-  float Ai3 = _f_c(i, j) * _g3(k) * rho3;
+  _prec Ai3 = _f_c(i, j) * _g3(k) * rho3;
   Ai3 = nu * 1.0 / Ai3;
-  float f_dcrj = _dcrjx(i) * _dcrjy(j) * _dcrjz(k);
+  _prec f_dcrj = _dcrjx(i) * _dcrjy(j) * _dcrjz(k);
   _u1(i, j, k) =
       (a * _u1(i, j, k) +
        Ai1 * (dhx4[2] * _f_c(i, j) * _g3_c(k) * _s11(i, j, k) +
@@ -1038,22 +1042,23 @@ __global__ void dtopo_vel_111(
 __launch_bounds__(DTOPO_VEL_112_MAX_THREADS_PER_BLOCK)
 
     __global__ void dtopo_vel_112(
-        float* RSTRCT u1, float* RSTRCT u2, float* RSTRCT u3,
-        const float* RSTRCT dcrjx, const float* RSTRCT dcrjy,
-        const float* RSTRCT dcrjz, const float* RSTRCT f,
-        const float* RSTRCT f1_1, const float* RSTRCT f1_2,
-        const float* RSTRCT f1_c, const float* RSTRCT f2_1,
-        const float* RSTRCT f2_2, const float* RSTRCT f2_c,
-        const float* RSTRCT f_1, const float* RSTRCT f_2,
-        const float* RSTRCT f_c, const float* RSTRCT g,
-        const float* RSTRCT g3, const float* RSTRCT g3_c,
-        const float* RSTRCT g_c, const float* RSTRCT rho,
-        const float* RSTRCT s11, const float* RSTRCT s12,
-        const float* RSTRCT s13, const float* RSTRCT s22,
-        const float* RSTRCT s23, const float* RSTRCT s33,
-        const float a, const float nu, const int nx, const int ny, const int nz,
+        _prec* RSTRCT u1, _prec* RSTRCT u2, _prec* RSTRCT u3,
+        const _prec* RSTRCT dcrjx, const _prec* RSTRCT dcrjy,
+        const _prec* RSTRCT dcrjz, const _prec* RSTRCT f,
+        const _prec* RSTRCT f1_1, const _prec* RSTRCT f1_2,
+        const _prec* RSTRCT f1_c, const _prec* RSTRCT f2_1,
+        const _prec* RSTRCT f2_2, const _prec* RSTRCT f2_c,
+        const _prec* RSTRCT f_1, const _prec* RSTRCT f_2,
+        const _prec* RSTRCT f_c, const _prec* RSTRCT g,
+        const _prec* RSTRCT g3, const _prec* RSTRCT g3_c,
+        const _prec* RSTRCT g_c, const _prec* RSTRCT rho,
+        const _prec* RSTRCT s11, const _prec* RSTRCT s12,
+        const _prec* RSTRCT s13, const _prec* RSTRCT s22,
+        const _prec* RSTRCT s23, const _prec* RSTRCT s33,
+        const _prec a, const _prec nu, const int nx, const int ny, const int nz,
         const int bi, const int bj, const int ei, const int ej) {
-        const float dhpz4r[6][9] = {
+#if APPLY_BC
+        const _prec dhpz4r[6][9] = {
             {-1.5373923010673118, -1.1059180740634813, -0.2134752473866528,
              -0.0352027995732726, -0.0075022330101095, -0.0027918394266035,
              0.0000000000000000, 0.0000000000000000, 0.0000000000000000},
@@ -1072,15 +1077,15 @@ __launch_bounds__(DTOPO_VEL_112_MAX_THREADS_PER_BLOCK)
             {0.0020323834153791, 0.0002106933140862, -0.0013351454085978,
              -0.0938400881871787, 0.6816971139746001, -0.0002232904416222,
              -0.6796875000000000, 0.0937500000000000, -0.0026041666666667}};
-        const float phx4[4] = {-0.0625000000000000, 0.5625000000000000,
+        const _prec phx4[4] = {-0.0625000000000000, 0.5625000000000000,
                                0.5625000000000000, -0.0625000000000000};
-        const float phy4[4] = {-0.0625000000000000, 0.5625000000000000,
+        const _prec phy4[4] = {-0.0625000000000000, 0.5625000000000000,
                                0.5625000000000000, -0.0625000000000000};
-        const float dhy4[4] = {0.0416666666666667, -1.1250000000000000,
+        const _prec dhy4[4] = {0.0416666666666667, -1.1250000000000000,
                                1.1250000000000000, -0.0416666666666667};
-        const float dhx4[4] = {0.0416666666666667, -1.1250000000000000,
+        const _prec dhx4[4] = {0.0416666666666667, -1.1250000000000000,
                                1.1250000000000000, -0.0416666666666667};
-        const float dhz4r[6][8] = {
+        const _prec dhz4r[6][8] = {
             {0.0000000000000000, -1.4511412472637157, -1.8534237417911470,
              0.3534237417911469, 0.0488587527362844, 0.0000000000000000,
              0.0000000000000000, 0.0000000000000000},
@@ -1099,15 +1104,15 @@ __launch_bounds__(DTOPO_VEL_112_MAX_THREADS_PER_BLOCK)
             {0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
              0.0000000000000000, -0.0416666666666667, 1.1250000000000000,
              -1.1250000000000000, 0.0416666666666667}};
-        const float px4[4] = {-0.0625000000000000, 0.5625000000000000,
+        const _prec px4[4] = {-0.0625000000000000, 0.5625000000000000,
                               0.5625000000000000, -0.0625000000000000};
-        const float py4[4] = {-0.0625000000000000, 0.5625000000000000,
+        const _prec py4[4] = {-0.0625000000000000, 0.5625000000000000,
                               0.5625000000000000, -0.0625000000000000};
-        const float dx4[4] = {0.0416666666666667, -1.1250000000000000,
+        const _prec dx4[4] = {0.0416666666666667, -1.1250000000000000,
                               1.1250000000000000, -0.0416666666666667};
-        const float dy4[4] = {0.0416666666666667, -1.1250000000000000,
+        const _prec dy4[4] = {0.0416666666666667, -1.1250000000000000,
                               1.1250000000000000, -0.0416666666666667};
-        const float dphz4r[6][9] = {
+        const _prec dphz4r[6][9] = {
             {0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
              0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
              0.0000000000000000, 0.0000000000000000, 0.0000000000000000},
@@ -1126,7 +1131,7 @@ __launch_bounds__(DTOPO_VEL_112_MAX_THREADS_PER_BLOCK)
             {0.0000000000000000, -0.0040378273193044, 0.0064139372778371,
              -0.0890062133451850, 0.6749219241340761, 0.0002498459192428,
              -0.6796875000000000, 0.0937500000000000, -0.0026041666666667}};
-        const float dz4r[6][7] = {
+        const _prec dz4r[6][7] = {
             {0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
              0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
              0.0000000000000000},
@@ -1145,6 +1150,101 @@ __launch_bounds__(DTOPO_VEL_112_MAX_THREADS_PER_BLOCK)
             {0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
              -0.0416666666666667, 1.1250000000000000, -1.1250000000000000,
              0.0416666666666667}};
+
+#else
+  const _prec dhpz4r[6][9] = {
+      {1.4276800979942257, -0.2875185051606178, -2.0072491465276454,
+       0.8773816261307504, -0.0075022330101095, -0.0027918394266035,
+       0.0000000000000000, 0.0000000000000000, 0.0000000000000000},
+      {0.8139439685257414, 0.1273679143938725, -1.1932750007455710,
+       0.1475120181828087, 0.1125814499297686, -0.0081303502866204,
+       0.0000000000000000, 0.0000000000000000, 0.0000000000000000},
+      {0.1639182541610305, 0.3113839909089030, -0.0536007135209480,
+       -0.3910958927076030, -0.0401741813821989, 0.0095685425408165,
+       0.0000000000000000, 0.0000000000000000, 0.0000000000000000},
+      {0.0171478318814576, -0.0916600077207278, 0.7187220404622645,
+       -0.1434031863528334, -0.5827389738506837, 0.0847863081664324,
+       -0.0028540125859095, 0.0000000000000000, 0.0000000000000000},
+      {-0.0579176640853654, 0.0022069616616207, 0.0108792602269819,
+       0.6803612607837533, -0.0530169938441240, -0.6736586580761996,
+       0.0937500000000000, -0.0026041666666667, 0.0000000000000000},
+      {0.0020323834153791, 0.0002106933140862, -0.0013351454085978,
+       -0.0938400881871787, 0.6816971139746001, -0.0002232904416222,
+       -0.6796875000000000, 0.0937500000000000, -0.0026041666666667}};
+  const _prec phx4[4] = {-0.0625000000000000, 0.5625000000000000,
+                         0.5625000000000000, -0.0625000000000000};
+  const _prec phy4[4] = {-0.0625000000000000, 0.5625000000000000,
+                         0.5625000000000000, -0.0625000000000000};
+  const _prec dhy4[4] = {0.0416666666666667, -1.1250000000000000,
+                         1.1250000000000000, -0.0416666666666667};
+  const _prec dhx4[4] = {0.0416666666666667, -1.1250000000000000,
+                         1.1250000000000000, -0.0416666666666667};
+  const _prec dhz4r[6][8] = {
+      {0.0000000000000000, 1.4511412472637157, -1.8534237417911470,
+       0.3534237417911469, 0.0488587527362844, 0.0000000000000000,
+       0.0000000000000000, 0.0000000000000000},
+      {0.0000000000000000, 0.8577143189081458, -0.5731429567244373,
+       -0.4268570432755628, 0.1422856810918542, 0.0000000000000000,
+       0.0000000000000000, 0.0000000000000000},
+      {0.0000000000000000, 0.1674548505882877, 0.4976354482351368,
+       -0.4976354482351368, -0.1674548505882877, 0.0000000000000000,
+       0.0000000000000000, 0.0000000000000000},
+      {0.0000000000000000, -0.1027061113405124, 0.2624541326469860,
+       0.8288742701021167, -1.0342864927831414, 0.0456642013745513,
+       0.0000000000000000, 0.0000000000000000},
+      {0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
+       -0.0416666666666667, 1.1250000000000000, -1.1250000000000000,
+       0.0416666666666667, 0.0000000000000000},
+      {0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
+       0.0000000000000000, -0.0416666666666667, 1.1250000000000000,
+       -1.1250000000000000, 0.0416666666666667}};
+  const _prec px4[4] = {-0.0625000000000000, 0.5625000000000000,
+                        0.5625000000000000, -0.0625000000000000};
+  const _prec py4[4] = {-0.0625000000000000, 0.5625000000000000,
+                        0.5625000000000000, -0.0625000000000000};
+  const _prec dx4[4] = {0.0416666666666667, -1.1250000000000000,
+                        1.1250000000000000, -0.0416666666666667};
+  const _prec dy4[4] = {0.0416666666666667, -1.1250000000000000,
+                        1.1250000000000000, -0.0416666666666667};
+  const _prec dphz4r[6][9] = {
+      {0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
+       0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
+       0.0000000000000000, 0.0000000000000000, 0.0000000000000000},
+      {0.0000000000000000, 1.3764648947859957, -1.8523239861274132,
+       0.5524268681758197, -0.0537413571133823, -0.0228264197210198,
+       0.0000000000000000, 0.0000000000000000, 0.0000000000000000},
+      {0.0000000000000000, 0.4428256655817484, -0.0574614517751294,
+       -0.2022259589759502, -0.1944663890497050, 0.0113281342190362,
+       0.0000000000000000, 0.0000000000000000, 0.0000000000000000},
+      {0.0000000000000000, -0.3360140866060758, 1.2113298407847195,
+       -0.3111668377093505, -0.6714462506479002, 0.1111440843153523,
+       -0.0038467501367455, 0.0000000000000000, 0.0000000000000000},
+      {0.0000000000000000, 0.0338560531369653, -0.0409943223643902,
+       0.5284757132923059, 0.0115571196122084, -0.6162252315536446,
+       0.0857115441015996, -0.0023808762250444, 0.0000000000000000},
+      {0.0000000000000000, -0.0040378273193044, 0.0064139372778371,
+       -0.0890062133451850, 0.6749219241340761, 0.0002498459192428,
+       -0.6796875000000000, 0.0937500000000000, -0.0026041666666667}};
+  const _prec dz4r[6][7] = {
+      {0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
+       0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
+       0.0000000000000000},
+      {1.7779989465546748, -1.3337480247900155, -0.7775013168066564,
+       0.3332503950419969, 0.0000000000000000, 0.0000000000000000,
+       0.0000000000000000},
+      {0.4410217341392059, 0.1730842484889890, -0.4487228323259926,
+       -0.1653831503022022, 0.0000000000000000, 0.0000000000000000,
+       0.0000000000000000},
+      {-0.1798793213882701, 0.2757257254150788, 0.9597948548284453,
+       -1.1171892610431817, 0.0615480021879277, 0.0000000000000000,
+       0.0000000000000000},
+      {-0.0153911381507088, -0.0568851455503591, 0.1998976464597171,
+       0.8628231468598346, -1.0285385292191949, 0.0380940196007109,
+       0.0000000000000000},
+      {0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
+       -0.0416666666666667, 1.1250000000000000, -1.1250000000000000,
+       0.0416666666666667}};
+#endif
         const int i = threadIdx.z + blockIdx.z * blockDim.z + bi;
         if (i >= nx) return;
         if (i >= ei) return;
@@ -1232,20 +1332,20 @@ __launch_bounds__(DTOPO_VEL_112_MAX_THREADS_PER_BLOCK)
            (2 * align + nz) * ((j) + ngsl + 2)]
 
         int kb = nz - k - 2;
-        float rho1 = 0.25 * (_rho(i, j, kb + 0) + _rho(i, j - 1, kb + 0)) +
+        _prec rho1 = 0.25 * (_rho(i, j, kb + 0) + _rho(i, j - 1, kb + 0)) +
                      0.25 * (_rho(i, j, kb + 1) + _rho(i, j - 1, kb + 1));
-        float rho2 = 0.25 * (_rho(i, j, kb + 0) + _rho(i - 1, j, kb + 0)) +
-                     0.25 * (_rho(i, j, kb + 1) + _rho(i - 1, j, kb + 1));
-        float rho3 = 0.25 * (_rho(i, j, kb + 1) + _rho(i - 1, j, kb + 1)) +
-                     0.25 * (_rho(i, j - 1, kb + 1) + _rho(i - 1, j - 1, kb + 1));
+        _prec rho2 = 0.25 * (_rho(i, j, kb + 0) + _rho(i + 1, j, kb + 0)) +
+                     0.25 * (_rho(i, j, kb + 1) + _rho(i + 1, j, kb + 1));
+        _prec rho3 = 0.25 * (_rho(i, j, kb + 1) + _rho(i + 1, j, kb + 1)) +
+                     0.25 * (_rho(i, j - 1, kb + 1) + _rho(i + 1, j - 1, kb + 1));
 
-        float Ai1 = _f_1(i, j) * _g3_c(nz - 1 - k) * rho1;
+        _prec Ai1 = _f_1(i, j) * _g3_c(nz - 1 - k) * rho1;
         Ai1 = nu * 1.0 / Ai1;
-        float Ai2 = _f_2(i, j) * _g3_c(nz - 1 - k) * rho2;
+        _prec Ai2 = _f_2(i, j) * _g3_c(nz - 1 - k) * rho2;
         Ai2 = nu * 1.0 / Ai2;
-        float Ai3 = _f_c(i, j) * _g3(nz - 1 - k) * rho3;
+        _prec Ai3 = _f_c(i, j) * _g3(nz - 1 - k) * rho3;
         Ai3 = nu * 1.0 / Ai3;
-        float f_dcrj = _dcrjx(i) * _dcrjy(j) * _dcrjz(nz - 1 - k);
+        _prec f_dcrj = _dcrjx(i) * _dcrjy(j) * _dcrjz(nz - 1 - k);
         _u1(i, j, nz - 1 - k) =
             (a * _u1(i, j, nz - 1 - k) +
              Ai1 * (dhx4[2] * _f_c(i, j) * _g3_c(nz - 1 - k) *
@@ -1628,24 +1728,24 @@ __launch_bounds__(DTOPO_VEL_112_MAX_THREADS_PER_BLOCK)
 __launch_bounds__(DTOPO_BUF_VEL_110_MAX_THREADS_PER_BLOCK)
 
     __global__ void dtopo_buf_vel_110(
-        float* RSTRCT buf_u1, float* RSTRCT buf_u2,
-        float* RSTRCT buf_u3, const float* RSTRCT dcrjx,
-        const float* RSTRCT dcrjy, const float* RSTRCT dcrjz,
-        const float* RSTRCT f, const float* RSTRCT f1_1,
-        const float* RSTRCT f1_2, const float* RSTRCT f1_c,
-        const float* RSTRCT f2_1, const float* RSTRCT f2_2,
-        const float* RSTRCT f2_c, const float* RSTRCT f_1,
-        const float* RSTRCT f_2, const float* RSTRCT f_c,
-        const float* RSTRCT g, const float* RSTRCT g3,
-        const float* RSTRCT g3_c, const float* RSTRCT g_c,
-        const float* RSTRCT rho, const float* RSTRCT s11,
-        const float* RSTRCT s12, const float* RSTRCT s13,
-        const float* RSTRCT s22, const float* RSTRCT s23,
-        const float* RSTRCT s33, const float* RSTRCT u1,
-        const float* RSTRCT u2, const float* RSTRCT u3,
-        const float a, const float nu, const int nx, const int ny, const int nz,
+        _prec* RSTRCT buf_u1, _prec* RSTRCT buf_u2,
+        _prec* RSTRCT buf_u3, const _prec* RSTRCT dcrjx,
+        const _prec* RSTRCT dcrjy, const _prec* RSTRCT dcrjz,
+        const _prec* RSTRCT f, const _prec* RSTRCT f1_1,
+        const _prec* RSTRCT f1_2, const _prec* RSTRCT f1_c,
+        const _prec* RSTRCT f2_1, const _prec* RSTRCT f2_2,
+        const _prec* RSTRCT f2_c, const _prec* RSTRCT f_1,
+        const _prec* RSTRCT f_2, const _prec* RSTRCT f_c,
+        const _prec* RSTRCT g, const _prec* RSTRCT g3,
+        const _prec* RSTRCT g3_c, const _prec* RSTRCT g_c,
+        const _prec* RSTRCT rho, const _prec* RSTRCT s11,
+        const _prec* RSTRCT s12, const _prec* RSTRCT s13,
+        const _prec* RSTRCT s22, const _prec* RSTRCT s23,
+        const _prec* RSTRCT s33, const _prec* RSTRCT u1,
+        const _prec* RSTRCT u2, const _prec* RSTRCT u3,
+        const _prec a, const _prec nu, const int nx, const int ny, const int nz,
         const int bj, const int ej, const int rj0) {
-        const float phz2l[6][7] = {
+        const _prec phz2l[6][7] = {
             {1.0000000000000000, 0.0000000000000000, 0.0000000000000000,
              0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
              0.0000000000000000},
@@ -1664,9 +1764,9 @@ __launch_bounds__(DTOPO_BUF_VEL_110_MAX_THREADS_PER_BLOCK)
             {0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
              0.0000000000000000, 0.5000000000000000, 0.5000000000000000,
              0.0000000000000000}};
-        const float phy2[2] = {0.5000000000000000, 0.5000000000000000};
-        const float phx2[2] = {0.5000000000000000, 0.5000000000000000};
-        const float dhpz4l[6][9] = {
+        const _prec phy2[2] = {0.5000000000000000, 0.5000000000000000};
+        const _prec phx2[2] = {0.5000000000000000, 0.5000000000000000};
+        const _prec dhpz4l[6][9] = {
             {-1.4276800979942257, 0.2875185051606178, 2.0072491465276454,
              -0.8773816261307504, 0.0075022330101095, 0.0027918394266035,
              0.0000000000000000, 0.0000000000000000, 0.0000000000000000},
@@ -1685,15 +1785,15 @@ __launch_bounds__(DTOPO_BUF_VEL_110_MAX_THREADS_PER_BLOCK)
             {-0.0020323834153791, -0.0002106933140862, 0.0013351454085978,
              0.0938400881871787, -0.6816971139746001, 0.0002232904416222,
              0.6796875000000000, -0.0937500000000000, 0.0026041666666667}};
-        const float phx4[4] = {-0.0625000000000000, 0.5625000000000000,
+        const _prec phx4[4] = {-0.0625000000000000, 0.5625000000000000,
                                0.5625000000000000, -0.0625000000000000};
-        const float phy4[4] = {-0.0625000000000000, 0.5625000000000000,
+        const _prec phy4[4] = {-0.0625000000000000, 0.5625000000000000,
                                0.5625000000000000, -0.0625000000000000};
-        const float dhy4[4] = {0.0416666666666667, -1.1250000000000000,
+        const _prec dhy4[4] = {0.0416666666666667, -1.1250000000000000,
                                1.1250000000000000, -0.0416666666666667};
-        const float dhx4[4] = {0.0416666666666667, -1.1250000000000000,
+        const _prec dhx4[4] = {0.0416666666666667, -1.1250000000000000,
                                1.1250000000000000, -0.0416666666666667};
-        const float dhz4l[6][7] = {
+        const _prec dhz4l[6][7] = {
             {-1.4511412472637157, 1.8534237417911470, -0.3534237417911469,
              -0.0488587527362844, 0.0000000000000000, 0.0000000000000000,
              0.0000000000000000},
@@ -1712,15 +1812,15 @@ __launch_bounds__(DTOPO_BUF_VEL_110_MAX_THREADS_PER_BLOCK)
             {0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
              0.0416666666666667, -1.1250000000000000, 1.1250000000000000,
              -0.0416666666666667}};
-        const float px4[4] = {-0.0625000000000000, 0.5625000000000000,
+        const _prec px4[4] = {-0.0625000000000000, 0.5625000000000000,
                               0.5625000000000000, -0.0625000000000000};
-        const float py4[4] = {-0.0625000000000000, 0.5625000000000000,
+        const _prec py4[4] = {-0.0625000000000000, 0.5625000000000000,
                               0.5625000000000000, -0.0625000000000000};
-        const float dx4[4] = {0.0416666666666667, -1.1250000000000000,
+        const _prec dx4[4] = {0.0416666666666667, -1.1250000000000000,
                               1.1250000000000000, -0.0416666666666667};
-        const float dy4[4] = {0.0416666666666667, -1.1250000000000000,
+        const _prec dy4[4] = {0.0416666666666667, -1.1250000000000000,
                               1.1250000000000000, -0.0416666666666667};
-        const float dphz4l[6][9] = {
+        const _prec dphz4l[6][9] = {
             {-1.3764648947859957, 1.8523239861274134, -0.5524268681758195,
              0.0537413571133823, 0.0228264197210198, 0.0000000000000000,
              0.0000000000000000, 0.0000000000000000, 0.0000000000000000},
@@ -1739,7 +1839,7 @@ __launch_bounds__(DTOPO_BUF_VEL_110_MAX_THREADS_PER_BLOCK)
             {0.0000000000000000, 0.0000000000000000, -0.0026041666666667,
              0.0937500000000000, -0.6796875000000000, 0.0000000000000000,
              0.6796875000000000, -0.0937500000000000, 0.0026041666666667}};
-        const float dz4l[6][8] = {
+        const _prec dz4l[6][8] = {
             {-1.7779989465546748, 1.3337480247900155, 0.7775013168066564,
              -0.3332503950419969, 0.0000000000000000, 0.0000000000000000,
              0.0000000000000000, 0.0000000000000000},
@@ -1851,7 +1951,7 @@ __launch_bounds__(DTOPO_BUF_VEL_110_MAX_THREADS_PER_BLOCK)
         u3[(k) + align +                                               \
            (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
            (2 * align + nz) * ((j) + ngsl + 2)]
-        float rho1 = phz2l[k][0] * (phy2[1] * _rho(i, j + rj0, 0) +
+        _prec rho1 = phz2l[k][0] * (phy2[1] * _rho(i, j + rj0, 0) +
                                     phy2[0] * _rho(i, j + rj0 - 1, 0)) +
                      phz2l[k][1] * (phy2[1] * _rho(i, j + rj0, 1) +
                                     phy2[0] * _rho(i, j + rj0 - 1, 1)) +
@@ -1865,7 +1965,7 @@ __launch_bounds__(DTOPO_BUF_VEL_110_MAX_THREADS_PER_BLOCK)
                                     phy2[0] * _rho(i, j + rj0 - 1, 5)) +
                      phz2l[k][6] * (phy2[1] * _rho(i, j + rj0, 6) +
                                     phy2[0] * _rho(i, j + rj0 - 1, 6));
-        float rho2 = phz2l[k][0] * (phx2[1] * _rho(i, j + rj0, 0) +
+        _prec rho2 = phz2l[k][0] * (phx2[1] * _rho(i, j + rj0, 0) +
                                     phx2[0] * _rho(i - 1, j + rj0, 0)) +
                      phz2l[k][1] * (phx2[1] * _rho(i, j + rj0, 1) +
                                     phx2[0] * _rho(i - 1, j + rj0, 1)) +
@@ -1879,17 +1979,17 @@ __launch_bounds__(DTOPO_BUF_VEL_110_MAX_THREADS_PER_BLOCK)
                                     phx2[0] * _rho(i - 1, j + rj0, 5)) +
                      phz2l[k][6] * (phx2[1] * _rho(i, j + rj0, 6) +
                                     phx2[0] * _rho(i - 1, j + rj0, 6));
-        float rho3 = phy2[1] * (phx2[1] * _rho(i, j + rj0, k) +
+        _prec rho3 = phy2[1] * (phx2[1] * _rho(i, j + rj0, k) +
                                 phx2[0] * _rho(i - 1, j + rj0, k)) +
                      phy2[0] * (phx2[1] * _rho(i, j + rj0 - 1, k) +
                                 phx2[0] * _rho(i - 1, j + rj0 - 1, k));
-        float Ai1 = _f_1(i, j + rj0) * _g3_c(k) * rho1;
+        _prec Ai1 = _f_1(i, j + rj0) * _g3_c(k) * rho1;
         Ai1 = nu * 1.0 / Ai1;
-        float Ai2 = _f_2(i, j + rj0) * _g3_c(k) * rho2;
+        _prec Ai2 = _f_2(i, j + rj0) * _g3_c(k) * rho2;
         Ai2 = nu * 1.0 / Ai2;
-        float Ai3 = _f_c(i, j + rj0) * _g3(k) * rho3;
+        _prec Ai3 = _f_c(i, j + rj0) * _g3(k) * rho3;
         Ai3 = nu * 1.0 / Ai3;
-        float f_dcrj = _dcrjx(i) * _dcrjy(j + rj0) * _dcrjz(k);
+        _prec f_dcrj = _dcrjx(i) * _dcrjy(j + rj0) * _dcrjz(k);
         _buf_u1(i, j, k) =
             (a * _u1(i, j + rj0, k) +
              Ai1 *
@@ -2275,50 +2375,50 @@ __launch_bounds__(DTOPO_BUF_VEL_110_MAX_THREADS_PER_BLOCK)
 __launch_bounds__(DTOPO_BUF_VEL_111_MAX_THREADS_PER_BLOCK)
 
     __global__ void dtopo_buf_vel_111(
-        float *RSTRCT buf_u1, float *RSTRCT buf_u2,
-        float *RSTRCT buf_u3, const float *RSTRCT dcrjx,
-        const float *RSTRCT dcrjy, const float *RSTRCT dcrjz,
-        const float *RSTRCT f, const float *RSTRCT f1_1,
-        const float *RSTRCT f1_2, const float *RSTRCT f1_c,
-        const float *RSTRCT f2_1, const float *RSTRCT f2_2,
-        const float *RSTRCT f2_c, const float *RSTRCT f_1,
-        const float *RSTRCT f_2, const float *RSTRCT f_c,
-        const float *RSTRCT g, const float *RSTRCT g3,
-        const float *RSTRCT g3_c, const float *RSTRCT g_c,
-        const float *RSTRCT rho, const float *RSTRCT s11,
-        const float *RSTRCT s12, const float *RSTRCT s13,
-        const float *RSTRCT s22, const float *RSTRCT s23,
-        const float *RSTRCT s33, const float *RSTRCT u1,
-        const float *RSTRCT u2, const float *RSTRCT u3,
-        const float a, const float nu, const int nx, const int ny, const int nz,
+        _prec *RSTRCT buf_u1, _prec *RSTRCT buf_u2,
+        _prec *RSTRCT buf_u3, const _prec *RSTRCT dcrjx,
+        const _prec *RSTRCT dcrjy, const _prec *RSTRCT dcrjz,
+        const _prec *RSTRCT f, const _prec *RSTRCT f1_1,
+        const _prec *RSTRCT f1_2, const _prec *RSTRCT f1_c,
+        const _prec *RSTRCT f2_1, const _prec *RSTRCT f2_2,
+        const _prec *RSTRCT f2_c, const _prec *RSTRCT f_1,
+        const _prec *RSTRCT f_2, const _prec *RSTRCT f_c,
+        const _prec *RSTRCT g, const _prec *RSTRCT g3,
+        const _prec *RSTRCT g3_c, const _prec *RSTRCT g_c,
+        const _prec *RSTRCT rho, const _prec *RSTRCT s11,
+        const _prec *RSTRCT s12, const _prec *RSTRCT s13,
+        const _prec *RSTRCT s22, const _prec *RSTRCT s23,
+        const _prec *RSTRCT s33, const _prec *RSTRCT u1,
+        const _prec *RSTRCT u2, const _prec *RSTRCT u3,
+        const _prec a, const _prec nu, const int nx, const int ny, const int nz,
         const int bj, const int ej, const int rj0) {
-  const float dhpz4[7] = {-0.0026041666666667, 0.0937500000000000,
+  const _prec dhpz4[7] = {-0.0026041666666667, 0.0937500000000000,
                           -0.6796875000000000, 0.0000000000000000,
                           0.6796875000000000,  -0.0937500000000000,
                           0.0026041666666667};
-  const float phx4[4] = {-0.0625000000000000, 0.5625000000000000,
+  const _prec phx4[4] = {-0.0625000000000000, 0.5625000000000000,
                          0.5625000000000000, -0.0625000000000000};
-  const float phy4[4] = {-0.0625000000000000, 0.5625000000000000,
+  const _prec phy4[4] = {-0.0625000000000000, 0.5625000000000000,
                          0.5625000000000000, -0.0625000000000000};
-  const float dhy4[4] = {0.0416666666666667, -1.1250000000000000,
+  const _prec dhy4[4] = {0.0416666666666667, -1.1250000000000000,
                          1.1250000000000000, -0.0416666666666667};
-  const float dhx4[4] = {0.0416666666666667, -1.1250000000000000,
+  const _prec dhx4[4] = {0.0416666666666667, -1.1250000000000000,
                          1.1250000000000000, -0.0416666666666667};
-  const float dhz4[4] = {0.0416666666666667, -1.1250000000000000,
+  const _prec dhz4[4] = {0.0416666666666667, -1.1250000000000000,
                          1.1250000000000000, -0.0416666666666667};
-  const float px4[4] = {-0.0625000000000000, 0.5625000000000000,
+  const _prec px4[4] = {-0.0625000000000000, 0.5625000000000000,
                         0.5625000000000000, -0.0625000000000000};
-  const float py4[4] = {-0.0625000000000000, 0.5625000000000000,
+  const _prec py4[4] = {-0.0625000000000000, 0.5625000000000000,
                         0.5625000000000000, -0.0625000000000000};
-  const float dx4[4] = {0.0416666666666667, -1.1250000000000000,
+  const _prec dx4[4] = {0.0416666666666667, -1.1250000000000000,
                         1.1250000000000000, -0.0416666666666667};
-  const float dy4[4] = {0.0416666666666667, -1.1250000000000000,
+  const _prec dy4[4] = {0.0416666666666667, -1.1250000000000000,
                         1.1250000000000000, -0.0416666666666667};
-  const float dphz4[7] = {-0.0026041666666667, 0.0937500000000000,
+  const _prec dphz4[7] = {-0.0026041666666667, 0.0937500000000000,
                           -0.6796875000000000, 0.0000000000000000,
                           0.6796875000000000,  -0.0937500000000000,
                           0.0026041666666667};
-  const float dz4[4] = {0.0416666666666667, -1.1250000000000000,
+  const _prec dz4[4] = {0.0416666666666667, -1.1250000000000000,
                         1.1250000000000000, -0.0416666666666667};
 
   int dm_offset = 3;
@@ -2417,19 +2517,20 @@ __launch_bounds__(DTOPO_BUF_VEL_111_MAX_THREADS_PER_BLOCK)
 #define _u3(i, j, k)                                                           \
   u3[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
      (2 * align + nz) * ((j) + ngsl + 2)]
-  float rho1 = 0.25 * (_rho(i, j + rj0, k - 1) + _rho(i, j + rj0 - 1, k - 1)) +
+  _prec rho1 = 0.25 * (_rho(i, j + rj0, k - 1) + _rho(i, j + rj0 - 1, k - 1)) +
                0.25 * (_rho(i, j + rj0, k) + _rho(i, j + rj0 - 1, k));
-  float rho2 = 0.25 * (_rho(i, j + rj0, k - 1) + _rho(i - 1, j + rj0, k - 1)) +
-               0.25 * (_rho(i, j + rj0, k) + _rho(i - 1, j + rj0, k));
-  float rho3 = 0.25 * (_rho(i, j + rj0, k) + _rho(i - 1, j + rj0, k)) +
-               0.25 * (_rho(i, j + rj0 - 1, k) + _rho(i - 1, j + rj0 - 1, k));
-  float Ai1 = _f_1(i, j + rj0) * _g3_c(k) * rho1;
+  _prec rho2 = 0.25 * (_rho(i, j + rj0, k - 1) + _rho(i + 1, j + rj0, k - 1)) +
+               0.25 * (_rho(i, j + rj0, k) + _rho(i + 1, j + rj0, k));
+  _prec rho3 = 0.25 * (_rho(i, j + rj0, k) + _rho(i + 1, j + rj0, k)) +
+               0.25 * (_rho(i, j + rj0 - 1, k) + _rho(i + 1, j + rj0 - 1, k));
+
+  _prec Ai1 = _f_1(i, j + rj0) * _g3_c(k) * rho1;
   Ai1 = nu * 1.0 / Ai1;
-  float Ai2 = _f_2(i, j + rj0) * _g3_c(k) * rho2;
+  _prec Ai2 = _f_2(i, j + rj0) * _g3_c(k) * rho2;
   Ai2 = nu * 1.0 / Ai2;
-  float Ai3 = _f_c(i, j + rj0) * _g3(k) * rho3;
+  _prec Ai3 = _f_c(i, j + rj0) * _g3(k) * rho3;
   Ai3 = nu * 1.0 / Ai3;
-  float f_dcrj = _dcrjx(i) * _dcrjy(j + rj0) * _dcrjz(k);
+  _prec f_dcrj = _dcrjx(i) * _dcrjy(j + rj0) * _dcrjz(k);
   _buf_u1(i, j, k) =
       (a * _u1(i, j + rj0, k) +
        Ai1 *
@@ -2519,6 +2620,26 @@ __launch_bounds__(DTOPO_BUF_VEL_111_MAX_THREADS_PER_BLOCK)
                       phy4[1] * _s12(i, j + rj0 - 1, k + 3) +
                       phy4[3] * _s12(i, j + rj0 + 1, k + 3))))) *
       f_dcrj;
+
+ if (k <  OVERLAP_ZONE_INDEX) {
+  _buf_u1(i, j, k) =
+      (a * _u1(i, j + rj0, k) +
+       nu / rho1 *
+           (dhx4[2] * _s11(i, j + rj0, k) +
+            dhx4[0] * _s11(i - 2, j + rj0, k) +
+            dhx4[1] * _s11(i - 1, j + rj0, k) +
+            dhx4[3] * _s11(i + 1, j + rj0, k) +
+            dhy4[2] * _s12(i, j + rj0, k) +
+            dhy4[0] * _s12(i, j + rj0 - 2, k) +
+            dhy4[1] * _s12(i, j + rj0 - 1, k) +
+            dhy4[3] * _s12(i, j + rj0 + 1, k) +
+            dhz4[2] * _s13(i, j + rj0, k) + 
+            dhz4[0] * _s13(i, j + rj0, k - 2) +
+            dhz4[1] * _s13(i, j + rj0, k - 1) +
+            dhz4[3] * _s13(i, j + rj0, k + 1)
+            )) * f_dcrj;
+  }
+
   _buf_u2(i, j, k) =
       (a * _u2(i, j + rj0, k) +
        Ai2 *
@@ -2604,6 +2725,26 @@ __launch_bounds__(DTOPO_BUF_VEL_111_MAX_THREADS_PER_BLOCK)
                                       py4[2] * _s22(i, j + rj0 + 1, k + 3) +
                                       py4[3] * _s22(i, j + rj0 + 2, k + 3))))) *
       f_dcrj;
+
+ if ( k  < OVERLAP_ZONE_INDEX) {
+  _buf_u2(i, j, k) =
+      (a * _u2(i, j + rj0, k) +
+       nu / rho2 *
+           (dhz4[2] * _s23(i, j + rj0, k) + 
+            dhz4[0] * _s23(i, j + rj0, k - 2) +
+            dhz4[1] * _s23(i, j + rj0, k - 1) +
+            dhz4[3] * _s23(i, j + rj0, k + 1) +
+            dx4[1] * _s12(i, j + rj0, k) +
+            dx4[0] * _s12(i - 1, j + rj0, k) +
+            dx4[2] * _s12(i + 1, j + rj0, k) +
+            dx4[3] * _s12(i + 2, j + rj0, k) +
+            dy4[1] * _s22(i, j + rj0, k) +
+            dy4[0] * _s22(i, j + rj0 - 1, k) +
+            dy4[2] * _s22(i, j + rj0 + 1, k) +
+            dy4[3] * _s22(i, j + rj0 + 2, k))
+      ) * f_dcrj;
+ }
+
   _buf_u3(i, j, k) =
       (a * _u3(i, j + rj0, k) +
        Ai3 *
@@ -2690,6 +2831,26 @@ __launch_bounds__(DTOPO_BUF_VEL_111_MAX_THREADS_PER_BLOCK)
                       phy4[1] * _s23(i, j + rj0 - 1, k + 3) +
                       phy4[3] * _s23(i, j + rj0 + 1, k + 3))))) *
       f_dcrj;
+
+ if ( k  < OVERLAP_ZONE_INDEX) {
+  _buf_u3(i, j, k) =
+      (a * _u3(i, j + rj0, k) +
+       nu / rho3 *
+           (dhy4[2] * _s23(i, j + rj0, k) +
+            dhy4[0] * _s23(i, j + rj0 - 2, k) +
+            dhy4[1] * _s23(i, j + rj0 - 1, k) +
+            dhy4[3] * _s23(i, j + rj0 + 1, k) +
+            dx4[1] * _s13(i, j + rj0, k) +
+            dx4[0] * _s13(i - 1, j + rj0, k) +
+            dx4[2] * _s13(i + 1, j + rj0, k) +
+            dx4[3] * _s13(i + 2, j + rj0, k) +
+            dz4[1] * _s33(i, j + rj0, k) + 
+            dz4[0] * _s33(i, j + rj0, k - 1) +
+            dz4[2] * _s33(i, j + rj0, k + 1) +
+            dz4[3] * _s33(i, j + rj0, k + 2)
+           ) ) * f_dcrj;
+ }
+
 #undef _buf_u1
 #undef _buf_u2
 #undef _buf_u3
@@ -2725,24 +2886,24 @@ __launch_bounds__(DTOPO_BUF_VEL_111_MAX_THREADS_PER_BLOCK)
 __launch_bounds__(DTOPO_BUF_VEL_112_MAX_THREADS_PER_BLOCK)
 
     __global__ void dtopo_buf_vel_112(
-        float* RSTRCT buf_u1, float* RSTRCT buf_u2,
-        float* RSTRCT buf_u3, const float* RSTRCT dcrjx,
-        const float* RSTRCT dcrjy, const float* RSTRCT dcrjz,
-        const float* RSTRCT f, const float* RSTRCT f1_1,
-        const float* RSTRCT f1_2, const float* RSTRCT f1_c,
-        const float* RSTRCT f2_1, const float* RSTRCT f2_2,
-        const float* RSTRCT f2_c, const float* RSTRCT f_1,
-        const float* RSTRCT f_2, const float* RSTRCT f_c,
-        const float* RSTRCT g, const float* RSTRCT g3,
-        const float* RSTRCT g3_c, const float* RSTRCT g_c,
-        const float* RSTRCT rho, const float* RSTRCT s11,
-        const float* RSTRCT s12, const float* RSTRCT s13,
-        const float* RSTRCT s22, const float* RSTRCT s23,
-        const float* RSTRCT s33, const float* RSTRCT u1,
-        const float* RSTRCT u2, const float* RSTRCT u3,
-        const float a, const float nu, const int nx, const int ny, const int nz,
+        _prec* RSTRCT buf_u1, _prec* RSTRCT buf_u2,
+        _prec* RSTRCT buf_u3, const _prec* RSTRCT dcrjx,
+        const _prec* RSTRCT dcrjy, const _prec* RSTRCT dcrjz,
+        const _prec* RSTRCT f, const _prec* RSTRCT f1_1,
+        const _prec* RSTRCT f1_2, const _prec* RSTRCT f1_c,
+        const _prec* RSTRCT f2_1, const _prec* RSTRCT f2_2,
+        const _prec* RSTRCT f2_c, const _prec* RSTRCT f_1,
+        const _prec* RSTRCT f_2, const _prec* RSTRCT f_c,
+        const _prec* RSTRCT g, const _prec* RSTRCT g3,
+        const _prec* RSTRCT g3_c, const _prec* RSTRCT g_c,
+        const _prec* RSTRCT rho, const _prec* RSTRCT s11,
+        const _prec* RSTRCT s12, const _prec* RSTRCT s13,
+        const _prec* RSTRCT s22, const _prec* RSTRCT s23,
+        const _prec* RSTRCT s33, const _prec* RSTRCT u1,
+        const _prec* RSTRCT u2, const _prec* RSTRCT u3,
+        const _prec a, const _prec nu, const int nx, const int ny, const int nz,
         const int bj, const int ej, const int rj0) {
-        const float dhpz4r[6][9] = {
+        const _prec dhpz4r[6][9] = {
             {-1.5373923010673118, -1.1059180740634813, -0.2134752473866528,
              -0.0352027995732726, -0.0075022330101095, -0.0027918394266035,
              0.0000000000000000, 0.0000000000000000, 0.0000000000000000},
@@ -2761,15 +2922,15 @@ __launch_bounds__(DTOPO_BUF_VEL_112_MAX_THREADS_PER_BLOCK)
             {0.0020323834153791, 0.0002106933140862, -0.0013351454085978,
              -0.0938400881871787, 0.6816971139746001, -0.0002232904416222,
              -0.6796875000000000, 0.0937500000000000, -0.0026041666666667}};
-        const float phx4[4] = {-0.0625000000000000, 0.5625000000000000,
+        const _prec phx4[4] = {-0.0625000000000000, 0.5625000000000000,
                                0.5625000000000000, -0.0625000000000000};
-        const float phy4[4] = {-0.0625000000000000, 0.5625000000000000,
+        const _prec phy4[4] = {-0.0625000000000000, 0.5625000000000000,
                                0.5625000000000000, -0.0625000000000000};
-        const float dhy4[4] = {0.0416666666666667, -1.1250000000000000,
+        const _prec dhy4[4] = {0.0416666666666667, -1.1250000000000000,
                                1.1250000000000000, -0.0416666666666667};
-        const float dhx4[4] = {0.0416666666666667, -1.1250000000000000,
+        const _prec dhx4[4] = {0.0416666666666667, -1.1250000000000000,
                                1.1250000000000000, -0.0416666666666667};
-        const float dhz4r[6][8] = {
+        const _prec dhz4r[6][8] = {
             {0.0000000000000000, -1.4511412472637157, -1.8534237417911470,
              0.3534237417911469, 0.0488587527362844, 0.0000000000000000,
              0.0000000000000000, 0.0000000000000000},
@@ -2788,15 +2949,15 @@ __launch_bounds__(DTOPO_BUF_VEL_112_MAX_THREADS_PER_BLOCK)
             {0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
              0.0000000000000000, -0.0416666666666667, 1.1250000000000000,
              -1.1250000000000000, 0.0416666666666667}};
-        const float px4[4] = {-0.0625000000000000, 0.5625000000000000,
+        const _prec px4[4] = {-0.0625000000000000, 0.5625000000000000,
                               0.5625000000000000, -0.0625000000000000};
-        const float py4[4] = {-0.0625000000000000, 0.5625000000000000,
+        const _prec py4[4] = {-0.0625000000000000, 0.5625000000000000,
                               0.5625000000000000, -0.0625000000000000};
-        const float dx4[4] = {0.0416666666666667, -1.1250000000000000,
+        const _prec dx4[4] = {0.0416666666666667, -1.1250000000000000,
                               1.1250000000000000, -0.0416666666666667};
-        const float dy4[4] = {0.0416666666666667, -1.1250000000000000,
+        const _prec dy4[4] = {0.0416666666666667, -1.1250000000000000,
                               1.1250000000000000, -0.0416666666666667};
-        const float dphz4r[6][9] = {
+        const _prec dphz4r[6][9] = {
             {0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
              0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
              0.0000000000000000, 0.0000000000000000, 0.0000000000000000},
@@ -2815,7 +2976,7 @@ __launch_bounds__(DTOPO_BUF_VEL_112_MAX_THREADS_PER_BLOCK)
             {0.0000000000000000, -0.0040378273193044, 0.0064139372778371,
              -0.0890062133451850, 0.6749219241340761, 0.0002498459192428,
              -0.6796875000000000, 0.0937500000000000, -0.0026041666666667}};
-        const float dz4r[6][7] = {
+        const _prec dz4r[6][7] = {
             {0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
              0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
              0.0000000000000000},
@@ -2928,23 +3089,23 @@ __launch_bounds__(DTOPO_BUF_VEL_112_MAX_THREADS_PER_BLOCK)
            (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
            (2 * align + nz) * ((j) + ngsl + 2)]
         int kb = nz - k - 2;
-        float rho1 =
+        _prec rho1 =
             0.25 * (_rho(i, j + rj0, kb + 0) + _rho(i, j + rj0 - 1, kb + 0)) +
             0.25 * (_rho(i, j + rj0, kb + 1) + _rho(i, j + rj0 - 1, kb + 1));
-        float rho2 =
-            0.25 * (_rho(i, j + rj0, kb + 0) + _rho(i - 1, j + rj0, kb + 0)) +
-            0.25 * (_rho(i, j + rj0, kb + 1) + _rho(i - 1, j + rj0, kb + 1));
-        float rho3 =
-            0.25 * (_rho(i, j + rj0, kb + 1) + _rho(i - 1, j + rj0, kb + 1)) +
+        _prec rho2 =
+            0.25 * (_rho(i, j + rj0, kb + 0) + _rho(i + 1, j + rj0, kb + 0)) +
+            0.25 * (_rho(i, j + rj0, kb + 1) + _rho(i + 1, j + rj0, kb + 1));
+        _prec rho3 =
+            0.25 * (_rho(i, j + rj0, kb + 1) + _rho(i + 1, j + rj0, kb + 1)) +
             0.25 * (_rho(i, j + rj0 - 1, kb + 1) +
-                    _rho(i - 1, j + rj0 - 1, kb + 1));
-        float Ai1 = _f_1(i, j + rj0) * _g3_c(nz - 1 - k) * rho1;
+                    _rho(i + 1, j + rj0 - 1, kb + 1));
+        _prec Ai1 = _f_1(i, j + rj0) * _g3_c(nz - 1 - k) * rho1;
         Ai1 = nu * 1.0 / Ai1;
-        float Ai2 = _f_2(i, j + rj0) * _g3_c(nz - 1 - k) * rho2;
+        _prec Ai2 = _f_2(i, j + rj0) * _g3_c(nz - 1 - k) * rho2;
         Ai2 = nu * 1.0 / Ai2;
-        float Ai3 = _f_c(i, j + rj0) * _g3(nz - 1 - k) * rho3;
+        _prec Ai3 = _f_c(i, j + rj0) * _g3(nz - 1 - k) * rho3;
         Ai3 = nu * 1.0 / Ai3;
-        float f_dcrj = _dcrjx(i) * _dcrjy(j + rj0) * _dcrjz(nz - 1 - k);
+        _prec f_dcrj = _dcrjx(i) * _dcrjy(j + rj0) * _dcrjz(nz - 1 - k);
         _buf_u1(i, j, nz - 1 - k) =
             (a * _u1(i, j + rj0, nz - 1 - k) +
              Ai1 * (dhx4[2] * _f_c(i, j + rj0) * _g3_c(nz - 1 - k) *
@@ -3333,3 +3494,5 @@ __launch_bounds__(DTOPO_BUF_VEL_112_MAX_THREADS_PER_BLOCK)
 #undef _u3
 }
 
+#undef OVERLAP_ZONE_INDEX
+
diff --git a/src/topography/kernels/velocity_unroll.cu b/src/topography/kernels/velocity_unroll.cu
index 84156de..59714aa 100644
--- a/src/topography/kernels/velocity_unroll.cu
+++ b/src/topography/kernels/velocity_unroll.cu
@@ -5,49 +5,50 @@ __launch_bounds__ (256)
 #else
 __launch_bounds__ (128)
 #endif
+#define OVERLAP_ZONE_INDEX 8
 __global__ void dtopo_vel_111_unroll(
-        float *RSTRCT u1, float *RSTRCT u2, float *RSTRCT u3,
-        const float *RSTRCT dcrjx, const float *RSTRCT dcrjy,
-        const float *RSTRCT dcrjz, const float *RSTRCT f,
-        const float *RSTRCT f1_1, const float *RSTRCT f1_2,
-        const float *RSTRCT f1_c, const float *RSTRCT f2_1,
-        const float *RSTRCT f2_2, const float *RSTRCT f2_c,
-        const float *RSTRCT f_1, const float *RSTRCT f_2,
-        const float *RSTRCT f_c, const float *RSTRCT g,
-        const float *RSTRCT g3, const float *RSTRCT g3_c,
-        const float *RSTRCT g_c, const float *RSTRCT rho,
-        const float *RSTRCT s11, const float *RSTRCT s12,
-        const float *RSTRCT s13, const float *RSTRCT s22,
-        const float *RSTRCT s23, const float *RSTRCT s33,
-        const float a, const float nu, const int nx, const int ny, const int nz,
+        _prec *RSTRCT u1, _prec *RSTRCT u2, _prec *RSTRCT u3,
+        const _prec *RSTRCT dcrjx, const _prec *RSTRCT dcrjy,
+        const _prec *RSTRCT dcrjz, const _prec *RSTRCT f,
+        const _prec *RSTRCT f1_1, const _prec *RSTRCT f1_2,
+        const _prec *RSTRCT f1_c, const _prec *RSTRCT f2_1,
+        const _prec *RSTRCT f2_2, const _prec *RSTRCT f2_c,
+        const _prec *RSTRCT f_1, const _prec *RSTRCT f_2,
+        const _prec *RSTRCT f_c, const _prec *RSTRCT g,
+        const _prec *RSTRCT g3, const _prec *RSTRCT g3_c,
+        const _prec *RSTRCT g_c, const _prec *RSTRCT rho,
+        const _prec *RSTRCT s11, const _prec *RSTRCT s12,
+        const _prec *RSTRCT s13, const _prec *RSTRCT s22,
+        const _prec *RSTRCT s23, const _prec *RSTRCT s33,
+        const _prec a, const _prec nu, const int nx, const int ny, const int nz,
         const int bi, const int bj, const int ei, const int ej) {
-  const float dhpz4[7] = {-0.0026041666666667, 0.0937500000000000,
+  const _prec dhpz4[7] = {-0.0026041666666667, 0.0937500000000000,
                           -0.6796875000000000, 0.0000000000000000,
                           0.6796875000000000,  -0.0937500000000000,
                           0.0026041666666667};
-  const float phx4[4] = {-0.0625000000000000, 0.5625000000000000,
+  const _prec phx4[4] = {-0.0625000000000000, 0.5625000000000000,
                          0.5625000000000000, -0.0625000000000000};
-  const float phy4[4] = {-0.0625000000000000, 0.5625000000000000,
+  const _prec phy4[4] = {-0.0625000000000000, 0.5625000000000000,
                          0.5625000000000000, -0.0625000000000000};
-  const float dhy4[4] = {0.0416666666666667, -1.1250000000000000,
+  const _prec dhy4[4] = {0.0416666666666667, -1.1250000000000000,
                          1.1250000000000000, -0.0416666666666667};
-  const float dhx4[4] = {0.0416666666666667, -1.1250000000000000,
+  const _prec dhx4[4] = {0.0416666666666667, -1.1250000000000000,
                          1.1250000000000000, -0.0416666666666667};
-  const float dhz4[4] = {0.0416666666666667, -1.1250000000000000,
+  const _prec dhz4[4] = {0.0416666666666667, -1.1250000000000000,
                          1.1250000000000000, -0.0416666666666667};
-  const float px4[4] = {-0.0625000000000000, 0.5625000000000000,
+  const _prec px4[4] = {-0.0625000000000000, 0.5625000000000000,
                         0.5625000000000000, -0.0625000000000000};
-  const float py4[4] = {-0.0625000000000000, 0.5625000000000000,
+  const _prec py4[4] = {-0.0625000000000000, 0.5625000000000000,
                         0.5625000000000000, -0.0625000000000000};
-  const float dx4[4] = {0.0416666666666667, -1.1250000000000000,
+  const _prec dx4[4] = {0.0416666666666667, -1.1250000000000000,
                         1.1250000000000000, -0.0416666666666667};
-  const float dy4[4] = {0.0416666666666667, -1.1250000000000000,
+  const _prec dy4[4] = {0.0416666666666667, -1.1250000000000000,
                         1.1250000000000000, -0.0416666666666667};
-  const float dphz4[7] = {-0.0026041666666667, 0.0937500000000000,
+  const _prec dphz4[7] = {-0.0026041666666667, 0.0937500000000000,
                           -0.6796875000000000, 0.0000000000000000,
                           0.6796875000000000,  -0.0937500000000000,
                           0.0026041666666667};
-  const float dz4[4] = {0.0416666666666667, -1.1250000000000000,
+  const _prec dz4[4] = {0.0416666666666667, -1.1250000000000000,
                         1.1250000000000000, -0.0416666666666667};
   int dm_offset = 3;
   const int i = threadIdx.z + blockIdx.z * blockDim.z + bi;
@@ -138,30 +139,30 @@ __global__ void dtopo_vel_111_unroll(
   u3[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
      (2 * align + nz) * ((j) + ngsl + 2)]
 
-  float v1[nq][nr];
-  float v2[nq][nr];
-  float v3[nq][nr];
+  _prec v1[nq][nr];
+  _prec v2[nq][nr];
+  _prec v3[nq][nr];
 #pragma unroll
         for (int q = 0; q < nq; ++q) {
 #pragma unroll
         for (int r = 0; r < nr; ++r) {
 
 
-  float c = 0.25f;
-  float rho1 = c * (_rho(i, j + q, k + r - 1) + _rho(i, j + q - 1, k + r - 1)) +
+  _prec c = 0.25f;
+  _prec rho1 = c * (_rho(i, j + q, k + r - 1) + _rho(i, j + q - 1, k + r - 1)) +
                c * (_rho(i, j + q, k + r) + _rho(i, j + q - 1, k + r));
-  float rho2 = c * (_rho(i, j + q, k + r - 1) + _rho(i - 1, j + q, k + r - 1)) +
-               c * (_rho(i, j + q, k + r) + _rho(i - 1, j + q, k + r));
-  float rho3 = c * (_rho(i, j + q, k + r) + _rho(i - 1, j + q, k + r)) +
-               c * (_rho(i, j + q - 1, k + r) + _rho(i - 1, j + q - 1, k + r));
+  _prec rho2 = c * (_rho(i, j + q, k + r - 1) + _rho(i + 1, j + q, k + r - 1)) +
+               c * (_rho(i, j + q, k + r) + _rho(i + 1, j + q, k + r));
+  _prec rho3 = c * (_rho(i, j + q, k + r) + _rho(i + 1, j + q, k + r)) +
+               c * (_rho(i, j + q - 1, k + r) + _rho(i + 1, j + q - 1, k + r));
 
-  float Ai1 = _f_1(i, j + q) * _g3_c(k + r) * rho1;
+  _prec Ai1 = _f_1(i, j + q) * _g3_c(k + r) * rho1;
   Ai1 = nu * 1.0 / Ai1;
-  float Ai2 = _f_2(i, j + q) * _g3_c(k + r) * rho2;
+  _prec Ai2 = _f_2(i, j + q) * _g3_c(k + r) * rho2;
   Ai2 = nu * 1.0 / Ai2;
-  float Ai3 = _f_c(i, j + q) * _g3(k + r) * rho3;
+  _prec Ai3 = _f_c(i, j + q) * _g3(k + r) * rho3;
   Ai3 = nu * 1.0 / Ai3;
-  float f_dcrj = _dcrjx(i) * _dcrjy(j + q) * _dcrjz(k + r);
+  _prec f_dcrj = _dcrjx(i) * _dcrjy(j + q) * _dcrjz(k + r);
   v1[q][r] =
       (a * _u1(i, j + q, k + r) +
        Ai1 * (dhx4[2] * _f_c(i, j + q) * _g3_c(k + r) * _s11(i, j + q, k + r) +
@@ -245,6 +246,25 @@ __global__ void dtopo_vel_111_unroll(
                         phy4[1] * _s12(i, j + q - 1, k + r + 3) +
                         phy4[3] * _s12(i, j + q + 1, k + r + 3))))) *
       f_dcrj;
+  if (k + r <  OVERLAP_ZONE_INDEX) {
+      v1[q][r] = (a * _u1(i, j + q, k + r) +
+                  nu / rho1 *
+                      (dhx4[2] * _s11(i, j + q, k + r) +
+                       dhx4[0] * _s11(i - 2, j + q, k + r) +
+                       dhx4[1] * _s11(i - 1, j + q, k + r) +
+                       dhx4[3] * _s11(i + 1, j + q, k + r) +
+                       dhy4[2] * _s12(i, j + q, k + r) +
+                       dhy4[0] * _s12(i, j + q - 2, k + r) +
+                       dhy4[1] * _s12(i, j + q - 1, k + r) +
+                       dhy4[3] * _s12(i, j + q + 1, k + r) +
+                       dhz4[2] * _s13(i, j + q, k + r) +
+                       dhz4[0] * _s13(i, j + q, k + r - 2) +
+                       dhz4[1] * _s13(i, j + q, k + r - 1) +
+                       dhz4[3] * _s13(i, j + q, k + r + 1))) *
+                 f_dcrj;
+  }
+
+
   v2[q][r] =
       (a * _u2(i, j + q, k + r) +
        Ai2 *
@@ -327,6 +347,23 @@ __global__ void dtopo_vel_111_unroll(
                       py4[2] * _s22(i, j + q + 1, k + r + 3) +
                       py4[3] * _s22(i, j + q + 2, k + r + 3))))) *
       f_dcrj;
+  if (k + r < OVERLAP_ZONE_INDEX) {
+        v2[q][r] =
+      (a * _u2(i, j + q, k + r) +
+       nu / rho2 *
+           (dhz4[2] * _s23(i, j + q, k + r) + dhz4[0] * _s23(i, j + q, k + r - 2) +
+            dhz4[1] * _s23(i, j + q, k + r - 1) + dhz4[3] * _s23(i, j + q, k + r + 1) +
+            dx4[1] * _s12(i, j + q, k + r) +
+            dx4[0] *  _s12(i - 1, j + q, k + r) +
+            dx4[2] *  _s12(i + 1, j + q, k + r) +
+            dx4[3] *  _s12(i + 2, j + q, k + r) +
+            dy4[1] * _s22(i, j + q, k + r) +
+            dy4[0] * _s22(i, j + q - 1, k + r) +
+            dy4[2] * _s22(i, j + q + 1, k + r) +
+            dy4[3] * _s22(i, j + q + 2, k + r)
+           )) * f_dcrj;
+
+  }
   v3[q][r] =
       (a * _u3(i, j + q, k + r) +
        Ai3 *
@@ -410,6 +447,24 @@ __global__ void dtopo_vel_111_unroll(
                       phy4[1] * _s23(i, j + q - 1, k + r + 3) +
                       phy4[3] * _s23(i, j + q + 1, k + r + 3))))) *
       f_dcrj;
+
+  if (k + r < OVERLAP_ZONE_INDEX) {
+      v3[q][r] = (a * _u3(i, j + q, k + r) +
+                  nu / rho3 *
+                      (dhy4[2] * _g3(k + r) * _s23(i, j + q, k + r) +
+                       dhy4[0] * _s23(i, j + q - 2, k + r) +
+                       dhy4[1] * _s23(i, j + q - 1, k + r) +
+                       dhy4[3] * _s23(i, j + q + 1, k + r) +
+                       dx4[1] * _s13(i, j + q, k + r) +
+                       dx4[0] * _s13(i - 1, j + q, k + r) +
+                       dx4[2] * _s13(i + 1, j + q, k + r) +
+                       dx4[3] * _s13(i + 2, j + q, k + r) +
+                       dz4[1] * _s33(i, j + q, k + r) +
+                       dz4[0] * _s33(i, j + q, k + r - 1) +
+                       dz4[2] * _s33(i, j + q, k + r + 1) +
+                       dz4[3] * _s33(i, j + q, k + r + 2))) *
+                 f_dcrj;
+  }
         }
         }
 
@@ -457,5 +512,5 @@ __global__ void dtopo_vel_111_unroll(
 
 
 #undef RSTRCT
-
+#undef OVERLAP_ZONE_INDEX
 
diff --git a/src/topography/mapping.c b/src/topography/mapping.c
new file mode 100644
index 0000000..58cfa3a
--- /dev/null
+++ b/src/topography/mapping.c
@@ -0,0 +1,200 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <assert.h>
+#include <topography/mapping.h>
+
+const int VERBOSE = 0;
+#define EPSILON 1e-4
+
+int map_error = 0;
+
+void hermite_cubic_basis(double b[4], const double t);
+void hermite_cubic_basis_derivative(double db[4], const double t);
+void adjust(double *m0, double *m1, const double s);
+void grid_stretch(struct mapping *map);
+
+void hermite_cubic_basis(double b[4], const double t) {
+    b[0] = (1.0 + 2.0 * t) * (1.0 - t) * (1.0 - t);
+    b[1] = t * (1.0 - t) * (1.0 - t);
+    b[2] = t * t * (3.0 - 2.0 * t);
+    b[3] = t * t * (t - 1.0);
+}
+
+void hermite_cubic_basis_derivative(double db[4], const double t) {
+   db[0] = 6.0 * t * t - 6.0 * t;
+   db[1] = 3.0 * t * t - 4.0 * t + 1.0;
+   db[2] = -6.0 * t * t + 6.0 * t;
+   db[3] = 3.0 * t * t - 2.0 * t;
+}
+
+
+void adjust(double *m0, double *m1, const double s) {
+    double a = *m0 / s;
+    double b = *m1 / s;
+
+    if (a < 0 || b < 0) {
+        fprintf(stderr, "%s:%s():%d Non-monotonic mapping function data!\n",
+                __FILE__, __func__, __LINE__);
+        map_error = MAP_NON_MONOTONIC;
+    }
+
+    if (a * a + b * b > 9) {
+        double v = 3.0 / sqrt(a * a + b * b);
+        *m0 = v * a * s;
+        *m1 = v * b * s;
+    }
+}
+
+void grid_stretch(struct mapping *map) {
+    const double dzb = map->dzb;
+    const double dzt = map->dzt;
+    const double h = map->h;
+
+    double s0 = dzb / h;
+    double s1 = (1.0 - dzb - dzt) / ( 1.0 - 2.0 * h);
+    double s2 = dzt / h;
+
+    double m0 = s0;
+    double m1 = 0.5 * (s0 + s1);
+    double m2 = 0.5 * (s1 + s2);
+    double m3 = s2;
+
+    adjust(&m0, &m1, s0);
+    adjust(&m1, &m2, s1);
+    adjust(&m2, &m3, s2);
+
+    map->m[0] = m0;
+    map->m[1] = m1;
+    map->m[2] = m2;
+    map->m[3] = m3;
+}
+
+double map_height(const int nz, const double dz) {
+        return dz * (nz - 2 - MAPPING_START_POINT);
+}
+
+
+struct mapping map_init(const double dzb, const double dzt, const double h) {
+
+    struct mapping map;
+    map.dzb = dzb;
+    map.dzt = dzt;
+    map.h = h;
+    map.r[0] = 0.0;
+    map.r[1] = h;
+    map.r[2] = 1.0 - h;
+    map.r[3] = 1.0;
+    map.z[0] = 0.0;
+    map.z[1] = dzb;
+    map.z[2] = 1.0 - dzt;
+    map.z[3] = 1.0;
+
+    grid_stretch(&map);
+
+    return map;
+}
+
+int map_find_cell_r(const double r, const struct mapping *map) {
+    if (r < -EPSILON) {
+        fprintf(stderr, "%s:%s():%d Outside interval (r = %f, r < 0)!\n", 
+                __FILE__, __func__, __LINE__, r);
+        map_error = MAP_OUTSIDE;
+    }
+    else if (r <= map->h) return 0;
+    else if (r > map->h && r <= 1.0 - map->h) return 1;
+    else if (r <= 1.0) return 2;
+    if (r > 1.0 + EPSILON) {
+        fprintf(stderr, "%s:%s():%d Outside interval (r = %f, r > 1)!\n",
+                __FILE__, __func__, __LINE__, r);
+        map_error = MAP_OUTSIDE;
+    }
+
+    return -1;
+}
+
+int map_find_cell_z(const double z, const struct mapping *map) {
+    if (z < -EPSILON) {
+        fprintf(stderr, "%s:%s():%d Outside interval (z = %f, z < 0)!\n",
+                __FILE__, __func__, __LINE__, z);
+        map_error = MAP_OUTSIDE;
+    }
+    else if (z <= map->dzb) return 0;
+    else if (z > map->dzb && z <= 1.0 - map->dzt) return 1.0;
+    else if (z <= 1.0) return 2;
+    if (z > 1.0 + EPSILON) {
+        fprintf(stderr, "%s:%s():%d Outside interval (z = %f, z > 1)!\n",
+                __FILE__, __func__, __LINE__, z);
+        map_error = MAP_OUTSIDE;
+    }
+    return -1;
+
+}
+
+double map_eval(const double r, const struct mapping *map) {
+    int c = map_find_cell_r(r, map);
+    double b[4];
+    double dr = map->r[c+1] - map->r[c];
+    hermite_cubic_basis(b, (r - map->r[c]) / dr);
+    return b[0] * map->z[c] + dr * map->m[c] * b[1] + b[2] * map->z[c+1] + dr * b[3] * map->m[c+1];
+}
+
+double map_eval_derivative(const double r, const struct mapping *map) {
+    int c = map_find_cell_r(r, map);
+    double b[4];
+    double dr = map->r[c+1] - map->r[c];
+    hermite_cubic_basis_derivative(b, (r - map->r[c]) / dr);
+    double d =  b[0] * map->z[c] + dr * map->m[c] * b[1] + b[2] * map->z[c+1] + dr * b[3] * map->m[c+1];
+    return d / dr;
+}
+
+double map_invert(const double z, const struct mapping *map, const double eps, const int maxiter) {
+
+    double rk = z;
+    double fk = map_eval(rk, map);
+    double dfk = map_eval_derivative(z, map);
+    double h = map->h;
+
+    int k = 0;
+    double rl = rk;
+    while ( (fabs(z - fk) > eps * h || fabs(rk - rl) > eps * h) && k < maxiter) {
+        if (VERBOSE)
+        printf("k = %d rk = %f fk = %f dfk = %f \n", k, rk, fk, dfk);
+        rl = rk;
+        rk = rk - (fk - z) / dfk;
+        rk = rk < 0 ? 0 : rk;
+        rk = rk > 1 ? 1 : rk;
+        fk = map_eval(rk, map);
+        dfk = map_eval_derivative(rk, map);
+        k = k + 1;
+    }
+    if (VERBOSE)
+        printf("\n");
+    if (k >= maxiter)
+        printf(
+            "WARNING: Mapping inversion failed to converge. Either increase "
+            "the number of maximum iterations or decrease the tolerance. r = %g, |z - f(r)| = %g \n",
+            rk, fabs(z - fk));
+
+    return rk;
+}
+
+const char* map_error_string(const enum map_err_codes err_code) {
+    switch (err_code) {
+        case MAP_SUCCESS:
+            return "All mapping operations completed successfully";
+            break;
+        case MAP_NON_MONOTONIC:
+            return "The mapping function is non-monotonic\n";
+            break;
+        case MAP_OUTSIDE:
+            return "The query point is outside the domain of definition of the mapping function\n";
+            break;
+        default:
+            return "Unknown error code\n";
+        }
+}
+
+enum map_err_codes map_get_last_error(void) {
+    return map_error;
+}
diff --git a/src/topography/metrics/CMakeLists.txt b/src/topography/metrics/CMakeLists.txt
index f5d431e..6207433 100644
--- a/src/topography/metrics/CMakeLists.txt
+++ b/src/topography/metrics/CMakeLists.txt
@@ -1,15 +1,16 @@
 set(HEADERS
-    ${AWP_MINI_SOURCE_DIR}/include/awp/definitions.h     
-    ${AWP_MINI_SOURCE_DIR}/include/functions/functions.h     
-    ${AWP_MINI_SOURCE_DIR}/include/topography/metrics/metrics.h
-    ${AWP_MINI_SOURCE_DIR}/include/topography/metrics/kernel.h
+    ${AWP_SOURCE_DIR}/include/awp/definitions.h     
+    ${AWP_SOURCE_DIR}/include/functions/functions.h     
+    ${AWP_SOURCE_DIR}/include/topography/metrics/metrics.h
+    ${AWP_SOURCE_DIR}/include/topography/metrics/kernel.h
+    ${AWP_SOURCE_DIR}/include/topography/metrics/shift.h
     )
 
-add_library(metrics metrics.c kernel.c)
+add_library(metrics metrics.c kernel.c shift.c)
 
 target_include_directories(metrics
         PUBLIC
-        ${AWP_MINI_SOURCE_DIR}/include/
+        ${AWP_SOURCE_DIR}/include/
        )
 target_link_libraries(metrics interpolation functions)
 
diff --git a/src/topography/metrics/kernel.c b/src/topography/metrics/kernel.c
index a01e307..02325eb 100644
--- a/src/topography/metrics/kernel.c
+++ b/src/topography/metrics/kernel.c
@@ -1,14 +1,18 @@
 #include <topography/metrics/kernel.h>
+#
+// This parameter pads the compute region. Its needed for the computation of
+// derivative and interpolation stencils. Do not change its value.
+#define padding 8
 
 void metrics_f_interp_1_111(float *df1, const float *f, const int nx, const int ny, const int nz)
 {
      const float phy[4] = {-0.0625000000000000, 0.5625000000000000, 0.5625000000000000, -0.0625000000000000};
      for (int k = 0; k < 1; ++k) {
-         for (int j = 0; j < 2*ngsl + ny - 4; ++j) {
-             for (int i = 0; i < 2*ngsl + nx - 4; ++i) {
-                  #define _f(i,j) f[(j) + align + ngsl + ((i) + ngsl + 2)*(2*align + 2*ngsl + ny + 4) + 2]
-                  #define _df1(i,j) df1[(j) + align + ngsl + ((i) + ngsl + 2)*(2*align + 2*ngsl + ny + 4) + 2]
-                  _df1(-ngsl + i + 2,-ngsl + j + 2) = phy[0]*_f(-ngsl + i + 2,-ngsl + j) + phy[1]*_f(-ngsl + i + 2,-ngsl + j + 1) + phy[2]*_f(-ngsl + i + 2,-ngsl + j + 2) + phy[3]*_f(-ngsl + i + 2,-ngsl + j + 3);
+         for (int j = 0; j < 2*padding + ny - 4; ++j) {
+             for (int i = 0; i < 2*padding + nx - 4; ++i) {
+                  #define _f(i,j) f[(j) + align + padding + ((i) + padding + 2)*(2*align + 2*padding + ny + 4) + 2]
+                  #define _df1(i,j) df1[(j) + align + padding + ((i) + padding + 2)*(2*align + 2*padding + ny + 4) + 2]
+                  _df1(-padding + i + 2,-padding + j + 2) = phy[0]*_f(-padding + i + 2,-padding + j) + phy[1]*_f(-padding + i + 2,-padding + j + 1) + phy[2]*_f(-padding + i + 2,-padding + j + 2) + phy[3]*_f(-padding + i + 2,-padding + j + 3);
                   #undef _f
                   #undef _df1
                   
@@ -22,11 +26,11 @@ void metrics_f_interp_2_111(float *df1, const float *f, const int nx, const int
 {
      const float px[4] = {-0.0625000000000000, 0.5625000000000000, 0.5625000000000000, -0.0625000000000000};
      for (int k = 0; k < 1; ++k) {
-         for (int j = 0; j < 2*ngsl + ny - 4; ++j) {
-             for (int i = 0; i < 2*ngsl + nx - 4; ++i) {
-                  #define _f(i,j) f[(j) + align + ngsl + ((i) + ngsl + 2)*(2*align + 2*ngsl + ny + 4) + 2]
-                  #define _df1(i,j) df1[(j) + align + ngsl + ((i) + ngsl + 2)*(2*align + 2*ngsl + ny + 4) + 2]
-                  _df1(-ngsl + i + 2,-ngsl + j + 2) = px[0]*_f(-ngsl + i + 1,-ngsl + j + 2) + px[1]*_f(-ngsl + i + 2,-ngsl + j + 2) + px[2]*_f(-ngsl + i + 3,-ngsl + j + 2) + px[3]*_f(-ngsl + i + 4,-ngsl + j + 2);
+         for (int j = 0; j < 2*padding + ny - 4; ++j) {
+             for (int i = 0; i < 2*padding + nx - 4; ++i) {
+                  #define _f(i,j) f[(j) + align + padding + ((i) + padding + 2)*(2*align + 2*padding + ny + 4) + 2]
+                  #define _df1(i,j) df1[(j) + align + padding + ((i) + padding + 2)*(2*align + 2*padding + ny + 4) + 2]
+                  _df1(-padding + i + 2,-padding + j + 2) = px[0]*_f(-padding + i + 1,-padding + j + 2) + px[1]*_f(-padding + i + 2,-padding + j + 2) + px[2]*_f(-padding + i + 3,-padding + j + 2) + px[3]*_f(-padding + i + 4,-padding + j + 2);
                   #undef _f
                   #undef _df1
                   
@@ -41,11 +45,11 @@ void metrics_f_interp_c_111(float *df1, const float *f, const int nx, const int
      const float phy[4] = {-0.0625000000000000, 0.5625000000000000, 0.5625000000000000, -0.0625000000000000};
      const float px[4] = {-0.0625000000000000, 0.5625000000000000, 0.5625000000000000, -0.0625000000000000};
      for (int k = 0; k < 1; ++k) {
-         for (int j = 0; j < 2*ngsl + ny - 4; ++j) {
-             for (int i = 0; i < 2*ngsl + nx - 4; ++i) {
-                  #define _f(i,j) f[(j) + align + ngsl + ((i) + ngsl + 2)*(2*align + 2*ngsl + ny + 4) + 2]
-                  #define _df1(i,j) df1[(j) + align + ngsl + ((i) + ngsl + 2)*(2*align + 2*ngsl + ny + 4) + 2]
-                  _df1(-ngsl + i + 2,-ngsl + j + 2) = phy[0]*(px[0]*_f(-ngsl + i + 1,-ngsl + j) + px[1]*_f(-ngsl + i + 2,-ngsl + j) + px[2]*_f(-ngsl + i + 3,-ngsl + j) + px[3]*_f(-ngsl + i + 4,-ngsl + j)) + phy[1]*(px[0]*_f(-ngsl + i + 1,-ngsl + j + 1) + px[1]*_f(-ngsl + i + 2,-ngsl + j + 1) + px[2]*_f(-ngsl + i + 3,-ngsl + j + 1) + px[3]*_f(-ngsl + i + 4,-ngsl + j + 1)) + phy[2]*(px[0]*_f(-ngsl + i + 1,-ngsl + j + 2) + px[1]*_f(-ngsl + i + 2,-ngsl + j + 2) + px[2]*_f(-ngsl + i + 3,-ngsl + j + 2) + px[3]*_f(-ngsl + i + 4,-ngsl + j + 2)) + phy[3]*(px[0]*_f(-ngsl + i + 1,-ngsl + j + 3) + px[1]*_f(-ngsl + i + 2,-ngsl + j + 3) + px[2]*_f(-ngsl + i + 3,-ngsl + j + 3) + px[3]*_f(-ngsl + i + 4,-ngsl + j + 3));
+         for (int j = 0; j < 2*padding + ny - 4; ++j) {
+             for (int i = 0; i < 2*padding + nx - 4; ++i) {
+                  #define _f(i,j) f[(j) + align + padding + ((i) + padding + 2)*(2*align + 2*padding + ny + 4) + 2]
+                  #define _df1(i,j) df1[(j) + align + padding + ((i) + padding + 2)*(2*align + 2*padding + ny + 4) + 2]
+                  _df1(-padding + i + 2,-padding + j + 2) = phy[0]*(px[0]*_f(-padding + i + 1,-padding + j) + px[1]*_f(-padding + i + 2,-padding + j) + px[2]*_f(-padding + i + 3,-padding + j) + px[3]*_f(-padding + i + 4,-padding + j)) + phy[1]*(px[0]*_f(-padding + i + 1,-padding + j + 1) + px[1]*_f(-padding + i + 2,-padding + j + 1) + px[2]*_f(-padding + i + 3,-padding + j + 1) + px[3]*_f(-padding + i + 4,-padding + j + 1)) + phy[2]*(px[0]*_f(-padding + i + 1,-padding + j + 2) + px[1]*_f(-padding + i + 2,-padding + j + 2) + px[2]*_f(-padding + i + 3,-padding + j + 2) + px[3]*_f(-padding + i + 4,-padding + j + 2)) + phy[3]*(px[0]*_f(-padding + i + 1,-padding + j + 3) + px[1]*_f(-padding + i + 2,-padding + j + 3) + px[2]*_f(-padding + i + 3,-padding + j + 3) + px[3]*_f(-padding + i + 4,-padding + j + 3));
                   #undef _f
                   #undef _df1
                   
@@ -59,11 +63,11 @@ void metrics_f_diff_1_1_111(float *df1, const float *f, const float hi, const in
 {
      const float dhx[4] = {0.0416666666666667, -1.1250000000000000, 1.1250000000000000, -0.0416666666666667};
      for (int k = 0; k < 1; ++k) {
-         for (int j = 0; j < 2*ngsl + ny - 4; ++j) {
-             for (int i = 0; i < 2*ngsl + nx - 4; ++i) {
-                  #define _f(i,j) f[(j) + align + ngsl + ((i) + ngsl + 2)*(2*align + 2*ngsl + ny + 4) + 2]
-                  #define _df1(i,j) df1[(j) + align + ngsl + ((i) + ngsl + 2)*(2*align + 2*ngsl + ny + 4) + 2]
-                  _df1(-ngsl + i + 2,-ngsl + j + 2) = hi*(dhx[0]*_f(-ngsl + i,-ngsl + j + 2) + dhx[1]*_f(-ngsl + i + 1,-ngsl + j + 2) + dhx[2]*_f(-ngsl + i + 2,-ngsl + j + 2) + dhx[3]*_f(-ngsl + i + 3,-ngsl + j + 2));
+         for (int j = 0; j < 2*padding + ny - 4; ++j) {
+             for (int i = 0; i < 2*padding + nx - 4; ++i) {
+                  #define _f(i,j) f[(j) + align + padding + ((i) + padding + 2)*(2*align + 2*padding + ny + 4) + 2]
+                  #define _df1(i,j) df1[(j) + align + padding + ((i) + padding + 2)*(2*align + 2*padding + ny + 4) + 2]
+                  _df1(-padding + i + 2,-padding + j + 2) = hi*(dhx[0]*_f(-padding + i,-padding + j + 2) + dhx[1]*_f(-padding + i + 1,-padding + j + 2) + dhx[2]*_f(-padding + i + 2,-padding + j + 2) + dhx[3]*_f(-padding + i + 3,-padding + j + 2));
                   #undef _f
                   #undef _df1
                   
@@ -77,11 +81,11 @@ void metrics_f_diff_1_2_111(float *df1, const float *f, const float hi, const in
 {
      const float dx[4] = {0.0416666666666667, -1.1250000000000000, 1.1250000000000000, -0.0416666666666667};
      for (int k = 0; k < 1; ++k) {
-         for (int j = 0; j < 2*ngsl + ny - 4; ++j) {
-             for (int i = 0; i < 2*ngsl + nx - 4; ++i) {
-                  #define _f(i,j) f[(j) + align + ngsl + ((i) + ngsl + 2)*(2*align + 2*ngsl + ny + 4) + 2]
-                  #define _df1(i,j) df1[(j) + align + ngsl + ((i) + ngsl + 2)*(2*align + 2*ngsl + ny + 4) + 2]
-                  _df1(-ngsl + i + 2,-ngsl + j + 2) = hi*(dx[0]*_f(-ngsl + i + 1,-ngsl + j + 2) + dx[1]*_f(-ngsl + i + 2,-ngsl + j + 2) + dx[2]*_f(-ngsl + i + 3,-ngsl + j + 2) + dx[3]*_f(-ngsl + i + 4,-ngsl + j + 2));
+         for (int j = 0; j < 2*padding + ny - 4; ++j) {
+             for (int i = 0; i < 2*padding + nx - 4; ++i) {
+                  #define _f(i,j) f[(j) + align + padding + ((i) + padding + 2)*(2*align + 2*padding + ny + 4) + 2]
+                  #define _df1(i,j) df1[(j) + align + padding + ((i) + padding + 2)*(2*align + 2*padding + ny + 4) + 2]
+                  _df1(-padding + i + 2,-padding + j + 2) = hi*(dx[0]*_f(-padding + i + 1,-padding + j + 2) + dx[1]*_f(-padding + i + 2,-padding + j + 2) + dx[2]*_f(-padding + i + 3,-padding + j + 2) + dx[3]*_f(-padding + i + 4,-padding + j + 2));
                   #undef _f
                   #undef _df1
                   
@@ -95,11 +99,11 @@ void metrics_f_diff_2_1_111(float *df1, const float *f, const float hi, const in
 {
      const float dhy[4] = {0.0416666666666667, -1.1250000000000000, 1.1250000000000000, -0.0416666666666667};
      for (int k = 0; k < 1; ++k) {
-         for (int j = 0; j < 2*ngsl + ny - 4; ++j) {
-             for (int i = 0; i < 2*ngsl + nx - 4; ++i) {
-                  #define _f(i,j) f[(j) + align + ngsl + ((i) + ngsl + 2)*(2*align + 2*ngsl + ny + 4) + 2]
-                  #define _df1(i,j) df1[(j) + align + ngsl + ((i) + ngsl + 2)*(2*align + 2*ngsl + ny + 4) + 2]
-                  _df1(-ngsl + i + 2,-ngsl + j + 2) = hi*(dhy[0]*_f(-ngsl + i + 2,-ngsl + j) + dhy[1]*_f(-ngsl + i + 2,-ngsl + j + 1) + dhy[2]*_f(-ngsl + i + 2,-ngsl + j + 2) + dhy[3]*_f(-ngsl + i + 2,-ngsl + j + 3));
+         for (int j = 0; j < 2*padding + ny - 4; ++j) {
+             for (int i = 0; i < 2*padding + nx - 4; ++i) {
+                  #define _f(i,j) f[(j) + align + padding + ((i) + padding + 2)*(2*align + 2*padding + ny + 4) + 2]
+                  #define _df1(i,j) df1[(j) + align + padding + ((i) + padding + 2)*(2*align + 2*padding + ny + 4) + 2]
+                  _df1(-padding + i + 2,-padding + j + 2) = hi*(dhy[0]*_f(-padding + i + 2,-padding + j) + dhy[1]*_f(-padding + i + 2,-padding + j + 1) + dhy[2]*_f(-padding + i + 2,-padding + j + 2) + dhy[3]*_f(-padding + i + 2,-padding + j + 3));
                   #undef _f
                   #undef _df1
                   
@@ -113,11 +117,11 @@ void metrics_f_diff_2_2_111(float *df1, const float *f, const float hi, const in
 {
      const float dy[4] = {0.0416666666666667, -1.1250000000000000, 1.1250000000000000, -0.0416666666666667};
      for (int k = 0; k < 1; ++k) {
-         for (int j = 0; j < 2*ngsl + ny - 4; ++j) {
-             for (int i = 0; i < 2*ngsl + nx - 4; ++i) {
-                  #define _f(i,j) f[(j) + align + ngsl + ((i) + ngsl + 2)*(2*align + 2*ngsl + ny + 4) + 2]
-                  #define _df1(i,j) df1[(j) + align + ngsl + ((i) + ngsl + 2)*(2*align + 2*ngsl + ny + 4) + 2]
-                  _df1(-ngsl + i + 2,-ngsl + j + 2) = hi*(dy[0]*_f(-ngsl + i + 2,-ngsl + j + 1) + dy[1]*_f(-ngsl + i + 2,-ngsl + j + 2) + dy[2]*_f(-ngsl + i + 2,-ngsl + j + 3) + dy[3]*_f(-ngsl + i + 2,-ngsl + j + 4));
+         for (int j = 0; j < 2*padding + ny - 4; ++j) {
+             for (int i = 0; i < 2*padding + nx - 4; ++i) {
+                  #define _f(i,j) f[(j) + align + padding + ((i) + padding + 2)*(2*align + 2*padding + ny + 4) + 2]
+                  #define _df1(i,j) df1[(j) + align + padding + ((i) + padding + 2)*(2*align + 2*padding + ny + 4) + 2]
+                  _df1(-padding + i + 2,-padding + j + 2) = hi*(dy[0]*_f(-padding + i + 2,-padding + j + 1) + dy[1]*_f(-padding + i + 2,-padding + j + 2) + dy[2]*_f(-padding + i + 2,-padding + j + 3) + dy[3]*_f(-padding + i + 2,-padding + j + 4));
                   #undef _f
                   #undef _df1
                   
@@ -289,4 +293,4 @@ void metrics_g_diff_c_112(float *g3, const float *g, const float hi, const int n
      
 }
 
-
+#undef padding
diff --git a/src/topography/metrics/metrics.c b/src/topography/metrics/metrics.c
index eb85be7..f0b6ea2 100644
--- a/src/topography/metrics/metrics.c
+++ b/src/topography/metrics/metrics.c
@@ -3,28 +3,32 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 
-#include <awp/definitions.h>
+#include <awp/pmcl3d_cons.h>
 #include <grid/grid_3d.h>
 #include <grid/shift.h>
 #include <topography/metrics/metrics.h>
 #include <topography/metrics/kernel.h>
+#include <topography/metrics/shift.h>
 #include <test/test.h>
 #include "interpolation/interpolation.h"
 
-f_grid_t metrics_init_f(const int *size, const _prec gridspacing)
-{
-        f_grid_t out = {
-            .size = {size[0], size[1], 1},
-            .mem = {size[0] + 4 + 2*ngsl, size[1] + 4 + 2*ngsl + 2 * align, 1},
-            .bounds_x = {-ngsl, size[0] + ngsl},
-            .bounds_y = {-ngsl, size[1] + ngsl},
-            .bounds_stress_x = {-ngsl / 2, size[0] + ngsl / 2},
-            .bounds_stress_y = {-ngsl / 2, size[1] + ngsl / 2},
-            .offset = {2 + ngsl, 2 + ngsl + align, 0},
-            .hi = 1.0/gridspacing
-        };
+
+// This parameter pads the compute region. Its needed for the computation of
+// derivative and interpolation stencils. Do not change its value.
+
+f_grid_t metrics_init_f(const int *size, const _prec gridspacing,
+                            const int pad) {
+        f_grid_t out = {.size = {size[0], size[1], 1},
+                        .mem = {size[0] + 4 + 2 * pad,
+                                size[1] + 4 + 2 * pad + 2 * align, 1},
+                        .bounds_x = {-pad, size[0] + pad},
+                        .bounds_y = {-pad, size[1] + pad},
+                        .bounds_stress_x = {-pad / 2, size[0] + pad / 2},
+                        .bounds_stress_y = {-pad / 2, size[1] + pad / 2},
+                        .offset = {2 + pad, 2 + pad + align, 0},
+                        .hi = 1.0 / gridspacing};
         out.line = out.mem[2];
-        out.slice = out.mem[1]*out.mem[2];
+        out.slice = out.mem[1] * out.mem[2];
 
         metrics_h_malloc_f(&out);
         metrics_d_malloc_f(&out);
@@ -168,6 +172,24 @@ void metrics_differentiate_f(f_grid_t *f)
                              f->size[2]);
 }
 
+void metrics_shift_f(f_grid_t *fout, const f_grid_t *fin)
+{
+        int nx = fout->size[0];
+        int ny = fout->size[1];
+        metrics_shift_f_apply(fout->f, fin->f, nx, ny);
+        metrics_shift_f_apply(fout->f_1, fin->f_1, nx, ny);
+        metrics_shift_f_apply(fout->f_2, fin->f_2, nx, ny);
+        metrics_shift_f_apply(fout->f_c, fin->f_c, nx, ny);
+
+        metrics_shift_f_apply(fout->f1_1, fin->f1_1, nx, ny);
+        metrics_shift_f_apply(fout->f1_2, fin->f1_2, nx, ny);
+        metrics_shift_f_apply(fout->f1_c, fin->f1_c, nx, ny);
+
+        metrics_shift_f_apply(fout->f2_1, fin->f2_1, nx, ny);
+        metrics_shift_f_apply(fout->f2_2, fin->f2_2, nx, ny);
+        metrics_shift_f_apply(fout->f2_c, fin->f2_c, nx, ny);
+}
+
 int metrics_interpolate_f_point(const f_grid_t *f, prec *out, const prec *in,
                                 const prec *x, const prec *y,
                                 grid3_t grid, const prec *qx,
@@ -189,7 +211,7 @@ int metrics_interpolate_f_point(const f_grid_t *f, prec *out, const prec *in,
                 out[q] = 0.0;
                 for (int i = 0; i < deg + 1; ++i) {
                 for (int j = 0; j < deg + 1; ++j) {
-                        int pos = pmetrics_f_index(f, ix + i, iy + j);
+                        int pos = metrics_f_index(f, ix + i, iy + j);
                         out[q] += lx[i] * ly[j] * in[pos];
                 }
                 }
@@ -203,6 +225,66 @@ int metrics_interpolate_f_point(const f_grid_t *f, prec *out, const prec *in,
         return err;
 }
 
+// This function might be useful when fixing topo-DM incompatibility
+int metrics_interpolate_jacobian(const f_grid_t *fgrid, float *out, const float *f, const float *dg,
+                        const float *x, const float *y, const float *z,
+                        grid3_t grid, const float *qx,
+                        const float *qy, const float *qz, const int m, const int deg) 
+{
+        int err = 0;
+        prec *lx, *ly, *lz, *xloc, *yloc, *zloc;
+        lx = calloc(sizeof(lx), (deg + 1));
+        ly = calloc(sizeof(ly), (deg + 1));
+        lz = calloc(sizeof(lz), (deg + 1));
+        xloc = calloc(sizeof(xloc), (deg + 1));
+        yloc = calloc(sizeof(yloc), (deg + 1));
+        zloc = calloc(sizeof(zloc), (deg + 1));
+
+        //printf("grid = %d %d \n", grid.size.x, grid.size.y);
+        int ny = grid.size.y - 4 - ngsl2;
+        //printf("ny = %d \n", ny);
+        //printf("z = %g \n", qz[0]);
+        #define _f(i, j) f[(j) + align + (i) * (2 * align + 2 * ngsl + ny + 4)]
+        for (int q = 0; q < m; ++q) { 
+                int ix = 0; int iy = 0; int iz = 0;
+                err = interp_lagrange1_coef(
+                    xloc, lx, &ix, x, grid.size.x, qx[q], deg);
+                err = interp_lagrange1_coef(
+                    yloc, ly, &iy, y, grid.size.y, qy[q], deg);
+                err = interp_lagrange1_coef(
+                    zloc, lz, &iz, z, grid_boundary_size(grid).z, qz[q], deg);
+                out[q] = 0.0;
+                //printf("%d %d %d: \n", ix, iy, iz);
+                for (int i = 0; i < deg + 1; ++i) {
+                for (int j = 0; j < deg + 1; ++j) {
+                for (int k = 0; k < deg + 1; ++k) {
+                        int gpos = align + iz + k;
+                        //printf("g[%d] = %g  f = %g \n", gpos, dg[gpos], _f(ix + i, iy + j));
+                        out[q] += lx[i] * ly[j] * lz[k] * _f(ix + i, iy + j) * dg[gpos];
+                }
+                }
+                }
+
+
+                // int i1 = 2 + 4 + 99;
+                // int j1 = 2 + 4 + 99;
+                // int i2 = i1 + 1;
+                // int j2 = j1 + 1;
+                // printf("%g %g %g %g \n", _f(i1, j1), _f(i1, j2), _f(i2, j1), _f(i2, j2));
+
+                //exit(-1);
+        }
+
+        free(lx);
+        free(ly);
+        free(lz);
+        free(xloc);
+        free(yloc);
+        free(zloc);
+
+        return err;
+}
+
 g_grid_t metrics_init_g(const int *size, const _prec gridspacing)
 {
         g_grid_t out = {
diff --git a/src/topography/metrics/shift.c b/src/topography/metrics/shift.c
new file mode 100644
index 0000000..a64e74d
--- /dev/null
+++ b/src/topography/metrics/shift.c
@@ -0,0 +1,23 @@
+#include <topography/metrics/shift.h>
+#include <stdio.h>
+
+void metrics_shift_f_apply(float *fout, const float *fin, const int nx,
+                           const int ny)
+{
+        int mx = nx + 2 * ngsl;
+        int my = ny + 2 * ngsl;
+        const int padding = 8;
+
+#define _fout(i, j)               \
+        fout[(j) + align + \
+             ((i) + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _fin(i, j)                  \
+        fin[(j) + align + \
+            ((i) + 2) * (2 * align + 2 * padding + ny + 4) + 2]
+        for (int i = 0; i < mx; ++i) {
+                for (int j = 0; j < my; ++j) {
+                        _fout(i, j) =
+                            _fin(i + padding - ngsl, j + padding - ngsl);
+                }
+        }
+}
diff --git a/src/topography/mms.cu b/src/topography/mms.cu
new file mode 100644
index 0000000..539ba39
--- /dev/null
+++ b/src/topography/mms.cu
@@ -0,0 +1,561 @@
+#include <stdio.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <topography/mms.cuh>
+#include <topography/mapping.cuh>
+#include <buffers/buffer.h>
+#include <readers/input.h>
+#include <awp/pmcl3d_cons.h>
+// Background values (P-wave speed, S-wave speed, density)
+float scp0, scs0, srho0;
+// Perturbation values (P-wave speed, S-wave speed, density)
+float sdcp, sdcs, sdrho;
+// Wavenumbers
+float kx, ky, kz;
+
+// Background values (velocities and stresses)
+float svx0, svy0, svz0, sxx0, syy0, szz0, sxy0, sxz0, syz0;
+// Perturbation values (velocities and stresses)
+float sdvx, sdvy, sdvz, sdxx, sdyy, sdzz, sdxy, sdxz, sdyz;
+
+// Plane wave position
+float src;
+
+
+// Grid stretching ratio
+float stretch_ratio;
+
+
+__inline__ __device__ int in_bounds_stress(int nx, int ny, int nz, int i, int j, int k) {
+                if (i < ngsl / 2 + 2 || i >= nx + 3 * ngsl / 2 + 2) return 0;
+                if (j < ngsl / 2 + 2 || j >= ny + 3 * ngsl / 2 + 2) return 0;
+                if (k >= align + nz) return 0;
+                return 1;
+}
+
+__inline__ __device__ int in_bounds_velocity(int nx, int ny, int nz, int i, int j, int k) {
+                if (i < ngsl + 2 || i >= nx + ngsl + 2) return 0;
+                if (j < ngsl + 2 || j >= ny + ngsl + 2) return 0;
+                if (k >= align + nz) return 0;
+                return 1;
+}
+
+__inline__ __device__ int in_bounds(int i, int j, int k, int bi, int bj, int bk,
+                                    int ei, int ej, int ek) {
+        if (i < bi || j < bj || k - align < bk) return 0;
+        if (i >= ei || j >= ej || k - align >= ek) return 0;
+        return 1;
+}
+
+__inline__ __device__ float length_x(int nx, float h) {
+        return (nx - 1) * h;
+}
+
+__inline__ __device__ float length_y(int ny, float h) {
+        return (ny - 1) * h;
+}
+
+__inline__ __device__ float length_z(int nz, float h) {
+        return (nz - 2) * h;
+}
+
+__inline__ __device__ float xi(int i, int px, float Lx, float h, int hat=0) {
+                return (i - ngsl - 2 - hat * 0.5f) * h + px * Lx;
+}
+
+__inline__ __device__ float yj(int j, int py, float Ly, float h, int hat=0) {
+                float shift = hat == 0? 0.0f : 0.5f;
+                return (j - ngsl - 2 - hat * 0.5f) * h + py * Ly;
+}
+
+__inline__ __device__ float zk(int k, int pz, float Lz, float h, int hat=0) {
+                return (k - align - hat * 0.5f) * h + pz * Lz;
+}
+
+__inline__ __device__ float material_perturbation(float x, float y, float z, float kx, float ky, float kz) {
+        return sin(kx * x) * sin(ky * y) * sin(kz * z);
+}
+
+__inline__ __device__ float gaussian(float z, float z0, float t, float k, float om) {
+        float tau = k * (z - z0)  + om * t;
+        return exp( - tau * tau );
+}
+
+
+
+__global__ void material_properties(
+              float *d_d1, float *d_lam,
+              float *d_mu, float *d_qp, float *d_qs,
+              const float lam0, const float mu0, const float rho0, 
+              const float dlam, const float dmu, const float drho, 
+              const float kx, const float ky, const float kz, 
+              const int nx, const int ny,
+              const int nz, 
+              const int px, const int py, const int pz, 
+              const float h
+              ) {
+
+                int i = threadIdx.z + blockDim.z * blockIdx.z;
+                int j = threadIdx.y + blockDim.y * blockIdx.y;
+                int k = align + threadIdx.x + blockDim.x * blockIdx.x;
+
+                if (!in_bounds_stress(nx, ny, nz, i, j, k)) return;
+                
+                float Lx = length_x(nx, h);
+                float Ly = length_y(ny, h);
+                float Lz = length_z(nz, h);
+
+                float x = xi(i, px, Lx, h);
+                float y = yj(j, py, Ly, h);
+
+                float z = zk(k, pz, Lz, h);
+                float zh = zk(k, pz, Lz, h, 1);
+                //float z = zk(k, pz, Lz, h);
+
+                int line = 2 * align + nz;
+                int slice = line * (4 + 2 * ngsl + ny);
+                int pos = k + line * j + slice * i;
+
+                float S = material_perturbation(x, y, z, kx, ky, kz);
+                
+                d_d1[pos] = rho0 + drho * S;
+                d_lam[pos] = 1.0f / (lam0 + dlam * S);
+                d_mu[pos] = 1.0f /  (mu0 + dmu * S);
+                d_qp[pos] = 1e-10;
+                d_qs[pos] = 1e-10;
+                                        
+}
+
+__global__ void exact_velocity(
+              float *d_vx, float *d_vy, float *d_vz, 
+              const float vx0, const float vy0, const float vz0, 
+              const float xx0, const float yy0, const float zz0, 
+              const float xy0, const float xz0, const float yz0, 
+              const float dvx, const float dvy, const float dvz, 
+              const float dxx, const float dyy, const float dzz, 
+              const float dxy, const float dxz, const float dyz, 
+              const float cp0, const float cs0, const float rho0,
+              const float dcp, const float dcs, const float drho,
+              const float rc, 
+              const float kx, const float ky, const float kz, 
+              const int nx, const int ny, const int nz, 
+              const int px, const int py, const int pz, 
+              const int bi, const int bj, const int bk, 
+              const int ei, const int ej, const int ek, 
+              const float h, const float t, 
+              const int apply_in_interior, 
+              const float f
+              ) {
+
+                int i = threadIdx.z + blockDim.z * blockIdx.z;
+                int j = threadIdx.y + blockDim.y * blockIdx.y;
+                int k = align + threadIdx.x + blockDim.x * blockIdx.x;
+
+                if (!in_bounds_velocity(nx, ny, nz, i, j, k)) return;
+                int is_in_bounds = in_bounds(i, j, k, bi, bj, bk, ei, ej, ek);
+                if (apply_in_interior && !is_in_bounds) return;
+                if (!apply_in_interior && is_in_bounds) return;
+                
+                float Lx =  length_x(nx, h);
+                float Ly =  length_y(ny, h);
+                float Lz =  length_z(nz, h);
+
+                float x = xi(i, px, Lx, h);
+                float y = yj(j, py, Ly, h);
+
+                float z = topo_mapping0(f, zk(k, pz, Lz, h), h, nz);
+                float zh = topo_mapping0(f, zk(k, pz, Lz, h, 1), h, nz);
+                float zc = topo_mapping0(f, rc, h, nz);
+
+
+                int line = 2 * align + nz;
+                int slice = line * (4 + 2 * ngsl + ny);
+                int pos = k + line * j + slice * i;
+
+                float S = material_perturbation(x, y, z, kx, ky, kz);
+                float cp = cp0 + dcp * S;
+                float cs = cs0 + dcs * S;
+
+                float om_p = cp * kz;
+                float om_s = cs * kz;
+
+
+                d_vx[pos] =  dvx*exp(-pow(kz*(zh - zc) + om_s*t, 2)) + vx0;
+                d_vy[pos] =  dvy*exp(-pow(kz*(zh - zc) + om_s*t, 2)) + vy0;
+                d_vz[pos] =  dvz*exp(-pow(kz*(z - zc) + om_p*t, 2)) + vz0;
+                
+                                        
+}
+
+__global__ void exact_stress(
+              float *d_xx, float *d_yy, float *d_zz, 
+              float *d_xy, float *d_xz, float *d_yz, 
+              const float vx0, const float vy0, const float vz0, 
+              const float xx0, const float yy0, const float zz0, 
+              const float xy0, const float xz0, const float yz0, 
+              const float dvx, const float dvy, const float dvz, 
+              const float dxx, const float dyy, const float dzz, 
+              const float dxy, const float dxz, const float dyz, 
+              const float cp0, const float cs0, const float rho0,
+              const float dcp, const float dcs, const float drho, 
+              const float rc, 
+              const float kx, const float ky, const float kz, 
+              const int nx, const int ny, const int nz, 
+              const int px, const int py, const int pz, 
+              const int bi, const int bj, const int bk, 
+              const int ei, const int ej, const int ek, 
+              const float h, const float t, const int apply_in_interior, 
+              const float f
+              ) {
+
+                int i = threadIdx.z + blockDim.z * blockIdx.z;
+                int j = threadIdx.y + blockDim.y * blockIdx.y;
+                int k = align + threadIdx.x + blockDim.x * blockIdx.x;
+
+                if (!in_bounds_stress(nx, ny, nz, i, j, k)) return;
+                int is_in_bounds = in_bounds(i, j, k, bi, bj, bk, ei, ej, ek);
+                if (apply_in_interior && !is_in_bounds) return;
+                if (!apply_in_interior && is_in_bounds) return;
+                
+                float Lx =  length_x(nx, h);
+                float Ly =  length_y(ny, h);
+                float Lz =  length_z(nz, h);
+
+                float x = xi(i, px, Lx, h);
+                float y = yj(j, py, Ly, h);
+                float z = topo_mapping0(f, zk(k, pz, Lz, h), h, nz);
+                float zh = topo_mapping0(f, zk(k, pz, Lz, h, 1), h, nz);
+                float zc = topo_mapping0(f, rc, h, nz);
+
+                int line = 2 * align + nz;
+                int slice = line * (4 + 2 * ngsl + ny);
+                int pos = k + line * j + slice * i;
+
+                float S = material_perturbation(x, y, z, kx, ky, kz);
+                float cp = cp0 + dcp * S;
+                float cs = cs0 + dcs * S;
+                float rho = rho0;
+
+                float om_p = cp * kz;
+                float om_s = cs * kz;
+
+                d_xx[pos] = 0.0f;
+                d_yy[pos] = 0.0f;
+                d_zz[pos] = rho * cp * (dzz*exp(-pow(kz*(zh - zc) + om_p*t, 2)) + zz0);
+
+                
+                d_xy[pos] = 0.0f;
+                d_xz[pos] = cs * rho * (dxz*exp(-pow(kz*(z - zc) + om_s*t, 2)) + xz0);
+                d_yz[pos] = cs * rho * (dyz*exp(-pow(kz*(z - zc) + om_s*t, 2)) + yz0);
+                                        
+}
+
+__global__ void force_velocity(
+                float *d_vx, float *d_vy, float *d_vz,
+                const float vx0, const float vy0, const float vz0, 
+                const float xx0, const float yy0, const float zz0, 
+                const float xy0, const float xz0, const float yz0, 
+                const float dvx, const float dvy, const float dvz, 
+                const float dxx, const float dyy, const float dzz, 
+                const float dxy, const float dxz, const float dyz, 
+                const float cp0, const float cs0, const float rho0,
+                const float dcp, const float dcs, const float drho, 
+                const int nx, const int ny, const int nz,
+                const int px, const int py, const int pz,
+                const float h, const float t, const float dt) {
+
+                int i = threadIdx.z + blockDim.z * blockIdx.z;
+                int j = threadIdx.y + blockDim.y * blockIdx.y;
+                int k = align + threadIdx.x + blockDim.x * blockIdx.x;
+
+                if (!in_bounds_velocity(nx, ny, nz, i, j, k)) return;
+
+                float Lx =  length_x(nx, h);
+                float Ly =  length_y(ny, h);
+                float Lz =  length_z(nz, h);
+ 
+                float kx = 0.0;
+                float ky = 0.0;
+                float kz = 0.0;
+
+                float x = xi(i, px, Lx, h);
+                float y = yj(j, py, Ly, h);
+                float z = zk(k, pz, Lz, h);
+
+                int line = 2 * align + nz;
+                int slice = line * (4 + 2 * ngsl + ny);
+                int pos = k + line * j + slice * i;
+
+                float S = material_perturbation(x, y, z, kx, ky, kz);
+
+                float cp = cp0 + dcp * S;
+                float cs = cs0 + dcs * S;
+                float rho = rho0 + drho * S;
+                float om_p = cp * kz;
+}
+
+__global__ void force_stress(
+                float *d_xx, float *d_yy, float *d_zz, 
+                float *d_xy, float *d_xz, float *d_yz,
+                const float vx0, const float vy0, const float vz0, 
+                const float xx0, const float yy0, const float zz0, 
+                const float xy0, const float xz0, const float yz0, 
+                const float dvx, const float dvy, const float dvz, 
+                const float dxx, const float dyy, const float dzz, 
+                const float dxy, const float dxz, const float dyz, 
+                const float cp0, const float cs0, const float rho0,
+                const float dcp, const float dcs, const float drho,
+                const int nx, const int ny, const int nz,
+                const int px, const int py, const int pz,
+                const float h, const float t, const float dt) {
+
+                int i = threadIdx.z + blockDim.z * blockIdx.z;
+                int j = threadIdx.y + blockDim.y * blockIdx.y;
+                int k = align + threadIdx.x + blockDim.x * blockIdx.x;
+
+                if (!in_bounds_stress(nx, ny, nz, i, j, k)) return;
+
+                float Lx =  length_x(nx, h);
+                float Ly =  length_y(ny, h);
+                float Lz =  length_z(nz, h);
+ 
+                float kx = 0.0;
+                float ky = 0.0;
+                float kz = 0.0;
+
+                float x = xi(i, px, Lx, h);
+                float y = yj(j, py, Ly, h);
+                float z = zk(k, pz, Lz, h);
+                float zh = zk(k, pz, Lz, h, 1);
+
+                int line = 2 * align + nz;
+                int slice = line * (4 + 2 * ngsl + ny);
+                int pos = k + line * j + slice * i;
+
+                float S = material_perturbation(x, y, z, kx, ky, kz);
+                float cp = cp0 + dcp * S;
+                float cs = cs0 + dcs * S;
+                float rho = rho0 + drho * S;
+
+                float om_p = cp * kz;
+
+                float lam = rho * (cp * cp  - 2 * cs * cs);
+                float mu = rho * cs * cs;
+}
+
+void mms_init(const char *MMSFILE, const int *nxt,
+              const int *nyt, const int *nzt, const int ngrids, float **d_d1,
+              float **d_lam, float **d_mu, float **d_qp, float **d_qs,
+              float **d_vx, float **d_vy, float **d_vz, float **d_xx,
+              float **d_yy, float **d_zz, float **d_xy, float **d_xz,
+              float **d_yz, int px, int py, int rank, const MPI_Comm comm, const float *h, const float dt) 
+{
+
+        FILE *fh = fopen(MMSFILE, "r");
+        if (!fh)  {
+         if (rank == 0) {
+                 fprintf(stderr, "Failed to open: %s \n", MMSFILE);
+                 exit(-1);
+        }
+                return; 
+        }
+
+
+        int parsed = fscanf(fh,
+                            "%f %f %f %f %f %f | %f %f %f | %f %f %f %f %f %f %f %f "
+                            "%f | %f %f %f %f %f %f %f %f %f | %f %f \n",
+                            &scp0, &scs0, &srho0, &sdcp, &sdcs, &sdrho, &kx, &ky, &kz,
+                            &svx0, &svy0, &svz0, &sxx0, &syy0, &szz0, &sxy0,
+                            &sxz0, &syz0, &sdvx, &sdvy, &sdvz, &sdxx, &sdyy,
+                            &sdzz, &sdxy, &sdxz, &sdyz, &src, &stretch_ratio);
+
+        if (parsed != 31 && px == 0 && py == 0)
+                 fprintf(stderr, "Failed to parse: %s \n", MMSFILE);
+
+
+        if (px == 0 && py == 0) {
+                printf("Done reading mms input file\n");
+                printf("Settings: \n");
+                printf("        cp0 = %g cs0 = %g rho0 = %g \n", scp0, scs0, srho0);
+                printf("        dcp = %g dcs = %g drho = %g \n", sdcp, sdcs, sdrho);
+                printf("        kx = %g ky = %g kz = %g \n", kx, ky, kz);
+                printf("        vx0 = %g vy0 = %g vz0 = %g \n", svx0, svy0, svz0);
+                printf("        xx0 = %g yy0 = %g zz0 = %g \n", sxx0, syy0, szz0);
+                printf("        xy0 = %g xz0 = %g yz0 = %g \n", sxy0, sxz0, syz0);
+                printf("        dvx = %g dvy = %g dvz = %g \n", sdvx, sdvy, sdvz);
+                printf("        dxx = %g dyy = %g dzz = %g \n", sdxx, sdyy, sdzz);
+                printf("        dxy = %g dxz = %g dyz = %g \n", sdxy, sdxz, sdyz);
+                printf("        rc = %g \n", src);
+                printf("        stretch_ratio = %g \n", stretch_ratio);
+        }
+
+
+        const int INTERIOR = 1;
+        dim3 threads (32, 4, 1);
+        for (int p = 0; p < ngrids; ++p) {
+        
+                int mz = nzt[p];
+                int my = nyt[p] + 2 * ngsl + 4;
+                int mx = nxt[p] + 2 * ngsl + 4;
+
+                float mu0 = scs0 * scs0 * srho0;
+                float dmu = sdcs * sdcs * sdrho;
+                float lam0 = scp0 * scp0 * srho0 - 2.0 * scs0 * scs0 * srho0;
+                float dlam = sdcp * sdcp * sdrho - 2.0 * sdcs * sdcs * sdrho;
+                printf("mu0 = %g lam0 = %g dlam = %g dmu = %g \n", mu0, lam0, dlam, dmu);
+
+                if (px == 0 && py == 0) printf("Setting material properties for grid = %d \n", p);
+                // Set material properties
+                dim3 blocks( (mz - 1) / threads.x + 1, (my - 1) / threads.y + 1, (mx - 1) / threads.z + 1);
+                material_properties<<<blocks, threads>>>(
+                    d_d1[p], d_lam[p], d_mu[p], d_qp[p], d_qs[p], lam0, mu0,
+                    srho0, dlam, dmu, sdrho, kx, ky, kz, nxt[p], nyt[p], nzt[p], px,
+                    py, p, h[p]);
+
+                if (px == 0 && py == 0) printf("Setting velocity initial conditions for grid = %d \n", p);
+                // Set initial conditions for velocity
+                mms_exact_velocity(
+                                d_vx[p], d_vy[p], d_vz[p],
+                                nxt[p], nyt[p], nzt[p], 
+                                px, py, p, 
+                                0, 0, 0, 
+                                2 + 2 * ngsl + nxt[p], 4 + 2 * ngsl + nyt[p], nzt[p], 
+                                h[p], 0.0f - 0.5f * dt, INTERIOR);
+
+                if (px == 0 && py == 0) printf("Setting stress initial conditions for grid = %d \n", p);
+                // Set initial conditions for stress
+                mms_exact_stress(
+                                d_xx[p], d_yy[p], d_zz[p], 
+                                d_xy[p], d_xz[p], d_yz[p], 
+                                nxt[p], nyt[p], nzt[p], 
+                                px, py, p, 
+                                0, 0, 0, 
+                                4 + 2 * ngsl + nxt[p], 4 + 2 * ngsl + nyt[p], nzt[p], 
+                                h[p], 0.0f - 0.0f * dt, INTERIOR);
+                CUCHK(cudaGetLastError());
+
+
+
+        }
+
+                if (px == 0 && py == 0) printf("MMS initialization done. \n");
+}
+
+void mms_exact_velocity(
+              float *d_vx, float *d_vy, float *d_vz,
+              const int nx, const int ny, const int nz, 
+              const int px, const int py, const int pz, 
+              const int bi, const int bj, const int bk, 
+              const int ei, const int ej, const int ek, 
+              const float h, const float t, const int apply_in_interior)
+{
+                int mz = nz;
+                int my = ny + 2 * ngsl + 4;
+                int mx = nx + 2 * ngsl + 4;
+                dim3 threads(32, 4, 1);
+                dim3 blocks((mz - 1) / threads.x + 1, (my - 1) / threads.y + 1,
+                            (mx - 1) / threads.z + 1);
+
+                exact_velocity<<<blocks, threads>>>(
+                                d_vx, d_vy, d_vz,
+                                svx0, svy0, svz0, 
+                                sxx0, syy0, szz0,
+                                sxy0, sxz0, syz0,
+                                sdvx, sdvy, sdvz, 
+                                sdxx, sdyy, sdzz,
+                                sdxy, sdxz, sdyz,
+                                scp0, scs0, srho0,
+                                sdcp, sdcs, sdrho, 
+                                src, 
+                                kx, ky, kz,
+                                nx, ny, nz, 
+                                px, py, pz, 
+                                bi, bj, bk,
+                                ei, ej, ek,
+                                h, t, apply_in_interior, stretch_ratio);
+                CUCHK(cudaGetLastError());
+}
+
+void mms_exact_stress(
+              float *d_xx, float *d_yy, float *d_zz, 
+              float *d_xy, float *d_xz, float *d_yz, 
+              const int nx, const int ny, const int nz, 
+              const int px, const int py, const int pz, 
+              const int bi, const int bj, const int bk, 
+              const int ei, const int ej, const int ek, 
+              const float h, const float t, const int apply_in_interior)
+{
+                int mz = nz;
+                int my = ny + 2 * ngsl + 4;
+                int mx = nx + 2 * ngsl + 4;
+                dim3 threads(32, 4, 1);
+                dim3 blocks((mz - 1) / threads.x + 1, (my - 1) / threads.y + 1,
+                            (mx - 1) / threads.z + 1);
+                exact_stress<<<blocks, threads>>>(
+                                d_xx, d_yy, d_zz, 
+                                d_xy, d_xz, d_yz, 
+                                svx0, svy0, svz0, 
+                                sxx0, syy0, szz0,
+                                sxy0, sxz0, syz0,
+                                sdvx, sdvy, sdvz, 
+                                sdxx, sdyy, sdzz,
+                                sdxy, sdxz, sdyz,
+                                scp0, scs0, srho0,
+                                sdcp, sdcs, sdrho, 
+                                src, 
+                                kx, ky, kz,
+                                nx, ny, nz, 
+                                px, py, pz, 
+                                bi, bj, bk, 
+                                ei, ej, ek, 
+                                h, t, apply_in_interior, stretch_ratio);
+                CUCHK(cudaGetLastError());
+
+}
+
+void mms_force_velocity(float *d_vx, float *d_vy, float *d_vz, const int nx, const int ny, const int nz, const float h, const int px, const int py, const int pz, const float t, const float dt)
+{
+                int mz = nz;
+                int my = ny + 2 * ngsl + 4;
+                int mx = nx + 2 * ngsl + 4;
+                dim3 threads (32, 4, 1);
+                dim3 blocks( (mz - 1) / threads.x + 1, (my - 1) / threads.y + 1, (mx - 1) / threads.z + 1);
+                force_velocity<<<blocks, threads>>>(
+                    d_vx, d_vy, d_vz,
+                    svx0, svy0, svz0, 
+                    sxx0, syy0, szz0,
+                    sxy0, sxz0, syz0,
+                    sdvx, sdvy, sdvz, 
+                    sdxx, sdyy, sdzz,
+                    sdxy, sdxz, sdyz,
+                    scp0, scs0, srho0,
+                    sdcp, sdcs, sdrho, 
+                    nx, ny, nz, 
+                    px, py, pz, h, t, dt);
+                CUCHK(cudaGetLastError());
+}
+void mms_force_stress(float *d_xx, float *d_yy, float *d_zz, float *d_xy,
+                      float *d_xz, float *d_yz, const int nx, const int ny, const int nz,
+                      const float h, const int px, const int py, const int pz, const float t, const float dt) {
+        int mz = nz;
+        int my = ny + 2 * ngsl + 4;
+        int mx = nx + 2 * ngsl + 4;
+        dim3 threads(32, 4, 1);
+        dim3 blocks( (mz - 1) / threads.x + 1, (my - 1) / threads.y + 1, (mx - 1) / threads.z + 1);
+        force_stress<<<blocks, threads>>>(
+                    d_xx, d_yy, d_zz, 
+                    d_xy, d_xz, d_yz, 
+                    svx0, svy0, svz0, 
+                    sxx0, syy0, szz0,
+                    sxy0, sxz0, syz0,
+                    sdvx, sdvy, sdvz, 
+                    sdxx, sdyy, sdzz,
+                    sdxy, sdxz, sdyz,
+                    scp0, scs0, srho0,
+                    sdcp, sdcs, sdrho, 
+                    nx, ny, nz, 
+                    px, py, pz, h, t, dt);
+        CUCHK(cudaGetLastError());
+}
+
+
diff --git a/src/topography/mms.py b/src/topography/mms.py
new file mode 100644
index 0000000..67f3e8c
--- /dev/null
+++ b/src/topography/mms.py
@@ -0,0 +1,96 @@
+import sympy as sp
+
+
+rho, cp, cs = sp.symbols("rho cp cs")
+#rho0, cp0, cs0 = sp.symbols("rho0 cp0 cs0")
+#drho, dcp, dcs = sp.symbols("drho dcp dcs")
+
+vx0, vy0, vz0 = sp.symbols("vx0 vy0 vz0")
+xx0, yy0, zz0 = sp.symbols("xx0 yy0 zz0")
+xy0, xz0, yz0 = sp.symbols("xy0 xz0 yz0")
+dvx, dvy, dvz = sp.symbols("dvx dvy dvz")
+dxx, dyy, dzz = sp.symbols("dxx dyy dzz")
+dxy, dxz, dyz = sp.symbols("dxy dxz dyz")
+
+x, y, z, t, kx, ky, kz, om_p, om_c, dt = sp.symbols("x y z t kx ky kz om_p om_c dt")
+lam, mu = sp.symbols("lam mu")
+
+S = sp.sin(kx * x) * sp.sin(ky * y) * sp.sin(kz * z)
+
+#rho = rho0# + drho * S
+#cs = cs0# + dcs * S
+#cp = cp0# + dcp * S
+
+Vp = sp.sin(kz * z) * sp.sin(om_p * t)
+vx = 0
+vy = 0
+vz = vz0 + Vp * dvz
+
+xx = 0
+yy = 0
+zz = zz0 + Vp * dzz
+xy = 0
+xz = 0
+yz = 0
+
+
+
+vx_t = sp.diff(vx, t)
+vy_t = sp.diff(vy, t)
+vz_t = sp.diff(vz, t)
+
+vx_x = sp.diff(vx, x)
+vy_x = sp.diff(vy, x)
+vz_x = sp.diff(vz, x)
+
+vx_y = sp.diff(vx, y)
+vy_y = sp.diff(vy, y)
+vz_y = sp.diff(vz, y)
+
+vx_z = sp.diff(vx, z)
+vy_z = sp.diff(vy, z)
+vz_z = sp.diff(vz, z)
+
+xx_x = sp.diff(xx, x)
+yy_x = sp.diff(yy, x)
+zz_x = sp.diff(zz, x)
+
+xx_y = sp.diff(xx, y)
+yy_y = sp.diff(yy, y)
+zz_y = sp.diff(zz, y)
+
+xx_z = sp.diff(xx, z)
+yy_z = sp.diff(yy, z)
+zz_z = sp.diff(zz, z)
+
+xx_t = sp.diff(xx, t)
+yy_t = sp.diff(yy, t)
+zz_t = sp.diff(zz, t)
+
+xy_x = sp.diff(xy, x)
+xz_x = sp.diff(xz, x)
+yz_x = sp.diff(yz, x)
+
+xy_y = sp.diff(xy, y)
+xz_y = sp.diff(xz, y)
+yz_y = sp.diff(yz, y)
+
+xy_z = sp.diff(xy, z)
+xz_z = sp.diff(xz, z)
+yz_z = sp.diff(yz, z)
+
+xy_t = sp.diff(xy, t)
+xz_t = sp.diff(xz, t)
+yz_t = sp.diff(yz, t)
+
+f_vx = vx_t  - (xx_x + xy_y + xz_z) / rho
+f_vy = vy_t  - (xy_x + yy_y + yz_z) / rho
+f_vz = vz_t  - (xz_x + yz_y + zz_z) / rho
+
+div = vx_x + vy_y + vz_z
+f_zz = zz_t - (lam * div + 2 * mu * vz_z)
+print("d_vz[pos] = ", sp.ccode(vz) + ";")
+print("d_zz[pos] = ", sp.ccode(zz) + ";")
+
+print("d_vz[pos] += ", sp.ccode(dt * f_vz) + ";")
+print("d_zz[pos] += ", sp.ccode(dt * f_zz) + ";")
diff --git a/src/topography/opt_topography.cu b/src/topography/opt_topography.cu
deleted file mode 100644
index a8af372..0000000
--- a/src/topography/opt_topography.cu
+++ /dev/null
@@ -1,17 +0,0 @@
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <nvToolsExt.h>
-#include <stdio.h>
-
-#include <topography/topography.h>
-#include <topography/opt_topography.cuh>
-#include <topography/kernels/optimized_launch_config.cuh>
-#include <topography/kernels/optimized_velocity.cuh>
-#include <awp/definitions.h>
-
-void topo_init_material_H(topo_t *T)
-{
-        fprintf(stderr, "Not Implemented\n");
-}
-
-
diff --git a/src/topography/readers/CMakeLists.txt b/src/topography/readers/CMakeLists.txt
index ac49345..52464f6 100644
--- a/src/topography/readers/CMakeLists.txt
+++ b/src/topography/readers/CMakeLists.txt
@@ -1,13 +1,13 @@
 set(HEADERS
-    ${AWP_MINI_SOURCE_DIR}/include/awp/error.h     
-    ${AWP_MINI_SOURCE_DIR}/include/topography/readers/serial_reader.h
+    ${AWP_SOURCE_DIR}/include/awp/error.h     
+    ${AWP_SOURCE_DIR}/include/topography/readers/serial_reader.h
     )
 
 add_library(topography_readers serial_reader.c)
 
 target_include_directories(topography_readers
         PUBLIC
-        ${AWP_MINI_SOURCE_DIR}/include/
+        ${AWP_SOURCE_DIR}/include/
        )
 
 
diff --git a/src/topography/readers/serial_reader.c b/src/topography/readers/serial_reader.c
index 1178193..010c774 100644
--- a/src/topography/readers/serial_reader.c
+++ b/src/topography/readers/serial_reader.c
@@ -5,6 +5,7 @@
 #include <awp/error.h>
 #include <test/test.h>
 #include <topography/readers/serial_reader.h>
+#include <topography/metrics/metrics.h>
 
 
 int topo_read_serial(const char *filename, const int rank, const int px,
@@ -28,9 +29,9 @@ int topo_read_serial(const char *filename, const int rank, const int px,
         assert(count > 0);
         assert(nx * px == gnx);
         assert(ny * py == gny);
-        assert(padding >= ngsl);
+        assert(padding >= metrics_padding);
 
-        if (nx * px != gnx || ny * py != gny || padding < ngsl) {
+        if (nx * px != gnx || ny * py != gny || padding < metrics_padding) {
                 fclose(fh);
                 return ERR_INCONSISTENT_SIZE;
         }
@@ -38,15 +39,15 @@ int topo_read_serial(const char *filename, const int rank, const int px,
         float *data = malloc(sizeof data * gmx * gmy);
         count = fread(data, sizeof data, gmx * gmy, fh);
 
-        int lmx = 4 + nx + 2 * ngsl;
-        int lmy = 4 + ny + 2 * ngsl + 2 * align;
+        int lmx = 4 + nx + 2 * metrics_padding;
+        int lmy = 4 + ny + 2 * metrics_padding + 2 * align;
 
         if (alloc) {
                 *out = malloc(sizeof out * lmx * lmy); 
         }
 
-        for (int i = 0; i < (nx + 2 * ngsl); ++i) {
-        for (int j = 0; j < (ny + 2 * ngsl); ++j) {
+        for (int i = 0; i < (nx + 2 * metrics_padding); ++i) {
+        for (int j = 0; j < (ny + 2 * metrics_padding); ++j) {
                 size_t global_pos =   (ny * coord[1] + j) 
                                     + (nx * coord[0] + i) * gmy;
                 size_t local_pos = 2 + align + j + (2 + i) * lmy;
diff --git a/src/topography/receivers/CMakeLists.txt b/src/topography/receivers/CMakeLists.txt
index 65fcdfa..b03059b 100644
--- a/src/topography/receivers/CMakeLists.txt
+++ b/src/topography/receivers/CMakeLists.txt
@@ -4,7 +4,7 @@ add_library(topography_receivers
 
 target_include_directories(topography_receivers
         PUBLIC
-        ${AWP_MINI_SOURCE_DIR}/include/
+        ${AWP_SOURCE_DIR}/include/
         )
 
 target_link_libraries(topography_receivers topography_sources)
diff --git a/src/topography/receivers/receiver.c b/src/topography/receivers/receiver.c
index eaad187..f5a2b89 100644
--- a/src/topography/receivers/receiver.c
+++ b/src/topography/receivers/receiver.c
@@ -16,8 +16,10 @@ void receiver_init_indexed(recv_t *recv, const input_t *input,
 
 recv_t receiver_init(const char *filename, 
                      const enum grid_types grid_type,
+                     const enum source_type st,
                      const input_t *input,
                      const grids_t *grids, 
+                     const struct mapping *map, 
                      const int ngrids,
                      const f_grid_t *f, 
                      const int rank,
@@ -27,8 +29,8 @@ recv_t receiver_init(const char *filename,
 
         strcpy(recv.filename, filename);
 
-        source_init_common(&recv, filename, grid_type, input, grids, ngrids, f,
-                           rank, comm);
+        source_init_common(&recv, filename, grid_type, input, grids, map, ngrids, f,
+                           rank, comm, st);
 
         if (!recv.use) {
                 return recv;
diff --git a/src/topography/receivers/receivers.c b/src/topography/receivers/receivers.c
index d35ec6f..ad7154d 100644
--- a/src/topography/receivers/receivers.c
+++ b/src/topography/receivers/receivers.c
@@ -3,7 +3,7 @@
 #include <math.h>
 #include <errno.h>
 
-#include <awp/definitions.h>
+#include <awp/pmcl3d_cons.h>
 #include <test/test.h>
 #include <topography/receivers/receivers.h>
 #include <topography/receivers/receiver.h>
@@ -21,7 +21,7 @@ static recv_t rz;
 
 static input_t input;
 
-void receivers_init(const char *filename, const grids_t *grids, int ngrids,
+void receivers_init(const char *filename, const grids_t *grids, const struct mapping *map, int ngrids,
                     const f_grid_t *f,
                   const MPI_Comm comm, const int rank, const int size)
 {
@@ -35,9 +35,9 @@ void receivers_init(const char *filename, const grids_t *grids, int ngrids,
         AWPCHK(input_broadcast(&input, rank, 0, comm));
 
 
-        rx = receiver_init("x", X, &input, grids, ngrids, f, rank, comm);
-        ry = receiver_init("y", Y, &input, grids, ngrids, f, rank, comm);
-        rz = receiver_init("z", Z, &input, grids, ngrids, f, rank, comm);
+        rx = receiver_init("x", X, RECEIVER, &input, grids, map, ngrids, f, rank, comm);
+        ry = receiver_init("y", Y, RECEIVER, &input, grids, map, ngrids, f, rank, comm);
+        rz = receiver_init("z", Z, RECEIVER, &input, grids, map, ngrids, f, rank, comm);
 }
 
 void receivers_finalize(void)
@@ -78,6 +78,53 @@ size_t receivers_last_step(void)
         return last_step;
 }
 
+recv_t receivers_get_receiver(enum grid_types grid_type)
+{
+        switch (grid_type)
+        {
+                case XX:
+                        fprintf(stderr, "No receiver can exist on grid XX\n");
+                        break;
+                case YY:
+                        fprintf(stderr, "No receiver can exist on grid YY\n");
+                        break;
+                case ZZ:
+                        fprintf(stderr, "No receiver can exist on grid ZZ\n");
+                        break;
+                case XY:
+                        fprintf(stderr, "No receiver can exist on grid XY\n");
+                        break;
+                case XZ:
+                        fprintf(stderr, "No receiver can exist on grid XZ\n");
+                        break;
+                case YZ:
+                        fprintf(stderr, "No receiver can exist on grid YZ\n");
+                        break;
+                case SX:
+                        fprintf(stderr, "No receiver can exist on grid SX\n");
+                        break;
+                case SY:
+                        fprintf(stderr, "No receiver can exist on grid SY\n");
+                        break;
+                case SZ:
+                        fprintf(stderr, "No receiver can exist on grid SZ\n");
+                        break;
+                case X:
+                        return rx;
+                        break;
+                case Y:
+                        return ry;
+                        break;
+                case Z:
+                        return rz;
+                        break;
+                case NODE:
+                        fprintf(stderr, "No receiver can exist on grid NODE\n");
+                        break;
+        }
+        return rx;
+}
+
 void receivers_step_format(char *out, size_t step, const char *base)
 {
         sprintf(out, "%s_%0*ld", base, leading_zeros, step);
diff --git a/src/topography/receivers/sgt.c b/src/topography/receivers/sgt.c
index c92b876..9c0238a 100644
--- a/src/topography/receivers/sgt.c
+++ b/src/topography/receivers/sgt.c
@@ -3,7 +3,7 @@
 #include <math.h>
 #include <errno.h>
 
-#include <awp/definitions.h>
+#include <awp/pmcl3d_cons.h>
 #include <test/test.h>
 #include <topography/receivers/receiver.h>
 #include <topography/receivers/sgt.h>
@@ -26,7 +26,7 @@ static input_t input;
 static size_t last_step = 0;
 static int leading_zeros;
 
-void sgt_init(const char *filename, const grids_t *grids, int ngrids,
+void sgt_init(const char *filename, const grids_t *grids, const struct mapping *map, int ngrids,
                     const f_grid_t *f,
                   const MPI_Comm comm, const int rank, const int size)
 {
@@ -40,12 +40,12 @@ void sgt_init(const char *filename, const grids_t *grids, int ngrids,
         AWPCHK(input_broadcast(&input, rank, 0, comm));
 
 
-        Gxx = receiver_init("Gxx", XX, &input, grids, ngrids, f, rank, comm);
-        Gyy = receiver_init("Gyy", YY, &input, grids, ngrids, f, rank, comm);
-        Gzz = receiver_init("Gzz", ZZ, &input, grids, ngrids, f, rank, comm);
-        Gxy = receiver_init("Gxy", XY, &input, grids, ngrids, f, rank, comm);
-        Gxz = receiver_init("Gxz", XZ, &input, grids, ngrids, f, rank, comm);
-        Gyz = receiver_init("Gyz", YZ, &input, grids, ngrids, f, rank, comm);
+        Gxx = receiver_init("Gxx", XX, SGT, &input, grids, map, ngrids, f, rank, comm);
+        Gyy = receiver_init("Gyy", YY, SGT, &input, grids, map, ngrids, f, rank, comm);
+        Gzz = receiver_init("Gzz", ZZ, SGT, &input, grids, map, ngrids, f, rank, comm);
+        Gxy = receiver_init("Gxy", XY, SGT, &input, grids, map, ngrids, f, rank, comm);
+        Gxz = receiver_init("Gxz", XZ, SGT, &input, grids, map, ngrids, f, rank, comm);
+        Gyz = receiver_init("Gyz", YZ, SGT, &input, grids, map, ngrids, f, rank, comm);
         
         // Configure material input file so that it outputs without buffering
         input_t material_input = input;
@@ -53,7 +53,7 @@ void sgt_init(const char *filename, const grids_t *grids, int ngrids,
         material_input.cpu_buffer_size = 1;
         material_input.steps = 1;
         material_input.num_writes = 1;
-        mat = receiver_init("", NODE, &material_input, grids, ngrids, f, rank,
+        mat = receiver_init("", NODE, RECEIVER, &material_input, grids, map, ngrids, f, rank,
                            comm);
 }
 
@@ -70,14 +70,14 @@ void sgt_finalize(void)
 }
 
 void sgt_write_material_properties(const prec *d_d1, const prec *d_lami,
-                                   const prec *d_mui, const int grid_num) {
+                                   const prec *d_mui, const int grid_num, const int rank) {
         if (!use) return;
-        printf("Writing material properties\n");
+        if (rank == 0) printf("Writing material properties\n");
         int len = strlen(mat.filename) + 4; 
         char *filename;
         filename = malloc(sizeof filename * len);
         sprintf(filename, "%sd1", mat.filename);
-        receiver_write(&mat, 0, filename, d_lami, grid_num);
+        receiver_write(&mat, 0, filename, d_d1, grid_num);
         sprintf(filename, "%slami", mat.filename);
         receiver_write(&mat, 0, filename, d_lami, grid_num);
         sprintf(filename, "%smui", mat.filename);
diff --git a/src/topography/sources/CMakeLists.txt b/src/topography/sources/CMakeLists.txt
index 07bbedc..29e992d 100644
--- a/src/topography/sources/CMakeLists.txt
+++ b/src/topography/sources/CMakeLists.txt
@@ -4,7 +4,7 @@ add_library(topography_sources
 
 target_include_directories(topography_sources
         PUBLIC
-        ${AWP_MINI_SOURCE_DIR}/include/
+        ${AWP_SOURCE_DIR}/include/
         )
 
-target_link_libraries(topography_sources readers buffers mpi metrics)
+target_link_libraries(topography_sources readers buffers mpi metrics mapping)
diff --git a/src/topography/sources/forces.c b/src/topography/sources/forces.c
index 4e172a2..0c3b05b 100644
--- a/src/topography/sources/forces.c
+++ b/src/topography/sources/forces.c
@@ -8,7 +8,10 @@
 #include <test/test.h>
 #include <topography/sources/source.h>
 #include <topography/sources/forces.h>
+#include <topography/mapping.h>
+#include <grid/shift.h>
 #include <readers/input.h>
+#include "interpolation/interpolation.h"
 
 static int use;
 
@@ -17,13 +20,41 @@ static source_t Fx;
 static source_t Fy;
 static source_t Fz;
 
+// Density at force location
+static float *d_rho_interp_x, *d_rho_interp_y, *d_rho_interp_z;
+
 static input_t input;
 
 static int myrank;
 
-void forces_init(const char *filename, const grids_t *grids, int ngrids,
-                  const f_grid_t *f, const MPI_Comm comm, const int rank,
-                  const int size) 
+void interpolate_density(float **d_rho_interp, const float *d_rho,
+                         const source_t *F, const grids_t *grids, const int degree);
+void interpolate_density(float **d_rho_interp, const float *d_rho,
+                         const source_t *F, const grids_t *grids, const int degree) {
+        if (!F->use) return;
+
+        cu_interp_t d_interp;
+       // Interpolate density to the force location
+       size_t num_bytes = sizeof(float) * F->lengths[0]; 
+       cudaMalloc((void**)d_rho_interp, num_bytes); 
+
+       grid3_t grid = grid_init_full_grid(grids->z.inner_size,
+                                          grid_node(), grids->z.coordinate, grids->z.boundary1,
+                                          grids->z.boundary2, grids->z.gridspacing);
+        grid_data_t xyz;
+        grid_data_init(&xyz, grid, 0);
+        AWPCHK(cuinterp_init(&d_interp, xyz.x, xyz.y, xyz.z,
+                                     grid, F->x[0], F->y[0], F->z[0],
+                                     F->global_indices[0],
+                                     F->lengths[0], degree));
+        cuinterp_interp_H(&d_interp, *d_rho_interp, d_rho);
+        cuinterp_finalize(&d_interp);
+
+}
+
+void forces_init(const char *filename, const grids_t *grids, const struct mapping *map, int ngrids,
+                  const f_grid_t *f, const g_grid_t *g, const MPI_Comm comm, const int rank,
+                  const int size, const float *rho, const int istopo) 
 {
         use = strcmp(filename, "") != 0 ? 1 : 0;
 
@@ -31,19 +62,23 @@ void forces_init(const char *filename, const grids_t *grids, int ngrids,
 
         if (!use) return;
 
-        // FIXME: Add support for multiple grids
-
        if (rank == 0) { 
                AWPCHK(input_init(&input, filename));
        }
        AWPCHK(input_broadcast(&input, rank, 0, comm));
 
 
-       Fx = source_init("fx", X, &input, grids, ngrids, f, rank, comm);
-       Fy = source_init("fy", Y, &input, grids, ngrids, f, rank, comm);
-       Fz = source_init("fz", Z, &input, grids, ngrids, f, rank, comm);
+       Fx = source_init("fx", SX, &input, grids, map, ngrids, f, rank, comm, FORCE);
+       Fy = source_init("fy", SY, &input, grids, map, ngrids, f, rank, comm, FORCE);
+       Fz = source_init("fz", SZ, &input, grids, map, ngrids, f, rank, comm, FORCE);
 
-       AWPCHK(forces_boundary_check(&Fx));
+       if (Fx.use) AWPCHK(forces_boundary_check(&Fx));
+       if (Fy.use) AWPCHK(forces_boundary_check(&Fy));
+       if (Fz.use) AWPCHK(forces_boundary_check(&Fz));
+
+       interpolate_density(&d_rho_interp_x, rho, &Fx, grids, input.degree);
+       interpolate_density(&d_rho_interp_y, rho, &Fy, grids, input.degree);
+       interpolate_density(&d_rho_interp_z, rho, &Fz, grids, input.degree);
 
 }
 
@@ -74,6 +109,7 @@ void forces_add(prec *d_u1, prec *d_v1, prec *d_w1, const prec *d_d1, const size
                 const prec h, const prec dt, const f_grid_t *f,
                 const g_grid_t *g, const int grid_num) 
 {
+        
         if (!use) return;
 
         int nx = f->size[0];
@@ -88,12 +124,34 @@ void forces_add(prec *d_u1, prec *d_v1, prec *d_w1, const prec *d_d1, const size
         prec q = 3.55599789310935;
         prec qh = 2.9022824945274315;
 
-        source_add_force(d_u1, d_d1, &Fx, step, h, dt, qh, f->d_f_1, nx, ny, nz,
-                         g->d_g3_c, grid_num);
-        source_add_force(d_v1, d_d1, &Fy, step, h, dt, qh, f->d_f_2, nx, ny, nz,
-                         g->d_g3_c, grid_num);
-        source_add_force(d_w1, d_d1, &Fz, step, h, dt, q, f->d_f_c, nx, ny, nz,
-                         g->d_g3, grid_num);
+        source_add_force(d_u1, d_rho_interp_x, &Fx, step, h, dt, qh, f->d_f_1, nx, ny, nz,
+                         g->d_g3_c, grid_num, 0, 1);
+        source_add_force(d_v1, d_rho_interp_y, &Fy, step, h, dt, qh, f->d_f_2, nx, ny, nz,
+                         g->d_g3_c, grid_num, 0, 2);
+        source_add_force(d_w1, d_rho_interp_z, &Fz, step, h, dt, q, f->d_f_c, nx, ny, nz,
+                         g->d_g3, grid_num, 0, 3);
+}
+
+void forces_add_cartesian(prec *d_xz, prec *d_yz, prec *d_zz, const size_t step,
+                const int nx, const int ny, const int nz, const prec h, const prec dt, const int grid_num) 
+{
+        
+        if (!use) return;
+
+        source_add_force(d_xz, d_rho_interp_x, &Fx, step, h, dt, 1.0, NULL, nx, ny, nz, NULL, grid_num, 1, 1);
+        source_add_force(d_yz, d_rho_interp_y, &Fy, step, h, dt, 1.0, NULL, nx, ny, nz, NULL, grid_num, 1, 2);
+        source_add_force(d_zz, d_rho_interp_z, &Fz, step, h, dt, 1.0, NULL, nx, ny, nz, NULL, grid_num, 1, 3);
+}
+
+void forces_add_cartesian_velocity(prec *d_vx, prec *d_vy, prec *d_vz, const size_t step,
+                const int nx, const int ny, const int nz, const prec h, const prec dt, const int grid_num) 
+{
+        
+        if (!use) return;
+
+        source_add_force(d_vx, d_rho_interp_x, &Fx, step, h, dt, 1.0, NULL, nx, ny, nz, NULL, grid_num, 2, 1);
+        source_add_force(d_vy, d_rho_interp_y, &Fy, step, h, dt, 1.0, NULL, nx, ny, nz, NULL, grid_num, 2, 2);
+        source_add_force(d_vz, d_rho_interp_z, &Fz, step, h, dt, 1.0, NULL, nx, ny, nz, NULL, grid_num, 2, 3);
 }
 
 void forces_finalize(void)
@@ -103,5 +161,12 @@ void forces_finalize(void)
         source_finalize(&Fx);
         source_finalize(&Fy);
         source_finalize(&Fz);
+
+        if (d_rho_interp_x != NULL) cudaFree(d_rho_interp_x);
+        if (d_rho_interp_y != NULL) cudaFree(d_rho_interp_y);
+        if (d_rho_interp_z != NULL) cudaFree(d_rho_interp_z);
 }
 
+
+
+
diff --git a/src/topography/sources/source.c b/src/topography/sources/source.c
index f431f8a..c2ebe3c 100644
--- a/src/topography/sources/source.c
+++ b/src/topography/sources/source.c
@@ -13,130 +13,134 @@
 #include <interpolation/interpolation.h>
 #include <interpolation/interpolation.cuh>
 #include <topography/sources/source.h>
+#include <topography/mapping.h>
 #include <topography/sources/source.cuh>
 #include <topography/grids.h>
 
-#define OVERLAP 7.0
+//#define DEBUG_SOURCE
 
 void source_init_indexed(source_t *src, const input_t *input, size_t num_reads);
 
-source_t source_init(const char *file_end, 
-                     const enum grid_types grid_type, 
+source_t source_init(const char *file_end,
+                     const enum grid_types grid_type,
                      const input_t *input,
-                     const grids_t *grids, 
+                     const grids_t *grids,
+                     const struct mapping *map,
                      const int ngrids,
-                     const f_grid_t *f, 
+                     const f_grid_t *f,
                      const int rank,
-                     const MPI_Comm comm)
+                     const MPI_Comm comm,
+                     const enum source_type st)
 {
         source_t src;
 
-        source_init_common(&src, file_end, grid_type, input, grids, ngrids, f,
-                           rank, comm);
+        source_init_common(&src, file_end, grid_type, input, grids, map, ngrids, f,
+                           rank, comm, st);
 
-        if (!src.use) {
+        if (!src.use)
+        {
                 return src;
         }
 
-
         size_t num_reads =
             input->steps / (input->cpu_buffer_size * input->gpu_buffer_size);
         source_init_indexed(&src, input, num_reads);
         src.io = mpi_io_idx_init(src.comm, rank, src.offsets, src.blocklen,
-                                     src.length, num_reads);
+                                 src.length, num_reads);
 
         return src;
 }
 
-
 void source_finalize(source_t *src)
 {
-        if (!src->use) return;
+        if (!src->use)
+                return;
         free(src->indices);
         buffer_finalize(&src->buffer);
         free(src->blocklen);
         free(src->offsets);
         free(src->host_buffer_extra);
-        for (int i = 0; i < src->ngrids; ++i) {
-                if (src->x[i] != NULL) free(src->x[i]);
-                if (src->y[i] != NULL) free(src->y[i]);
-                if (src->z[i] != NULL) free(src->z[i]);
-                if (src->xu[i] != NULL) free(src->xu[i]);
-                if (src->yu[i] != NULL) free(src->yu[i]);
-                if (src->zu[i] != NULL) free(src->zu[i]);
-                if (src->type[i] != NULL) free(src->type[i]);
+        for (int i = 0; i < src->ngrids; ++i)
+        {
+                if (src->x[i] != NULL)
+                        free(src->x[i]);
+                if (src->y[i] != NULL)
+                        free(src->y[i]);
+                if (src->z[i] != NULL)
+                        free(src->z[i]);
+                if (src->xu[i] != NULL)
+                        free(src->xu[i]);
+                if (src->yu[i] != NULL)
+                        free(src->yu[i]);
+                if (src->zu[i] != NULL)
+                        free(src->zu[i]);
+                if (src->type[i] != NULL)
+                        free(src->type[i]);
         }
 }
 
-void source_find_grid_number(const input_t *input, const
-                             grids_t *grids, int *grid_number, 
+void source_find_grid_number(const input_t *input, const grids_t *grids, int *grid_number,
                              const int *indices,
                              const int length,
-                             const int num_grids)
+                             const int num_grids,
+                             const int is_topo)
 {
-
-
-        for (int j = 0; j < length; ++j) {
-                grid_number[j] = -1;
+        int *nz = malloc(sizeof nz * num_grids);
+        for (int i = 0; i < num_grids; ++i) {
+            nz[i] = grids[i].z.size.z;
+                for (int j = 0; j < length; ++j)
+                    grid_number[j] = -1;
         }
 
-
-        _prec lower = 0;
-        _prec upper = 0;
-	_prec overlap = 0;
         for (int i = 0; i < num_grids; ++i) {
-		prec *z1 = malloc(sizeof z1 * grids[i].z.size.z);
-		grid1_t z_grid = grid_grid1_z(grids[i].z);
-                grid_fill1(z1, z_grid);
-
-                upper  = lower;
-                lower  = lower - z1[z_grid.end];
-
-                for (int j = 0; j < length; ++j) {
-                        _prec z = input->z[indices[j]];
-                        // Take into account that topography can yield positive
-                        // z-values
-                        if (input->type[indices[j]] == INPUT_SURFACE_COORD) {
-                                grid_number[j] = 0;
-                                continue;
-                        }
-                        else if (z > 0) {
-                                grid_number[j] = 0;
-                        }
-                        else if (z > lower && z <= upper - overlap) {
-                                grid_number[j] = i;
-                        }
-                        }
+                float *z1 = malloc(sizeof z1 * nz[i]);
+                grid1_t z_grid = grid_grid1_z(grids[i].z);
+                grid_fill1(z1, z_grid, 0);
+                _prec h = z_grid.gridspacing;
+                _prec zloc = 0.0;
+                _prec hw = 0.5 * (input->degree + 1) * h; 
+                for (int j = 0; j < length; ++j)
+                {
+                    // Skip assignment if this source/recv has already been assigned a grid number
+                    if (grid_number[j] != -1) continue;
+
+                    // Surface coordinates map to the top block (block 0)
+                    if (input->type[indices[j]] == INPUT_SURFACE_COORD) {
+                        grid_number[j] = 0; 
+                        continue;
+                    }
+                    _prec z = input->z[indices[j]];
+                    global_to_local(&zloc, &grid_number[j], z - hw, h, nz, num_grids, is_topo); 
 
-                if (i + 1 != num_grids) {
-                       	overlap = z_grid.gridspacing * OVERLAP;
-                } else {
-                        overlap = 0.0f;
                 }
-		lower = lower + overlap;
-		free(z1);
-	}
 
+                free(z1);
+
+        }
+
+        free(nz);
 
-        for (int j = 0; j < length; ++j) {
-                if (grid_number[j] == -1) {
-                        fprintf(stderr, 
-                                "Failed to assign source/receiver id=%d "\
-                                " to a grid.\n", j);
+        for (int j = 0; j < length; ++j)
+        {
+                if (grid_number[j] == -1)
+                {
+                        fprintf(stderr,
+                                "Failed to assign source/receiver id=%d "
+                                " to a grid, z=%f.\n",
+                                j, input->z[indices[j]]);
                         exit(1);
                 }
-
         }
-
 }
 
 void source_init_common(source_t *src, const char *filename,
                         const enum grid_types grid_type,
-                        const input_t *input, 
-                        const grids_t *grids, 
+                        const input_t *input,
+                        const grids_t *grids,
+                        const struct mapping *map,
                         const int ngrids,
-                        const f_grid_t *f, 
-                        const int rank, const MPI_Comm comm)
+                        const f_grid_t *f,
+                        const int rank, const MPI_Comm comm, const enum source_type st)
 {
         sprintf(src->filename, "%s_%s", input->file, filename);
 
@@ -144,98 +148,92 @@ void source_init_common(source_t *src, const char *filename,
         _prec *y = malloc(sizeof y * input->length);
         _prec *z = malloc(sizeof z * input->length);
 
+        int is_topo = f == NULL ? 0 : 1;
+
         {
                 int *grid_number = malloc(sizeof grid_number * input->length);
                 int *indices = malloc(sizeof indices * input->length);
 
-                for (size_t i = 0; i < input->length; ++i) {
+                for (size_t i = 0; i < input->length; ++i)
+                {
                         indices[i] = i;
                 }
 
                 source_find_grid_number(input, grids, grid_number, indices,
-                                        input->length, ngrids);
-
-                // Determine offsets for the DM
-                _prec *dm_offset_x = malloc(sizeof dm_offset_x * ngrids);
-                _prec *dm_offset_y = malloc(sizeof dm_offset_y * ngrids);
-                _prec *dm_offset_z = malloc(sizeof dm_offset_z * ngrids);
-                grid3_t grid_top = grids_select(grid_type, &grids[0]);
-                dm_offset_x[0] = 0;
-                dm_offset_y[0] = 0;
-		dm_offset_z[0] = 0;
-                for (int j = 1; j < ngrids; ++j) {
-                        grid3_t grid_pre = grids_select(grid_type, &grids[j-1]);
-			grid3_t grid_cur = grids_select(grid_type, &grids[j]);
-                        dm_offset_x[j] = dm_offset_x[j - 1] +
-                                         SOURCE_DM_OFFSET_X * grid_pre.gridspacing-
-					 (grid_cur.shift.x * 0.5 * grid_cur.gridspacing
-						-grid_pre.shift.x * 0.5 * grid_pre.gridspacing);
-                        dm_offset_y[j] = dm_offset_y[j - 1] +
-                                         SOURCE_DM_OFFSET_Y * grid_pre.gridspacing-
-                                         (grid_cur.shift.y * 0.5 * grid_cur.gridspacing
-                                                -grid_pre.shift.y * 0.5 * grid_pre.gridspacing);
-			dm_offset_z[j] = grid_top.shift.z * 0.5 * grid_top.gridspacing - 
-					 (grid_cur.shift.z * 0.5 * grid_cur.gridspacing);
-                }
+                                        input->length, ngrids, is_topo);
 
-                for (size_t i = 0; i < input->length; ++i) {
-                     // Shift by 0.5 such that x = 0, y = 0 is
-                     // located at a material or topography grid
-                     // point.
-                     x[i] = input->x[i] +
-                            SOURCE_OFFSET_X * grid_top.gridspacing;
-                     y[i] = input->y[i];
-		     z[i] = input->z[i];
-
-                     int grid_num = grid_number[i];
-
-                     // Apply DM-specific shift for all other blocks
-                     x[i] = x[i] + dm_offset_x[grid_num];
-                     y[i] = y[i] + dm_offset_y[grid_num];
-		     z[i] = z[i] + dm_offset_z[grid_num];
+                for (size_t i = 0; i < input->length; ++i)
+                {
+                        x[i] = input->x[i];
+                        y[i] = input->y[i];
+                        z[i] = input->z[i];
                 }
 
-                free(dm_offset_x);
-                free(dm_offset_y);
-                free(dm_offset_z);
                 free(indices);
-                free(grid_number);
-        }
-
 
-        grid3_t grid = grids_select(grid_type, &grids[0]);
+                src->length = 0;
+                size_t *src_count = malloc(sizeof src_count * ngrids);
 
 
-        AWPCHK(dist_indices(&src->indices, &src->length, x,
-                            y, input->length, grid));
 
+                for (int j = 0; j < ngrids; ++j)
+                {
+                        size_t num_sources_in_block = 0;
+                        grid3_t grid = grids_select(grid_type, &grids[j]);
+                        
+                        AWPCHK(dist_indices(&src->indices, &num_sources_in_block, x, y,
+                                            input->length, grid, grid_number, j,
+                                            st, DIST_COUNT));
+                        src_count[j] = src->length;
+                        src->length += num_sources_in_block;
+                }
 
+                src->indices = malloc(sizeof(src->indices) * src->length);
+                for (int j = 0; j < ngrids; ++j)
+                {
+                        grid3_t grid = grids_select(grid_type, &grids[j]);
+                        AWPCHK(dist_indices(&src->indices, &src_count[j], x, y,
+                                            input->length, grid, grid_number, j,
+                                            st, DIST_INSERT_INDICES));
+                }
+                free(grid_number);
+                free(src_count);
+        }
 
         src->ngrids = ngrids;
         src->use = src->length > 0 ? 1 : 0;
+        src->steps = input->steps;
 
         MPI_Comm_split(comm, src->use, rank, &src->comm);
 
-        if (!src->use) {
+        if (!src->use)
+        {
                 return;
         }
 
-        for (int j = 0; j < ngrids; ++j) {
+        for (int j = 0; j < ngrids; ++j)
+        {
                 src->lengths[j] = 0;
         }
 
+        // identify grid number for each local source
         int *grid_number = malloc(sizeof grid_number * src->length);
         source_find_grid_number(input, grids, grid_number, src->indices,
-                                src->length, ngrids);
+                                src->length, ngrids, is_topo);
 
-        for (size_t i = 0; i < src->length; ++i) {
-                for (int j = 0; j < ngrids; ++j) {
-                        if (grid_number[i] == j) src->lengths[j] += 1;
+        // count number of local sources for each grid
+        for (size_t i = 0; i < src->length; ++i)
+        {
+                for (int j = 0; j < ngrids; ++j)
+                {
+                        if (grid_number[i] == j)
+                                src->lengths[j] += 1;
                 }
         }
 
         // Init arrays that contains local coordinates
-        for (int j = 0; j < ngrids; ++j) {
+        for (int j = 0; j < ngrids; ++j)
+        {
                 src->global_indices[j] =
                     calloc(sizeof src->global_indices[j], src->lengths[j]);
                 src->x[j] = malloc(sizeof src->x * src->lengths[j]);
@@ -244,13 +242,17 @@ void source_init_common(source_t *src, const char *filename,
                 src->xu[j] = malloc(sizeof src->x * src->lengths[j]);
                 src->yu[j] = malloc(sizeof src->y * src->lengths[j]);
                 src->zu[j] = malloc(sizeof src->z * src->lengths[j]);
-                src->type[j] = malloc(sizeof src->type *  src->lengths[j]);
+                src->type[j] = malloc(sizeof src->type * src->lengths[j]);
         }
 
-        for (int j = 0; j < ngrids; ++j) {
+        // copy global source data to local source data
+        for (int j = 0; j < ngrids; ++j)
+        {
                 int local_idx = 0;
-                for (size_t i = 0; i < src->length; ++i) {
-                        if (grid_number[i] != j) continue;
+                for (size_t i = 0; i < src->length; ++i)
+                {
+                        if (grid_number[i] != j)
+                                continue;
                         src->global_indices[j][local_idx] = i;
                         src->x[j][local_idx] = x[src->indices[i]];
                         src->y[j][local_idx] = y[src->indices[i]];
@@ -263,33 +265,39 @@ void source_init_common(source_t *src, const char *filename,
                 }
         }
 
-        _prec overlap = 0.0;
-        _prec lower = 0.0;
-        _prec block_height = 0.0;
-        for (int j = 0; j < ngrids; ++j) {
-                grid = grids_select(grid_type, &grids[j]);
-        
-                grid3_t metric_grid = grid_init_metric_grid( grid.inner_size,
-                                grid_node(), grid.coordinate, grid.boundary1,
-                                grid.boundary2, grid.gridspacing);
-
-                if (f!= NULL && j == 0) {
+        double overlap = 0.0;
+        double lower = 0.0;
+        double block_height = 0.0;
+        for (int j = 0; j < ngrids; ++j)
+        {
+                grid3_t grid = grids_select(grid_type, &grids[j]);
+
+                grid3_t metric_grid = grid_init_metric_grid(grid.inner_size,
+                                                            grid_node(), grid.coordinate, grid.boundary1,
+                                                            grid.boundary2, grid.gridspacing);
+
+                if (f != NULL && j == 0)
+                {
                         block_height = grid.gridspacing * (grid.size.z - 2);
                 }
-                else {
+                else
+                {
                         block_height = grid.gridspacing * (grid.size.z - 1);
                 }
 
-                lower  = lower - block_height + overlap;
+                lower = lower - block_height + overlap;
+
 
-                if (src->lengths[j] == 0) {
+                if (src->lengths[j] == 0)
+                {
                         src->x[j] = NULL;
                         src->y[j] = NULL;
                         src->z[j] = NULL;
                         src->type[j] = NULL;
                 }
 
-                if (src->lengths[j] != 0 && f != NULL && j == 0) {
+                if (src->lengths[j] != 0 && f != NULL && j == 0)
+                {
                         // x, y, z grid vectors compatible with topography grid
                         grid1_t x_grid = grid_grid1_x(metric_grid);
                         grid1_t y_grid = grid_grid1_y(metric_grid);
@@ -299,9 +307,10 @@ void source_init_common(source_t *src, const char *filename,
                         prec *y1 = malloc(sizeof y1 * y_grid.size);
                         prec *z1 = malloc(sizeof z1 * z_grid.size);
 
-                        grid_fill1(x1, x_grid);
-                        grid_fill1(y1, y_grid);
-                        grid_fill1(z1, z_grid);
+                        grid_fill1(x1, x_grid, 1);
+                        grid_fill1(y1, y_grid, 0);
+                        grid_fill1(z1, z_grid, 0);
+
 
                         // Interpolate topography data to source location in
                         // (x,y) space
@@ -312,120 +321,109 @@ void source_init_common(source_t *src, const char *filename,
                                                     metric_grid, src->x[j], src->y[j],
                                                     src->lengths[j], input->degree);
 
-
-                        for (size_t k = 0; k < src->lengths[j]; ++k) {
-                                switch (src->type[j][k]) {
-                                        // Map to parameter space
-                                        case INPUT_VOLUME_COORD:
-                                                src->z[j][k] =
-                                                    (block_height + src->z[j][k]) /
-                                                    f_interp[k];
-                                                break;
-                                        case INPUT_SURFACE_COORD:
-                                                src->z[j][k] = z1[z_grid.size - 2];
-                                                break;
-                                                // FIXME: INPUT_BATHYMETRY_COORD
-                                                // Implement treatment for ocean
-                                                // bathymetry.
-                                                // Recommendation: Add a
-                                                // function to "receivers.c" and
-                                                // a function to to "receiver.c"
-                                                // Place the implementation in
-                                                // "receiver.c" but call this
-                                                // function for each receiver
-                                                // component in "receivers.c"
+                        _prec h = grid.gridspacing;
+                        _prec hw = 0.5 * (input->degree + 1) * h; 
+                        for (size_t k = 0; k < src->lengths[j]; ++k)
+                        {
+                                switch (src->type[j][k])
+                                {
+                                // Map to parameter space
+                                case INPUT_VOLUME_COORD:
+                                    if (block_height + (src->z[j][k] - hw) <
+                                        overlap * h && grid_number[j] == 0) {
+                                        fprintf(stderr,
+                                                "Source/Receiver cannot exist "
+                                                "at the first two grid points on the "
+                                                "fine grid, id = %ld \n", k);
+                                        fprintf(stderr, "z = %g \n", src->z[j][k]);
+                                        fprintf(stderr, "This is a bug, please report it.\n");
+                                        exit(-1);
+                                    } else {
+                                        // Source / receiver is in the top part
+                                        // of the block that experiences the
+                                        // curvilinear grid transform
+
+                                        double h = grid.gridspacing;
+                                        double H = block_height - h * OVERLAP;
+                                        double Hf = f_interp[k] * H;
+                                        double x = (H + src->z[j][k]) / Hf;
+                                        double r = H * map_invert(x, map, MAPPING_INVERSION_TOL, MAPPING_MAX_ITER) + OVERLAP * h;
+                                        src->z[j][k] = r;
+                                    }
+                                        break;
+                                case INPUT_SURFACE_COORD:
+                                        src->z[j][k] = z1[z_grid.size - 2];
+                                        break;
                                 }
                         }
 
-                        // TODO: Add inversion step if grid stretching function
-                        // is used
 
                         free(f_interp);
                         free(x1);
                         free(y1);
                         free(z1);
-                } 
+                }
                 // Regular AWP
-                else {
-                      for (size_t k = 0; k < src->lengths[j]; ++k) {
-                              switch (src->type[j][k]) {
-                                      case INPUT_VOLUME_COORD:
-                                              src->z[j][k] = (src->z[j][k] - lower);
-                                              break;
-                                      // Map to parameter space
-                                      case INPUT_SURFACE_COORD:
-                                              // Only coordinates in the top
-                                              // block can be surface
-                                              // coordinates
-                                              assert(j == 0);
-                                              src->z[j][k] = block_height;
-                                              break;
-                              }
-                      }
+                else
+                {
+                        for (size_t k = 0; k < src->lengths[j]; ++k)
+                        {
+                                switch (src->type[j][k])
+                                {
+                                case INPUT_VOLUME_COORD:
+                                        src->z[j][k] = (src->z[j][k] - lower);
+                                        break;
+                                // Map to parameter space
+                                case INPUT_SURFACE_COORD:
+                                        // Only coordinates in the top
+                                        // block can be surface
+                                        // coordinates
+                                        assert(j == 0);
+                                        // Subtract 2h so the source location appears in the
+                                        // interior of the grid. This hack prevents the stencil from
+                                        // becoming one-sided.  The index gets correctly adjusted by
+                                        // changing the interpolation index below
+                                        src->z[j][k] = block_height - 1 * grid.gridspacing;
+                                        break;
+                                }
+                        }
                 }
 
                 overlap = grid.gridspacing * OVERLAP;
 
-                if (src->lengths[j] == 0) continue;
+                if (src->lengths[j] == 0)
+                        continue;
 
                 // Init grid that covers interior and halo regions
-                grid3_t full_grid = grid_init_metric_grid(
-                            grid.inner_size, grid.shift, grid.coordinate,
-                            grid.boundary1, grid.boundary2, grid.gridspacing);
+                grid3_t full_grid = grid_init_full_grid(
+                    grid.inner_size, grid.shift, grid.coordinate,
+                    grid.boundary1, grid.boundary2, grid.gridspacing);
                 grid_data_t xyz;
-                grid_data_init(&xyz, full_grid);
+                grid_data_init(&xyz, grid, j);
 
                 // Compute interpolation coefficients on the full grid
                 AWPCHK(cuinterp_init(&src->interpolation[j], xyz.x, xyz.y, xyz.z,
-                                        full_grid, src->x[j], src->y[j], src->z[j],
-                                        src->global_indices[j],
-                                        src->lengths[j], input->degree));
-
-
-		//Special treatment for sources located in the overlap zone
-		if(ngrids > 1)
-		{
- 			for (size_t k = 0; k < src->lengths[j]; ++k)
-                	{
-				//top block
-				if (j == 0)
-				{
-					//bottom two grids will not be used
-					if(src->interpolation[j].iz[k] < 2)
-					{
-					src->interpolation[j].iz[k]=2;
-					}
-				}
-				//blocks in between 
-				else if(j > 0 && j != ngrids-1)
-				{
-					//bottom two grids will not be used
-                                	if(src->interpolation[j].iz[k] < 2)
-                                	{
-                                	src->interpolation[j].iz[k]=2;
-                                	}
-					else if(src->interpolation[j].iz[k] > grid.size.z-3)
-					{
-					//the top two grids in the coarse grid will not be used
-					src->interpolation[j].iz[k]=grid.size.z-3;
-					}			
-				}
-				//bottom block
-				else if(j > 0 && j == ngrids - 1)
-				{
-                                        if(src->interpolation[j].iz[k] > grid.size.z-3)
-                                        {
-					//the top two grids in the coarse grid will not be used
-					src->interpolation[j].iz[k]=grid.size.z-3;
-					}
-				}
-			}//k loop
-		}
-		
-
-//------------------------------------------------------------------------------
-//Added by Te-Yang for printing purpose
-                grid3_t vel_grid = grid_init_stress_grid(
+                                     full_grid, src->x[j], src->y[j], src->z[j],
+                                     src->global_indices[j],
+                                     src->lengths[j], input->degree));
+
+                // Correct interpolation coefficients when the receivers appear on the free surface
+                if (f == NULL && j == 0) {
+                        for (size_t k = 0; k < src->lengths[j]; ++k) {
+                                switch (src->type[j][k]) {
+                                        case INPUT_SURFACE_COORD:
+                                                src->interpolation[j].iz[k] +=
+                                                    1;
+                                }
+                        }
+                }
+
+
+                cuinterp_htod(&src->interpolation[j]);
+
+#ifdef DEBUG_SOURCE
+                {
+                        grid3_t vel_grid = grid_init_stress_grid(
                             grid.inner_size, grid.shift, grid.coordinate,
                             grid.boundary1, grid.boundary2, grid.gridspacing);
                         grid1_t x_grid = grid_grid1_x(vel_grid);
@@ -436,59 +434,91 @@ void source_init_common(source_t *src, const char *filename,
                         prec *y1 = malloc(sizeof y1 * y_grid.size);
                         prec *z1 = malloc(sizeof z1 * z_grid.size);
 
-                        grid_fill1(x1, x_grid);
-                        grid_fill1(y1, y_grid);
-                        grid_fill1(z1, z_grid);
-
-
-                printf("rank = %d, shift = %d %d %d id = %d origin = %f %f %f h = %f\n",
-                                rank, grid.shift.x, grid.shift.y, grid.shift.z,
-                                j,
-                                x1[ngsl/2], y1[ngsl/2], z1[0],	
-                                grid.gridspacing);
-
-		for (size_t k = 0; k < src->lengths[j]; ++k)
-		{
-		printf("query int x y z = %f %f %f | nearest x y z = %f %f %f | index = %d %d %d\n", 
-			src->x[j][k], src->y[j][k], src->z[j][k],
-			x1[ngsl/2+src->interpolation[j].ix[k]-ngsl],
-                        y1[ngsl/2+src->interpolation[j].iy[k]-ngsl],
-                        z1[src->interpolation[j].iz[k]],
-                        src->interpolation[j].ix[k],
-                        src->interpolation[j].iy[k],
-                        src->interpolation[j].iz[k]);
-		}
-		fflush(stdout);
-//--------------------------------------------------------------------------------
-                                        
+                        grid_fill1(x1, x_grid, 1);
+                        grid_fill1(y1, y_grid, 0);
+                        grid_fill1(z1, z_grid, 0);
+
+                        for (int i = 120; i < 128; ++i) {
+                                printf("%3.2f ", z1[i]);
+                        }
+                        printf("\n");
+
+                        if (grid_type == SX )
+                        //if (grid_type == X || grid_type == Y || grid_type == Z  || grid_type == SX || grid_type == SY || grid_type == SZ || grid_type == XX || grid_type == XZ || grid_type == NODE)
+                        {
+                                fprintf(stderr, "rank = %d, grid_type = %s, shift = %d %d %d id = %d origin = %f %f %f h = %f\n",
+                                       rank, grid_typename(grid_type), grid.shift.x, grid.shift.y, grid.shift.z,
+                                       j,
+                                       x1[ngsl / 2], y1[ngsl / 2], z1[0],
+                                       grid.gridspacing);
+
+                                for (size_t k = 0; k < src->lengths[j]; ++k)
+                                {
+                                        fprintf(stderr, "query int x y z = %f %f %f | nearest x y z = %f %f %f | index = %d %d %d\n",
+                                               src->x[j][k], src->y[j][k], src->z[j][k],
+                                               x1[ngsl / 2 + src->interpolation[j].ix[k] - ngsl],
+                                               y1[ngsl / 2 + src->interpolation[j].iy[k] - ngsl],
+                                               z1[src->interpolation[j].iz[k]],
+                                               src->interpolation[j].ix[k],
+                                               src->interpolation[j].iy[k],
+                                               src->interpolation[j].iz[k]);
+                                        fprintf(stderr, "index-x: %d \n",
+                                               src->interpolation[j].ix[0]);
+                                        fprintf(stderr, "index-y: %d \n",
+                                               src->interpolation[j].iy[0]);
+                                        fprintf(stderr, "weights-x: %f %f %f %f \n",
+                                              src->interpolation[j].lx[0],
+                                              src->interpolation[j].lx[1],
+                                              src->interpolation[j].lx[2],
+                                              src->interpolation[j].lx[3]);
+                                        fprintf(stderr, "weights-y: %f %f %f %f \n",
+                                              src->interpolation[j].ly[0],
+                                              src->interpolation[j].ly[1],
+                                              src->interpolation[j].ly[2],
+                                              src->interpolation[j].ly[3]);
+                                        fprintf(stderr, "weights-z: %f %f %f %f \n",
+                                              src->interpolation[j].lz[0],
+                                              src->interpolation[j].lz[1],
+                                              src->interpolation[j].lz[2],
+                                              src->interpolation[j].lz[3]);
+                                        fprintf(stderr, "---------------------------------------\n\n");
+                                }
+                        }
+                        fflush(stdout);
+                }
+#endif
+                //--------------------------------------------------------------------------------
+
                 grid_data_free(&xyz);
         } // end loop j
 
+
         free(grid_number);
         free(x);
         free(y);
-        
 
         src->buffer = buffer_init(src->length,
-                                 input->gpu_buffer_size,
-                                 input->cpu_buffer_size, input->stride);
+                                  input->gpu_buffer_size,
+                                  input->cpu_buffer_size, input->stride);
 
         // Extra space for host buffer
         src->host_buffer_extra = malloc(src->buffer.h_buffer_bytes);
-
 }
 
 void source_init_indexed(source_t *src, const input_t *input, size_t num_reads)
 {
-        if (!src->use) return;
+        if (!src->use)
+                return;
         src->blocklen = malloc(sizeof(src->blocklen) * input->length);
         src->offsets = malloc(sizeof(src->offsets) * input->length);
         size_t num_elements = input->steps / num_reads;
         src->num_elements = num_elements;
-        for (size_t i = 0; i < src->length; ++i) {
+        for (size_t i = 0; i < src->length; ++i)
+        {
                 src->blocklen[i] = num_elements;
         }
-        for (size_t i = 0; i < src->length; ++i) {
+        for (size_t i = 0; i < src->length; ++i)
+        {
                 src->offsets[i] = src->indices[i] * num_elements;
         }
 }
@@ -497,64 +527,69 @@ void source_read(source_t *src, size_t step)
 {
         if (!src->use)
                 return;
-        if (buffer_is_host_empty(&src->buffer, step)) {
-             prec *host_ptr = buffer_get_host_ptr(&src->buffer, step);
-             mpi_io_idx_read(&src->io, host_ptr, src->filename);
-
-             // Transpose data from (index, time) to (time, index)
-             // (last index is contiguous)
-             size_t rows = src->length;
-             size_t cols = src->buffer.num_host * src->buffer.num_device;
-             array_transpose(src->host_buffer_extra, host_ptr, rows, cols);
-             SWAP(src->host_buffer_extra, src->buffer.h_buffer, prec*);
+        if (step > src->steps)
+        {
+                src->use = 0;
+                return;
+        }
+        if (buffer_is_host_empty(&src->buffer, step))
+        {
+                prec *host_ptr = buffer_get_host_ptr(&src->buffer, step);
+                mpi_io_idx_read(&src->io, host_ptr, src->filename);
+
+                // Transpose data from (index, time) to (time, index)
+                // (last index is contiguous)
+                size_t rows = src->length;
+                size_t cols = src->buffer.num_host * src->buffer.num_device;
+                array_transpose(src->host_buffer_extra, host_ptr, rows, cols);
+                SWAP(src->host_buffer_extra, src->buffer.h_buffer, prec *);
         }
-        
-        if (buffer_is_device_empty(&src->buffer, step)) {
+
+        if (buffer_is_device_empty(&src->buffer, step))
+        {
                 buffer_copy_to_device(&src->buffer, step);
         }
-        
 }
 
 void source_add_cartesian(prec *out, source_t *src, const size_t step,
                           const prec h, const prec dt, const int grid_num)
 {
         if (!src->use || !buffer_is_device_ready(&src->buffer, step) ||
-             src->lengths[grid_num] == 0) 
+            src->lengths[grid_num] == 0)
                 return;
 
-
         prec *source_data = buffer_get_device_ptr(&src->buffer, step);
-        cusource_add_cartesian_H(&src->interpolation[grid_num], 
+        cusource_add_cartesian_H(&src->interpolation[grid_num],
                                  out, source_data, h, dt);
 }
 
 void source_add_curvilinear(prec *out, source_t *src, const size_t step,
                             const prec h, const prec dt, const prec *f,
-                            const int ny, 
-                            const prec *dg, const int grid_num) 
+                            const int ny,
+                            const prec *dg, const int grid_num, const int zhat)
 {
         if (!src->use || !buffer_is_device_ready(&src->buffer, step) ||
-             src->lengths[grid_num] == 0) 
+            src->lengths[grid_num] == 0)
                 return;
 
         prec *source_data = buffer_get_device_ptr(&src->buffer, step);
         cusource_add_curvilinear_H(&src->interpolation[grid_num], out,
-                                   source_data, h, dt, f, ny, dg);
+                                   source_data, h, dt, f, ny, dg, zhat);
 }
 
 void source_add_force(prec *out, const prec *d1, source_t *src,
                       const size_t step, const prec h, const prec dt,
                       const prec quad_weight,
-                      const prec *f, const int nx, const int ny, const int nz, 
+                      const prec *f, const int nx, const int ny, const int nz,
                       const prec *dg,
-                      const int grid_num) 
+                      const int grid_num, const int sourcetype, const int dir)
 {
         if (!src->use || !buffer_is_device_ready(&src->buffer, step) ||
-             src->lengths[grid_num] == 0) 
+            src->lengths[grid_num] == 0)
                 return;
 
         prec *source_data = buffer_get_device_ptr(&src->buffer, step);
         cusource_add_force_H(&src->interpolation[grid_num], out,
-                                   source_data, d1, h, dt, quad_weight, f, nx, ny, nz, dg);
+                                     source_data, d1, h, dt, quad_weight, f, nx,
+                                     ny, nz, dg, sourcetype, dir);
 }
-
diff --git a/src/topography/sources/source.cu b/src/topography/sources/source.cu
index fd44d76..c76e040 100644
--- a/src/topography/sources/source.cu
+++ b/src/topography/sources/source.cu
@@ -3,11 +3,18 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <stdint.h>
+#include <assert.h>
 
 #include <topography/sources/source.cuh>
 #include <interpolation/interpolation.cuh>
 #include <test/test.h>
 
+
+// Enable or disable atomic operations. If the sources are overlapping, disabling atomics causes
+// parallel synchronization issues. Only disable this macro if you know that the sources are
+// non-overlapping.
+#define USE_ATOMICS 1
+
 void cusource_add_cartesian_H(const cu_interp_t *I, prec *out, const prec *in,
                               const prec h, const prec dt)
 {
@@ -40,9 +47,16 @@ __global__ void cusource_add_cartesian(prec *out, const prec *in,
         for (int j = 0; j < num_basis; ++j) {
         for (int k = 0; k < num_basis; ++k) {
                 size_t pos = grid_index(grid, ix[q] + i, iy[q] + j, iz[q] + k);
-                out[pos] += - dth * lx[q * num_basis + i] *
+                prec value = - dth * lx[q * num_basis + i] *
                             ly[q * num_basis + j] * lz[q * num_basis + k] *
                             in[lidx[q]];
+#if USE_ATOMICS
+                atomicAdd(&out[pos], value);
+#else 
+                out[pos] = value;
+#endif
+
+
         }
         }
         }
@@ -50,7 +64,7 @@ __global__ void cusource_add_cartesian(prec *out, const prec *in,
 
 void cusource_add_curvilinear_H(const cu_interp_t *I, prec *out, const prec *in,
                                 const prec h, const prec dt, const prec *f,
-                                const int ny, const prec *dg) 
+                                const int ny, const prec *dg, const int zhat) 
 {
         dim3 block (INTERP_THREADS, 1, 1);
         dim3 grid((I->num_query + INTERP_THREADS - 1) / INTERP_THREADS,
@@ -58,7 +72,7 @@ void cusource_add_curvilinear_H(const cu_interp_t *I, prec *out, const prec *in,
 
         cusource_add_curvilinear<<<grid, block>>>(
             out, in, I->d_lx, I->d_ly, I->d_lz, I->num_basis, I->d_ix, I->d_iy,
-            I->d_iz, I->d_ridx, h, dt, I->num_query, I->grid, f, ny, dg);
+            I->d_iz, I->d_ridx, h, dt, I->num_query, I->grid, f, ny, dg, zhat);
         CUCHK(cudaGetLastError());
 }
 
@@ -69,30 +83,55 @@ __global__ void cusource_add_curvilinear(prec *out, const prec *in,
                                  const int *lidx,
                                  const prec h, const prec dt,
                                  const int num_query, const grid3_t grid,
-                                 const prec *f, const int ny, const prec *dg)
+                                 const prec *f, const int ny, const prec *dg, const int zhat)
 {
         int q = threadIdx.x + blockDim.x * blockIdx.x;
         if (q >= num_query) {
                 return;
         }
 
-#define _f(i, j)                                                             \
-  f[(j) + align +                                                     \
-      ((i) + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _f(i, j) f[(j) + align + (i) * (2 * align + 2 * ngsl + ny + 4)]
 #define _dg(k) dg[(k) + align]
 
         prec dth = dt / (h * h * h);
 
+        // Reciprocal quadrature weights near the top boundary in the z-direction. First weight is
+        // on the boundary
+        // hweights: weights at the nodal grid points
+        const prec hweights[4] = {3.55599789310935, 0.6905974224013051,
+                                  1.4771520525102637, 0.914256470417062};
+        // hhatweights: weights at the cell-centered grid points
+        const prec hhatweights[4] = {2.9022824945274315, 2.28681149230364,
+                                     0.7658753535345706, 1.0959408329892313};
+
+        int nx = grid.size.x - 4 - 2 * ngsl;
+        int nz = grid.size.z;
         for (int i = 0; i < num_basis; ++i) {
         for (int j = 0; j < num_basis; ++j) {
         for (int k = 0; k < num_basis; ++k) {
                prec Ji =
                    1.0 / (_f(i + ix[q], j + iy[q]) *
                           _dg(iz[q] + k));
-                size_t pos = grid_index(grid, ix[q] + i, iy[q] + j, iz[q] + k);
-                out[pos] += - dth * lx[q * num_basis + i] *
+                int pos =
+                    (iz[q] + k) + align +
+                    (2 * align + nz) * (ix[q] + i) * (2 * ngsl + ny + 4) +
+                    (2 * align + nz) * (iy[q] + j);
+                prec w = 1.0f;
+                int offset_z = nz - (iz[q] + k + 2);
+                int offset_zhat = nz - (iz[q] + k + 1);
+                if (zhat == 0 &&  offset_z  < 4 && offset_z >= 0)
+                        w = hweights[offset_z];
+                if (zhat == 1 &&  offset_zhat < 4 && offset_zhat >= 0)
+                        w = hhatweights[offset_zhat];
+                prec value = - dth * lx[q * num_basis + i] *
                             ly[q * num_basis + j] * lz[q * num_basis + k] *
-                            in[lidx[q]] * Ji;
+                            in[lidx[q]] * Ji * w;
+
+#if USE_ATOMICS
+                atomicAdd(&out[pos], value);
+#else 
+                out[pos] = value;
+#endif
         }
         }
         }
@@ -102,16 +141,31 @@ void cusource_add_force_H(const cu_interp_t *I, prec *out, const prec *in,
                           const prec *d1, const prec h, const prec dt,
                           const prec quad_weight,
                           const prec *f, const int nx, const int ny,
-                          const int nz, const prec *dg) 
+                          const int nz, const prec *dg, const int sourcetype, const int dir) 
 {
         dim3 block (INTERP_THREADS, 1, 1);
         dim3 grid((I->num_query + INTERP_THREADS - 1) / INTERP_THREADS,
                   1, 1);
 
+        if (sourcetype == 0) {
         cusource_add_force<<<grid, block>>>(
             out, in, d1, I->d_lx, I->d_ly, I->d_lz, I->num_basis, I->d_ix,
             I->d_iy, I->d_iz, I->d_ridx, h, dt, quad_weight, I->num_query,
             I->grid, f, nx, ny, nz, dg);
+        } 
+        else if (sourcetype == 1) {
+        cusource_add_force_stress<<<grid, block>>>(
+            out, in, d1, I->d_lx, I->d_ly, I->d_lz, I->num_basis, I->d_ix,
+            I->d_iy, I->d_iz, I->d_ridx, h, dt, quad_weight, I->num_query,
+            I->grid, f, nx, ny, nz, dg, dir);
+
+        }
+        else {
+            cusource_add_force_velocity<<<grid, block>>>(
+            out, in, d1, I->d_lx, I->d_ly, I->d_lz, I->num_basis, I->d_ix,
+            I->d_iy, I->d_iz, I->d_ridx, h, dt, quad_weight, I->num_query,
+            I->grid, f, nx, ny, nz, dg, dir);
+        }
         CUCHK(cudaGetLastError());
 }
 
@@ -130,29 +184,121 @@ __global__ void cusource_add_force(prec *out, const prec *in, const prec *d1,
                 return;
         }
 
-#define _f(i, j)                                                             \
-  f[(j) + align +                                                     \
-      ((i) + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _f(i, j) f[(j) + align + (i) * (2 * align + 2 * ngsl + ny + 4)]
 #define _dg(k) dg[(k) + align]
 
-#define _rho(i, j, k)                                                  \
-        d1[(k) + align +                                               \
-           (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
-           (2 * align + nz) * ((j) + ngsl + 2)]
+#define _rho(i, j, k)                                                   \
+        d1[(k) + align + (2 * align + nz) * (i) * (2 * ngsl + ny + 4) + \
+           (2 * align + nz) * (j)]
 
         prec dth = dt / (h * h * h);
 
         for (int i = 0; i < num_basis; ++i) {
         for (int j = 0; j < num_basis; ++j) {
         for (int k = 0; k < num_basis; ++k) {
-                prec Ji =
-                    - quad_weight / (_f(i + ix[q], j + iy[q]) * _dg(iz[q] + k) *
-                                   _rho(i + ix[q], j + iy[q], iz[q] + k));
-                size_t pos = grid_index(grid, ix[q] + i, iy[q] + j, iz[q] + k);
-                out[pos] += -dth * lx[q * num_basis + i] *
+                // Do not apply stencil at halo points
+                if ( ix[q] + i >= 2 + nx + ngsl / 2 || ix[q] + i < 2 + ngsl / 2 ||
+                     iy[q] + j >= 2 + ny + ngsl / 2 || iy[q] + j < 2 + ngsl / 2 ) continue;
+
+                prec J =  _f(i + ix[q], j + iy[q]) * _dg(iz[q] + k);
+                prec Ji = - quad_weight /(J * d1[q]);
+                int pos =
+                    (iz[q] + k) + align +
+                    (2 * align + nz) * (ix[q] + i) * (2 * ngsl + ny + 4) +
+                    (2 * align + nz) * (iy[q] + j);
+                prec value = -dth * lx[q * num_basis + i] *
                             ly[q * num_basis + j] * lz[q * num_basis + k] * in[lidx[q]] * Ji;
+#if USE_ATOMICS
+                atomicAdd(&out[pos], value);
+#else 
+                out[pos] = value;
+#endif
+        }
+        }
+        }
+}
+
+__global__ void cusource_add_force_stress(prec *out, const prec *in, const prec *d1,
+                                   const prec *lx, const prec *ly,
+                                   const prec *lz, const int num_basis,
+                                   const int *ix, const int *iy, const int *iz,
+                                   const int *lidx, const prec h, const prec dt,
+                                   const prec quad_weight,
+                                   const int num_query, const grid3_t grid,
+                                   const prec *f, const int nx, const int ny,
+                                   const int nz, const prec *dg, const int dir) 
+{
+        int q = threadIdx.x + blockDim.x * blockIdx.x;
+        if (q >= num_query) {
+                return;
         }
+
+
+        prec dth = 1.0 / (h * h);
+        int k = nz - 1;
+
+        for (int i = 0; i < num_basis; ++i) {
+        for (int j = 0; j < num_basis; ++j) {
+                // Do not apply stencil at halo points
+                if ( ix[q] + i >= 2 + nx + ngsl || ix[q] + i < 2 + ngsl ||
+                     iy[q] + j >= 2 + ny + ngsl || iy[q] + j < 2 + ngsl ) continue;
+
+                int pos =
+                    (k) + align +
+                    (2 * align + nz) * (ix[q] + i) * (2 * ngsl + ny + 4) +
+                    (2 * align + nz) * (iy[q] + j);
+                prec value = dth * lx[q * num_basis + i] *
+                            ly[q * num_basis + j] * in[lidx[q]];
+                if (dir == 1 || dir == 2) {
+                        out[pos] = value;
+                        out[pos+1] = 2 * value - out[pos-1];
+                        out[pos+2] = 2 * value - out[pos-2];
+                }
+                if (dir == 3) {
+                        out[pos+1] = 2 * value - out[pos];
+                        out[pos+2] = 2 * value - out[pos-1];
+                }
         }
         }
 }
 
+__global__ void cusource_add_force_velocity(prec *out, const prec *in, const prec *d1,
+                                   const prec *lx, const prec *ly,
+                                   const prec *lz, const int num_basis,
+                                   const int *ix, const int *iy, const int *iz,
+                                   const int *lidx, const prec h, const prec dt,
+                                   const prec quad_weight,
+                                   const int num_query, const grid3_t grid,
+                                   const prec *f, const int nx, const int ny,
+                                   const int nz, const prec *dg, const int dir) 
+{
+        int q = threadIdx.x + blockDim.x * blockIdx.x;
+        if (q >= num_query) {
+                return;
+        }
+
+
+        prec dth = dt / (h * h * h);
+        int k = nz - 1;
+
+        for (int i = 0; i < num_basis; ++i) {
+        for (int j = 0; j < num_basis; ++j) {
+                // Do not apply stencil at halo points
+                if ( ix[q] + i >= 2 + nx + ngsl / 2 || ix[q] + i < 2 + ngsl / 2 ||
+                     iy[q] + j >= 2 + ny + ngsl / 2 || iy[q] + j < 2 + ngsl / 2 ) continue;
+
+                int pos =
+                    (k) + align +
+                    (2 * align + nz) * (ix[q] + i) * (2 * ngsl + ny + 4) +
+                    (2 * align + nz) * (iy[q] + j);
+                prec value = dth * lx[q * num_basis + i] *
+                            ly[q * num_basis + j] / d1[q] * in[lidx[q]];
+#if USE_ATOMICS
+                atomicAdd(&out[pos], 1.0 * value);
+                atomicAdd(&out[pos - 1], -0.0 * value);
+#else
+                    out[pos] += value;
+#endif
+        }
+        }
+}
diff --git a/src/topography/sources/sources.c b/src/topography/sources/sources.c
index 8c42ac9..6812e19 100644
--- a/src/topography/sources/sources.c
+++ b/src/topography/sources/sources.c
@@ -9,6 +9,8 @@
 #include <topography/sources/source.h>
 #include <topography/sources/sources.h>
 #include <readers/input.h>
+#include <grid/shift.h>
+#include "interpolation/interpolation.h"
 
 static int use;
 
@@ -24,8 +26,11 @@ static input_t input;
 
 static int myrank;
 
-void sources_init(const char *filename, const grids_t *grids, int ngrids,
-                  const f_grid_t *f, const MPI_Comm comm, const int rank,
+static float *F_interp;
+static float *d_F_interp;
+
+void sources_init(const char *filename, const grids_t *grids, const struct mapping *map, int ngrids,
+                  const f_grid_t *f, const g_grid_t *g, const MPI_Comm comm, const int rank,
                   const int size) 
 {
         use = strcmp(filename, "") != 0 ? 1 : 0;
@@ -34,20 +39,18 @@ void sources_init(const char *filename, const grids_t *grids, int ngrids,
 
         if (!use) return;
 
-        // FIXME: Add support for multiple grids
-
        if (rank == 0) { 
                AWPCHK(input_init(&input, filename));
        }
        AWPCHK(input_broadcast(&input, rank, 0, comm));
 
+       Mxx = source_init("xx", XX, &input, grids, map, ngrids, f, rank, comm, MOMENT_TENSOR);
+       Myy = source_init("yy", YY, &input, grids, map, ngrids, f, rank, comm, MOMENT_TENSOR);
+       Mzz = source_init("zz", ZZ, &input, grids, map, ngrids, f, rank, comm, MOMENT_TENSOR);
+       Mxy = source_init("xy", XY, &input, grids, map, ngrids, f, rank, comm, MOMENT_TENSOR);
+       Mxz = source_init("xz", XZ, &input, grids, map, ngrids, f, rank, comm, MOMENT_TENSOR);
+       Myz = source_init("yz", YZ, &input, grids, map, ngrids, f, rank, comm, MOMENT_TENSOR);
 
-       Mxx = source_init("xx", XX, &input, grids, ngrids, f, rank, comm);
-       Myy = source_init("yy", YY, &input, grids, ngrids, f, rank, comm);
-       Mzz = source_init("zz", ZZ, &input, grids, ngrids, f, rank, comm);
-       Mxy = source_init("xy", XY, &input, grids, ngrids, f, rank, comm);
-       Mxz = source_init("xz", XZ, &input, grids, ngrids, f, rank, comm);
-       Myz = source_init("yz", YZ, &input, grids, ngrids, f, rank, comm);
 }
 
 void sources_read(size_t step)
@@ -86,18 +89,19 @@ void sources_add_curvilinear(prec *d_xx, prec *d_yy, prec *d_zz, prec *d_xy,
         if (!use) return;
 
         int ny = f->size[1];
+        // last argument specifies if the grid is cell-centered in the z-direction
         source_add_curvilinear(d_xx, &Mxx, step, h, dt, f->d_f_c, ny, g->d_g3_c,
-                               grid_num);
+                               grid_num, 1);
         source_add_curvilinear(d_yy, &Myy, step, h, dt, f->d_f_c, ny, g->d_g3_c,
-                               grid_num);
+                               grid_num, 1);
         source_add_curvilinear(d_zz, &Mzz, step, h, dt, f->d_f_c, ny, g->d_g3_c,
-                               grid_num);
+                               grid_num, 1);
         source_add_curvilinear(d_xy, &Mxy, step, h, dt, f->d_f, ny, g->d_g3_c,
-                               grid_num);
+                               grid_num, 1);
         source_add_curvilinear(d_xz, &Mxz, step, h, dt, f->d_f_1, ny, g->d_g3,
-                               grid_num);
+                               grid_num, 0);
         source_add_curvilinear(d_yz, &Myz, step, h, dt, f->d_f_2, ny, g->d_g3,
-                               grid_num);
+                               grid_num, 0);
 }
 
 source_t sources_get_source(enum grid_types grid_type)
@@ -122,6 +126,15 @@ source_t sources_get_source(enum grid_types grid_type)
                 case YZ:
                         return Myz;
                         break;
+                case SX:
+                        fprintf(stderr, "No source can exist on grid SX\n");
+                        break;
+                case SY:
+                        fprintf(stderr, "No source can exist on grid SY\n");
+                        break;
+                case SZ:
+                        fprintf(stderr, "No source can exist on grid SZ\n");
+                        break;
                 case X:
                         fprintf(stderr, "No source can exist on grid X\n");
                         break;
@@ -149,5 +162,8 @@ void sources_finalize(void)
         source_finalize(&Mxy);
         source_finalize(&Mxz);
         source_finalize(&Myz);
+
+        if (F_interp != NULL )free(F_interp);
+        if (d_F_interp != NULL )cudaFree(d_F_interp);
 }
 
diff --git a/src/topography/stress.cu b/src/topography/stress.cu
index 20c79b8..a89301f 100644
--- a/src/topography/stress.cu
+++ b/src/topography/stress.cu
@@ -2,9 +2,86 @@
 #include <nvToolsExt.h>
 #include <stdio.h>
 
-#include <topography/kernels/optimized_stress.cuh>
-#include <topography/kernels/optimized_launch_config.cuh>
 #include <topography/stress.cuh>
+#include <test/test.h>
+
+
+// Threads in x, y, z
+#ifndef STRIU_TX
+#define STRIU_TX 32
+#endif      
+            
+#ifndef STRIU_TY
+#define STRIU_TY 1
+#endif      
+            
+#ifndef STRIU_TZ
+#define STRIU_TZ 4
+#endif
+
+// Unroll factor in CUDA x
+#ifndef STRIU_RX
+#define STRIU_RX 1
+#endif
+
+// Unroll factor in CUDA y
+#ifndef STRIU_RY
+#define STRIU_RY 2
+#endif
+
+// Number of threads per block to use for interior stress kernel
+#ifndef STR_INT_X
+#define STR_INT_X 32
+#endif
+#ifndef STR_INT_Y
+#define STR_INT_Y 4
+#endif
+#ifndef STR_INT_Z
+#define STR_INT_Z 1
+#endif
+
+#ifndef DTOPO_STR_110_X
+#define DTOPO_STR_110_X STR_INT_X
+#endif
+
+#ifndef DTOPO_STR_110_Y
+#define DTOPO_STR_110_Y STR_INT_Y
+#endif
+
+#ifndef DTOPO_STR_110_Z
+#define DTOPO_STR_110_Z STR_INT_Z
+#endif
+
+#ifndef DTOPO_STR_111_X
+#define DTOPO_STR_111_X STR_INT_X
+#endif
+
+#ifndef DTOPO_STR_111_Y
+#define DTOPO_STR_111_Y STR_INT_Y
+#endif
+
+#ifndef DTOPO_STR_111_Z
+#define DTOPO_STR_111_Z STR_INT_Z
+#endif
+
+#ifndef DTOPO_STR_112_X
+#define DTOPO_STR_112_X STR_INT_X
+#endif
+
+#ifndef DTOPO_STR_112_Y
+#define DTOPO_STR_112_Y STR_INT_Y
+#endif
+
+#ifndef DTOPO_STR_112_Z
+#define DTOPO_STR_112_Z STR_INT_Z
+#endif
+
+#define DTOPO_STR_110_LOOP_Z 1
+#define DTOPO_STR_111_LOOP_Z 1
+#define DTOPO_STR_112_LOOP_Z 1
+
+#include "kernels/stress.cu"
+#include "kernels/stress_index_unroll.cu"
 
 inline dim3 set_grid(const dim3 block, const int3_t size, const dim3 loop)
 {
@@ -15,6 +92,12 @@ inline dim3 set_grid(const dim3 block, const int3_t size, const dim3 loop)
         return out;
 }
 
+void topo_set_constants(topo_t *T)
+{
+        set_constants(T->gridspacing, T->dth * T->gridspacing, T->nx, T->ny,
+                      T->nz);
+}
+
 void topo_stress_interior_H(topo_t *T)
 {
 
@@ -23,20 +106,25 @@ void topo_stress_interior_H(topo_t *T)
                 printf("launching %s(%d)\n", __func__, T->rank);
         }
 
-        {
-        dim3 block(DTOPO_STR_111_X, DTOPO_STR_111_Y,
-                    DTOPO_STR_111_Z);
-        int3_t size = {(int)T->stress_bounds_right[0] - T->stress_bounds_left[1], 
-                       (int)T->stress_bounds_ydir[1] -  T->stress_bounds_ydir[0],
-                       (int)T->stress_grid_interior.z};
-        dim3 loop(0, 0, DTOPO_STR_111_LOOP_Z);
-        dim3 grid = set_grid(block, size, loop);
-        dtopo_str_111<<<grid, block, 0, T->stream_i>>>
+
+     int shift = ngsl + 2;
+     {
+     int3_t size = {T->stress_bounds_right[0] - T->stress_bounds_left[0], 
+                    T->stress_bounds_ydir[1] -  T->stress_bounds_ydir[0],
+                    (int)T->stress_grid_interior.z};
+
+        dim3 threads (STRIU_TX, STRIU_TY, STRIU_TZ);
+        dim3 blocks((size.z - 4) / (STRIU_RX * threads.x) + 1,
+                    (size.y - 1) / (STRIU_RY * threads.y) + 1,
+                    (size.x - 1) / (threads.z) + 1);
+
+        dtopo_str_111_index_unroll<STRIU_TX, STRIU_TY, STRIU_TZ, STRIU_RX, STRIU_RY><<<blocks, threads, 0, T->stream_i>>>
                          (
-                          T->xx, T->xy, T->xz, 
-                          T->yy, T->yz, T->zz,
+                          T->xx, T->yy, T->zz, 
+                          T->xy, T->xz, T->yz,
+                          T->r1, T->r2, T->r3,
+                          T->r4, T->r5, T->r6,
                           T->u1, T->v1, T->w1, 
-                          T->dcrjx, T->dcrjy, T->dcrjz,
                           T->metrics_f.d_f,
                           T->metrics_f.d_f1_1,
                           T->metrics_f.d_f1_2,
@@ -52,15 +140,24 @@ void topo_stress_interior_H(topo_t *T)
                           T->metrics_g.d_g3_c,
                           T->metrics_g.d_g_c,
                           T->lami,
-                          T->mui, T->timestep,  
-                          T->dth, 
-                          T->nx, T->ny, T->nz,
-                          T->stress_bounds_left[1], T->stress_bounds_ydir[0], 
-                          T->stress_bounds_right[0], T->stress_bounds_ydir[1]);
+                          T->mui, 
+                          T->qpi,
+                          T->coeff,
+                          T->qsi,
+                          T->dcrjx, T->dcrjy, T->dcrjz,
+                          T->vx1,
+                          T->vx2,
+                          T->ww,
+                          T->wwo,
+                          T->nx, T->ny, T->nz, T->coord[0], T->coord[1], T->nz,
+                          T->stress_bounds_left[1] + shift, 
+                          T->stress_bounds_right[0]+ shift, 
+                          T->stress_bounds_ydir[0] + shift, 
+                          T->stress_bounds_ydir[1] + shift);
+
         CUCHK(cudaGetLastError());
         }
 
-
         {
         dim3 block(DTOPO_STR_112_X, DTOPO_STR_112_Y,
                     DTOPO_STR_112_Z);
@@ -71,10 +168,11 @@ void topo_stress_interior_H(topo_t *T)
         dim3 grid = set_grid(block, size, loop);
         dtopo_str_112<<<grid, block, 0, T->stream_i>>>
                          (
-                          T->xx, T->xy, T->xz, 
-                          T->yy, T->yz, T->zz,
+                          T->xx, T->yy, T->zz, 
+                          T->xy, T->xz, T->yz,
+                          T->r1, T->r2, T->r3,
+                          T->r4, T->r5, T->r6,
                           T->u1, T->v1, T->w1, 
-                          T->dcrjx, T->dcrjy, T->dcrjz,
                           T->metrics_f.d_f,
                           T->metrics_f.d_f1_1,
                           T->metrics_f.d_f1_2,
@@ -90,50 +188,21 @@ void topo_stress_interior_H(topo_t *T)
                           T->metrics_g.d_g3_c,
                           T->metrics_g.d_g_c,
                           T->lami,
-                          T->mui, T->timestep,  
-                          T->dth, 
-                          T->nx, T->ny, T->nz,
-                          T->stress_bounds_left[1], T->stress_bounds_ydir[0], 
-                          T->stress_bounds_right[0], T->stress_bounds_ydir[1]);
-
-        CUCHK(cudaGetLastError());
-        }
-
-        if (TOPO_DBG) {
-        dim3 block(DTOPO_STR_110_X, DTOPO_STR_110_Y,
-                    DTOPO_STR_110_Z);
-        int3_t size = {(int)T->stress_bounds_right[0] - T->stress_bounds_left[0], 
-                       (int)T->stress_bounds_ydir[1] -  T->stress_bounds_ydir[0],
-                       TOP_BOUNDARY_SIZE};
-        dim3 loop(0, 0, DTOPO_STR_110_LOOP_Z);
-        dim3 grid = set_grid(block, size, loop);
-                dtopo_str_110<<<grid, block, 0, T->stream_i>>>
-                         (
-                          T->xx, T->xy, T->xz, 
-                          T->yy, T->yz, T->zz,
-                          T->u1, T->v1, T->w1, 
+                          T->mui, 
+                          T->qpi,
+                          T->coeff,
+                          T->qsi,
                           T->dcrjx, T->dcrjy, T->dcrjz,
-                          T->metrics_f.d_f,
-                          T->metrics_f.d_f1_1,
-                          T->metrics_f.d_f1_2,
-                          T->metrics_f.d_f1_c,
-                          T->metrics_f.d_f2_1,
-                          T->metrics_f.d_f2_2,
-                          T->metrics_f.d_f2_c,
-                          T->metrics_f.d_f_1,
-                          T->metrics_f.d_f_2,
-                          T->metrics_f.d_f_c,
-                          T->metrics_g.d_g,
-                          T->metrics_g.d_g3,
-                          T->metrics_g.d_g3_c,
-                          T->metrics_g.d_g_c,
-                          T->lami,
-                          T->mui, T->timestep,  
-                          T->dth, 
-                          T->nx, T->ny, T->nz,
-                          T->stress_bounds_left[1], T->stress_bounds_ydir[0], 
-                          T->stress_bounds_right[0], T->stress_bounds_ydir[1]);
-                CUCHK(cudaGetLastError());
+                          T->vx1,
+                          T->vx2,
+                          T->ww,
+                          T->wwo,
+                          T->nx, T->ny, T->nz, T->coord[0], T->coord[1], T->nz,
+                          T->stress_bounds_left[1]  + shift, 
+                          T->stress_bounds_right[0] + shift, 
+                          T->stress_bounds_ydir[0]  + shift, 
+                          T->stress_bounds_ydir[1]  + shift);
+        CUCHK(cudaGetLastError());
         }
 }
 
@@ -148,20 +217,24 @@ void topo_stress_left_H(topo_t *T)
         if (TOPO_DBG) {
                 printf("launching %s(%d)\n", __func__, T->rank);
         }
-        dim3 block(DTOPO_STR_111_X, DTOPO_STR_111_Y,
-                    DTOPO_STR_111_Z);
         int3_t size = {(int)T->stress_bounds_left[1] - T->stress_bounds_left[0],
                        (int)T->stress_bounds_ydir[1] - T->stress_bounds_ydir[0],
                        (int)T->stress_grid_interior.z};
-        dim3 loop(0, 0, DTOPO_STR_111_LOOP_Z);
-        dim3 grid = set_grid(block, size, loop);
 
-        dtopo_str_111<<<grid, block, 0, T->stream_1>>>
+        int shift = ngsl + 2;
+
+        dim3 threads (STRIU_TX, STRIU_TY, STRIU_TZ);
+        dim3 blocks((size.z - 4) / (STRIU_RX * threads.x) + 1,
+                    (size.y - 1) / (STRIU_RY * threads.y) + 1,
+                    (size.x - 1) / (threads.z) + 1);
+
+        dtopo_str_111_index_unroll<STRIU_TX, STRIU_TY, STRIU_TZ, STRIU_RX, STRIU_RY><<<blocks, threads, 0, T->stream_1>>>
                          (
-                          T->xx, T->xy, T->xz, 
-                          T->yy, T->yz, T->zz,
+                          T->xx, T->yy, T->zz, 
+                          T->xy, T->xz, T->yz,
+                          T->r1, T->r2, T->r3,
+                          T->r4, T->r5, T->r6,
                           T->u1, T->v1, T->w1, 
-                          T->dcrjx, T->dcrjy, T->dcrjz,
                           T->metrics_f.d_f,
                           T->metrics_f.d_f1_1,
                           T->metrics_f.d_f1_2,
@@ -177,11 +250,20 @@ void topo_stress_left_H(topo_t *T)
                           T->metrics_g.d_g3_c,
                           T->metrics_g.d_g_c,
                           T->lami,
-                          T->mui, T->timestep,  
-                          T->dth, 
-                          T->nx, T->ny, T->nz,
-                          T->stress_bounds_left[0], T->stress_bounds_ydir[0], 
-                          T->stress_bounds_left[1], T->stress_bounds_ydir[1]);
+                          T->mui, 
+                          T->qpi,
+                          T->coeff,
+                          T->qsi,
+                          T->dcrjx, T->dcrjy, T->dcrjz,
+                          T->vx1,
+                          T->vx2,
+                          T->ww,
+                          T->wwo,
+                          T->nx, T->ny, T->nz, T->coord[0], T->coord[1], T->nz,
+                          T->stress_bounds_left[0] + shift, 
+                          T->stress_bounds_left[1] + shift, 
+                          T->stress_bounds_ydir[0] + shift, 
+                          T->stress_bounds_ydir[1] + shift);
         CUCHK(cudaGetLastError());
 
 
@@ -195,10 +277,11 @@ void topo_stress_left_H(topo_t *T)
         dim3 grid = set_grid(block, size, loop);
         dtopo_str_112<<<grid, block, 0, T->stream_1>>>
                          (
-                          T->xx, T->xy, T->xz, 
-                          T->yy, T->yz, T->zz,
+                          T->xx, T->yy, T->zz, 
+                          T->xy, T->xz, T->yz,
+                          T->r1, T->r2, T->r3,
+                          T->r4, T->r5, T->r6,
                           T->u1, T->v1, T->w1, 
-                          T->dcrjx, T->dcrjy, T->dcrjz,
                           T->metrics_f.d_f,
                           T->metrics_f.d_f1_1,
                           T->metrics_f.d_f1_2,
@@ -214,49 +297,21 @@ void topo_stress_left_H(topo_t *T)
                           T->metrics_g.d_g3_c,
                           T->metrics_g.d_g_c,
                           T->lami,
-                          T->mui, T->timestep,  
-                          T->dth, 
-                          T->nx, T->ny, T->nz,
-                          T->stress_bounds_left[0], T->stress_bounds_ydir[0], 
-                          T->stress_bounds_left[1], T->stress_bounds_ydir[1]);
-        CUCHK(cudaGetLastError());
-        }
-
-        if (TOPO_DBG) {
-                dim3 block(DTOPO_STR_110_X, DTOPO_STR_110_Y, DTOPO_STR_110_Z);
-                int3_t size = {
-                    (int)T->stress_bounds_left[1] - T->stress_bounds_left[0],
-                    (int)T->stress_bounds_ydir[1] - T->stress_bounds_ydir[0],
-                    (int)T->stress_grid_interior.z};
-                dim3 loop(0, 0, DTOPO_STR_110_LOOP_Z);
-                dim3 grid = set_grid(block, size, loop);
-                dtopo_str_110<<<grid, block, 0, T->stream_1>>>
-                         (
-                          T->xx, T->xy, T->xz, 
-                          T->yy, T->yz, T->zz,
-                          T->u1, T->v1, T->w1, 
+                          T->mui, 
+                          T->qpi,
+                          T->coeff,
+                          T->qsi,
                           T->dcrjx, T->dcrjy, T->dcrjz,
-                          T->metrics_f.d_f,
-                          T->metrics_f.d_f1_1,
-                          T->metrics_f.d_f1_2,
-                          T->metrics_f.d_f1_c,
-                          T->metrics_f.d_f2_1,
-                          T->metrics_f.d_f2_2,
-                          T->metrics_f.d_f2_c,
-                          T->metrics_f.d_f_1,
-                          T->metrics_f.d_f_2,
-                          T->metrics_f.d_f_c,
-                          T->metrics_g.d_g,
-                          T->metrics_g.d_g3,
-                          T->metrics_g.d_g3_c,
-                          T->metrics_g.d_g_c,
-                          T->lami,
-                          T->mui, T->timestep,  
-                          T->dth, 
-                          T->nx, T->ny, T->nz,
-                          T->stress_bounds_left[0], T->stress_bounds_ydir[0], 
-                          T->stress_bounds_left[1], T->stress_bounds_ydir[1]);
-                CUCHK(cudaGetLastError());
+                          T->vx1,
+                          T->vx2,
+                          T->ww,
+                          T->wwo,
+                          T->nx, T->ny, T->nz, T->coord[0], T->coord[1], T->nz,
+                          T->stress_bounds_left[0] + shift, 
+                          T->stress_bounds_left[1] + shift, 
+                          T->stress_bounds_ydir[0] + shift, 
+                          T->stress_bounds_ydir[1] + shift);
+        CUCHK(cudaGetLastError());
         }
 }
 
@@ -271,20 +326,24 @@ void topo_stress_right_H(topo_t *T)
                 printf("launching %s(%d)\n", __func__, T->rank);
         }
 
+        int shift = ngsl + 2;
         {
-        dim3 block(DTOPO_STR_111_X, DTOPO_STR_111_Y,
-                    DTOPO_STR_111_Z);
         int3_t size = {(int)T->stress_bounds_right[1] - T->stress_bounds_left[0],
                        (int)T->stress_bounds_ydir[1] - T->stress_bounds_ydir[0],
                        (int)T->stress_grid_interior.z};
-        dim3 loop(0, 0, DTOPO_STR_111_LOOP_Z);
-        dim3 grid = set_grid(block, size, loop);
-        dtopo_str_111<<<grid, block, 0, T->stream_2>>>
+
+        dim3 threads (STRIU_TX, STRIU_TY, STRIU_TZ);
+        dim3 blocks((size.z - 4) / (STRIU_RX * threads.x) + 1,
+                    (size.y - 1) / (STRIU_RY * threads.y) + 1,
+                    (size.x - 1) / (threads.z) + 1);
+
+        dtopo_str_111_index_unroll<STRIU_TX, STRIU_TY, STRIU_TZ, STRIU_RX, STRIU_RY><<<blocks, threads, 0, T->stream_2>>>
                          (
-                          T->xx, T->xy, T->xz, 
-                          T->yy, T->yz, T->zz,
+                          T->xx, T->yy, T->zz, 
+                          T->xy, T->xz, T->yz,
+                          T->r1, T->r2, T->r3,
+                          T->r4, T->r5, T->r6,
                           T->u1, T->v1, T->w1, 
-                          T->dcrjx, T->dcrjy, T->dcrjz,
                           T->metrics_f.d_f,
                           T->metrics_f.d_f1_1,
                           T->metrics_f.d_f1_2,
@@ -300,11 +359,20 @@ void topo_stress_right_H(topo_t *T)
                           T->metrics_g.d_g3_c,
                           T->metrics_g.d_g_c,
                           T->lami,
-                          T->mui, T->timestep,  
-                          T->dth, 
-                          T->nx, T->ny, T->nz,
-                          T->stress_bounds_right[0], T->stress_bounds_ydir[0], 
-                          T->stress_bounds_right[1], T->stress_bounds_ydir[1]);
+                          T->mui, 
+                          T->qpi,
+                          T->coeff,
+                          T->qsi,
+                          T->dcrjx, T->dcrjy, T->dcrjz,
+                          T->vx1,
+                          T->vx2,
+                          T->ww,
+                          T->wwo,
+                          T->nx, T->ny, T->nz, T->coord[0], T->coord[1], T->nz,
+                          T->stress_bounds_right[0] + shift, 
+                          T->stress_bounds_right[1] + shift, 
+                          T->stress_bounds_ydir[0]  + shift, 
+                          T->stress_bounds_ydir[1]  + shift);
         CUCHK(cudaGetLastError());
         }
 
@@ -318,10 +386,11 @@ void topo_stress_right_H(topo_t *T)
         dim3 grid = set_grid(block, size, loop);
         dtopo_str_112<<<grid, block, 0, T->stream_2>>>
                          (
-                          T->xx, T->xy, T->xz, 
-                          T->yy, T->yz, T->zz,
+                          T->xx, T->yy, T->zz, 
+                          T->xy, T->xz, T->yz,
+                          T->r1, T->r2, T->r3,
+                          T->r4, T->r5, T->r6,
                           T->u1, T->v1, T->w1, 
-                          T->dcrjx, T->dcrjy, T->dcrjz,
                           T->metrics_f.d_f,
                           T->metrics_f.d_f1_1,
                           T->metrics_f.d_f1_2,
@@ -337,48 +406,20 @@ void topo_stress_right_H(topo_t *T)
                           T->metrics_g.d_g3_c,
                           T->metrics_g.d_g_c,
                           T->lami,
-                          T->mui, T->timestep,  
-                          T->dth, 
-                          T->nx, T->ny, T->nz,
-                          T->stress_bounds_right[0], T->stress_bounds_ydir[0], 
-                          T->stress_bounds_right[1], T->stress_bounds_ydir[1]);
-        CUCHK(cudaGetLastError());
-        }
-
-        if (TOPO_DBG) {
-                dim3 block(DTOPO_STR_110_X, DTOPO_STR_110_Y, DTOPO_STR_110_Z);
-                int3_t size = {
-                    (int)T->stress_bounds_right[1] - T->stress_bounds_left[0],
-                    (int)T->stress_bounds_ydir[1] - T->stress_bounds_ydir[0],
-                    TOP_BOUNDARY_SIZE};
-                dim3 loop(0, 0, DTOPO_STR_110_LOOP_Z);
-                dim3 grid = set_grid(block, size, loop);
-                dtopo_str_110<<<grid, block, 0, T->stream_2>>>
-                         (
-                          T->xx, T->xy, T->xz, 
-                          T->yy, T->yz, T->zz,
-                          T->u1, T->v1, T->w1, 
+                          T->mui, 
+                          T->qpi,
+                          T->coeff,
+                          T->qsi,
                           T->dcrjx, T->dcrjy, T->dcrjz,
-                          T->metrics_f.d_f,
-                          T->metrics_f.d_f1_1,
-                          T->metrics_f.d_f1_2,
-                          T->metrics_f.d_f1_c,
-                          T->metrics_f.d_f2_1,
-                          T->metrics_f.d_f2_2,
-                          T->metrics_f.d_f2_c,
-                          T->metrics_f.d_f_1,
-                          T->metrics_f.d_f_2,
-                          T->metrics_f.d_f_c,
-                          T->metrics_g.d_g,
-                          T->metrics_g.d_g3,
-                          T->metrics_g.d_g3_c,
-                          T->metrics_g.d_g_c,
-                          T->lami,
-                          T->mui, T->timestep,  
-                          T->dth, 
-                          T->nx, T->ny, T->nz,
-                          T->stress_bounds_right[0], T->stress_bounds_ydir[0], 
-                          T->stress_bounds_right[1], T->stress_bounds_ydir[1]);
-                CUCHK(cudaGetLastError());
+                          T->vx1,
+                          T->vx2,
+                          T->ww,
+                          T->wwo,
+                          T->nx, T->ny, T->nz, T->coord[0], T->coord[1], T->nz,
+                          T->stress_bounds_right[0] + shift, 
+                          T->stress_bounds_right[1] + shift, 
+                          T->stress_bounds_ydir[0]  + shift, 
+                          T->stress_bounds_ydir[1]  + shift);
+        CUCHK(cudaGetLastError());
         }
 }
diff --git a/src/topography/stress_attenuation.cu b/src/topography/stress_attenuation.cu
deleted file mode 100644
index 9063519..0000000
--- a/src/topography/stress_attenuation.cu
+++ /dev/null
@@ -1,531 +0,0 @@
-#include <cuda.h>
-#include <nvToolsExt.h>
-#include <stdio.h>
-
-#include <topography/kernels/stress_attenuation.cuh>
-#include <topography/kernels/optimized_launch_config.cuh>
-#include <topography/stress_attenuation.cuh>
-#include <test/test.h>
-
-inline dim3 set_grid(const dim3 block, const int3_t size, const dim3 loop)
-{
-        dim3 out;
-        out.x = ((1 - loop.x) * size.z + block.x - 1 + loop.x) / block.x;
-        out.y = ((1 - loop.y) * size.y + block.y - 1 + loop.y) / block.y;
-        out.z = ((1 - loop.z) * size.x + block.z - 1 + loop.z) / block.z;
-        return out;
-}
-
-void topo_set_constants(topo_t *T)
-{
-        set_constants(T->gridspacing, T->dth * T->gridspacing, T->nx, T->ny,
-                      T->nz);
-}
-
-void topo_stress_interior_H(topo_t *T)
-{
-
-        if (!T->use) return;
-        if (TOPO_DBG) {
-                printf("launching %s(%d)\n", __func__, T->rank);
-        }
-
-
-     int shift = ngsl + 2;
-     {
-     dim3 block(DTOPO_STR_111_X, DTOPO_STR_111_Y,
-                 DTOPO_STR_111_Z);
-     int3_t size = {T->stress_bounds_right[0] - T->stress_bounds_left[0], 
-                    T->stress_bounds_ydir[1] -  T->stress_bounds_ydir[0],
-                    (int)T->stress_grid_interior.z};
-     dim3 loop(0, 0, DTOPO_STR_112_LOOP_Z);
-     dim3 grid = set_grid(block, size, loop);
-
-        dtopo_str_111<<<grid, block, 0, T->stream_i>>>
-                         (
-                          T->xx, T->yy, T->zz, 
-                          T->xy, T->xz, T->yz,
-                          T->r1, T->r2, T->r3,
-                          T->r4, T->r5, T->r6,
-                          T->u1, T->v1, T->w1, 
-                          T->metrics_f.d_f,
-                          T->metrics_f.d_f1_1,
-                          T->metrics_f.d_f1_2,
-                          T->metrics_f.d_f1_c,
-                          T->metrics_f.d_f2_1,
-                          T->metrics_f.d_f2_2,
-                          T->metrics_f.d_f2_c,
-                          T->metrics_f.d_f_1,
-                          T->metrics_f.d_f_2,
-                          T->metrics_f.d_f_c,
-                          T->metrics_g.d_g,
-                          T->metrics_g.d_g3,
-                          T->metrics_g.d_g3_c,
-                          T->metrics_g.d_g_c,
-                          T->lami,
-                          T->mui, 
-                          T->qpi,
-                          T->coeff,
-                          T->qsi,
-                          T->dcrjx, T->dcrjy, T->dcrjz,
-                          T->vx1,
-                          T->vx2,
-                          T->ww,
-                          T->wwo,
-                          T->nx, T->ny, T->nz, T->coord[0], T->coord[1], T->nz,
-                          T->stress_bounds_left[1] + shift, 
-                          T->stress_bounds_right[0]+ shift, 
-                          T->stress_bounds_ydir[0] + shift, 
-                          T->stress_bounds_ydir[1] + shift);
-
-        CUCHK(cudaGetLastError());
-        }
-
-        {
-        dim3 block(DTOPO_STR_112_X, DTOPO_STR_112_Y,
-                    DTOPO_STR_112_Z);
-        int3_t size = {(int)T->stress_bounds_right[0] - T->stress_bounds_left[0], 
-                       (int)T->stress_bounds_ydir[1] -  T->stress_bounds_ydir[0],
-                       TOP_BOUNDARY_SIZE};
-        dim3 loop(0, 0, DTOPO_STR_112_LOOP_Z);
-        dim3 grid = set_grid(block, size, loop);
-        dtopo_str_112<<<grid, block, 0, T->stream_i>>>
-                         (
-                          T->xx, T->yy, T->zz, 
-                          T->xy, T->xz, T->yz,
-                          T->r1, T->r2, T->r3,
-                          T->r4, T->r5, T->r6,
-                          T->u1, T->v1, T->w1, 
-                          T->metrics_f.d_f,
-                          T->metrics_f.d_f1_1,
-                          T->metrics_f.d_f1_2,
-                          T->metrics_f.d_f1_c,
-                          T->metrics_f.d_f2_1,
-                          T->metrics_f.d_f2_2,
-                          T->metrics_f.d_f2_c,
-                          T->metrics_f.d_f_1,
-                          T->metrics_f.d_f_2,
-                          T->metrics_f.d_f_c,
-                          T->metrics_g.d_g,
-                          T->metrics_g.d_g3,
-                          T->metrics_g.d_g3_c,
-                          T->metrics_g.d_g_c,
-                          T->lami,
-                          T->mui, 
-                          T->qpi,
-                          T->coeff,
-                          T->qsi,
-                          T->dcrjx, T->dcrjy, T->dcrjz,
-                          T->vx1,
-                          T->vx2,
-                          T->ww,
-                          T->wwo,
-                          T->nx, T->ny, T->nz, T->coord[0], T->coord[1], T->nz,
-                          T->stress_bounds_left[1]  + shift, 
-                          T->stress_bounds_right[0] + shift, 
-                          T->stress_bounds_ydir[0]  + shift, 
-                          T->stress_bounds_ydir[1]  + shift);
-        CUCHK(cudaGetLastError());
-        }
-}
-
-void topo_velocity_interior_H(topo_t *T)
-{
-
-        if (!T->use) return;
-        if (TOPO_DBG) {
-                printf("launching %s(%d)\n", __func__, T->rank);
-        }
-        dim3 block (TBX, TBY, TBZ);
-        dim3 grid ((T->velocity_grid_interior.x+TBX-1)/TBX, 
-                   (T->velocity_grid_interior.y+TBY-1)/TBY,
-                   (T->velocity_grid_interior.z+TBZ-1)/TBZ);
-        // Compute velocities in the front send buffer region. 
-        dtopo_vel_111<<<grid, block, 0, T->stream_1>>>(
-                                                   T->u1, T->v1, T->w1,
-                                                   T->dcrjx, T->dcrjy, T->dcrjz,
-                                                   T->metrics_f.d_f,
-                                                   T->metrics_f.d_f1_1,
-                                                   T->metrics_f.d_f1_2,
-                                                   T->metrics_f.d_f1_c,
-                                                   T->metrics_f.d_f2_1,
-                                                   T->metrics_f.d_f2_2,
-                                                   T->metrics_f.d_f2_c,
-                                                   T->metrics_f.d_f_1,
-                                                   T->metrics_f.d_f_2,
-                                                   T->metrics_f.d_f_c,
-                                                   T->metrics_g.d_g,
-                                                   T->metrics_g.d_g3,
-                                                   T->metrics_g.d_g3_c,
-                                                   T->metrics_g.d_g_c,
-                                                   T->rho,
-                                                   T->xx, T->xy, T->xz, 
-                                                   T->yy, T->yz, T->zz,
-                                                   T->timestep, T->dth,
-                                                   T->nx, T->ny, T->nz,
-                                                   T->velocity_bounds_left[0],
-                                                   T->velocity_bounds_front[0], 
-                                                   T->velocity_bounds_right[1],
-                                                   T->velocity_bounds_front[1]);
-        CUCHK(cudaGetLastError());
-
-        // Compute interior part excluding send buffer regions
-        dtopo_vel_111<<<grid, block, 0, T->stream_i>>>(
-                                                   T->u1, T->v1, T->w1,
-                                                   T->dcrjx, T->dcrjy, T->dcrjz,
-                                                   T->metrics_f.d_f,
-                                                   T->metrics_f.d_f1_1,
-                                                   T->metrics_f.d_f1_2,
-                                                   T->metrics_f.d_f1_c,
-                                                   T->metrics_f.d_f2_1,
-                                                   T->metrics_f.d_f2_2,
-                                                   T->metrics_f.d_f2_c,
-                                                   T->metrics_f.d_f_1,
-                                                   T->metrics_f.d_f_2,
-                                                   T->metrics_f.d_f_c,
-                                                   T->metrics_g.d_g,
-                                                   T->metrics_g.d_g3,
-                                                   T->metrics_g.d_g3_c,
-                                                   T->metrics_g.d_g_c,
-                                                   T->rho,
-                                                   T->xx, T->xy, T->xz, 
-                                                   T->yy, T->yz, T->zz,
-                                                   T->timestep, T->dth,
-                                                   T->nx, T->ny, T->nz,
-                                                   T->velocity_bounds_left[0],
-                                                   T->velocity_bounds_front[1], 
-                                                   T->velocity_bounds_right[1],
-                                                   T->velocity_bounds_back[0]);
-        CUCHK(cudaGetLastError());
-
-        // Compute back send buffer region
-        dtopo_vel_111<<<grid, block, 0, T->stream_2>>>(
-                                                   T->u1, T->v1, T->w1,
-                                                   T->dcrjx, T->dcrjy, T->dcrjz,
-                                                   T->metrics_f.d_f,
-                                                   T->metrics_f.d_f1_1,
-                                                   T->metrics_f.d_f1_2,
-                                                   T->metrics_f.d_f1_c,
-                                                   T->metrics_f.d_f2_1,
-                                                   T->metrics_f.d_f2_2,
-                                                   T->metrics_f.d_f2_c,
-                                                   T->metrics_f.d_f_1,
-                                                   T->metrics_f.d_f_2,
-                                                   T->metrics_f.d_f_c,
-                                                   T->metrics_g.d_g,
-                                                   T->metrics_g.d_g3,
-                                                   T->metrics_g.d_g3_c,
-                                                   T->metrics_g.d_g_c,
-                                                   T->rho,
-                                                   T->xx, T->xy, T->xz, 
-                                                   T->yy, T->yz, T->zz,
-                                                   T->timestep, T->dth,
-                                                   T->nx, T->ny, T->nz,
-                                                   T->velocity_bounds_left[0],
-                                                   T->velocity_bounds_back[0], 
-                                                   T->velocity_bounds_right[1],
-                                                   T->velocity_bounds_back[1]);
-        CUCHK(cudaGetLastError());
-
-        // Adjust grid size for boundary computation
-        grid.z = (TOP_BOUNDARY_SIZE+TBZ-1)/TBZ;
-        // Boundary stencils near free surface
-        
-        dtopo_vel_112<<<grid, block, 0, T->stream_1>>>(
-                                                   T->u1, T->v1, T->w1,
-                                                   T->dcrjx, T->dcrjy, T->dcrjz,
-                                                   T->metrics_f.d_f,
-                                                   T->metrics_f.d_f1_1,
-                                                   T->metrics_f.d_f1_2,
-                                                   T->metrics_f.d_f1_c,
-                                                   T->metrics_f.d_f2_1,
-                                                   T->metrics_f.d_f2_2,
-                                                   T->metrics_f.d_f2_c,
-                                                   T->metrics_f.d_f_1,
-                                                   T->metrics_f.d_f_2,
-                                                   T->metrics_f.d_f_c,
-                                                   T->metrics_g.d_g,
-                                                   T->metrics_g.d_g3,
-                                                   T->metrics_g.d_g3_c,
-                                                   T->metrics_g.d_g_c,
-                                                   T->rho,
-                                                   T->xx, T->xy, T->xz, 
-                                                   T->yy, T->yz, T->zz,
-                                                   T->timestep, T->dth,
-                                                   T->nx, T->ny, T->nz,
-                                                   T->velocity_bounds_left[0],
-                                                   T->velocity_bounds_front[0], 
-                                                   T->velocity_bounds_right[1],
-                                                   T->velocity_bounds_front[1]);
-        CUCHK(cudaGetLastError());
-
-        dtopo_vel_112<<<grid, block, 0, T->stream_i>>>(
-                                                   T->u1, T->v1, T->w1,
-                                                   T->dcrjx, T->dcrjy, T->dcrjz,
-                                                   T->metrics_f.d_f,
-                                                   T->metrics_f.d_f1_1,
-                                                   T->metrics_f.d_f1_2,
-                                                   T->metrics_f.d_f1_c,
-                                                   T->metrics_f.d_f2_1,
-                                                   T->metrics_f.d_f2_2,
-                                                   T->metrics_f.d_f2_c,
-                                                   T->metrics_f.d_f_1,
-                                                   T->metrics_f.d_f_2,
-                                                   T->metrics_f.d_f_c,
-                                                   T->metrics_g.d_g,
-                                                   T->metrics_g.d_g3,
-                                                   T->metrics_g.d_g3_c,
-                                                   T->metrics_g.d_g_c,
-                                                   T->rho,
-                                                   T->xx, T->xy, T->xz, 
-                                                   T->yy, T->yz, T->zz,
-                                                   T->timestep, T->dth,
-                                                   T->nx, T->ny, T->nz,
-                                                   T->velocity_bounds_left[0],
-                                                   T->velocity_bounds_front[1], 
-                                                   T->velocity_bounds_right[1],
-                                                   T->velocity_bounds_back[0]);
-        CUCHK(cudaGetLastError());
-
-        dtopo_vel_112<<<grid, block, 0, T->stream_2>>>(
-                                                   T->u1, T->v1, T->w1,
-                                                   T->dcrjx, T->dcrjy, T->dcrjz,
-                                                   T->metrics_f.d_f,
-                                                   T->metrics_f.d_f1_1,
-                                                   T->metrics_f.d_f1_2,
-                                                   T->metrics_f.d_f1_c,
-                                                   T->metrics_f.d_f2_1,
-                                                   T->metrics_f.d_f2_2,
-                                                   T->metrics_f.d_f2_c,
-                                                   T->metrics_f.d_f_1,
-                                                   T->metrics_f.d_f_2,
-                                                   T->metrics_f.d_f_c,
-                                                   T->metrics_g.d_g,
-                                                   T->metrics_g.d_g3,
-                                                   T->metrics_g.d_g3_c,
-                                                   T->metrics_g.d_g_c,
-                                                   T->rho,
-                                                   T->xx, T->xy, T->xz, 
-                                                   T->yy, T->yz, T->zz,
-                                                   T->timestep, T->dth,
-                                                   T->nx, T->ny, T->nz,
-                                                   T->velocity_bounds_left[0],
-                                                   T->velocity_bounds_back[0], 
-                                                   T->velocity_bounds_right[1],
-                                                   T->velocity_bounds_back[1]);
-        CUCHK(cudaGetLastError());
-}
-
-void topo_stress_left_H(topo_t *T)
-{
-
-        if (!T->use) return;
-        if (T->x_rank_l < 0) {
-                return;
-        }
-
-        if (TOPO_DBG) {
-                printf("launching %s(%d)\n", __func__, T->rank);
-        }
-        dim3 block(DTOPO_STR_111_X, DTOPO_STR_111_Y,
-                    DTOPO_STR_111_Z);
-        int3_t size = {(int)T->stress_bounds_left[1] - T->stress_bounds_left[0],
-                       (int)T->stress_bounds_ydir[1] - T->stress_bounds_ydir[0],
-                       (int)T->stress_grid_interior.z};
-        dim3 loop(0, 0, DTOPO_STR_111_LOOP_Z);
-        dim3 grid = set_grid(block, size, loop);
-
-        int shift = ngsl + 2;
-        dtopo_str_111<<<grid, block, 0, T->stream_1>>>
-                         (
-                          T->xx, T->yy, T->zz, 
-                          T->xy, T->xz, T->yz,
-                          T->r1, T->r2, T->r3,
-                          T->r4, T->r5, T->r6,
-                          T->u1, T->v1, T->w1, 
-                          T->metrics_f.d_f,
-                          T->metrics_f.d_f1_1,
-                          T->metrics_f.d_f1_2,
-                          T->metrics_f.d_f1_c,
-                          T->metrics_f.d_f2_1,
-                          T->metrics_f.d_f2_2,
-                          T->metrics_f.d_f2_c,
-                          T->metrics_f.d_f_1,
-                          T->metrics_f.d_f_2,
-                          T->metrics_f.d_f_c,
-                          T->metrics_g.d_g,
-                          T->metrics_g.d_g3,
-                          T->metrics_g.d_g3_c,
-                          T->metrics_g.d_g_c,
-                          T->lami,
-                          T->mui, 
-                          T->qpi,
-                          T->coeff,
-                          T->qsi,
-                          T->dcrjx, T->dcrjy, T->dcrjz,
-                          T->vx1,
-                          T->vx2,
-                          T->ww,
-                          T->wwo,
-                          T->nx, T->ny, T->nz, T->coord[0], T->coord[1], T->nz,
-                          T->stress_bounds_left[0] + shift, 
-                          T->stress_bounds_left[1] + shift, 
-                          T->stress_bounds_ydir[0] + shift, 
-                          T->stress_bounds_ydir[1] + shift);
-        CUCHK(cudaGetLastError());
-
-
-        {
-        dim3 block(DTOPO_STR_112_X, DTOPO_STR_112_Y,
-                    DTOPO_STR_112_Z);
-        int3_t size = {(int)T->stress_bounds_left[1] - T->stress_bounds_left[0],
-                       (int)T->stress_bounds_ydir[1] - T->stress_bounds_ydir[0],
-                       (int)T->stress_grid_interior.z};
-        dim3 loop(0, 0, DTOPO_STR_112_LOOP_Z);
-        dim3 grid = set_grid(block, size, loop);
-        dtopo_str_112<<<grid, block, 0, T->stream_1>>>
-                         (
-                          T->xx, T->yy, T->zz, 
-                          T->xy, T->xz, T->yz,
-                          T->r1, T->r2, T->r3,
-                          T->r4, T->r5, T->r6,
-                          T->u1, T->v1, T->w1, 
-                          T->metrics_f.d_f,
-                          T->metrics_f.d_f1_1,
-                          T->metrics_f.d_f1_2,
-                          T->metrics_f.d_f1_c,
-                          T->metrics_f.d_f2_1,
-                          T->metrics_f.d_f2_2,
-                          T->metrics_f.d_f2_c,
-                          T->metrics_f.d_f_1,
-                          T->metrics_f.d_f_2,
-                          T->metrics_f.d_f_c,
-                          T->metrics_g.d_g,
-                          T->metrics_g.d_g3,
-                          T->metrics_g.d_g3_c,
-                          T->metrics_g.d_g_c,
-                          T->lami,
-                          T->mui, 
-                          T->qpi,
-                          T->coeff,
-                          T->qsi,
-                          T->dcrjx, T->dcrjy, T->dcrjz,
-                          T->vx1,
-                          T->vx2,
-                          T->ww,
-                          T->wwo,
-                          T->nx, T->ny, T->nz, T->coord[0], T->coord[1], T->nz,
-                          T->stress_bounds_left[0] + shift, 
-                          T->stress_bounds_left[1] + shift, 
-                          T->stress_bounds_ydir[0] + shift, 
-                          T->stress_bounds_ydir[1] + shift);
-        CUCHK(cudaGetLastError());
-        }
-}
-
-void topo_stress_right_H(topo_t *T)
-{
-
-        if (!T->use) return;
-        if (T->x_rank_r < 0) {
-                return;
-        }
-        if (TOPO_DBG) {
-                printf("launching %s(%d)\n", __func__, T->rank);
-        }
-
-        int shift = ngsl + 2;
-        {
-        dim3 block(DTOPO_STR_111_X, DTOPO_STR_111_Y,
-                    DTOPO_STR_111_Z);
-        int3_t size = {(int)T->stress_bounds_right[1] - T->stress_bounds_left[0],
-                       (int)T->stress_bounds_ydir[1] - T->stress_bounds_ydir[0],
-                       (int)T->stress_grid_interior.z};
-        dim3 loop(0, 0, DTOPO_STR_111_LOOP_Z);
-        dim3 grid = set_grid(block, size, loop);
-        dtopo_str_111<<<grid, block, 0, T->stream_2>>>
-                         (
-                          T->xx, T->yy, T->zz, 
-                          T->xy, T->xz, T->yz,
-                          T->r1, T->r2, T->r3,
-                          T->r4, T->r5, T->r6,
-                          T->u1, T->v1, T->w1, 
-                          T->metrics_f.d_f,
-                          T->metrics_f.d_f1_1,
-                          T->metrics_f.d_f1_2,
-                          T->metrics_f.d_f1_c,
-                          T->metrics_f.d_f2_1,
-                          T->metrics_f.d_f2_2,
-                          T->metrics_f.d_f2_c,
-                          T->metrics_f.d_f_1,
-                          T->metrics_f.d_f_2,
-                          T->metrics_f.d_f_c,
-                          T->metrics_g.d_g,
-                          T->metrics_g.d_g3,
-                          T->metrics_g.d_g3_c,
-                          T->metrics_g.d_g_c,
-                          T->lami,
-                          T->mui, 
-                          T->qpi,
-                          T->coeff,
-                          T->qsi,
-                          T->dcrjx, T->dcrjy, T->dcrjz,
-                          T->vx1,
-                          T->vx2,
-                          T->ww,
-                          T->wwo,
-                          T->nx, T->ny, T->nz, T->coord[0], T->coord[1], T->nz,
-                          T->stress_bounds_right[0] + shift, 
-                          T->stress_bounds_right[1] + shift, 
-                          T->stress_bounds_ydir[0]  + shift, 
-                          T->stress_bounds_ydir[1]  + shift);
-        CUCHK(cudaGetLastError());
-        }
-
-        {
-        dim3 block(DTOPO_STR_112_X, DTOPO_STR_112_Y,
-                    DTOPO_STR_112_Z);
-        int3_t size = {(int)T->stress_bounds_right[1] - T->stress_bounds_left[0],
-                       (int)T->stress_bounds_ydir[1] - T->stress_bounds_ydir[0],
-                       TOP_BOUNDARY_SIZE};
-        dim3 loop(0, 0, DTOPO_STR_112_LOOP_Z);
-        dim3 grid = set_grid(block, size, loop);
-        dtopo_str_112<<<grid, block, 0, T->stream_2>>>
-                         (
-                          T->xx, T->yy, T->zz, 
-                          T->xy, T->xz, T->yz,
-                          T->r1, T->r2, T->r3,
-                          T->r4, T->r5, T->r6,
-                          T->u1, T->v1, T->w1, 
-                          T->metrics_f.d_f,
-                          T->metrics_f.d_f1_1,
-                          T->metrics_f.d_f1_2,
-                          T->metrics_f.d_f1_c,
-                          T->metrics_f.d_f2_1,
-                          T->metrics_f.d_f2_2,
-                          T->metrics_f.d_f2_c,
-                          T->metrics_f.d_f_1,
-                          T->metrics_f.d_f_2,
-                          T->metrics_f.d_f_c,
-                          T->metrics_g.d_g,
-                          T->metrics_g.d_g3,
-                          T->metrics_g.d_g3_c,
-                          T->metrics_g.d_g_c,
-                          T->lami,
-                          T->mui, 
-                          T->qpi,
-                          T->coeff,
-                          T->qsi,
-                          T->dcrjx, T->dcrjy, T->dcrjz,
-                          T->vx1,
-                          T->vx2,
-                          T->ww,
-                          T->wwo,
-                          T->nx, T->ny, T->nz, T->coord[0], T->coord[1], T->nz,
-                          T->stress_bounds_right[0] + shift, 
-                          T->stress_bounds_right[1] + shift, 
-                          T->stress_bounds_ydir[0]  + shift, 
-                          T->stress_bounds_ydir[1]  + shift);
-        CUCHK(cudaGetLastError());
-        }
-}
diff --git a/src/topography/topography.c b/src/topography/topography.c
index b24c90b..dc58bc0 100644
--- a/src/topography/topography.c
+++ b/src/topography/topography.c
@@ -6,13 +6,15 @@
 #include <sys/stat.h>
 #include <sys/types.h>
 
-#include <awp/definitions.h>
+#include <awp/pmcl3d_cons.h>
 #include <awp/error.h>
 #include <grid/grid_3d.h>
 #include <topography/geometry/geometry.h>
 #include <topography/topography.h>
+#include <topography/mapping.h>
 #include <topography/readers/serial_reader.h>
 #include <topography/topography.cuh>
+#include <test/test.h>
 
 topo_t topo_init(const int USETOPO, 
                  const char *INTOPO, 
@@ -29,6 +31,8 @@ topo_t topo_init(const int USETOPO,
                  int nzt,
                  const _prec dt,
                  const _prec h,
+                 const _prec hb,
+                 const _prec ht,
                  cudaStream_t stream_1,
                  cudaStream_t stream_2,
                  cudaStream_t stream_i
@@ -43,6 +47,7 @@ topo_t topo_init(const int USETOPO,
         int slice = myt * mzt;
         int line = mzt;
         int slice_gl = ngsl * mzt;
+        _prec block_height = h * (nzt - 2 - OVERLAP);
 
         topo_t T = {.use = USETOPO, .dbg = TOPO_DBG, 
                     .verbose = TOPO_VERBOSE,
@@ -59,19 +64,17 @@ topo_t topo_init(const int USETOPO,
                     .off_x = {2, 2 + ngsl, 2 + ngsl + nxt, 2 + ngsl2 + nxt},
                     .off_y = {2, 2 + ngsl, 2 + ngsl + nyt, 2 + ngsl2 + nyt},
                     .off_z = {0, align, align + nzt, 2*align + nzt},
-                    // Grid affinity
-                    .sxx = {0, 1, 1}, .syy = {0, 1, 1}, .szz = {0, 1, 1},
-                    .sxy = {1, 0, 1}, .sxz = {1, 1, 0}, .syz = {0, 0, 0},
-                    .su1 = {1, 1, 1}, .sv1 = {0, 0, 1}, .sw1 = {0, 1, 0},
                     .gridsize = gridsize,
                     .slice = slice, .line = line,
                     .slice_gl = slice_gl,
                     .dth = dt/h,
                     .timestep = 1,
                     .gridspacing = h,
+                    .block_height = block_height,
                     .stream_1 = stream_1,
                     .stream_2 = stream_2,
-                    .stream_i = stream_i
+                    .stream_i = stream_i,
+                    .map = map_init(hb / block_height, ht / block_height, h / block_height) 
                    };
 
         if (rank == 0 && T.verbose && T.use) printf("Topography:: enabled\n");
@@ -79,7 +82,7 @@ topo_t topo_init(const int USETOPO,
                 printf("Topography:: debugging enabled\n");
 
         if (T.dbg && rank == 0 && T.use)
-                printf("Topography block size:: %d %d %d\n", TBX, TBY, TBZ);
+                printf("Topography min. block size:: %d %d %d\n", TBX, TBY, TBZ);
 
         topo_set_bounds(&T);
 
@@ -207,25 +210,16 @@ void topo_d_free(topo_t *T)
         CUCHK(cudaFree(T->dcrjz));
 }
 
-void topo_init_metrics(topo_t *T)
-{
-        if (!T->use) return;
-
-        int size[3] = {T->nx, T->ny, T->nz};
-        T->metrics_f = metrics_init_f(size, T->gridspacing);
-        T->metrics_g = metrics_init_g(size, T->gridspacing);
-}
-
 void topo_init_geometry(topo_t *T)
 {
         int err = 0;
         int alloc = 0;
 
         err |= topo_read_serial(T->topography_file, T->rank, T->px, T->py,
-                                T->coord, T->nx, T->ny, alloc, &T->metrics_f.f);
-        geom_no_grid_stretching(&T->metrics_g);
-        geom_custom(&T->metrics_f, T->topography_grid, T->px, T->py,
-                    T->metrics_f.f);
+                                T->coord, T->nx, T->ny, alloc, &T->metrics_f_init.f);
+        geom_grid_stretching(&T->metrics_g, &T->map, T->block_height);
+        geom_custom(&T->metrics_f_init, T->topography_grid, T->px, T->py,
+                    T->metrics_f_init.f);
 
         if (err > 0) {
                 printf("%s \n", error_message(err));
@@ -234,11 +228,24 @@ void topo_init_geometry(topo_t *T)
         }
 }
 
+void topo_init_metrics(topo_t *T)
+{
+        if (!T->use) return;
+        int size[3] = {T->nx, T->ny, T->nz};
+        T->metrics_f_init = metrics_init_f(size, T->gridspacing, metrics_padding);
+        T->metrics_f = metrics_init_f(size, T->gridspacing, ngsl);
+        T->metrics_g = metrics_init_g(size, T->gridspacing);
+}
+
 void topo_build(topo_t *T)
 {
         if (!T->use) return;
 
-        metrics_build_f(&T->metrics_f);
+        metrics_build_f(&T->metrics_f_init);
+        metrics_shift_f(&T->metrics_f, &T->metrics_f_init);
+        metrics_d_copy_f(&T->metrics_f);
+        metrics_free_f(&T->metrics_f_init);
+
         metrics_build_g(&T->metrics_g);
 
         #if TOPO_USE_CONST_MATERIAL
diff --git a/src/topography/topography.cu b/src/topography/topography.cu
deleted file mode 100644
index dc881ec..0000000
--- a/src/topography/topography.cu
+++ /dev/null
@@ -1,780 +0,0 @@
-#include <cuda.h>
-#include <stdio.h>
-
-#include <topography/topography.h>
-#include <topography/topography.cuh>
-#include <topography/kernels/unoptimized.cuh>
-#include <awp/definitions.h>
-
-void topo_init_material_H(topo_t *T)
-{
-        if (TOPO_DBG) {
-                printf("launching %s(%d)\n", __func__, T->rank);
-        }
-        dim3 block (TBX, TBY, TBZ);
-        dim3 grid ((T->mx+TBX-1)/TBX, 
-                   (T->my+TBY-1)/TBY,
-                   (T->mz+TBZ-1)/TBZ);
-
-
-        // Apply material properties inside and outside ghost region
-        dtopo_init_material_111<<<grid, block>>>(T->lami, T->mui, T->rho,
-                                                 T->mx, T->my, T->mz);
-
-        CUCHK(cudaGetLastError());
-}
-void topo_velocity_interior_H(topo_t *T)
-{
-
-        if (!T->use) return;
-        if (TOPO_DBG) {
-                printf("launching %s(%d)\n", __func__, T->rank);
-        }
-        dim3 block (TBX, TBY, TBZ);
-        dim3 grid ((T->velocity_grid_interior.x+TBX-1)/TBX, 
-                   (T->velocity_grid_interior.y+TBY-1)/TBY,
-                   (T->velocity_grid_interior.z+TBZ-1)/TBZ);
-
-        if (TOPO_DBG) {
-        printf("grid: %d %d %d block: %d %d %d \n", 
-                        grid.x, grid.y, grid.z,
-                        block.x, block.y, block.z);
-        printf("n = %d %d %d \n", T->nx, T->ny, T->nz);
-        }
-
-        // Compute velocities in the front send buffer region. 
-        dtopo_vel_111<<<grid, block, 0, T->stream_1>>>(
-                                                   T->u1, T->v1, T->w1,
-                                                   T->dcrjx, T->dcrjy, T->dcrjz,
-                                                   T->metrics_f.d_f,
-                                                   T->metrics_f.d_f1_1,
-                                                   T->metrics_f.d_f1_2,
-                                                   T->metrics_f.d_f1_c,
-                                                   T->metrics_f.d_f2_1,
-                                                   T->metrics_f.d_f2_2,
-                                                   T->metrics_f.d_f2_c,
-                                                   T->metrics_f.d_f_1,
-                                                   T->metrics_f.d_f_2,
-                                                   T->metrics_f.d_f_c,
-                                                   T->metrics_g.d_g,
-                                                   T->metrics_g.d_g3,
-                                                   T->metrics_g.d_g3_c,
-                                                   T->metrics_g.d_g_c,
-                                                   T->rho,
-                                                   T->xx, T->xy, T->xz, 
-                                                   T->yy, T->yz, T->zz,
-                                                   T->timestep, T->dth,
-                                                   T->nx, T->ny, T->nz,
-                                                   T->velocity_bounds_left[0],
-                                                   T->velocity_bounds_front[0], 
-                                                   T->velocity_bounds_right[1],
-                                                   T->velocity_bounds_front[1]);
-        CUCHK(cudaGetLastError());
-
-        // Compute interior part excluding send buffer regions
-        dtopo_vel_111<<<grid, block, 0, T->stream_i>>>(
-                                                   T->u1, T->v1, T->w1,
-                                                   T->dcrjx, T->dcrjy, T->dcrjz,
-                                                   T->metrics_f.d_f,
-                                                   T->metrics_f.d_f1_1,
-                                                   T->metrics_f.d_f1_2,
-                                                   T->metrics_f.d_f1_c,
-                                                   T->metrics_f.d_f2_1,
-                                                   T->metrics_f.d_f2_2,
-                                                   T->metrics_f.d_f2_c,
-                                                   T->metrics_f.d_f_1,
-                                                   T->metrics_f.d_f_2,
-                                                   T->metrics_f.d_f_c,
-                                                   T->metrics_g.d_g,
-                                                   T->metrics_g.d_g3,
-                                                   T->metrics_g.d_g3_c,
-                                                   T->metrics_g.d_g_c,
-                                                   T->rho,
-                                                   T->xx, T->xy, T->xz, 
-                                                   T->yy, T->yz, T->zz,
-                                                   T->timestep, T->dth,
-                                                   T->nx, T->ny, T->nz,
-                                                   T->velocity_bounds_left[0],
-                                                   T->velocity_bounds_front[1], 
-                                                   T->velocity_bounds_right[1],
-                                                   T->velocity_bounds_back[0]);
-        CUCHK(cudaGetLastError());
-
-        // Compute back send buffer region
-        dtopo_vel_111<<<grid, block, 0, T->stream_2>>>(
-                                                   T->u1, T->v1, T->w1,
-                                                   T->dcrjx, T->dcrjy, T->dcrjz,
-                                                   T->metrics_f.d_f,
-                                                   T->metrics_f.d_f1_1,
-                                                   T->metrics_f.d_f1_2,
-                                                   T->metrics_f.d_f1_c,
-                                                   T->metrics_f.d_f2_1,
-                                                   T->metrics_f.d_f2_2,
-                                                   T->metrics_f.d_f2_c,
-                                                   T->metrics_f.d_f_1,
-                                                   T->metrics_f.d_f_2,
-                                                   T->metrics_f.d_f_c,
-                                                   T->metrics_g.d_g,
-                                                   T->metrics_g.d_g3,
-                                                   T->metrics_g.d_g3_c,
-                                                   T->metrics_g.d_g_c,
-                                                   T->rho,
-                                                   T->xx, T->xy, T->xz, 
-                                                   T->yy, T->yz, T->zz,
-                                                   T->timestep, T->dth,
-                                                   T->nx, T->ny, T->nz,
-                                                   T->velocity_bounds_left[0],
-                                                   T->velocity_bounds_back[0], 
-                                                   T->velocity_bounds_right[1],
-                                                   T->velocity_bounds_back[1]);
-        CUCHK(cudaGetLastError());
-
-        // Adjust grid size for boundary computation
-        grid.z = (TOP_BOUNDARY_SIZE+TBZ-1)/TBZ;
-        // Boundary stencils near free surface
-        
-        dtopo_vel_112<<<grid, block, 0, T->stream_1>>>(
-                                                   T->u1, T->v1, T->w1,
-                                                   T->dcrjx, T->dcrjy, T->dcrjz,
-                                                   T->metrics_f.d_f,
-                                                   T->metrics_f.d_f1_1,
-                                                   T->metrics_f.d_f1_2,
-                                                   T->metrics_f.d_f1_c,
-                                                   T->metrics_f.d_f2_1,
-                                                   T->metrics_f.d_f2_2,
-                                                   T->metrics_f.d_f2_c,
-                                                   T->metrics_f.d_f_1,
-                                                   T->metrics_f.d_f_2,
-                                                   T->metrics_f.d_f_c,
-                                                   T->metrics_g.d_g,
-                                                   T->metrics_g.d_g3,
-                                                   T->metrics_g.d_g3_c,
-                                                   T->metrics_g.d_g_c,
-                                                   T->rho,
-                                                   T->xx, T->xy, T->xz, 
-                                                   T->yy, T->yz, T->zz,
-                                                   T->timestep, T->dth,
-                                                   T->nx, T->ny, T->nz,
-                                                   T->velocity_bounds_left[0],
-                                                   T->velocity_bounds_front[0], 
-                                                   T->velocity_bounds_right[1],
-                                                   T->velocity_bounds_front[1]);
-        CUCHK(cudaGetLastError());
-
-        dtopo_vel_112<<<grid, block, 0, T->stream_i>>>(
-                                                   T->u1, T->v1, T->w1,
-                                                   T->dcrjx, T->dcrjy, T->dcrjz,
-                                                   T->metrics_f.d_f,
-                                                   T->metrics_f.d_f1_1,
-                                                   T->metrics_f.d_f1_2,
-                                                   T->metrics_f.d_f1_c,
-                                                   T->metrics_f.d_f2_1,
-                                                   T->metrics_f.d_f2_2,
-                                                   T->metrics_f.d_f2_c,
-                                                   T->metrics_f.d_f_1,
-                                                   T->metrics_f.d_f_2,
-                                                   T->metrics_f.d_f_c,
-                                                   T->metrics_g.d_g,
-                                                   T->metrics_g.d_g3,
-                                                   T->metrics_g.d_g3_c,
-                                                   T->metrics_g.d_g_c,
-                                                   T->rho,
-                                                   T->xx, T->xy, T->xz, 
-                                                   T->yy, T->yz, T->zz,
-                                                   T->timestep, T->dth,
-                                                   T->nx, T->ny, T->nz,
-                                                   T->velocity_bounds_left[0],
-                                                   T->velocity_bounds_front[1], 
-                                                   T->velocity_bounds_right[1],
-                                                   T->velocity_bounds_back[0]);
-        CUCHK(cudaGetLastError());
-
-        dtopo_vel_112<<<grid, block, 0, T->stream_2>>>(
-                                                   T->u1, T->v1, T->w1,
-                                                   T->dcrjx, T->dcrjy, T->dcrjz,
-                                                   T->metrics_f.d_f,
-                                                   T->metrics_f.d_f1_1,
-                                                   T->metrics_f.d_f1_2,
-                                                   T->metrics_f.d_f1_c,
-                                                   T->metrics_f.d_f2_1,
-                                                   T->metrics_f.d_f2_2,
-                                                   T->metrics_f.d_f2_c,
-                                                   T->metrics_f.d_f_1,
-                                                   T->metrics_f.d_f_2,
-                                                   T->metrics_f.d_f_c,
-                                                   T->metrics_g.d_g,
-                                                   T->metrics_g.d_g3,
-                                                   T->metrics_g.d_g3_c,
-                                                   T->metrics_g.d_g_c,
-                                                   T->rho,
-                                                   T->xx, T->xy, T->xz, 
-                                                   T->yy, T->yz, T->zz,
-                                                   T->timestep, T->dth,
-                                                   T->nx, T->ny, T->nz,
-                                                   T->velocity_bounds_left[0],
-                                                   T->velocity_bounds_back[0], 
-                                                   T->velocity_bounds_right[1],
-                                                   T->velocity_bounds_back[1]);
-        CUCHK(cudaGetLastError());
-
-        // This kernel only runs in debug mode because it applies one-sided
-        // stencils at depth
-        if (TOPO_DBG) { 
-                dtopo_vel_110<<<grid, block, 0, T->stream_i>>>(
-                                                   T->u1, T->v1, T->w1,
-                                                   T->dcrjx, T->dcrjy, T->dcrjz,
-                                                   T->metrics_f.d_f,
-                                                   T->metrics_f.d_f1_1,
-                                                   T->metrics_f.d_f1_2,
-                                                   T->metrics_f.d_f1_c,
-                                                   T->metrics_f.d_f2_1,
-                                                   T->metrics_f.d_f2_2,
-                                                   T->metrics_f.d_f2_c,
-                                                   T->metrics_f.d_f_1,
-                                                   T->metrics_f.d_f_2,
-                                                   T->metrics_f.d_f_c,
-                                                   T->metrics_g.d_g,
-                                                   T->metrics_g.d_g3,
-                                                   T->metrics_g.d_g3_c,
-                                                   T->metrics_g.d_g_c,
-                                                   T->rho,
-                                                   T->xx, T->xy, T->xz, 
-                                                   T->yy, T->yz, T->zz,
-                                                   T->timestep, T->dth,
-                                                   T->nx, T->ny, T->nz,
-                                                   T->velocity_bounds_left[0],
-                                                   T->velocity_bounds_front[0], 
-                                                   T->velocity_bounds_right[1],
-                                                   T->velocity_bounds_back[1]);
-                CUCHK(cudaGetLastError());
-        }
-}
-
-void topo_velocity_front_H(topo_t *T)
-{
-
-        if (!T->use) return;
-        if (T->y_rank_f < 0) {
-                return;
-        }
-
-        if (TOPO_DBG) {
-                printf("launching %s(%d)\n", __func__, T->rank);
-        }
-        //FIXME: Adjust grid size for boundary
-        dim3 block (TBX, TBY, TBZ);
-        dim3 grid ((T->velocity_grid_front.x+TBX-1)/TBX, 
-                   (T->velocity_grid_front.y+TBY-1)/TBY,
-                   (T->velocity_grid_front.z+TBZ-1)/TBZ);
-
-        dtopo_buf_vel_111<<<grid, block, 0, T->stream_1>>>
-                         (T->f_u1, T->f_v1, T->f_w1, 
-                          T->dcrjx, T->dcrjy, T->dcrjz,
-                          T->metrics_f.d_f,
-                          T->metrics_f.d_f1_1,
-                          T->metrics_f.d_f1_2,
-                          T->metrics_f.d_f1_c,
-                          T->metrics_f.d_f2_1,
-                          T->metrics_f.d_f2_2,
-                          T->metrics_f.d_f2_c,
-                          T->metrics_f.d_f_1,
-                          T->metrics_f.d_f_2,
-                          T->metrics_f.d_f_c,
-                          T->metrics_g.d_g,
-                          T->metrics_g.d_g3,
-                          T->metrics_g.d_g3_c,
-                          T->metrics_g.d_g_c,
-                          T->rho,
-                          T->xx, T->xy, T->xz, 
-                          T->yy, T->yz, T->zz,
-                          T->u1, T->v1, T->w1, 
-                          T->timestep, T->dth,
-                          T->nx, T->ny, T->nz,
-                          0, T->velocity_grid_front.y,
-                          T->velocity_bounds_front[0]);  
-        CUCHK(cudaGetLastError());
-
-        // Boundary stencils near free surface
-        // Adjust grid size for boundary computation
-        grid.z = (TOP_BOUNDARY_SIZE+TBZ-1)/TBZ;
-        dtopo_buf_vel_112<<<grid, block, 0, T->stream_1>>>
-                         (T->f_u1, T->f_v1, T->f_w1, 
-                          T->dcrjx, T->dcrjy, T->dcrjz,
-                          T->metrics_f.d_f,
-                          T->metrics_f.d_f1_1,
-                          T->metrics_f.d_f1_2,
-                          T->metrics_f.d_f1_c,
-                          T->metrics_f.d_f2_1,
-                          T->metrics_f.d_f2_2,
-                          T->metrics_f.d_f2_c,
-                          T->metrics_f.d_f_1,
-                          T->metrics_f.d_f_2,
-                          T->metrics_f.d_f_c,
-                          T->metrics_g.d_g,
-                          T->metrics_g.d_g3,
-                          T->metrics_g.d_g3_c,
-                          T->metrics_g.d_g_c,
-                          T->rho,
-                          T->xx, T->xy, T->xz, 
-                          T->yy, T->yz, T->zz,
-                          T->u1, T->v1, T->w1, 
-                          T->timestep, T->dth,
-                          T->nx, T->ny, T->nz,
-                          0, T->velocity_grid_front.y,
-                          T->velocity_bounds_front[0]);  
-
-        CUCHK(cudaGetLastError());
-
-        // This kernel only runs in debug mode because it applies one-sided
-        // stencils at depth
-        if (TOPO_DBG) { 
-                dtopo_buf_vel_110<<<grid, block, 0, T->stream_1>>>
-                         (T->f_u1, T->f_v1, T->f_w1, 
-                          T->dcrjx, T->dcrjy, T->dcrjz,
-                          T->metrics_f.d_f,
-                          T->metrics_f.d_f1_1,
-                          T->metrics_f.d_f1_2,
-                          T->metrics_f.d_f1_c,
-                          T->metrics_f.d_f2_1,
-                          T->metrics_f.d_f2_2,
-                          T->metrics_f.d_f2_c,
-                          T->metrics_f.d_f_1,
-                          T->metrics_f.d_f_2,
-                          T->metrics_f.d_f_c,
-                          T->metrics_g.d_g,
-                          T->metrics_g.d_g3,
-                          T->metrics_g.d_g3_c,
-                          T->metrics_g.d_g_c,
-                          T->rho,
-                          T->xx, T->xy, T->xz, 
-                          T->yy, T->yz, T->zz,
-                          T->u1, T->v1, T->w1, 
-                          T->timestep, T->dth,
-                          T->nx, T->ny, T->nz,
-                          0, T->velocity_grid_front.y,
-                          T->velocity_bounds_front[0]);  
-                CUCHK(cudaGetLastError());
-        }
-}
-
-void topo_velocity_back_H(topo_t *T)
-{
-        if (!T->use) return;
-        if (T->y_rank_b < 0) {
-                return;
-        }
-
-        if (TOPO_DBG) {
-                printf("launching %s(%d)\n", __func__, T->rank);
-        }
-        //FIXME: Adjust grid size for boundary
-        dim3 block (TBX, TBY, TBZ);
-        dim3 grid ((T->velocity_grid_back.x+TBX-1)/TBX, 
-                   (T->velocity_grid_back.y+TBY-1)/TBY,
-                   (T->velocity_grid_back.z+TBZ-1)/TBZ);
-
-        dtopo_buf_vel_111<<<grid, block, 0, T->stream_2>>>
-                         (T->b_u1, T->b_v1, T->b_w1, 
-                          T->dcrjx, T->dcrjy, T->dcrjz,
-                          T->metrics_f.d_f,
-                          T->metrics_f.d_f1_1,
-                          T->metrics_f.d_f1_2,
-                          T->metrics_f.d_f1_c,
-                          T->metrics_f.d_f2_1,
-                          T->metrics_f.d_f2_2,
-                          T->metrics_f.d_f2_c,
-                          T->metrics_f.d_f_1,
-                          T->metrics_f.d_f_2,
-                          T->metrics_f.d_f_c,
-                          T->metrics_g.d_g,
-                          T->metrics_g.d_g3,
-                          T->metrics_g.d_g3_c,
-                          T->metrics_g.d_g_c,
-                          T->rho,
-                          T->xx, T->xy, T->xz, 
-                          T->yy, T->yz, T->zz,
-                          T->u1, T->v1, T->w1, 
-                          T->timestep, T->dth,
-                          T->nx, T->ny, T->nz,
-                          0, T->velocity_grid_back.y,
-                          T->velocity_bounds_back[0]);  
-        CUCHK(cudaGetLastError());
-
-        // Boundary stencils near free surface
-        // Adjust grid size for boundary computation
-        grid.z = (TOP_BOUNDARY_SIZE+TBZ-1)/TBZ;
-        dtopo_buf_vel_112<<<grid, block, 0, T->stream_2>>>
-                         (T->b_u1, T->b_v1, T->b_w1, 
-                          T->dcrjx, T->dcrjy, T->dcrjz,
-                          T->metrics_f.d_f,
-                          T->metrics_f.d_f1_1,
-                          T->metrics_f.d_f1_2,
-                          T->metrics_f.d_f1_c,
-                          T->metrics_f.d_f2_1,
-                          T->metrics_f.d_f2_2,
-                          T->metrics_f.d_f2_c,
-                          T->metrics_f.d_f_1,
-                          T->metrics_f.d_f_2,
-                          T->metrics_f.d_f_c,
-                          T->metrics_g.d_g,
-                          T->metrics_g.d_g3,
-                          T->metrics_g.d_g3_c,
-                          T->metrics_g.d_g_c,
-                          T->rho,
-                          T->xx, T->xy, T->xz, 
-                          T->yy, T->yz, T->zz,
-                          T->u1, T->v1, T->w1, 
-                          T->timestep, T->dth,
-                          T->nx, T->ny, T->nz,
-                          0, T->velocity_grid_back.y,
-                          T->velocity_bounds_back[0]);  
-        CUCHK(cudaGetLastError());
-
-        // This kernel only runs in debug mode because it applies one-sided
-        // stencils at depth
-        if (TOPO_DBG) { 
-                dtopo_buf_vel_110<<<grid, block, 0, T->stream_2>>>
-                         (T->b_u1, T->b_v1, T->b_w1, 
-                          T->dcrjx, T->dcrjy, T->dcrjz,
-                          T->metrics_f.d_f,
-                          T->metrics_f.d_f1_1,
-                          T->metrics_f.d_f1_2,
-                          T->metrics_f.d_f1_c,
-                          T->metrics_f.d_f2_1,
-                          T->metrics_f.d_f2_2,
-                          T->metrics_f.d_f2_c,
-                          T->metrics_f.d_f_1,
-                          T->metrics_f.d_f_2,
-                          T->metrics_f.d_f_c,
-                          T->metrics_g.d_g,
-                          T->metrics_g.d_g3,
-                          T->metrics_g.d_g3_c,
-                          T->metrics_g.d_g_c,
-                          T->rho,
-                          T->xx, T->xy, T->xz, 
-                          T->yy, T->yz, T->zz,
-                          T->u1, T->v1, T->w1, 
-                          T->timestep, T->dth,
-                          T->nx, T->ny, T->nz,
-                          0, T->velocity_grid_back.y,
-                          T->velocity_bounds_back[0]);  
-                CUCHK(cudaGetLastError());
-        }
-}
-
-void topo_stress_interior_H(topo_t *T)
-{
-
-        if (!T->use) return;
-        if (TOPO_DBG) {
-                printf("launching %s(%d)\n", __func__, T->rank);
-        }
-        //FIXME: Adjust grid size for boundary
-        dim3 block (TBX, TBY, TBZ);
-        dim3 grid ((T->stress_grid_interior.x+TBX-1)/TBX, 
-                   (T->stress_grid_interior.y+TBY-1)/TBY,
-                   (T->stress_grid_interior.z+TBZ-1)/TBZ);
-
-
-        dtopo_str_111<<<grid, block, 0, T->stream_i>>>
-                         (
-                          T->xx, T->xy, T->xz, 
-                          T->yy, T->yz, T->zz,
-                          T->u1, T->v1, T->w1, 
-                          T->dcrjx, T->dcrjy, T->dcrjz,
-                          T->metrics_f.d_f,
-                          T->metrics_f.d_f1_1,
-                          T->metrics_f.d_f1_2,
-                          T->metrics_f.d_f1_c,
-                          T->metrics_f.d_f2_1,
-                          T->metrics_f.d_f2_2,
-                          T->metrics_f.d_f2_c,
-                          T->metrics_f.d_f_1,
-                          T->metrics_f.d_f_2,
-                          T->metrics_f.d_f_c,
-                          T->metrics_g.d_g,
-                          T->metrics_g.d_g3,
-                          T->metrics_g.d_g3_c,
-                          T->metrics_g.d_g_c,
-                          T->lami,
-                          T->mui, T->timestep,  
-                          T->dth, 
-                          T->nx, T->ny, T->nz,
-                          T->stress_bounds_left[1], T->stress_bounds_ydir[0], 
-                          T->stress_bounds_right[0], T->stress_bounds_ydir[1]);
-        CUCHK(cudaGetLastError());
-
-        // Adjust grid size for boundary computation
-        grid.z = (TOP_BOUNDARY_SIZE+TBZ-1)/TBZ;
-        dtopo_str_112<<<grid, block, 0, T->stream_i>>>
-                         (
-                          T->xx, T->xy, T->xz, 
-                          T->yy, T->yz, T->zz,
-                          T->u1, T->v1, T->w1, 
-                          T->dcrjx, T->dcrjy, T->dcrjz,
-                          T->metrics_f.d_f,
-                          T->metrics_f.d_f1_1,
-                          T->metrics_f.d_f1_2,
-                          T->metrics_f.d_f1_c,
-                          T->metrics_f.d_f2_1,
-                          T->metrics_f.d_f2_2,
-                          T->metrics_f.d_f2_c,
-                          T->metrics_f.d_f_1,
-                          T->metrics_f.d_f_2,
-                          T->metrics_f.d_f_c,
-                          T->metrics_g.d_g,
-                          T->metrics_g.d_g3,
-                          T->metrics_g.d_g3_c,
-                          T->metrics_g.d_g_c,
-                          T->lami,
-                          T->mui, T->timestep,  
-                          T->dth, 
-                          T->nx, T->ny, T->nz,
-                          T->stress_bounds_left[1], T->stress_bounds_ydir[0], 
-                          T->stress_bounds_right[0], T->stress_bounds_ydir[1]);
-
-        CUCHK(cudaGetLastError());
-
-        if (TOPO_DBG) {
-                dtopo_str_110<<<grid, block, 0, T->stream_i>>>
-                         (
-                          T->xx, T->xy, T->xz, 
-                          T->yy, T->yz, T->zz,
-                          T->u1, T->v1, T->w1, 
-                          T->dcrjx, T->dcrjy, T->dcrjz,
-                          T->metrics_f.d_f,
-                          T->metrics_f.d_f1_1,
-                          T->metrics_f.d_f1_2,
-                          T->metrics_f.d_f1_c,
-                          T->metrics_f.d_f2_1,
-                          T->metrics_f.d_f2_2,
-                          T->metrics_f.d_f2_c,
-                          T->metrics_f.d_f_1,
-                          T->metrics_f.d_f_2,
-                          T->metrics_f.d_f_c,
-                          T->metrics_g.d_g,
-                          T->metrics_g.d_g3,
-                          T->metrics_g.d_g3_c,
-                          T->metrics_g.d_g_c,
-                          T->lami,
-                          T->mui, T->timestep,  
-                          T->dth, 
-                          T->nx, T->ny, T->nz,
-                          T->stress_bounds_left[1], T->stress_bounds_ydir[0], 
-                          T->stress_bounds_right[0], T->stress_bounds_ydir[1]);
-                CUCHK(cudaGetLastError());
-        }
-}
-
-void topo_stress_left_H(topo_t *T)
-{
-
-        if (!T->use) return;
-        if (T->x_rank_l < 0) {
-                return;
-        }
-
-        if (TOPO_DBG) {
-                printf("launching %s(%d)\n", __func__, T->rank);
-        }
-
-        //FIXME: Adjust grid size for boundary
-        dim3 block (TBX, TBY, TBZ);
-        dim3 grid ((T->stress_grid_left.x+TBX-1)/TBX, 
-                   (T->stress_grid_left.y+TBY-1)/TBY,
-                   (T->stress_grid_left.z+TBZ-1)/TBZ);
-
-        dtopo_str_111<<<grid, block, 0, T->stream_1>>>
-                         (
-                          T->xx, T->xy, T->xz, 
-                          T->yy, T->yz, T->zz,
-                          T->u1, T->v1, T->w1, 
-                          T->dcrjx, T->dcrjy, T->dcrjz,
-                          T->metrics_f.d_f,
-                          T->metrics_f.d_f1_1,
-                          T->metrics_f.d_f1_2,
-                          T->metrics_f.d_f1_c,
-                          T->metrics_f.d_f2_1,
-                          T->metrics_f.d_f2_2,
-                          T->metrics_f.d_f2_c,
-                          T->metrics_f.d_f_1,
-                          T->metrics_f.d_f_2,
-                          T->metrics_f.d_f_c,
-                          T->metrics_g.d_g,
-                          T->metrics_g.d_g3,
-                          T->metrics_g.d_g3_c,
-                          T->metrics_g.d_g_c,
-                          T->lami,
-                          T->mui, T->timestep,  
-                          T->dth, 
-                          T->nx, T->ny, T->nz,
-                          T->stress_bounds_left[0], T->stress_bounds_ydir[0], 
-                          T->stress_bounds_left[1], T->stress_bounds_ydir[1]);
-        CUCHK(cudaGetLastError());
-
-
-        grid.z = (TOP_BOUNDARY_SIZE+TBZ-1)/TBZ;
-        dtopo_str_112<<<grid, block, 0, T->stream_1>>>
-                         (
-                          T->xx, T->xy, T->xz, 
-                          T->yy, T->yz, T->zz,
-                          T->u1, T->v1, T->w1, 
-                          T->dcrjx, T->dcrjy, T->dcrjz,
-                          T->metrics_f.d_f,
-                          T->metrics_f.d_f1_1,
-                          T->metrics_f.d_f1_2,
-                          T->metrics_f.d_f1_c,
-                          T->metrics_f.d_f2_1,
-                          T->metrics_f.d_f2_2,
-                          T->metrics_f.d_f2_c,
-                          T->metrics_f.d_f_1,
-                          T->metrics_f.d_f_2,
-                          T->metrics_f.d_f_c,
-                          T->metrics_g.d_g,
-                          T->metrics_g.d_g3,
-                          T->metrics_g.d_g3_c,
-                          T->metrics_g.d_g_c,
-                          T->lami,
-                          T->mui, T->timestep,  
-                          T->dth, 
-                          T->nx, T->ny, T->nz,
-                          T->stress_bounds_left[0], T->stress_bounds_ydir[0], 
-                          T->stress_bounds_left[1], T->stress_bounds_ydir[1]);
-        CUCHK(cudaGetLastError());
-
-        if (TOPO_DBG) {
-                dtopo_str_110<<<grid, block, 0, T->stream_1>>>
-                         (
-                          T->xx, T->xy, T->xz, 
-                          T->yy, T->yz, T->zz,
-                          T->u1, T->v1, T->w1, 
-                          T->dcrjx, T->dcrjy, T->dcrjz,
-                          T->metrics_f.d_f,
-                          T->metrics_f.d_f1_1,
-                          T->metrics_f.d_f1_2,
-                          T->metrics_f.d_f1_c,
-                          T->metrics_f.d_f2_1,
-                          T->metrics_f.d_f2_2,
-                          T->metrics_f.d_f2_c,
-                          T->metrics_f.d_f_1,
-                          T->metrics_f.d_f_2,
-                          T->metrics_f.d_f_c,
-                          T->metrics_g.d_g,
-                          T->metrics_g.d_g3,
-                          T->metrics_g.d_g3_c,
-                          T->metrics_g.d_g_c,
-                          T->lami,
-                          T->mui, T->timestep,  
-                          T->dth, 
-                          T->nx, T->ny, T->nz,
-                          T->stress_bounds_left[0], T->stress_bounds_ydir[0], 
-                          T->stress_bounds_left[1], T->stress_bounds_ydir[1]);
-                CUCHK(cudaGetLastError());
-        }
-}
-
-void topo_stress_right_H(topo_t *T)
-{
-
-        if (!T->use) return;
-        if (T->x_rank_r < 0) {
-                return;
-        }
-        if (TOPO_DBG) {
-                printf("launching %s(%d)\n", __func__, T->rank);
-        }
-
-        //FIXME: Adjust grid size for boundary
-        dim3 block (TBX, TBY, TBZ);
-        dim3 grid ((T->stress_grid_right.x+TBX-1)/TBX, 
-                   (T->stress_grid_right.y+TBY-1)/TBY,
-                   (T->stress_grid_right.z+TBZ-1)/TBZ);
-
-
-        dtopo_str_111<<<grid, block, 0, T->stream_2>>>
-                         (
-                          T->xx, T->xy, T->xz, 
-                          T->yy, T->yz, T->zz,
-                          T->u1, T->v1, T->w1, 
-                          T->dcrjx, T->dcrjy, T->dcrjz,
-                          T->metrics_f.d_f,
-                          T->metrics_f.d_f1_1,
-                          T->metrics_f.d_f1_2,
-                          T->metrics_f.d_f1_c,
-                          T->metrics_f.d_f2_1,
-                          T->metrics_f.d_f2_2,
-                          T->metrics_f.d_f2_c,
-                          T->metrics_f.d_f_1,
-                          T->metrics_f.d_f_2,
-                          T->metrics_f.d_f_c,
-                          T->metrics_g.d_g,
-                          T->metrics_g.d_g3,
-                          T->metrics_g.d_g3_c,
-                          T->metrics_g.d_g_c,
-                          T->lami,
-                          T->mui, T->timestep,  
-                          T->dth, 
-                          T->nx, T->ny, T->nz,
-                          T->stress_bounds_right[0], T->stress_bounds_ydir[0], 
-                          T->stress_bounds_right[1], T->stress_bounds_ydir[1]);
-        CUCHK(cudaGetLastError());
-
-        grid.z = (TOP_BOUNDARY_SIZE+TBZ-1)/TBZ;
-        dtopo_str_112<<<grid, block, 0, T->stream_2>>>
-                         (
-                          T->xx, T->xy, T->xz, 
-                          T->yy, T->yz, T->zz,
-                          T->u1, T->v1, T->w1, 
-                          T->dcrjx, T->dcrjy, T->dcrjz,
-                          T->metrics_f.d_f,
-                          T->metrics_f.d_f1_1,
-                          T->metrics_f.d_f1_2,
-                          T->metrics_f.d_f1_c,
-                          T->metrics_f.d_f2_1,
-                          T->metrics_f.d_f2_2,
-                          T->metrics_f.d_f2_c,
-                          T->metrics_f.d_f_1,
-                          T->metrics_f.d_f_2,
-                          T->metrics_f.d_f_c,
-                          T->metrics_g.d_g,
-                          T->metrics_g.d_g3,
-                          T->metrics_g.d_g3_c,
-                          T->metrics_g.d_g_c,
-                          T->lami,
-                          T->mui, T->timestep,  
-                          T->dth, 
-                          T->nx, T->ny, T->nz,
-                          T->stress_bounds_right[0], T->stress_bounds_ydir[0], 
-                          T->stress_bounds_right[1], T->stress_bounds_ydir[1]);
-        CUCHK(cudaGetLastError());
-
-        if (TOPO_DBG) {
-                dtopo_str_110<<<grid, block, 0, T->stream_2>>>
-                         (
-                          T->xx, T->xy, T->xz, 
-                          T->yy, T->yz, T->zz,
-                          T->u1, T->v1, T->w1, 
-                          T->dcrjx, T->dcrjy, T->dcrjz,
-                          T->metrics_f.d_f,
-                          T->metrics_f.d_f1_1,
-                          T->metrics_f.d_f1_2,
-                          T->metrics_f.d_f1_c,
-                          T->metrics_f.d_f2_1,
-                          T->metrics_f.d_f2_2,
-                          T->metrics_f.d_f2_c,
-                          T->metrics_f.d_f_1,
-                          T->metrics_f.d_f_2,
-                          T->metrics_f.d_f_c,
-                          T->metrics_g.d_g,
-                          T->metrics_g.d_g3,
-                          T->metrics_g.d_g3_c,
-                          T->metrics_g.d_g_c,
-                          T->lami,
-                          T->mui, T->timestep,  
-                          T->dth, 
-                          T->nx, T->ny, T->nz,
-                          T->stress_bounds_right[0], T->stress_bounds_ydir[0], 
-                          T->stress_bounds_right[1], T->stress_bounds_ydir[1]);
-                CUCHK(cudaGetLastError());
-        }
-}
diff --git a/src/topography/velocity.cu b/src/topography/velocity.cu
index c7b4c6b..ece9be4 100644
--- a/src/topography/velocity.cu
+++ b/src/topography/velocity.cu
@@ -2,10 +2,137 @@
 #include <nvToolsExt.h>
 #include <stdio.h>
 
-#include <topography/kernels/optimized_velocity.cuh>
-#include <topography/kernels/optimized_launch_config.cuh>
+#include <awp/definitions.h>
 #include <topography/velocity.cuh>
+#include <topography/topography.cuh>
+
+
+// Kernel naming convention
+// 110: Bottom boundary (only used in debug mode)
+// 111: Interior
+// 112: Top boundary
+
+// Number of threads per block to use for interior velocity kernel
+#ifndef VEL_INT_X
+#define VEL_INT_X 64
+#endif
+#ifndef VEL_INT_Y
+#define VEL_INT_Y 4
+#endif
+#ifndef VEL_INT_Z
+#define VEL_INT_Z 4
+#endif
+
+// Number of threads per block to use for boundary velocity kernel
+#ifndef VEL_BND_X
+#define VEL_BND_X 7
+#endif
+#ifndef VEL_BND_Y
+#define VEL_BND_Y 8
+#endif
+#ifndef VEL_BND_Z
+#define VEL_BND_Z 1
+#endif
+
+// Number of threads per block
+// grid dimension (X, Y, Z) refers to CUDA grid indices
+#ifndef DTOPO_VEL_110_X
+#define DTOPO_VEL_110_X VEL_BND_X
+#endif
+#ifndef DTOPO_VEL_110_Y
+#define DTOPO_VEL_110_Y VEL_BND_Y
+#endif
+#ifndef DTOPO_VEL_110_Z
+#define DTOPO_VEL_110_Z VEL_BND_Z
+#endif
+
+#ifndef DTOPO_VEL_111_X
+#define DTOPO_VEL_111_X VEL_INT_X
+#endif
+#ifndef DTOPO_VEL_111_Y
+#define DTOPO_VEL_111_Y VEL_INT_Y
+#endif
+#ifndef DTOPO_VEL_111_Z
+#define DTOPO_VEL_111_Z VEL_INT_Z
+#endif
+
+#ifndef DTOPO_VEL_112_X
+#define DTOPO_VEL_112_X VEL_BND_X
+#endif
+#ifndef DTOPO_VEL_112_Y
+#define DTOPO_VEL_112_Y VEL_BND_Y
+#endif
+#ifndef DTOPO_VEL_112_Z
+#define DTOPO_VEL_112_Z VEL_BND_Z
+#endif
+
+#ifndef DTOPO_BUF_VEL_111_X
+#define DTOPO_BUF_VEL_111_X VEL_INT_X
+#endif
+#ifndef DTOPO_BUF_VEL_111_Y
+#define DTOPO_BUF_VEL_111_Y VEL_INT_Y
+#endif
+#ifndef DTOPO_BUF_VEL_111_Z
+#define DTOPO_BUF_VEL_111_Z VEL_INT_Z
+#endif
+
+#ifndef DTOPO_BUF_VEL_112_X
+#define DTOPO_BUF_VEL_112_X VEL_BND_X
+#endif
+#ifndef DTOPO_BUF_VEL_112_Y
+#define DTOPO_BUF_VEL_112_Y VEL_BND_Y
+#endif
+#ifndef DTOPO_BUF_VEL_112_Z
+#define DTOPO_BUF_VEL_112_Z VEL_BND_Z
+#endif
+
+#ifndef DTOPO_BUF_VEL_110_X
+#define DTOPO_BUF_VEL_110_X VEL_BND_X
+#endif
+#ifndef DTOPO_BUF_VEL_110_Y
+#define DTOPO_BUF_VEL_110_Y VEL_BND_Y
+#endif
+#ifndef DTOPO_BUF_VEL_110_Z
+#define DTOPO_BUF_VEL_110_Z VEL_BND_Z
+#endif
+
+ 
+#ifndef DTOPO_VEL_110_MAX_THREADS_PER_BLOCK
+#define DTOPO_VEL_110_MAX_THREADS_PER_BLOCK 32
+#endif
+
+#ifndef DTOPO_VEL_111_MAX_THREADS_PER_BLOCK
+#define DTOPO_VEL_111_MAX_THREADS_PER_BLOCK 1024
+#endif
+
+#ifndef DTOPO_VEL_112_MAX_THREADS_PER_BLOCK
+#define DTOPO_VEL_112_MAX_THREADS_PER_BLOCK 64
+#endif
+
+#ifndef DTOPO_BUF_VEL_110_MAX_THREADS_PER_BLOCK
+#define DTOPO_BUF_VEL_110_MAX_THREADS_PER_BLOCK 1024
+#endif
+
+#ifndef DTOPO_BUF_VEL_111_MAX_THREADS_PER_BLOCK
+#define DTOPO_BUF_VEL_111_MAX_THREADS_PER_BLOCK 1024
+#endif
+
+#ifndef DTOPO_BUF_VEL_112_MAX_THREADS_PER_BLOCK
+#define DTOPO_BUF_VEL_112_MAX_THREADS_PER_BLOCK 1024
+#endif
+
+// Apply loop in kernel
+// This option must be compatible with the kernel. If there is no loop in the
+// kernel, turn off this option, and vice versa.
+#define DTOPO_VEL_110_LOOP_Z 1
+#define DTOPO_VEL_111_LOOP_Z 0
+#define DTOPO_VEL_112_LOOP_Z 0
+#define DTOPO_BUF_VEL_110_LOOP_Z 1
+#define DTOPO_BUF_VEL_111_LOOP_Z 0
+#define DTOPO_BUF_VEL_112_LOOP_Z 0
+
 #include "kernels/velocity_unroll.cu"
+#include "kernels/velocity.cu"
 
 inline dim3 set_grid(const dim3 block, const int3_t size, const dim3 loop)
 {
diff --git a/src/utils/CMakeLists.txt b/src/utils/CMakeLists.txt
index 4dc6b74..fcd440f 100644
--- a/src/utils/CMakeLists.txt
+++ b/src/utils/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(HEADERS
-    ${AWP_MINI_SOURCE_DIR}/include/utils/copy.h
+    ${AWP_SOURCE_DIR}/include/utils/copy.h
     )
 
 add_library(utils
@@ -8,5 +8,5 @@ add_library(utils
 
 target_include_directories(utils
         PUBLIC
-        ${AWP_MINI_SOURCE_DIR}/include/
+        ${AWP_SOURCE_DIR}/include/
         )
diff --git a/src/vtk/CMakeLists.txt b/src/vtk/CMakeLists.txt
index 7133a85..d43fa45 100644
--- a/src/vtk/CMakeLists.txt
+++ b/src/vtk/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(HEADERS
-    ${AWP_MINI_SOURCE_DIR}/include/awp/definitions.h
-    ${AWP_MINI_SOURCE_DIR}/include/vtk/vtk.h
+    ${AWP_SOURCE_DIR}/include/awp/definitions.h
+    ${AWP_SOURCE_DIR}/include/vtk/vtk.h
     )
 
 add_library(vtk
@@ -9,5 +9,5 @@ add_library(vtk
 
 target_include_directories(vtk
         PUBLIC
-        ${AWP_MINI_SOURCE_DIR}/include/
+        ${AWP_SOURCE_DIR}/include/
         )
diff --git a/src/vtk/vtk.c b/src/vtk/vtk.c
index 18cee13..8b01a93 100644
--- a/src/vtk/vtk.c
+++ b/src/vtk/vtk.c
@@ -3,7 +3,7 @@
 #include <assert.h>
 #include <math.h>
 
-#include <awp/definitions.h>
+#include <awp/pmcl3d_cons.h>
 #include <vtk/vtk.h>
 
 size_t vtk_write_grid(const char *fname, 
diff --git a/test111/Makefile b/test111/Makefile
index 2c2dcae..6bf95eb 100644
--- a/test111/Makefile
+++ b/test111/Makefile
@@ -3,9 +3,9 @@ nx=512
 ny=512
 nz=512
 nt=10
-arch=sm_70
+arch=sm_75
 log=prof.txt
-args=
+args=-lineinfo
 all: c p
 exe=test111.x
 
diff --git a/test111/stress.cu b/test111/stress.cu
index 007766c..9e13ecf 100644
--- a/test111/stress.cu
+++ b/test111/stress.cu
@@ -1,4 +1,4 @@
-#define CURVILINEAR
+//#define CURVILINEAR
 #define _f(i, j) f[(j) + align + (i) * (2 * align + 2 * ngsl + ny + 4)]
 #define _f_1(i, j) f_1[(j) + align + (i) * (2 * align + 2 * ngsl + ny + 4)]
 #define _f_2(i, j) f_2[(j) + align + (i) * (2 * align + 2 * ngsl + ny + 4)]
@@ -676,3 +676,4 @@ __global__ void dtopo_str_111(_prec*  RSTRCT xx, _prec*  RSTRCT yy, _prec*  RSTR
 #undef _g_c
 #undef _g
 #undef _g3
+
diff --git a/test111/test111.cu b/test111/test111.cu
index cc4cba9..2c1e425 100644
--- a/test111/test111.cu
+++ b/test111/test111.cu
@@ -45,6 +45,10 @@
 #ifndef USE_STRESS_MACRO_PLANES
 #define USE_STRESS_MACRO_PLANES 1
 #endif
+
+#ifndef USE_STRESS_CARTESIAN
+#define USE_STRESS_CARTESIAN 1
+#endif
 //-----------------------------------------------------------------------------
 
 //-----------------------------------------------------------------------------
@@ -52,11 +56,11 @@
 
 // Threads in x, y, z
 #ifndef STR_TX
-#define STR_TX 64
+#define STR_TX 32
 #endif
 
 #ifndef STR_TY
-#define STR_TY 8
+#define STR_TY 2
 #endif
 
 #ifndef STR_TZ
@@ -1780,6 +1784,7 @@ __global__ void chknan(const float *RSTRCT u1,
 #include "stress_macro_planes.cu"
 #include "stress_index.cu"
 #include "stress_index_unroll.cu"
+//#include "stress_cartesian.cu"
 
 #undef RSTRCT
 // *****************************************************************************
@@ -2338,6 +2343,7 @@ int main (int argc, char **argv) {
         dim3 blocks ((nz-4)/(threads.x)+1, 
                      (ny-1)/(threads.y)+1,
                      1);
+        cudaFuncSetCacheConfig(dtopo_str_111<STR_TX,STR_TY,STR_TZ>, cudaFuncCachePreferL1);
         dtopo_str_111<STR_TX, STR_TY, STR_TZ><<<blocks, threads>>>(
             s11, s22, s33, s12, s13, s23, r1, r2, r3, r4, r5, r6, u1, u2, u3, f,
             f1_1, f1_2, f1_c, f2_1, f2_2, f2_c, f_1, f_2, f_c, g, g3, g3_c, g_c,
@@ -2595,6 +2601,8 @@ int main (int argc, char **argv) {
         dim3 blocks((nz - 4) / (na * threads.x) + 1,
                     (ny - 1) / (nb * threads.y) + 1,
                     (nx - 1) / (threads.z) + 1);
+
+        cudaFuncSetCacheConfig(dtopo_str_111_index_unroll<STRIU_TX,STRIU_TY,STRIU_TZ, na, nb>, cudaFuncCachePreferL1);
         dtopo_str_111_index_unroll<STRIU_TX, STRIU_TY, STRIU_TZ, na, nb><<<blocks, threads>>>(
             t11, t22, t33, t12, t13, t23, p1, p2, p3, p4, p5, p6, u1, u2, u3, f,
             f1_1, f1_2, f1_c, f2_1, f2_2, f2_c, f_1, f_2, f_c, g, g3, g3_c, g_c,
@@ -2652,6 +2660,7 @@ int main (int argc, char **argv) {
 #endif
 
 
+
   }
 
   CUCHK(cudaDeviceSynchronize());
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index f065e17..9aeaa67 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -24,42 +24,17 @@ set(OPT_LIBRARIES
         ${LIBRARIES}
         )
 
-# Unoptimized version
-add_executable(test_unoptimized_kernels topography_kernels.cu)
-
-target_link_libraries(test_unoptimized_kernels 
-        ${UNOPT_LIBRARIES} 
-        )
-
-target_include_directories(test_unoptimized_kernels
-        PUBLIC
-        ${AWP_MINI_SOURCE_DIR}/include/
-        )
-target_compile_definitions(test_unoptimized_kernels 
-        PUBLIC USE_OPTIMIZED_KERNELS=0)
-
-# Optimized version
-add_executable(test_optimized_kernels topography_kernels.cu)
-
-target_link_libraries(test_optimized_kernels 
-        ${LIBRARIES} 
-        opt_topography_attenuation
-        )
-
-target_compile_definitions(test_optimized_kernels 
-        PUBLIC USE_OPTIMIZED_KERNELS=1)
-
 # Attenuation test
 add_executable(test_attenuation test_attenuation.cu)
 
 target_link_libraries(test_attenuation 
         ${LIBRARIES} 
         awp
-        opt_topography_attenuation
+        topography
         )
 
 target_include_directories(test_attenuation
         PUBLIC
-        ${AWP_MINI_SOURCE_DIR}/include/
+        ${AWP_SOURCE_DIR}/include/
         )
 
diff --git a/tests/buffers/CMakeLists.txt b/tests/buffers/CMakeLists.txt
index ff27b12..ef603b2 100644
--- a/tests/buffers/CMakeLists.txt
+++ b/tests/buffers/CMakeLists.txt
@@ -8,7 +8,7 @@ target_link_libraries(test_buffer
 
 target_include_directories(test_buffer
         PUBLIC
-        ${AWP_MINI_SOURCE_DIR}/include/
+        ${AWP_SOURCE_DIR}/include/
         )
 
 add_test(NAME test_buffer COMMAND test_buffer)
diff --git a/tests/fixtures/source_dm.txt b/tests/fixtures/source_dm.txt
index 234ba22..0e7298d 100644
--- a/tests/fixtures/source_dm.txt
+++ b/tests/fixtures/source_dm.txt
@@ -13,6 +13,6 @@ num_writes=2
 # * the second source should map to i=0 j=0 in block 1 (bottom block)
 
 coordinates
-0 4.5 1.5 -2.0
-0 4.5 1.5 -11.0
-0 4.5 1.5 -34.0
+0 9.0 9.0 -2.0
+0 9.0 9.0 -11.0
+0 9.0 9.0 -34.0
diff --git a/tests/fixtures/source_x.txt b/tests/fixtures/source_x.txt
new file mode 100644
index 0000000..90b81e8
--- /dev/null
+++ b/tests/fixtures/source_x.txt
@@ -0,0 +1,14 @@
+2.1.0
+file=source
+length=3 
+steps=600
+stride=1
+degree=0
+gpu_buffer_size=2
+cpu_buffer_size=3
+num_writes=2
+
+coordinates
+0 9.0 8.5 -2.0
+0 9.0 9.5 -11.0
+0 9.0 7.5 -34.0
diff --git a/tests/fixtures/source_xx.txt b/tests/fixtures/source_xx.txt
new file mode 100644
index 0000000..32e7959
--- /dev/null
+++ b/tests/fixtures/source_xx.txt
@@ -0,0 +1,13 @@
+2.1.0
+file=source
+length=3 
+steps=600
+stride=1
+degree=0
+gpu_buffer_size=2
+cpu_buffer_size=3
+num_writes=2
+coordinates
+0 9.5 8.5 -2.0
+0 10.5 9.5 -11.0
+0 13.5 7.5 -34.0
diff --git a/tests/fixtures/source_xy.txt b/tests/fixtures/source_xy.txt
new file mode 100644
index 0000000..7041aa5
--- /dev/null
+++ b/tests/fixtures/source_xy.txt
@@ -0,0 +1,18 @@
+2.1.0
+file=source
+length=3 
+steps=600
+stride=1
+degree=0
+gpu_buffer_size=2
+cpu_buffer_size=3
+num_writes=2
+# If the finest grid has grid spacing h = 1.0 (top block),
+# then:
+# * the first source should map to i=1 j=2 in block 0 (top block)
+# * the second source should map to i=0 j=0 in block 1 (bottom block)
+
+coordinates
+0 9.0 9.0 -2.0
+0 9.0 10.0 -11.0
+0 9.0 12.0 -34.0
diff --git a/tests/fixtures/source_xz.txt b/tests/fixtures/source_xz.txt
new file mode 100644
index 0000000..90b81e8
--- /dev/null
+++ b/tests/fixtures/source_xz.txt
@@ -0,0 +1,14 @@
+2.1.0
+file=source
+length=3 
+steps=600
+stride=1
+degree=0
+gpu_buffer_size=2
+cpu_buffer_size=3
+num_writes=2
+
+coordinates
+0 9.0 8.5 -2.0
+0 9.0 9.5 -11.0
+0 9.0 7.5 -34.0
diff --git a/tests/fixtures/source_y.txt b/tests/fixtures/source_y.txt
new file mode 100644
index 0000000..279e6fa
--- /dev/null
+++ b/tests/fixtures/source_y.txt
@@ -0,0 +1,14 @@
+2.1.0
+file=source
+length=3 
+steps=600
+stride=1
+degree=0
+gpu_buffer_size=2
+cpu_buffer_size=3
+num_writes=2
+
+coordinates
+0 9.5 9.0 -2.0
+0 10.5 10.0 -11.0
+0 13.5 12.0 -34.0
diff --git a/tests/fixtures/source_yz.txt b/tests/fixtures/source_yz.txt
new file mode 100644
index 0000000..279e6fa
--- /dev/null
+++ b/tests/fixtures/source_yz.txt
@@ -0,0 +1,14 @@
+2.1.0
+file=source
+length=3 
+steps=600
+stride=1
+degree=0
+gpu_buffer_size=2
+cpu_buffer_size=3
+num_writes=2
+
+coordinates
+0 9.5 9.0 -2.0
+0 10.5 10.0 -11.0
+0 13.5 12.0 -34.0
diff --git a/tests/fixtures/source_z.txt b/tests/fixtures/source_z.txt
new file mode 100644
index 0000000..e36dad8
--- /dev/null
+++ b/tests/fixtures/source_z.txt
@@ -0,0 +1,14 @@
+2.1.0
+file=source
+length=3 
+steps=600
+stride=1
+degree=0
+gpu_buffer_size=2
+cpu_buffer_size=3
+num_writes=2
+
+coordinates
+0 9.5 8.5 -2.0
+0 10.5 9.5 -11.0
+0 13.5 7.5 -34.0
diff --git a/tests/grid/CMakeLists.txt b/tests/grid/CMakeLists.txt
index 884941d..35afa2b 100644
--- a/tests/grid/CMakeLists.txt
+++ b/tests/grid/CMakeLists.txt
@@ -8,8 +8,7 @@ target_link_libraries(test_grid_3d
 
 target_include_directories(test_grid_3d
         PUBLIC
-        ${AWP_MINI_SOURCE_DIR}/include/
+        ${AWP_SOURCE_DIR}/include/
         )
 
 add_test(NAME test_grid_3d COMMAND test_grid_3d)
-
diff --git a/tests/grid/test_grid_3d.c b/tests/grid/test_grid_3d.c
index 7f9a251..b20e2c9 100644
--- a/tests/grid/test_grid_3d.c
+++ b/tests/grid/test_grid_3d.c
@@ -17,6 +17,11 @@ int test_grid_xyz(int rank, int size);
 int test_grid3_xyz(int rank, int size);
 int test_grid3_reduce(int rank, int size);
 int test_shift(int rank, int size);
+int test_global_to_local(int rank, int size);
+
+   
+int test_global_to_local(int rank, int size);
+   
 
 int main(int argc, char **argv)
 {
@@ -40,6 +45,7 @@ int main(int argc, char **argv)
         err |= test_grid3_xyz(rank, size);
         err |= test_grid3_reduce(rank, size);
         err |= test_shift(rank, size);
+        err |= test_global_to_local(rank, size);
 
         if (rank == 0) {
                 printf("Testing completed.\n");
@@ -64,7 +70,7 @@ int test_grid_fill(int rank, int size)
                         .boundary1 = 0, .boundary2 = 0};
 
         x = malloc(sizeof(x) * grid.size);
-        grid_fill1(x, grid);
+        grid_fill1(x, grid, 1);
         err |= mpi_assert(!err, rank);
         err |= mpi_assert(fabs(x[0] - (0.0 +  rank * n)) < FLTOL, rank);
         err |= mpi_assert(fabs(x[1] - (1.0 +  rank * n)) < FLTOL, rank);
@@ -78,7 +84,7 @@ int test_grid_fill(int rank, int size)
         grid1_t grid = {.id = rank, .shift = 0, .size = n, .gridspacing = h, 
                         .boundary1 = 1, .boundary2 = 0};
 
-        grid_fill1(x, grid);
+        grid_fill1(x, grid, 1);
         err |= mpi_assert(!err, rank);
         err |= mpi_assert(fabs(x[0] - (0.0 +  rank * n)) < FLTOL, rank);
         err |= mpi_assert(fabs(x[1] - (1.0 +  rank * n)) < FLTOL, rank);
@@ -92,7 +98,7 @@ int test_grid_fill(int rank, int size)
         grid1_t grid = {.id = rank, .shift = 0, .size = n, .gridspacing = h, 
                         .boundary1 = 0, .boundary2 = 1};
 
-        grid_fill1(x, grid);
+        grid_fill1(x, grid, 1);
         err |= mpi_assert(!err, rank);
         err |= mpi_assert(fabs(x[0] - (0.0 +  rank * n)) < FLTOL, rank);
         err |= mpi_assert(fabs(x[1] - (1.0 +  rank * n)) < FLTOL, rank);
@@ -106,11 +112,11 @@ int test_grid_fill(int rank, int size)
         grid1_t grid = {.id = rank, .shift = 1, .size = n, .gridspacing = h, 
                         .boundary1 = 0, .boundary2 = 0};
 
-        grid_fill1(x, grid);
+        grid_fill1(x, grid, 1);
         err |= mpi_assert(!err, rank);
-        err |= mpi_assert(fabs(x[0] - (-0.5 +  rank * n)) < FLTOL, rank);
-        err |= mpi_assert(fabs(x[1] - (+0.5 +  rank * n)) < FLTOL, rank);
-        err |= mpi_assert(fabs(x[n-1] - (n - 1  - 0.5 + rank * n) ) < FLTOL, 
+        err |= mpi_assert(fabs(x[0] - (0.5 +  rank * n)) < FLTOL, rank);
+        err |= mpi_assert(fabs(x[1] - (1.5 +  rank * n)) < FLTOL, rank);
+        err |= mpi_assert(fabs(x[n-1] - (n - 1  + 0.5 + rank * n) ) < FLTOL, 
                           rank);
 
         err |= test_finalize(&test, err);
@@ -121,7 +127,7 @@ int test_grid_fill(int rank, int size)
         grid1_t grid = {.id = rank, .shift = 1, .size = n, .gridspacing = h, 
                         .boundary1 = 1, .boundary2 = 0};
 
-        grid_fill1(x, grid);
+        grid_fill1(x, grid, 0);
         err |= mpi_assert(!err, rank);
         err |= mpi_assert(fabs(x[0] - (0.0 +  rank * n)) < FLTOL, rank);
         err |= mpi_assert(fabs(x[1] - (0.5 +  rank * n)) < FLTOL, rank);
@@ -136,7 +142,7 @@ int test_grid_fill(int rank, int size)
         grid1_t grid = {.id = rank, .shift = 1, .size = n, .gridspacing = h, 
                         .boundary1 = 0, .boundary2 = 1};
 
-        grid_fill1(x, grid);
+        grid_fill1(x, grid, 0);
         err |= mpi_assert(!err, rank);
         err |= mpi_assert(fabs(x[0] - (-0.5 +  rank * n)) < FLTOL, rank);
         err |= mpi_assert(fabs(x[1] - (+0.5 +  rank * n)) < FLTOL, rank);
@@ -159,7 +165,7 @@ int test_grid_in_bounds(int rank, int size)
 
         x = malloc(sizeof(x) * n);
 
-        int3_t shift = grid_yz();
+        int3_t shift = {0, 0, 0};
         
         int3_t coord = {.x = 0, .y = 0, .z = 0};
         int3_t asize = {gsize[0], gsize[1], gsize[2]};
@@ -173,7 +179,7 @@ int test_grid_in_bounds(int rank, int size)
                          .alignment = 2 + ngsl,
                          .padding = 0,
                          .gridspacing = 1.0};
-        grid_fill1(x, grid1);
+        grid_fill1(x, grid1, 0);
 
         test_t test = test_init(" * grid_in_bounds", rank, size);
         err |= mpi_assert(
@@ -193,7 +199,7 @@ int test_grid_in_bounds(int rank, int size)
                          .alignment = 1 + ngsl,
                          .padding = 1,
                          .gridspacing = 1.0};
-        grid_fill1(x, grid1);
+        grid_fill1(x, grid1, 0);
 
         test_t test = test_init(" * grid_in_bounds", rank, size);
         err |= mpi_assert(
@@ -237,7 +243,7 @@ int test_grid_xyz(int rank, int size)
         test_t test = test_init(" * grid_fill_x", rank, size);
 
         grid1_t grid1 = grid_grid1_x(grid);
-        grid_fill1(ans, grid1);
+        grid_fill1(ans, grid1, 1);
         grid_fill_x(x, grid);
 
         for (int i = 0; i < n; ++i) {
@@ -251,7 +257,7 @@ int test_grid_xyz(int rank, int size)
         test_t test = test_init(" * grid_fill_y", rank, size);
 
         grid1_t grid1 = grid_grid1_y(grid);
-        grid_fill1(ans, grid1);
+        grid_fill1(ans, grid1, 0);
         grid_fill_y(x, grid);
 
         for (int i = 0; i < n; ++i) {
@@ -265,7 +271,7 @@ int test_grid_xyz(int rank, int size)
         test_t test = test_init(" * grid_fill_z", rank, size);
         grid1_t grid1 = grid_grid1_z(grid);
 
-        grid_fill1(ans, grid1);
+        grid_fill1(ans, grid1, 0);
         grid_fill_z(x, grid);
 
         for (int i = 0; i < n; ++i) {
@@ -492,3 +498,69 @@ int test_shift(int rank, int size)
         return test_last_error();
 }
 
+int test_global_to_local(int rank, int size) {
+
+    int err = 0;
+
+
+    test_t test = test_init(" * global_to_local", rank, size);
+    const int num_grids = 3;
+    int nz[3] = {20, 10, 12};
+    _prec h = 1.0;
+    const prec H[3] = {grid_height(nz[0], h, 1), grid_height(nz[1], 3 * h, 0),
+                       grid_height(nz[2], 9 * h, 0)};
+
+    // Above free surface (in topo block)
+    {
+        _prec zglb = 0.2;
+        _prec zloc = 0.0;
+        int block_index = -1;
+        int istopo = 1;
+        global_to_local(&zloc, &block_index, zglb, h, nz, num_grids, istopo);
+        err |= mpi_assert(block_index == 0, rank);
+        err |= mpi_assert(fabs(zloc - (zglb + H[0])) < FLTOL, rank);
+    }
+
+    // Below free surface (in topo block)
+    {
+        _prec zglb = -0.2;
+        _prec zloc = 0.0;
+        int block_index = -1;
+        int istopo = 1;
+        global_to_local(&zloc, &block_index, zglb, h, nz, num_grids, istopo);
+        err |= mpi_assert(block_index == 0, rank);
+        err |= mpi_assert(fabs(zloc - (zglb + H[0]) ) < FLTOL, rank);
+    }
+
+    // In the overlap zone (belongs to the second block)
+    {
+        _prec zglb = -15.0;
+        _prec zloc = 0.0;
+        int block_index = -1;
+        int istopo = 1;
+        global_to_local(&zloc, &block_index, zglb, h, nz, num_grids, istopo);
+        err |= mpi_assert(block_index == 1, rank);
+
+        _prec zs = (zglb + H[0] + H[1] - grid_overlap(h) );
+        err |= mpi_assert(fabs(zloc - zs) < FLTOL, rank);
+    }
+
+
+    // In the overlap zone (belongs to the third block)
+    {
+        _prec zglb = -19.0;
+        _prec zloc = 0.0;
+        int block_index = -1;
+        int istopo = 1;
+        global_to_local(&zloc, &block_index, zglb, h, nz, num_grids, istopo);
+        err |= mpi_assert(block_index == 2, rank);
+
+        _prec zs = (zglb + H[0] + H[1] + H[2] - grid_overlap(h) - grid_overlap(3 * h) );
+        err |= mpi_assert(fabs(zloc - zs) < FLTOL, rank);
+    }
+
+    err |= test_finalize(&test, err);
+
+    return test_last_error();
+
+}   
diff --git a/tests/interpolation/CMakeLists.txt b/tests/interpolation/CMakeLists.txt
index 93676bb..5790cc1 100644
--- a/tests/interpolation/CMakeLists.txt
+++ b/tests/interpolation/CMakeLists.txt
@@ -10,7 +10,7 @@ target_link_libraries(test_interpolation
 
 target_include_directories(test_interpolation
         PUBLIC
-        ${AWP_MINI_SOURCE_DIR}/include/
+        ${AWP_SOURCE_DIR}/include/
         )
 
 add_test(NAME test_interpolation COMMAND test_interpolation)
@@ -28,7 +28,7 @@ target_link_libraries(test_interpolationcu
 
 target_include_directories(test_interpolationcu
         PUBLIC
-        ${AWP_MINI_SOURCE_DIR}/include/
+        ${AWP_SOURCE_DIR}/include/
         )
 
 add_test(NAME test_interpolationcu COMMAND test_interpolationcu)
@@ -46,7 +46,7 @@ target_link_libraries(test_lagrange
 
 target_include_directories(test_lagrange
         PUBLIC
-        ${AWP_MINI_SOURCE_DIR}/include/
+        ${AWP_SOURCE_DIR}/include/
         )
 
 add_test(NAME test_lagrange COMMAND test_lagrange)
diff --git a/tests/interpolation/test_interpolation.c b/tests/interpolation/test_interpolation.c
index 2609fea..e98a07d 100644
--- a/tests/interpolation/test_interpolation.c
+++ b/tests/interpolation/test_interpolation.c
@@ -84,7 +84,7 @@ int test_argnearest(void)
         // Grid with closed right boundary 
         grid1_t grid = {.id = 0, .shift = 0, .size = n, .gridspacing = 1, 
                         .boundary1 = 0, .boundary2 = 1};
-        grid_fill1(x, grid);
+        grid_fill1(x, grid, 1);
         test_t test = test_init(" * grid_argnearest:bounds_except", 0, 0);
         int nearest = -1;
         err |= s_no_except(interp_grid_argnearest(&nearest, x, -1, grid) == 
@@ -130,8 +130,7 @@ int test_argnearest_range(void)
         xs = 4.3;
 
         err |= interp_argnearest(&nearest, x, n, xs);
-        err |= interp_argnearest_range(&lower, &upper, lower, upper, nearest, n,
-                                       xs);
+        err |= interp_argnearest_range(&lower, &upper, lower, upper, nearest, n);
         err |= s_assert(nearest == 4);
         err |= s_assert(lower == 4);
         err |= s_assert(upper == 5);
@@ -141,8 +140,7 @@ int test_argnearest_range(void)
         lower = 1;
         upper = 1;
         err |= interp_argnearest(&nearest, x, n, xs);
-        err |= interp_argnearest_range(&lower, &upper, lower, upper, nearest, n,
-                                       xs);
+        err |= interp_argnearest_range(&lower, &upper, lower, upper, nearest, n);
 
         err |= test_finalize(&test, err);
         err |= s_assert(nearest == 4);
@@ -158,8 +156,7 @@ int test_argnearest_range(void)
         int nearest = -1, lower = 2, upper = 2;
         xs = 0.1;
         err |= interp_argnearest(&nearest, x, n, xs);
-        err |= interp_argnearest_range(&lower, &upper, lower, upper, nearest, n,
-                                       xs);
+        err |= interp_argnearest_range(&lower, &upper, lower, upper, nearest, n);
         err |= s_assert(nearest == 0);
         err |= s_assert(lower == 0);
         err |= s_assert(upper == 5);
@@ -175,8 +172,7 @@ int test_argnearest_range(void)
         int nearest = -1, lower = 2, upper = 2;
         xs = 8.9;
         err |= interp_argnearest(&nearest, x, n, xs);
-        err |= interp_argnearest_range(&lower, &upper, lower, upper, nearest, n,
-                                       xs);
+        err |= interp_argnearest_range(&lower, &upper, lower, upper, nearest, n);
         err |= s_assert(nearest == 9);
         err |= s_assert(lower == 5);
         err |= s_assert(upper == 10);
@@ -303,7 +299,7 @@ int test_lagrange3(void)
         prec *x3, *y3, *z3, *fcn3;
 
         int gsize[3] = {n, n, n};
-        int3_t shift = grid_yz();
+        int3_t shift = {0, 0, 0};
         int3_t coord = {.x = 0, .y = 0, .z = 0};
         int3_t asize = {gsize[0], gsize[1], gsize[2]};
         int3_t bnd1 = {1, 1, 1};
@@ -328,11 +324,11 @@ int test_lagrange3(void)
         grid_fill_z(z1, grid);
 
         int m = 4;
-        prec qx[4] = {0.0, 0.2, 0.4, 0.9};
+        prec qx[4] = {0.0, 0.4, 0.6, 0.9};
         prec qy[4] = {0.0, 0.7, 0.4, 0.7};
         prec qz[4] = {0.0, 0.2, 0.3, 0.8};
 
-        prec ax[4] = {0.0, 0.2, 0.4, 0.9};
+        prec ax[4] = {0.0, 0.4, 0.6, 0.9};
         prec ay[4] = {0.0, 0.7, 0.4, 0.7};
         prec az[4] = {0.0, 0.2, 0.3, 0.8};
         prec out[4];
diff --git a/tests/mpi/CMakeLists.txt b/tests/mpi/CMakeLists.txt
index 39aa029..02194cc 100644
--- a/tests/mpi/CMakeLists.txt
+++ b/tests/mpi/CMakeLists.txt
@@ -8,30 +8,11 @@ target_link_libraries(test_indexed
 
 target_include_directories(test_indexed
         PUBLIC
-        ${AWP_MINI_SOURCE_DIR}/include/
+        ${AWP_SOURCE_DIR}/include/
         )
 
 add_test(NAME test_indexed COMMAND test_indexed)
 
-# Distribute
-add_executable(test_mpi_distribute test_distribute.c)
-
-target_link_libraries(test_mpi_distribute 
-        testing
-        grid
-        mpi
-        ${MPI_C_LIBRARIES} 
-        )
-
-target_include_directories(test_mpi_distribute
-        PUBLIC
-        ${AWP_MINI_SOURCE_DIR}/include/
-        )
-
-add_test(NAME test_mpi_distribute COMMAND 
-        ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 --oversubscribe
-        test_mpi_distribute)
-
 # IO
 
 add_executable(test_mpi_io test_io.c)
@@ -45,7 +26,7 @@ target_link_libraries(test_mpi_io
 
 target_include_directories(test_mpi_io
         PUBLIC
-        ${AWP_MINI_SOURCE_DIR}/include/
+        ${AWP_SOURCE_DIR}/include/
         )
 
 add_test(NAME test_mpi_io COMMAND 
diff --git a/tests/mpi/test_distribute.c b/tests/mpi/test_distribute.c
deleted file mode 100644
index 78d4846..0000000
--- a/tests/mpi/test_distribute.c
+++ /dev/null
@@ -1,194 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <assert.h>
-#include <math.h>
-#include <mpi.h>
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-#define ADDLINENUM 1
-#define ADDRANK 1
-#define RANK rank
-
-#include <awp/definitions.h>
-#include <mpi/distribute.h>
-#include <test/test.h>
-#include <test/check.h>
-#include <grid/shift.h>
-
-int test_indices(int rank, int size, enum eshift shift); 
-
-int main(int argc, char **argv)
-{
-        int rank, size;
-        MPI_Init(&argc, &argv);
-        MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN); 
-        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-        MPI_Comm_size(MPI_COMM_WORLD, &size);
-
-        if (size != 4)
-        {
-                if (rank == 0) {
-                        printf("Test requires MPI size = 4.\n");
-                        fflush(stdout);
-                }
-                        MPI_Abort(MPI_COMM_WORLD, -1);
-                        return -1;
-        }
-
-        if (rank == 0) {
-                test_divider();
-                printf("Testing test_distribute.c\n");
-        }
-
-        test_indices(rank, size, GRID_U1);
-        test_indices(rank, size, GRID_U2);
-        test_indices(rank, size, GRID_U3);
-        //TODO: Add tests for stress grids
-        //err = test_indices(rank, size, GRID_XX);
-        //err = test_indices(rank, size, GRID_YY);
-        //err = test_indices(rank, size, GRID_ZZ);
-        //err = test_indices(rank, size, GRID_XY);
-        //err = test_indices(rank, size, GRID_XZ);
-        //err = test_indices(rank, size, GRID_YZ);
-
-        if (rank == 0) {
-                printf("Testing completed.\n");
-                test_divider();
-        }
-
-        MPI_Finalize();
-
-        return test_last_error();
-}
-
-int test_indices(int rank, int size, enum eshift shifttype)
-{
-        char msg[90];
-        sprintf(msg, " * indices: %s", grid_shift_label(shifttype)); 
-        test_t test = test_init(msg, rank,  size);
-        int err = 0;
-        int n = 11;
-        int blocks_x = 2;
-        int gsize[3] = {n, n, n};
-        prec h = 1.0/(n-1);
-        
-        prec *qx = malloc(sizeof qx * n);
-        prec *qy = malloc(sizeof qy * n);
-        prec *qz = malloc(sizeof qz * n);
-
-        int3_t shift = grid_shift(shifttype);
-        
-        int3_t coord = {.x = rank / blocks_x, .y = rank % blocks_x, .z = 0};
-        int3_t asize = {gsize[0], gsize[1], gsize[2]};
-
-        int3_t bnd1 = {0, 0, 1};
-        int3_t bnd2 = {0, 0, 1};
-
-        fcn_grid_t grid;
-        fcn_grid_t ref_grid;
-
-        if (shifttype == GRID_U1 || shifttype == GRID_U2 ||
-            shifttype == GRID_U3) {
-                // velocity grid
-                grid = grid_init(asize, shift, coord, bnd1, bnd2, 0, h);
-        } else {
-                // stress grid
-                grid = grid_init(asize, shift, coord, bnd1, bnd2, ngsl / 2, h);
-        }
-
-
-        // Reference grid
-        ref_grid = grid;
-        ref_grid.coordinate.x = 0;
-        ref_grid.coordinate.y = 0;
-
-        grid1_t grid_x = grid_grid1_x(ref_grid);
-        grid1_t grid_y = grid_grid1_y(ref_grid);
-
-        prec *x = malloc(sizeof(x) * grid_x.size);
-        prec *y = malloc(sizeof(y) * grid_y.size);
-
-        grid_fill1(x, grid_x);
-        grid_fill1(y, grid_y);
-
-        grid_x = grid_grid1_x(grid);
-        grid_y = grid_grid1_y(grid);
-
-        // local coordinates (not used)
-        prec *xloc = malloc(sizeof(x) * grid_x.size);
-        prec *yloc = malloc(sizeof(y) * grid_y.size);
-
-        grid_fill1(xloc, grid_x);
-        grid_fill1(yloc, grid_y);
-
-        h = grid.gridspacing;
-
-        n = 4;
-
-        // Query points below are placed at, or near the boundary of the
-        // partitions 
-
-        // bottom left
-        qx[0] = x[grid_x.size - 1];
-        qy[0] = y[0];
-        qz[0] = 0.0;
-
-        // bottom right
-        qx[1] = x[grid_x.size - 1] + h / 2 + 0.0001;
-        qy[1] = y[0];
-        qz[1] = 0.0;
-
-        // top left
-        qx[2] = x[0];
-        qy[2] = y[grid_y.size - 1] + h;
-        qz[2] = 0.0;
-
-        // top right
-        qx[3] = x[grid_x.size - 1] + h;
-        qy[3] = y[grid_y.size - 1] + h;
-        qz[3] = 0.0;
-
-        size_t nidx = 0;
-        int *indices;
-
-        dist_indices(&indices, &nidx, qx,  qy, n, grid);
-
-        if (coord.x == 0 && coord.y == 0) {
-                int ans[1] = {0};
-                err |= s_assert(nidx == 1);
-                err |= s_assert(chk_infi(ans, indices, nidx) == 0);
-        }
-
-        if (coord.x == 1 && coord.y == 0) {
-                int ans[1] = {1};
-                err |= s_assert(nidx == 1);
-                err |= s_assert(chk_infi(ans, indices, nidx) == 0);
-        }
-
-        if (coord.x == 0 && coord.y == 1) {
-                int ans[1] = {2};
-                err |= s_assert(nidx == 1);
-                err |= s_assert(chk_infi(ans, indices, nidx) == 0);
-        }
-
-        if (coord.x == 1 && coord.y == 1) {
-                int ans[1] = {3};
-                err |= s_assert(nidx == 1);
-                err |= s_assert(chk_infi(ans, indices, nidx) == 0);
-        }
-
-        free(x);
-        free(y);
-        free(qx);
-        free(qy);
-        free(qz);
-        free(indices);
-        free(xloc);
-        free(yloc);
-        err |= test_finalize(&test, err);
-
-        return test_last_error();
-
-}
-
diff --git a/tests/readers/CMakeLists.txt b/tests/readers/CMakeLists.txt
index 2989bf6..a332339 100644
--- a/tests/readers/CMakeLists.txt
+++ b/tests/readers/CMakeLists.txt
@@ -10,11 +10,11 @@ target_link_libraries(test_input
 
 target_include_directories(test_input
         PUBLIC
-        ${AWP_MINI_SOURCE_DIR}/include/
+        ${AWP_SOURCE_DIR}/include/
         )
 
 add_test(NAME test_input COMMAND test_input
-        ${AWP_MINI_SOURCE_DIR}/tests/fixtures/input1.txt)
+        ${AWP_SOURCE_DIR}/tests/fixtures/input1.txt)
 
 
 # Version
@@ -28,7 +28,7 @@ target_link_libraries(test_version
 
 target_include_directories(test_version
         PUBLIC
-        ${AWP_MINI_SOURCE_DIR}/include/
+        ${AWP_SOURCE_DIR}/include/
         )
 
 add_test(NAME test_version COMMAND test_version)
diff --git a/tests/readers/test_input.c b/tests/readers/test_input.c
index a22ebfd..f7bab8c 100644
--- a/tests/readers/test_input.c
+++ b/tests/readers/test_input.c
@@ -5,7 +5,7 @@
 
 #include <test/test.h>
 #include <readers/input.h>
-#include <readers/error.h>
+#include <awp/error.h>
 
 #define STR_LEN 2048
 
diff --git a/tests/readers/test_version.c b/tests/readers/test_version.c
index e67bd30..434b42f 100644
--- a/tests/readers/test_version.c
+++ b/tests/readers/test_version.c
@@ -5,7 +5,7 @@
 
 #include <test/test.h>
 #include <readers/version.h>
-#include <readers/error.h>
+#include <awp/error.h>
 
 
 int check_version(void);
diff --git a/tests/test_attenuation.cu b/tests/test_attenuation.cu
index d5c2b90..4db603e 100644
--- a/tests/test_attenuation.cu
+++ b/tests/test_attenuation.cu
@@ -21,7 +21,7 @@
 #include <awp/kernel.h>
 
 #include <topography/velocity.cuh>
-#include <topography/stress_attenuation.cuh>
+#include <topography/stress.cuh>
 #include <topography/geometry.h>
 #include <topography/host.h>
  
@@ -136,7 +136,7 @@ int main(int argc, char **argv)
 void init(topo_t *T)
 {
         *T = topo_init(1, "", rank, side.left, side.right, side.front,
-                              side.back, coord, px, py, nx, ny, nz, dt, h,
+                              side.back, coord, px, py, nx, ny, nz, dt, h, h, h,
                               stream_1, stream_2, stream_i);
         topo_d_malloc(T);
         topo_d_zero_init(T);
diff --git a/tests/topography/CMakeLists.txt b/tests/topography/CMakeLists.txt
index 3dabd7c..ce73aec 100644
--- a/tests/topography/CMakeLists.txt
+++ b/tests/topography/CMakeLists.txt
@@ -3,3 +3,5 @@ add_subdirectory(readers)
 add_subdirectory(geometry)
 add_subdirectory(sources)
 add_subdirectory(receivers)
+add_subdirectory(accuracy)
+add_subdirectory(mapping)
diff --git a/tests/topography/accuracy/CMakeLists.txt b/tests/topography/accuracy/CMakeLists.txt
new file mode 100644
index 0000000..29461d4
--- /dev/null
+++ b/tests/topography/accuracy/CMakeLists.txt
@@ -0,0 +1,8 @@
+add_executable(test_convergence test_convergence.cu)
+target_link_libraries(
+        test_convergence topography_no_bc awp geometry grid functions)
+target_include_directories(test_convergence
+        PUBLIC
+        ${PROJECT_SOURCE_DIR}/include/
+        )
+add_test(NAME test_convergence COMMAND test_convergence ${PROJECT_SOURCE_DIR}/tests/topography/accuracy/data/)
diff --git a/tests/topography/accuracy/README.md b/tests/topography/accuracy/README.md
new file mode 100644
index 0000000..87a3bd6
--- /dev/null
+++ b/tests/topography/accuracy/README.md
@@ -0,0 +1,23 @@
+# README
+
+This directory contains some additional tests that ensures that topography kernels are correctly
+implemented. 
+
+## Truncation error test
+The convergence test modifies the velocity kernels so that the free surface boundary condition is no
+longer imposed. This change makes it possible to check the rate at which the truncation errors go to
+zero everywhere in the domain. The idea is to take the discretized spatial elastic operator and apply it to a
+set of known trigonometric functions and compare it against an exact solution. The exact solution
+comes from symbolically evaluating the spatial derivatives in the elastic wave equation for the
+given test functions.
+
+Since all calculations are performed in single precision, it is quite difficult to assess if the
+implementation is correct. To improve the confidence, I tested not only with trigonometric functions
+but also with polynomials. These polynomials get differentiated to machine precision as long as
+their degree is below one, or two, depending on if the geometry is flat or not. Note that the
+last boundary point for some of the field components is always zero. This is because this point is
+not part of the actual computation.
+
+To run the test, the program expects a topography profile for each grid. The directory `data`
+contains topography profiles for a Gaussian hill geometry. By modifying the script `topopgraphy.py`
+you can investigate the truncation errors for a different geometry.
diff --git a/tests/topography/accuracy/build_topography.sh b/tests/topography/accuracy/build_topography.sh
new file mode 100755
index 0000000..0f0024e
--- /dev/null
+++ b/tests/topography/accuracy/build_topography.sh
@@ -0,0 +1,8 @@
+x=16
+h=1.0
+for i in 0 1 2 3;
+do
+python3 topography.py data/topography_$i.bin $x $x $h
+let x=x*2;
+h=`python3 -c "print(${h}/2)"`;
+done
diff --git a/tests/topography/accuracy/cupolynomial.cu b/tests/topography/accuracy/cupolynomial.cu
new file mode 100644
index 0000000..732d323
--- /dev/null
+++ b/tests/topography/accuracy/cupolynomial.cu
@@ -0,0 +1,102 @@
+#include "cupolynomial.cuh"
+
+__global__ void poly_xy(_prec *out, 
+                        const int wi0, const int win,
+                        const int wj0, const int wjn,
+                        const int wk0, const int wkn,
+                        const int ri0, const int rin,
+                        const int rj0, const int rjn,
+                        const int rk0, const int rkn,
+                        const int nx, const int ny, const int nz,
+                        const int line, const int slice,
+                        const int rx, const int ry,
+                        const _prec a0, const _prec a1, const _prec a2,
+                        const _prec p0, const _prec p1, const _prec p2, 
+                        const _prec s0, const _prec s1, const _prec s2)
+{
+     // Indices used for output
+     const int wk = threadIdx.x + blockIdx.x*blockDim.x + wk0;
+     if ( wk >= wkn) return;
+     const int wj = threadIdx.y + blockIdx.y*blockDim.y + wj0;
+     if ( wj >= wjn) return;
+     const int wi = threadIdx.z + blockIdx.z*blockDim.z + wi0;
+     if ( wi >= win) return;
+
+     // Indices used for input
+     const int rk = threadIdx.x + blockIdx.x*blockDim.x + rk0;
+     if ( rk >= rkn) return;
+     const int rj = threadIdx.y + blockIdx.y*blockDim.y + rj0;
+     if ( rj >= rjn) return;
+     const int ri = threadIdx.z + blockIdx.z*blockDim.z + ri0;
+     if ( ri >= rin) return;
+     
+     const int pos = wk + wj*line + wi*slice;
+     out[pos] = a0*pow(ri + nx*rx - 0.5*s0, p0) 
+              + a1*pow(rj + ny*ry - 0.5*s1, p1) 
+              + a2*pow(rk         - 0.5*s2, p2);
+}
+
+
+__global__ void poly_z(_prec *out, 
+                       const int wi0, const int win,
+                       const int wj0, const int wjn,
+                       const int wk0, const int wkn,
+                       const int ri0, const int rin,
+                       const int rj0, const int rjn,
+                       const int rk0, const int rkn,
+                       const int nx, const int ny, const int nz,
+                       const int line, const int slice,
+                       const int rx, const int ry,
+                       const _prec a0, const _prec a1, const _prec a2,
+                       const _prec p0, const _prec p1, const _prec p2, 
+                       const _prec s0, const _prec s1, const _prec s2)
+{
+     // Indices used for output
+     const int wk = threadIdx.x + blockIdx.x*blockDim.x + wk0;
+     if ( wk >= wkn) return;
+     const int wj = threadIdx.y + blockIdx.y*blockDim.y + wj0;
+     if ( wj >= wjn) return;
+     const int wi = threadIdx.z + blockIdx.z*blockDim.z + wi0;
+     if ( wi >= win) return;
+
+     // Indices used for input
+     const int rk = threadIdx.x + blockIdx.x*blockDim.x + rk0;
+     if ( rk >= rkn) return;
+     const int rj = threadIdx.y + blockIdx.y*blockDim.y + rj0;
+     if ( rj >= rjn) return;
+     const int ri = threadIdx.z + blockIdx.z*blockDim.z + ri0;
+     if ( ri >= rin) return;
+
+
+
+/*
+ *                                       n-4  n-3   n-2  n-1  
+ *   z    ------o-----o-|---o-----o--|---o----o-----o---*
+ *                      |            |     
+ *                      |            |     
+ *   zh   ---o-----o----|o-----o-----|^----o-----o--o
+ *                      |            |n-4  n-3   n-2 n-1
+ *
+ *           Bottom           Interior           Top 
+ */
+
+
+     _prec zkp = 0.0; 
+     if (rk == rkn - 1 && s2 == 1) {
+           zkp = pow(rkn - 2, p2);
+     } 
+     else if (rk == rk0) {
+        zkp = pow(rk, p2);
+     }   
+     else if (rk == rkn - 1 && s2 == 0) {
+           zkp = 0;
+     } 
+     else {
+        zkp = pow(rk- 0.5*s2, p2);
+     }
+     
+     const int pos = wk + wj*line + wi*slice;
+     out[pos] = a0*pow(ri + nx*rx - 0.5*s0, p0) 
+              + a1*pow(rj + ny*ry - 0.5*s1, p1) 
+              + a2*zkp;
+}
diff --git a/tests/topography/accuracy/cupolynomial.cuh b/tests/topography/accuracy/cupolynomial.cuh
new file mode 100644
index 0000000..3bf6c6a
--- /dev/null
+++ b/tests/topography/accuracy/cupolynomial.cuh
@@ -0,0 +1,39 @@
+#ifndef _POLYNOMIAL_H
+#define _POLYNOMIAL_H
+#endif
+
+#include <awp/definitions.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+__global__ void poly_xy(_prec *out, 
+                                const int wi0, const int win,
+                                const int wj0, const int wjn,
+                                const int wk0, const int wkn,
+                                const int ri0, const int rin,
+                                const int rj0, const int rjn,
+                                const int rk0, const int rkn,
+                                const int nx, const int ny, const int nz,
+                                const int line, const int slice,
+                                const int rx, const int ry,
+                                const _prec a0, const _prec a1, const _prec a2,
+                                const _prec p0, const _prec p1, const _prec p2, 
+                                const _prec s0, const _prec s1, const _prec s2);
+
+__global__ void poly_z(_prec *out, 
+                       const int wi0, const int win,
+                       const int wj0, const int wjn,
+                       const int wk0, const int wkn,
+                       const int ri0, const int rin,
+                       const int rj0, const int rjn,
+                       const int rk0, const int rkn,
+                       const int nx, const int ny, const int nz,
+                       const int line, const int slice,
+                       const int rx, const int ry,
+                       const _prec a0, const _prec a1, const _prec a2,
+                       const _prec p0, const _prec p1, const _prec p2, 
+                       const _prec s0, const _prec s1, const _prec s2);
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/topography/kernels/unoptimized.cu b/tests/topography/accuracy/cutopography_kernel.cu
similarity index 97%
rename from src/topography/kernels/unoptimized.cu
rename to tests/topography/accuracy/cutopography_kernel.cu
index 28feb48..7e9ce60 100644
--- a/src/topography/kernels/unoptimized.cu
+++ b/tests/topography/accuracy/cutopography_kernel.cu
@@ -1,24 +1,15 @@
-#include <awp/definitions.h>
-#include <topography/kernels/unoptimized.cuh>
-#include <stdio.h>
+#include "cutopography_kernel.cuh"
 
-__global__ void
-dtopo_vel_110(float *__restrict__ u1, float *__restrict__ u2,
-              float *__restrict__ u3, const float *__restrict__ dcrjx,
-              const float *__restrict__ dcrjy, const float *__restrict__ dcrjz,
-              const float *__restrict__ f, const float *__restrict__ f1_1,
-              const float *__restrict__ f1_2, const float *__restrict__ f1_c,
-              const float *__restrict__ f2_1, const float *__restrict__ f2_2,
-              const float *__restrict__ f2_c, const float *__restrict__ f_1,
-              const float *__restrict__ f_2, const float *__restrict__ f_c,
-              const float *__restrict__ g, const float *__restrict__ g3,
-              const float *__restrict__ g3_c, const float *__restrict__ g_c,
-              const float *__restrict__ rho, const float *__restrict__ s11,
-              const float *__restrict__ s12, const float *__restrict__ s13,
-              const float *__restrict__ s22, const float *__restrict__ s23,
-              const float *__restrict__ s33, const float a, const float nu,
-              const int nx, const int ny, const int nz, const int bi,
-              const int bj, const int ei, const int ej) {
+__global__ void dtopo_vel_110(
+    float *u1, float *u2, float *u3, const float *dcrjx, const float *dcrjy,
+    const float *dcrjz, const float *f, const float *f1_1, const float *f1_2,
+    const float *f1_c, const float *f2_1, const float *f2_2, const float *f2_c,
+    const float *f_1, const float *f_2, const float *f_c, const float *g,
+    const float *g3, const float *g3_c, const float *g_c, const float *rho,
+    const float *s11, const float *s12, const float *s13, const float *s22,
+    const float *s23, const float *s33, const float a, const float nu,
+    const int nx, const int ny, const int nz, const int bi, const int bj,
+    const int ei, const int ej) {
   const float phzl[6][7] = {
       {0.8338228784688313, 0.1775123316429260, 0.1435067013076542,
        -0.1548419114194114, 0.0000000000000000, 0.0000000000000000,
@@ -143,76 +134,76 @@ dtopo_vel_110(float *__restrict__ u1, float *__restrict__ u2,
   const int k = threadIdx.z + blockIdx.z * blockDim.z;
   if (k >= 6)
     return;
-#define _dcrjx(i) dcrjx[(i) + ngsl + 2]
-#define _dcrjy(j) dcrjy[(j) + ngsl + 2]
-#define _dcrjz(k) dcrjz[(k) + align]
-#define _f(i, j)                                                               \
-  f[(j) + align + ngsl + ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f1_1(i, j)                                                            \
-  f1_1[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f1_2(i, j)                                                            \
-  f1_2[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f1_c(i, j)                                                            \
-  f1_c[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f2_1(i, j)                                                            \
-  f2_1[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f2_2(i, j)                                                            \
-  f2_2[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f2_c(i, j)                                                            \
-  f2_c[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _rho(i, j, k)                                                          \
+  rho[(k) + align +                                                            \
+      (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
+      (2 * align + nz) * ((j) + ngsl + 2)]
 #define _f_1(i, j)                                                             \
   f_1[(j) + align + ngsl +                                                     \
       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _g3_c(k) g3_c[(k) + align]
 #define _f_2(i, j)                                                             \
   f_2[(j) + align + ngsl +                                                     \
       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
 #define _f_c(i, j)                                                             \
   f_c[(j) + align + ngsl +                                                     \
       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _g(k) g[(k) + align]
 #define _g3(k) g3[(k) + align]
-#define _g3_c(k) g3_c[(k) + align]
-#define _g_c(k) g_c[(k) + align]
-#define _rho(i, j, k)                                                          \
-  rho[(k) + align +                                                            \
+#define _dcrjx(i) dcrjx[(i) + ngsl + 2]
+#define _dcrjz(k) dcrjz[(k) + align]
+#define _dcrjy(j) dcrjy[(j) + ngsl + 2]
+#define _f(i, j)                                                               \
+  f[(j) + align + ngsl + ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _s13(i, j, k)                                                          \
+  s13[(k) + align +                                                            \
       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
       (2 * align + nz) * ((j) + ngsl + 2)]
+#define _u1(i, j, k)                                                           \
+  u1[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
+     (2 * align + nz) * ((j) + ngsl + 2)]
 #define _s11(i, j, k)                                                          \
   s11[(k) + align +                                                            \
       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
       (2 * align + nz) * ((j) + ngsl + 2)]
+#define _f2_1(i, j)                                                            \
+  f2_1[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _f1_1(i, j)                                                            \
+  f1_1[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
 #define _s12(i, j, k)                                                          \
   s12[(k) + align +                                                            \
       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
       (2 * align + nz) * ((j) + ngsl + 2)]
-#define _s13(i, j, k)                                                          \
-  s13[(k) + align +                                                            \
+#define _g_c(k) g_c[(k) + align]
+#define _s23(i, j, k)                                                          \
+  s23[(k) + align +                                                            \
       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
       (2 * align + nz) * ((j) + ngsl + 2)]
+#define _f2_2(i, j)                                                            \
+  f2_2[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
 #define _s22(i, j, k)                                                          \
   s22[(k) + align +                                                            \
       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
       (2 * align + nz) * ((j) + ngsl + 2)]
-#define _s23(i, j, k)                                                          \
-  s23[(k) + align +                                                            \
-      (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
-      (2 * align + nz) * ((j) + ngsl + 2)]
+#define _f1_2(i, j)                                                            \
+  f1_2[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _u2(i, j, k)                                                           \
+  u2[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
+     (2 * align + nz) * ((j) + ngsl + 2)]
+#define _g(k) g[(k) + align]
 #define _s33(i, j, k)                                                          \
   s33[(k) + align +                                                            \
       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
       (2 * align + nz) * ((j) + ngsl + 2)]
-#define _u1(i, j, k)                                                           \
-  u1[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
-     (2 * align + nz) * ((j) + ngsl + 2)]
-#define _u2(i, j, k)                                                           \
-  u2[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
-     (2 * align + nz) * ((j) + ngsl + 2)]
+#define _f2_c(i, j)                                                            \
+  f2_c[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _f1_c(i, j)                                                            \
+  f1_c[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
 #define _u3(i, j, k)                                                           \
   u3[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
      (2 * align + nz) * ((j) + ngsl + 2)]
@@ -485,52 +476,45 @@ dtopo_vel_110(float *__restrict__ u1, float *__restrict__ u2,
                       phy[1] * _s23(i, j - 1, 8) +
                       phy[3] * _s23(i, j + 1, 8))))) *
       f_dcrj;
-#undef _dcrjx
-#undef _dcrjy
-#undef _dcrjz
-#undef _f
-#undef _f1_1
-#undef _f1_2
-#undef _f1_c
-#undef _f2_1
-#undef _f2_2
-#undef _f2_c
+#undef _rho
 #undef _f_1
+#undef _g3_c
 #undef _f_2
 #undef _f_c
-#undef _g
 #undef _g3
-#undef _g3_c
-#undef _g_c
-#undef _rho
+#undef _dcrjx
+#undef _dcrjz
+#undef _dcrjy
+#undef _f
+#undef _s13
+#undef _u1
 #undef _s11
+#undef _f2_1
+#undef _f1_1
 #undef _s12
-#undef _s13
-#undef _s22
+#undef _g_c
 #undef _s23
-#undef _s33
-#undef _u1
+#undef _f2_2
+#undef _s22
+#undef _f1_2
 #undef _u2
+#undef _g
+#undef _s33
+#undef _f2_c
+#undef _f1_c
 #undef _u3
 }
 
-__global__ void
-dtopo_vel_111(float *__restrict__ u1, float *__restrict__ u2,
-              float *__restrict__ u3, const float *__restrict__ dcrjx,
-              const float *__restrict__ dcrjy, const float *__restrict__ dcrjz,
-              const float *__restrict__ f, const float *__restrict__ f1_1,
-              const float *__restrict__ f1_2, const float *__restrict__ f1_c,
-              const float *__restrict__ f2_1, const float *__restrict__ f2_2,
-              const float *__restrict__ f2_c, const float *__restrict__ f_1,
-              const float *__restrict__ f_2, const float *__restrict__ f_c,
-              const float *__restrict__ g, const float *__restrict__ g3,
-              const float *__restrict__ g3_c, const float *__restrict__ g_c,
-              const float *__restrict__ rho, const float *__restrict__ s11,
-              const float *__restrict__ s12, const float *__restrict__ s13,
-              const float *__restrict__ s22, const float *__restrict__ s23,
-              const float *__restrict__ s33, const float a, const float nu,
-              const int nx, const int ny, const int nz, const int bi,
-              const int bj, const int ei, const int ej) {
+__global__ void dtopo_vel_111(
+    float *u1, float *u2, float *u3, const float *dcrjx, const float *dcrjy,
+    const float *dcrjz, const float *f, const float *f1_1, const float *f1_2,
+    const float *f1_c, const float *f2_1, const float *f2_2, const float *f2_c,
+    const float *f_1, const float *f_2, const float *f_c, const float *g,
+    const float *g3, const float *g3_c, const float *g_c, const float *rho,
+    const float *s11, const float *s12, const float *s13, const float *s22,
+    const float *s23, const float *s33, const float a, const float nu,
+    const int nx, const int ny, const int nz, const int bi, const int bj,
+    const int ei, const int ej) {
   const float phz[4] = {-0.0625000000000000, 0.5625000000000000,
                         0.5625000000000000, -0.0625000000000000};
   const float phy[4] = {-0.0625000000000000, 0.5625000000000000,
@@ -574,76 +558,76 @@ dtopo_vel_111(float *__restrict__ u1, float *__restrict__ u2,
   const int k = threadIdx.z + blockIdx.z * blockDim.z;
   if (k >= nz - 12)
     return;
-#define _dcrjx(i) dcrjx[(i) + ngsl + 2]
-#define _dcrjy(j) dcrjy[(j) + ngsl + 2]
-#define _dcrjz(k) dcrjz[(k) + align]
-#define _f(i, j)                                                               \
-  f[(j) + align + ngsl + ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f1_1(i, j)                                                            \
-  f1_1[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f1_2(i, j)                                                            \
-  f1_2[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f1_c(i, j)                                                            \
-  f1_c[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f2_1(i, j)                                                            \
-  f2_1[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f2_2(i, j)                                                            \
-  f2_2[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f2_c(i, j)                                                            \
-  f2_c[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _rho(i, j, k)                                                          \
+  rho[(k) + align +                                                            \
+      (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
+      (2 * align + nz) * ((j) + ngsl + 2)]
 #define _f_1(i, j)                                                             \
   f_1[(j) + align + ngsl +                                                     \
       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _g3_c(k) g3_c[(k) + align]
 #define _f_2(i, j)                                                             \
   f_2[(j) + align + ngsl +                                                     \
       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
 #define _f_c(i, j)                                                             \
   f_c[(j) + align + ngsl +                                                     \
       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _g(k) g[(k) + align]
 #define _g3(k) g3[(k) + align]
-#define _g3_c(k) g3_c[(k) + align]
-#define _g_c(k) g_c[(k) + align]
-#define _rho(i, j, k)                                                          \
-  rho[(k) + align +                                                            \
+#define _dcrjx(i) dcrjx[(i) + ngsl + 2]
+#define _dcrjz(k) dcrjz[(k) + align]
+#define _dcrjy(j) dcrjy[(j) + ngsl + 2]
+#define _f(i, j)                                                               \
+  f[(j) + align + ngsl + ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _s13(i, j, k)                                                          \
+  s13[(k) + align +                                                            \
       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
       (2 * align + nz) * ((j) + ngsl + 2)]
+#define _u1(i, j, k)                                                           \
+  u1[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
+     (2 * align + nz) * ((j) + ngsl + 2)]
 #define _s11(i, j, k)                                                          \
   s11[(k) + align +                                                            \
       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
       (2 * align + nz) * ((j) + ngsl + 2)]
+#define _f2_1(i, j)                                                            \
+  f2_1[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _f1_1(i, j)                                                            \
+  f1_1[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
 #define _s12(i, j, k)                                                          \
   s12[(k) + align +                                                            \
       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
       (2 * align + nz) * ((j) + ngsl + 2)]
-#define _s13(i, j, k)                                                          \
-  s13[(k) + align +                                                            \
+#define _g_c(k) g_c[(k) + align]
+#define _s23(i, j, k)                                                          \
+  s23[(k) + align +                                                            \
       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
       (2 * align + nz) * ((j) + ngsl + 2)]
+#define _f2_2(i, j)                                                            \
+  f2_2[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
 #define _s22(i, j, k)                                                          \
   s22[(k) + align +                                                            \
       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
       (2 * align + nz) * ((j) + ngsl + 2)]
-#define _s23(i, j, k)                                                          \
-  s23[(k) + align +                                                            \
-      (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
-      (2 * align + nz) * ((j) + ngsl + 2)]
+#define _f1_2(i, j)                                                            \
+  f1_2[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _u2(i, j, k)                                                           \
+  u2[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
+     (2 * align + nz) * ((j) + ngsl + 2)]
+#define _g(k) g[(k) + align]
 #define _s33(i, j, k)                                                          \
   s33[(k) + align +                                                            \
       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
       (2 * align + nz) * ((j) + ngsl + 2)]
-#define _u1(i, j, k)                                                           \
-  u1[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
-     (2 * align + nz) * ((j) + ngsl + 2)]
-#define _u2(i, j, k)                                                           \
-  u2[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
-     (2 * align + nz) * ((j) + ngsl + 2)]
+#define _f2_c(i, j)                                                            \
+  f2_c[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _f1_c(i, j)                                                            \
+  f1_c[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
 #define _u3(i, j, k)                                                           \
   u3[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
      (2 * align + nz) * ((j) + ngsl + 2)]
@@ -945,52 +929,45 @@ dtopo_vel_111(float *__restrict__ u1, float *__restrict__ u2,
                                   phy[1] * _s23(i, j - 1, k + 9) +
                                   phy[3] * _s23(i, j + 1, k + 9))))) *
       f_dcrj;
-#undef _dcrjx
-#undef _dcrjy
-#undef _dcrjz
-#undef _f
-#undef _f1_1
-#undef _f1_2
-#undef _f1_c
-#undef _f2_1
-#undef _f2_2
-#undef _f2_c
+#undef _rho
 #undef _f_1
+#undef _g3_c
 #undef _f_2
 #undef _f_c
-#undef _g
 #undef _g3
-#undef _g3_c
-#undef _g_c
-#undef _rho
+#undef _dcrjx
+#undef _dcrjz
+#undef _dcrjy
+#undef _f
+#undef _s13
+#undef _u1
 #undef _s11
+#undef _f2_1
+#undef _f1_1
 #undef _s12
-#undef _s13
-#undef _s22
+#undef _g_c
 #undef _s23
-#undef _s33
-#undef _u1
+#undef _f2_2
+#undef _s22
+#undef _f1_2
 #undef _u2
+#undef _g
+#undef _s33
+#undef _f2_c
+#undef _f1_c
 #undef _u3
 }
 
-__global__ void
-dtopo_vel_112(float *__restrict__ u1, float *__restrict__ u2,
-              float *__restrict__ u3, const float *__restrict__ dcrjx,
-              const float *__restrict__ dcrjy, const float *__restrict__ dcrjz,
-              const float *__restrict__ f, const float *__restrict__ f1_1,
-              const float *__restrict__ f1_2, const float *__restrict__ f1_c,
-              const float *__restrict__ f2_1, const float *__restrict__ f2_2,
-              const float *__restrict__ f2_c, const float *__restrict__ f_1,
-              const float *__restrict__ f_2, const float *__restrict__ f_c,
-              const float *__restrict__ g, const float *__restrict__ g3,
-              const float *__restrict__ g3_c, const float *__restrict__ g_c,
-              const float *__restrict__ rho, const float *__restrict__ s11,
-              const float *__restrict__ s12, const float *__restrict__ s13,
-              const float *__restrict__ s22, const float *__restrict__ s23,
-              const float *__restrict__ s33, const float a, const float nu,
-              const int nx, const int ny, const int nz, const int bi,
-              const int bj, const int ei, const int ej) {
+__global__ void dtopo_vel_112(
+    float *u1, float *u2, float *u3, const float *dcrjx, const float *dcrjy,
+    const float *dcrjz, const float *f, const float *f1_1, const float *f1_2,
+    const float *f1_c, const float *f2_1, const float *f2_2, const float *f2_c,
+    const float *f_1, const float *f_2, const float *f_c, const float *g,
+    const float *g3, const float *g3_c, const float *g_c, const float *rho,
+    const float *s11, const float *s12, const float *s13, const float *s22,
+    const float *s23, const float *s33, const float a, const float nu,
+    const int nx, const int ny, const int nz, const int bi, const int bj,
+    const int ei, const int ej) {
   const float phzr[6][8] = {
       {0.0000000000000000, 0.8338228784688313, 0.1775123316429260,
        0.1435067013076542, -0.1548419114194114, 0.0000000000000000,
@@ -1115,76 +1092,76 @@ dtopo_vel_112(float *__restrict__ u1, float *__restrict__ u2,
   const int k = threadIdx.z + blockIdx.z * blockDim.z;
   if (k >= 6)
     return;
-#define _dcrjx(i) dcrjx[(i) + ngsl + 2]
-#define _dcrjy(j) dcrjy[(j) + ngsl + 2]
-#define _dcrjz(k) dcrjz[(k) + align]
-#define _f(i, j)                                                               \
-  f[(j) + align + ngsl + ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f1_1(i, j)                                                            \
-  f1_1[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f1_2(i, j)                                                            \
-  f1_2[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f1_c(i, j)                                                            \
-  f1_c[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f2_1(i, j)                                                            \
-  f2_1[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f2_2(i, j)                                                            \
-  f2_2[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f2_c(i, j)                                                            \
-  f2_c[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _rho(i, j, k)                                                          \
+  rho[(k) + align +                                                            \
+      (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
+      (2 * align + nz) * ((j) + ngsl + 2)]
 #define _f_1(i, j)                                                             \
   f_1[(j) + align + ngsl +                                                     \
       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _g3_c(k) g3_c[(k) + align]
 #define _f_2(i, j)                                                             \
   f_2[(j) + align + ngsl +                                                     \
       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
 #define _f_c(i, j)                                                             \
   f_c[(j) + align + ngsl +                                                     \
       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _g(k) g[(k) + align]
 #define _g3(k) g3[(k) + align]
-#define _g3_c(k) g3_c[(k) + align]
-#define _g_c(k) g_c[(k) + align]
-#define _rho(i, j, k)                                                          \
-  rho[(k) + align +                                                            \
+#define _dcrjx(i) dcrjx[(i) + ngsl + 2]
+#define _dcrjz(k) dcrjz[(k) + align]
+#define _dcrjy(j) dcrjy[(j) + ngsl + 2]
+#define _f(i, j)                                                               \
+  f[(j) + align + ngsl + ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _s13(i, j, k)                                                          \
+  s13[(k) + align +                                                            \
       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
       (2 * align + nz) * ((j) + ngsl + 2)]
+#define _u1(i, j, k)                                                           \
+  u1[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
+     (2 * align + nz) * ((j) + ngsl + 2)]
 #define _s11(i, j, k)                                                          \
   s11[(k) + align +                                                            \
       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
       (2 * align + nz) * ((j) + ngsl + 2)]
+#define _f2_1(i, j)                                                            \
+  f2_1[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _f1_1(i, j)                                                            \
+  f1_1[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
 #define _s12(i, j, k)                                                          \
   s12[(k) + align +                                                            \
       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
       (2 * align + nz) * ((j) + ngsl + 2)]
-#define _s13(i, j, k)                                                          \
-  s13[(k) + align +                                                            \
+#define _g_c(k) g_c[(k) + align]
+#define _s23(i, j, k)                                                          \
+  s23[(k) + align +                                                            \
       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
       (2 * align + nz) * ((j) + ngsl + 2)]
+#define _f2_2(i, j)                                                            \
+  f2_2[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
 #define _s22(i, j, k)                                                          \
   s22[(k) + align +                                                            \
       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
       (2 * align + nz) * ((j) + ngsl + 2)]
-#define _s23(i, j, k)                                                          \
-  s23[(k) + align +                                                            \
-      (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
-      (2 * align + nz) * ((j) + ngsl + 2)]
+#define _f1_2(i, j)                                                            \
+  f1_2[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _u2(i, j, k)                                                           \
+  u2[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
+     (2 * align + nz) * ((j) + ngsl + 2)]
+#define _g(k) g[(k) + align]
 #define _s33(i, j, k)                                                          \
   s33[(k) + align +                                                            \
       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
       (2 * align + nz) * ((j) + ngsl + 2)]
-#define _u1(i, j, k)                                                           \
-  u1[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
-     (2 * align + nz) * ((j) + ngsl + 2)]
-#define _u2(i, j, k)                                                           \
-  u2[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
-     (2 * align + nz) * ((j) + ngsl + 2)]
+#define _f2_c(i, j)                                                            \
+  f2_c[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _f1_c(i, j)                                                            \
+  f1_c[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
 #define _u3(i, j, k)                                                           \
   u3[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
      (2 * align + nz) * ((j) + ngsl + 2)]
@@ -1596,53 +1573,46 @@ dtopo_vel_112(float *__restrict__ u1, float *__restrict__ u2,
                                   phy[1] * _s23(i, j - 1, nz - 1) +
                                   phy[3] * _s23(i, j + 1, nz - 1))))) *
       f_dcrj;
-#undef _dcrjx
-#undef _dcrjy
-#undef _dcrjz
-#undef _f
-#undef _f1_1
-#undef _f1_2
-#undef _f1_c
-#undef _f2_1
-#undef _f2_2
-#undef _f2_c
+#undef _rho
 #undef _f_1
+#undef _g3_c
 #undef _f_2
 #undef _f_c
-#undef _g
 #undef _g3
-#undef _g3_c
-#undef _g_c
-#undef _rho
+#undef _dcrjx
+#undef _dcrjz
+#undef _dcrjy
+#undef _f
+#undef _s13
+#undef _u1
 #undef _s11
+#undef _f2_1
+#undef _f1_1
 #undef _s12
-#undef _s13
-#undef _s22
+#undef _g_c
 #undef _s23
-#undef _s33
-#undef _u1
+#undef _f2_2
+#undef _s22
+#undef _f1_2
 #undef _u2
+#undef _g
+#undef _s33
+#undef _f2_c
+#undef _f1_c
 #undef _u3
 }
 
 __global__ void dtopo_buf_vel_110(
-    float *__restrict__ buf_u1, float *__restrict__ buf_u2,
-    float *__restrict__ buf_u3, const float *__restrict__ dcrjx,
-    const float *__restrict__ dcrjy, const float *__restrict__ dcrjz,
-    const float *__restrict__ f, const float *__restrict__ f1_1,
-    const float *__restrict__ f1_2, const float *__restrict__ f1_c,
-    const float *__restrict__ f2_1, const float *__restrict__ f2_2,
-    const float *__restrict__ f2_c, const float *__restrict__ f_1,
-    const float *__restrict__ f_2, const float *__restrict__ f_c,
-    const float *__restrict__ g, const float *__restrict__ g3,
-    const float *__restrict__ g3_c, const float *__restrict__ g_c,
-    const float *__restrict__ rho, const float *__restrict__ s11,
-    const float *__restrict__ s12, const float *__restrict__ s13,
-    const float *__restrict__ s22, const float *__restrict__ s23,
-    const float *__restrict__ s33, const float *__restrict__ u1,
-    const float *__restrict__ u2, const float *__restrict__ u3, const float a,
-    const float nu, const int nx, const int ny, const int nz, const int bj,
-    const int ej, const int rj0) {
+    float *buf_u1, float *buf_u2, float *buf_u3, const float *dcrjx,
+    const float *dcrjy, const float *dcrjz, const float *f, const float *f1_1,
+    const float *f1_2, const float *f1_c, const float *f2_1, const float *f2_2,
+    const float *f2_c, const float *f_1, const float *f_2, const float *f_c,
+    const float *g, const float *g3, const float *g3_c, const float *g_c,
+    const float *rho, const float *s11, const float *s12, const float *s13,
+    const float *s22, const float *s23, const float *s33, const float *u1,
+    const float *u2, const float *u3, const float a, const float nu,
+    const int nx, const int ny, const int nz, const int bj, const int ej,
+    const int rj0) {
   const float phzl[6][7] = {
       {0.8338228784688313, 0.1775123316429260, 0.1435067013076542,
        -0.1548419114194114, 0.0000000000000000, 0.0000000000000000,
@@ -1765,88 +1735,88 @@ __global__ void dtopo_buf_vel_110(
   const int k = threadIdx.z + blockIdx.z * blockDim.z;
   if (k >= 6)
     return;
-#define _buf_u1(i, j, k)                                                       \
-  buf_u1[(j) * (2 * align + nz) + (k) + align +                                \
-         ngsl * (2 * align + nz) * ((i) + ngsl + 2)]
-#define _buf_u2(i, j, k)                                                       \
-  buf_u2[(j) * (2 * align + nz) + (k) + align +                                \
-         ngsl * (2 * align + nz) * ((i) + ngsl + 2)]
-#define _buf_u3(i, j, k)                                                       \
-  buf_u3[(j) * (2 * align + nz) + (k) + align +                                \
-         ngsl * (2 * align + nz) * ((i) + ngsl + 2)]
-#define _dcrjx(i) dcrjx[(i) + ngsl + 2]
-#define _dcrjy(j) dcrjy[(j) + ngsl + 2]
-#define _dcrjz(k) dcrjz[(k) + align]
-#define _f(i, j)                                                               \
-  f[(j) + align + ngsl + ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f1_1(i, j)                                                            \
-  f1_1[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f1_2(i, j)                                                            \
-  f1_2[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f1_c(i, j)                                                            \
-  f1_c[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f2_1(i, j)                                                            \
-  f2_1[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f2_2(i, j)                                                            \
-  f2_2[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f2_c(i, j)                                                            \
-  f2_c[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _rho(i, j, k)                                                          \
+  rho[(k) + align +                                                            \
+      (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
+      (2 * align + nz) * ((j) + ngsl + 2)]
 #define _f_1(i, j)                                                             \
   f_1[(j) + align + ngsl +                                                     \
       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _g3_c(k) g3_c[(k) + align]
 #define _f_2(i, j)                                                             \
   f_2[(j) + align + ngsl +                                                     \
       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
 #define _f_c(i, j)                                                             \
   f_c[(j) + align + ngsl +                                                     \
       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _g(k) g[(k) + align]
 #define _g3(k) g3[(k) + align]
-#define _g3_c(k) g3_c[(k) + align]
-#define _g_c(k) g_c[(k) + align]
-#define _rho(i, j, k)                                                          \
-  rho[(k) + align +                                                            \
+#define _dcrjx(i) dcrjx[(i) + ngsl + 2]
+#define _dcrjz(k) dcrjz[(k) + align]
+#define _dcrjy(j) dcrjy[(j) + ngsl + 2]
+#define _f(i, j)                                                               \
+  f[(j) + align + ngsl + ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _s13(i, j, k)                                                          \
+  s13[(k) + align +                                                            \
       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
       (2 * align + nz) * ((j) + ngsl + 2)]
+#define _u1(i, j, k)                                                           \
+  u1[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
+     (2 * align + nz) * ((j) + ngsl + 2)]
 #define _s11(i, j, k)                                                          \
   s11[(k) + align +                                                            \
       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
       (2 * align + nz) * ((j) + ngsl + 2)]
+#define _f2_1(i, j)                                                            \
+  f2_1[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _f1_1(i, j)                                                            \
+  f1_1[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
 #define _s12(i, j, k)                                                          \
   s12[(k) + align +                                                            \
       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
       (2 * align + nz) * ((j) + ngsl + 2)]
-#define _s13(i, j, k)                                                          \
-  s13[(k) + align +                                                            \
-      (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
-      (2 * align + nz) * ((j) + ngsl + 2)]
-#define _s22(i, j, k)                                                          \
-  s22[(k) + align +                                                            \
-      (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
-      (2 * align + nz) * ((j) + ngsl + 2)]
+#define _g_c(k) g_c[(k) + align]
 #define _s23(i, j, k)                                                          \
   s23[(k) + align +                                                            \
       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
       (2 * align + nz) * ((j) + ngsl + 2)]
+#define _f2_2(i, j)                                                            \
+  f2_2[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _s22(i, j, k)                                                          \
+  s22[(k) + align +                                                            \
+      (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
+      (2 * align + nz) * ((j) + ngsl + 2)]
+#define _f1_2(i, j)                                                            \
+  f1_2[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _u2(i, j, k)                                                           \
+  u2[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
+     (2 * align + nz) * ((j) + ngsl + 2)]
+#define _g(k) g[(k) + align]
 #define _s33(i, j, k)                                                          \
   s33[(k) + align +                                                            \
       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
       (2 * align + nz) * ((j) + ngsl + 2)]
-#define _u1(i, j, k)                                                           \
-  u1[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
-     (2 * align + nz) * ((j) + ngsl + 2)]
-#define _u2(i, j, k)                                                           \
-  u2[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
-     (2 * align + nz) * ((j) + ngsl + 2)]
+#define _f2_c(i, j)                                                            \
+  f2_c[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _f1_c(i, j)                                                            \
+  f1_c[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
 #define _u3(i, j, k)                                                           \
   u3[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
      (2 * align + nz) * ((j) + ngsl + 2)]
+#define _buf_u1(i, j, k)                                                       \
+  buf_u1[(j) * (2 * align + nz) + (k) + align +                                \
+         ngsl * (2 * align + nz) * ((i) + ngsl + 2)]
+#define _buf_u2(i, j, k)                                                       \
+  buf_u2[(j) * (2 * align + nz) + (k) + align +                                \
+         ngsl * (2 * align + nz) * ((i) + ngsl + 2)]
+#define _buf_u3(i, j, k)                                                       \
+  buf_u3[(j) * (2 * align + nz) + (k) + align +                                \
+         ngsl * (2 * align + nz) * ((i) + ngsl + 2)]
   float rho1 =
       phzl[k][0] *
           (phy[2] * _rho(i, j + rj0, 0) + phy[0] * _rho(i, j + rj0 - 2, 0) +
@@ -2250,56 +2220,49 @@ __global__ void dtopo_buf_vel_110(
                                       phy[1] * _s23(i, j + rj0 - 1, 8) +
                                       phy[3] * _s23(i, j + rj0 + 1, 8))))) *
       f_dcrj;
-#undef _buf_u1
-#undef _buf_u2
-#undef _buf_u3
-#undef _dcrjx
-#undef _dcrjy
-#undef _dcrjz
-#undef _f
-#undef _f1_1
-#undef _f1_2
-#undef _f1_c
-#undef _f2_1
-#undef _f2_2
-#undef _f2_c
+#undef _rho
 #undef _f_1
+#undef _g3_c
 #undef _f_2
 #undef _f_c
-#undef _g
 #undef _g3
-#undef _g3_c
-#undef _g_c
-#undef _rho
+#undef _dcrjx
+#undef _dcrjz
+#undef _dcrjy
+#undef _f
+#undef _s13
+#undef _u1
 #undef _s11
+#undef _f2_1
+#undef _f1_1
 #undef _s12
-#undef _s13
-#undef _s22
+#undef _g_c
 #undef _s23
-#undef _s33
-#undef _u1
+#undef _f2_2
+#undef _s22
+#undef _f1_2
 #undef _u2
+#undef _g
+#undef _s33
+#undef _f2_c
+#undef _f1_c
 #undef _u3
+#undef _buf_u1
+#undef _buf_u2
+#undef _buf_u3
 }
 
 __global__ void dtopo_buf_vel_111(
-    float *__restrict__ buf_u1, float *__restrict__ buf_u2,
-    float *__restrict__ buf_u3, const float *__restrict__ dcrjx,
-    const float *__restrict__ dcrjy, const float *__restrict__ dcrjz,
-    const float *__restrict__ f, const float *__restrict__ f1_1,
-    const float *__restrict__ f1_2, const float *__restrict__ f1_c,
-    const float *__restrict__ f2_1, const float *__restrict__ f2_2,
-    const float *__restrict__ f2_c, const float *__restrict__ f_1,
-    const float *__restrict__ f_2, const float *__restrict__ f_c,
-    const float *__restrict__ g, const float *__restrict__ g3,
-    const float *__restrict__ g3_c, const float *__restrict__ g_c,
-    const float *__restrict__ rho, const float *__restrict__ s11,
-    const float *__restrict__ s12, const float *__restrict__ s13,
-    const float *__restrict__ s22, const float *__restrict__ s23,
-    const float *__restrict__ s33, const float *__restrict__ u1,
-    const float *__restrict__ u2, const float *__restrict__ u3, const float a,
-    const float nu, const int nx, const int ny, const int nz, const int bj,
-    const int ej, const int rj0) {
+    float *buf_u1, float *buf_u2, float *buf_u3, const float *dcrjx,
+    const float *dcrjy, const float *dcrjz, const float *f, const float *f1_1,
+    const float *f1_2, const float *f1_c, const float *f2_1, const float *f2_2,
+    const float *f2_c, const float *f_1, const float *f_2, const float *f_c,
+    const float *g, const float *g3, const float *g3_c, const float *g_c,
+    const float *rho, const float *s11, const float *s12, const float *s13,
+    const float *s22, const float *s23, const float *s33, const float *u1,
+    const float *u2, const float *u3, const float a, const float nu,
+    const int nx, const int ny, const int nz, const int bj, const int ej,
+    const int rj0) {
   const float phz[4] = {-0.0625000000000000, 0.5625000000000000,
                         0.5625000000000000, -0.0625000000000000};
   const float phy[4] = {-0.0625000000000000, 0.5625000000000000,
@@ -2341,88 +2304,88 @@ __global__ void dtopo_buf_vel_111(
   const int k = threadIdx.z + blockIdx.z * blockDim.z;
   if (k >= nz - 12)
     return;
-#define _buf_u1(i, j, k)                                                       \
-  buf_u1[(j) * (2 * align + nz) + (k) + align +                                \
-         ngsl * (2 * align + nz) * ((i) + ngsl + 2)]
-#define _buf_u2(i, j, k)                                                       \
-  buf_u2[(j) * (2 * align + nz) + (k) + align +                                \
-         ngsl * (2 * align + nz) * ((i) + ngsl + 2)]
-#define _buf_u3(i, j, k)                                                       \
-  buf_u3[(j) * (2 * align + nz) + (k) + align +                                \
-         ngsl * (2 * align + nz) * ((i) + ngsl + 2)]
-#define _dcrjx(i) dcrjx[(i) + ngsl + 2]
-#define _dcrjy(j) dcrjy[(j) + ngsl + 2]
-#define _dcrjz(k) dcrjz[(k) + align]
-#define _f(i, j)                                                               \
-  f[(j) + align + ngsl + ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f1_1(i, j)                                                            \
-  f1_1[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f1_2(i, j)                                                            \
-  f1_2[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f1_c(i, j)                                                            \
-  f1_c[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f2_1(i, j)                                                            \
-  f2_1[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f2_2(i, j)                                                            \
-  f2_2[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f2_c(i, j)                                                            \
-  f2_c[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _rho(i, j, k)                                                          \
+  rho[(k) + align +                                                            \
+      (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
+      (2 * align + nz) * ((j) + ngsl + 2)]
 #define _f_1(i, j)                                                             \
   f_1[(j) + align + ngsl +                                                     \
       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _g3_c(k) g3_c[(k) + align]
 #define _f_2(i, j)                                                             \
   f_2[(j) + align + ngsl +                                                     \
       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
 #define _f_c(i, j)                                                             \
   f_c[(j) + align + ngsl +                                                     \
       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _g(k) g[(k) + align]
 #define _g3(k) g3[(k) + align]
-#define _g3_c(k) g3_c[(k) + align]
-#define _g_c(k) g_c[(k) + align]
-#define _rho(i, j, k)                                                          \
-  rho[(k) + align +                                                            \
+#define _dcrjx(i) dcrjx[(i) + ngsl + 2]
+#define _dcrjz(k) dcrjz[(k) + align]
+#define _dcrjy(j) dcrjy[(j) + ngsl + 2]
+#define _f(i, j)                                                               \
+  f[(j) + align + ngsl + ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _s13(i, j, k)                                                          \
+  s13[(k) + align +                                                            \
       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
       (2 * align + nz) * ((j) + ngsl + 2)]
+#define _u1(i, j, k)                                                           \
+  u1[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
+     (2 * align + nz) * ((j) + ngsl + 2)]
 #define _s11(i, j, k)                                                          \
   s11[(k) + align +                                                            \
       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
       (2 * align + nz) * ((j) + ngsl + 2)]
+#define _f2_1(i, j)                                                            \
+  f2_1[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _f1_1(i, j)                                                            \
+  f1_1[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
 #define _s12(i, j, k)                                                          \
   s12[(k) + align +                                                            \
       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
       (2 * align + nz) * ((j) + ngsl + 2)]
-#define _s13(i, j, k)                                                          \
-  s13[(k) + align +                                                            \
+#define _g_c(k) g_c[(k) + align]
+#define _s23(i, j, k)                                                          \
+  s23[(k) + align +                                                            \
       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
       (2 * align + nz) * ((j) + ngsl + 2)]
+#define _f2_2(i, j)                                                            \
+  f2_2[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
 #define _s22(i, j, k)                                                          \
   s22[(k) + align +                                                            \
       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
       (2 * align + nz) * ((j) + ngsl + 2)]
-#define _s23(i, j, k)                                                          \
-  s23[(k) + align +                                                            \
-      (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
-      (2 * align + nz) * ((j) + ngsl + 2)]
+#define _f1_2(i, j)                                                            \
+  f1_2[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _u2(i, j, k)                                                           \
+  u2[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
+     (2 * align + nz) * ((j) + ngsl + 2)]
+#define _g(k) g[(k) + align]
 #define _s33(i, j, k)                                                          \
   s33[(k) + align +                                                            \
       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
       (2 * align + nz) * ((j) + ngsl + 2)]
-#define _u1(i, j, k)                                                           \
-  u1[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
-     (2 * align + nz) * ((j) + ngsl + 2)]
-#define _u2(i, j, k)                                                           \
-  u2[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
-     (2 * align + nz) * ((j) + ngsl + 2)]
+#define _f2_c(i, j)                                                            \
+  f2_c[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _f1_c(i, j)                                                            \
+  f1_c[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
 #define _u3(i, j, k)                                                           \
   u3[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
      (2 * align + nz) * ((j) + ngsl + 2)]
+#define _buf_u1(i, j, k)                                                       \
+  buf_u1[(j) * (2 * align + nz) + (k) + align +                                \
+         ngsl * (2 * align + nz) * ((i) + ngsl + 2)]
+#define _buf_u2(i, j, k)                                                       \
+  buf_u2[(j) * (2 * align + nz) + (k) + align +                                \
+         ngsl * (2 * align + nz) * ((i) + ngsl + 2)]
+#define _buf_u3(i, j, k)                                                       \
+  buf_u3[(j) * (2 * align + nz) + (k) + align +                                \
+         ngsl * (2 * align + nz) * ((i) + ngsl + 2)]
   float rho1 = phz[0] * (phy[2] * _rho(i, j + rj0, k + 4) +
                          phy[0] * _rho(i, j + rj0 - 2, k + 4) +
                          phy[1] * _rho(i, j + rj0 - 1, k + 4) +
@@ -2752,56 +2715,49 @@ __global__ void dtopo_buf_vel_111(
                                       phy[1] * _s23(i, j + rj0 - 1, k + 9) +
                                       phy[3] * _s23(i, j + rj0 + 1, k + 9))))) *
       f_dcrj;
-#undef _buf_u1
-#undef _buf_u2
-#undef _buf_u3
-#undef _dcrjx
-#undef _dcrjy
-#undef _dcrjz
-#undef _f
-#undef _f1_1
-#undef _f1_2
-#undef _f1_c
-#undef _f2_1
-#undef _f2_2
-#undef _f2_c
+#undef _rho
 #undef _f_1
+#undef _g3_c
 #undef _f_2
 #undef _f_c
-#undef _g
 #undef _g3
-#undef _g3_c
-#undef _g_c
-#undef _rho
+#undef _dcrjx
+#undef _dcrjz
+#undef _dcrjy
+#undef _f
+#undef _s13
+#undef _u1
 #undef _s11
+#undef _f2_1
+#undef _f1_1
 #undef _s12
-#undef _s13
-#undef _s22
+#undef _g_c
 #undef _s23
-#undef _s33
-#undef _u1
+#undef _f2_2
+#undef _s22
+#undef _f1_2
 #undef _u2
+#undef _g
+#undef _s33
+#undef _f2_c
+#undef _f1_c
 #undef _u3
+#undef _buf_u1
+#undef _buf_u2
+#undef _buf_u3
 }
 
 __global__ void dtopo_buf_vel_112(
-    float *__restrict__ buf_u1, float *__restrict__ buf_u2,
-    float *__restrict__ buf_u3, const float *__restrict__ dcrjx,
-    const float *__restrict__ dcrjy, const float *__restrict__ dcrjz,
-    const float *__restrict__ f, const float *__restrict__ f1_1,
-    const float *__restrict__ f1_2, const float *__restrict__ f1_c,
-    const float *__restrict__ f2_1, const float *__restrict__ f2_2,
-    const float *__restrict__ f2_c, const float *__restrict__ f_1,
-    const float *__restrict__ f_2, const float *__restrict__ f_c,
-    const float *__restrict__ g, const float *__restrict__ g3,
-    const float *__restrict__ g3_c, const float *__restrict__ g_c,
-    const float *__restrict__ rho, const float *__restrict__ s11,
-    const float *__restrict__ s12, const float *__restrict__ s13,
-    const float *__restrict__ s22, const float *__restrict__ s23,
-    const float *__restrict__ s33, const float *__restrict__ u1,
-    const float *__restrict__ u2, const float *__restrict__ u3, const float a,
-    const float nu, const int nx, const int ny, const int nz, const int bj,
-    const int ej, const int rj0) {
+    float *buf_u1, float *buf_u2, float *buf_u3, const float *dcrjx,
+    const float *dcrjy, const float *dcrjz, const float *f, const float *f1_1,
+    const float *f1_2, const float *f1_c, const float *f2_1, const float *f2_2,
+    const float *f2_c, const float *f_1, const float *f_2, const float *f_c,
+    const float *g, const float *g3, const float *g3_c, const float *g_c,
+    const float *rho, const float *s11, const float *s12, const float *s13,
+    const float *s22, const float *s23, const float *s33, const float *u1,
+    const float *u2, const float *u3, const float a, const float nu,
+    const int nx, const int ny, const int nz, const int bj, const int ej,
+    const int rj0) {
   const float phzr[6][8] = {
       {0.0000000000000000, 0.8338228784688313, 0.1775123316429260,
        0.1435067013076542, -0.1548419114194114, 0.0000000000000000,
@@ -2912,100 +2868,100 @@ __global__ void dtopo_buf_vel_112(
        0.0000000000000000},
       {0.0000000000000000, 0.0000000000000000, 0.0000000000000000,
        -0.0416666666666667, 1.1250000000000000, -1.1250000000000000,
-       0.0416666666666667}};
-  const int i = threadIdx.x + blockIdx.x * blockDim.x;
-  if (i >= nx)
-    return;
-  const int j = threadIdx.y + blockIdx.y * blockDim.y + bj;
-  if (j >= ny)
-    return;
-  if (j >= ej)
-    return;
-  const int k = threadIdx.z + blockIdx.z * blockDim.z;
-  if (k >= 6)
-    return;
-#define _buf_u1(i, j, k)                                                       \
-  buf_u1[(j) * (2 * align + nz) + (k) + align +                                \
-         ngsl * (2 * align + nz) * ((i) + ngsl + 2)]
-#define _buf_u2(i, j, k)                                                       \
-  buf_u2[(j) * (2 * align + nz) + (k) + align +                                \
-         ngsl * (2 * align + nz) * ((i) + ngsl + 2)]
-#define _buf_u3(i, j, k)                                                       \
-  buf_u3[(j) * (2 * align + nz) + (k) + align +                                \
-         ngsl * (2 * align + nz) * ((i) + ngsl + 2)]
-#define _dcrjx(i) dcrjx[(i) + ngsl + 2]
-#define _dcrjy(j) dcrjy[(j) + ngsl + 2]
-#define _dcrjz(k) dcrjz[(k) + align]
-#define _f(i, j)                                                               \
-  f[(j) + align + ngsl + ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f1_1(i, j)                                                            \
-  f1_1[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f1_2(i, j)                                                            \
-  f1_2[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f1_c(i, j)                                                            \
-  f1_c[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f2_1(i, j)                                                            \
-  f2_1[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f2_2(i, j)                                                            \
-  f2_2[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f2_c(i, j)                                                            \
-  f2_c[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+       0.0416666666666667}};
+  const int i = threadIdx.x + blockIdx.x * blockDim.x;
+  if (i >= nx)
+    return;
+  const int j = threadIdx.y + blockIdx.y * blockDim.y + bj;
+  if (j >= ny)
+    return;
+  if (j >= ej)
+    return;
+  const int k = threadIdx.z + blockIdx.z * blockDim.z;
+  if (k >= 6)
+    return;
+#define _rho(i, j, k)                                                          \
+  rho[(k) + align +                                                            \
+      (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
+      (2 * align + nz) * ((j) + ngsl + 2)]
 #define _f_1(i, j)                                                             \
   f_1[(j) + align + ngsl +                                                     \
       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _g3_c(k) g3_c[(k) + align]
 #define _f_2(i, j)                                                             \
   f_2[(j) + align + ngsl +                                                     \
       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
 #define _f_c(i, j)                                                             \
   f_c[(j) + align + ngsl +                                                     \
       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _g(k) g[(k) + align]
 #define _g3(k) g3[(k) + align]
-#define _g3_c(k) g3_c[(k) + align]
-#define _g_c(k) g_c[(k) + align]
-#define _rho(i, j, k)                                                          \
-  rho[(k) + align +                                                            \
+#define _dcrjx(i) dcrjx[(i) + ngsl + 2]
+#define _dcrjz(k) dcrjz[(k) + align]
+#define _dcrjy(j) dcrjy[(j) + ngsl + 2]
+#define _f(i, j)                                                               \
+  f[(j) + align + ngsl + ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _s13(i, j, k)                                                          \
+  s13[(k) + align +                                                            \
       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
       (2 * align + nz) * ((j) + ngsl + 2)]
+#define _u1(i, j, k)                                                           \
+  u1[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
+     (2 * align + nz) * ((j) + ngsl + 2)]
 #define _s11(i, j, k)                                                          \
   s11[(k) + align +                                                            \
       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
       (2 * align + nz) * ((j) + ngsl + 2)]
+#define _f2_1(i, j)                                                            \
+  f2_1[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _f1_1(i, j)                                                            \
+  f1_1[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
 #define _s12(i, j, k)                                                          \
   s12[(k) + align +                                                            \
       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
       (2 * align + nz) * ((j) + ngsl + 2)]
-#define _s13(i, j, k)                                                          \
-  s13[(k) + align +                                                            \
+#define _g_c(k) g_c[(k) + align]
+#define _s23(i, j, k)                                                          \
+  s23[(k) + align +                                                            \
       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
       (2 * align + nz) * ((j) + ngsl + 2)]
+#define _f2_2(i, j)                                                            \
+  f2_2[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
 #define _s22(i, j, k)                                                          \
   s22[(k) + align +                                                            \
       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
       (2 * align + nz) * ((j) + ngsl + 2)]
-#define _s23(i, j, k)                                                          \
-  s23[(k) + align +                                                            \
-      (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
-      (2 * align + nz) * ((j) + ngsl + 2)]
+#define _f1_2(i, j)                                                            \
+  f1_2[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _u2(i, j, k)                                                           \
+  u2[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
+     (2 * align + nz) * ((j) + ngsl + 2)]
+#define _g(k) g[(k) + align]
 #define _s33(i, j, k)                                                          \
   s33[(k) + align +                                                            \
       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
       (2 * align + nz) * ((j) + ngsl + 2)]
-#define _u1(i, j, k)                                                           \
-  u1[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
-     (2 * align + nz) * ((j) + ngsl + 2)]
-#define _u2(i, j, k)                                                           \
-  u2[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
-     (2 * align + nz) * ((j) + ngsl + 2)]
+#define _f2_c(i, j)                                                            \
+  f2_c[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _f1_c(i, j)                                                            \
+  f1_c[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
 #define _u3(i, j, k)                                                           \
   u3[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
      (2 * align + nz) * ((j) + ngsl + 2)]
+#define _buf_u1(i, j, k)                                                       \
+  buf_u1[(j) * (2 * align + nz) + (k) + align +                                \
+         ngsl * (2 * align + nz) * ((i) + ngsl + 2)]
+#define _buf_u2(i, j, k)                                                       \
+  buf_u2[(j) * (2 * align + nz) + (k) + align +                                \
+         ngsl * (2 * align + nz) * ((i) + ngsl + 2)]
+#define _buf_u3(i, j, k)                                                       \
+  buf_u3[(j) * (2 * align + nz) + (k) + align +                                \
+         ngsl * (2 * align + nz) * ((i) + ngsl + 2)]
   float rho1 = phzr[k][7] * (phy[2] * _rho(i, j + rj0, nz - 8) +
                              phy[0] * _rho(i, j + rj0 - 2, nz - 8) +
                              phy[1] * _rho(i, j + rj0 - 1, nz - 8) +
@@ -3447,54 +3403,47 @@ __global__ void dtopo_buf_vel_112(
                         phy[1] * _s23(i, j + rj0 - 1, nz - 1) +
                         phy[3] * _s23(i, j + rj0 + 1, nz - 1))))) *
       f_dcrj;
-#undef _buf_u1
-#undef _buf_u2
-#undef _buf_u3
-#undef _dcrjx
-#undef _dcrjy
-#undef _dcrjz
-#undef _f
-#undef _f1_1
-#undef _f1_2
-#undef _f1_c
-#undef _f2_1
-#undef _f2_2
-#undef _f2_c
+#undef _rho
 #undef _f_1
+#undef _g3_c
 #undef _f_2
 #undef _f_c
-#undef _g
 #undef _g3
-#undef _g3_c
-#undef _g_c
-#undef _rho
+#undef _dcrjx
+#undef _dcrjz
+#undef _dcrjy
+#undef _f
+#undef _s13
+#undef _u1
 #undef _s11
+#undef _f2_1
+#undef _f1_1
 #undef _s12
-#undef _s13
-#undef _s22
+#undef _g_c
 #undef _s23
-#undef _s33
-#undef _u1
+#undef _f2_2
+#undef _s22
+#undef _f1_2
 #undef _u2
+#undef _g
+#undef _s33
+#undef _f2_c
+#undef _f1_c
 #undef _u3
+#undef _buf_u1
+#undef _buf_u2
+#undef _buf_u3
 }
 
 __global__ void dtopo_str_110(
-    float *__restrict__ s11, float *__restrict__ s12, float *__restrict__ s13,
-    float *__restrict__ s22, float *__restrict__ s23, float *__restrict__ s33,
-    float *__restrict__ u1, float *__restrict__ u2, float *__restrict__ u3,
-    const float *__restrict__ dcrjx, const float *__restrict__ dcrjy,
-    const float *__restrict__ dcrjz, const float *__restrict__ f,
-    const float *__restrict__ f1_1, const float *__restrict__ f1_2,
-    const float *__restrict__ f1_c, const float *__restrict__ f2_1,
-    const float *__restrict__ f2_2, const float *__restrict__ f2_c,
-    const float *__restrict__ f_1, const float *__restrict__ f_2,
-    const float *__restrict__ f_c, const float *__restrict__ g,
-    const float *__restrict__ g3, const float *__restrict__ g3_c,
-    const float *__restrict__ g_c, const float *__restrict__ lami,
-    const float *__restrict__ mui, const float a, const float nu, const int nx,
-    const int ny, const int nz, const int bi, const int bj, const int ei,
-    const int ej) {
+    float *s11, float *s12, float *s13, float *s22, float *s23, float *s33,
+    float *u1, float *u2, float *u3, const float *dcrjx, const float *dcrjy,
+    const float *dcrjz, const float *f, const float *f1_1, const float *f1_2,
+    const float *f1_c, const float *f2_1, const float *f2_2, const float *f2_c,
+    const float *f_1, const float *f_2, const float *f_c, const float *g,
+    const float *g3, const float *g3_c, const float *g_c, const float *lami,
+    const float *mui, const float a, const float nu, const int nx, const int ny,
+    const int nz, const int bi, const int bj, const int ei, const int ej) {
   const float phzl[6][7] = {
       {0.8338228784688313, 0.1775123316429260, 0.1435067013076542,
        -0.1548419114194114, 0.0000000000000000, 0.0000000000000000,
@@ -3619,42 +3568,19 @@ __global__ void dtopo_str_110(
   const int k = threadIdx.z + blockIdx.z * blockDim.z;
   if (k >= 6)
     return;
-#define _dcrjx(i) dcrjx[(i) + ngsl + 2]
-#define _dcrjy(j) dcrjy[(j) + ngsl + 2]
-#define _dcrjz(k) dcrjz[(k) + align]
+#define _f_c(i, j)                                                             \
+  f_c[(j) + align + ngsl +                                                     \
+      ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _g3_c(k) g3_c[(k) + align]
 #define _f(i, j)                                                               \
   f[(j) + align + ngsl + ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f1_1(i, j)                                                            \
-  f1_1[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f1_2(i, j)                                                            \
-  f1_2[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f1_c(i, j)                                                            \
-  f1_c[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f2_1(i, j)                                                            \
-  f2_1[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f2_2(i, j)                                                            \
-  f2_2[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f2_c(i, j)                                                            \
-  f2_c[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
 #define _f_1(i, j)                                                             \
   f_1[(j) + align + ngsl +                                                     \
       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _g3(k) g3[(k) + align]
 #define _f_2(i, j)                                                             \
   f_2[(j) + align + ngsl +                                                     \
       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f_c(i, j)                                                             \
-  f_c[(j) + align + ngsl +                                                     \
-      ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _g(k) g[(k) + align]
-#define _g3(k) g3[(k) + align]
-#define _g3_c(k) g3_c[(k) + align]
-#define _g_c(k) g_c[(k) + align]
 #define _lami(i, j, k)                                                         \
   lami[(k) + align +                                                           \
        (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +             \
@@ -3663,39 +3589,62 @@ __global__ void dtopo_str_110(
   mui[(k) + align +                                                            \
       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
       (2 * align + nz) * ((j) + ngsl + 2)]
+#define _u2(i, j, k)                                                           \
+  u2[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
+     (2 * align + nz) * ((j) + ngsl + 2)]
+#define _f2_2(i, j)                                                            \
+  f2_2[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _u3(i, j, k)                                                           \
+  u3[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
+     (2 * align + nz) * ((j) + ngsl + 2)]
+#define _g_c(k) g_c[(k) + align]
+#define _f1_1(i, j)                                                            \
+  f1_1[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _u1(i, j, k)                                                           \
+  u1[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
+     (2 * align + nz) * ((j) + ngsl + 2)]
+#define _dcrjx(i) dcrjx[(i) + ngsl + 2]
+#define _dcrjz(k) dcrjz[(k) + align]
+#define _dcrjy(j) dcrjy[(j) + ngsl + 2]
 #define _s11(i, j, k)                                                          \
   s11[(k) + align +                                                            \
       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
       (2 * align + nz) * ((j) + ngsl + 2)]
+#define _s22(i, j, k)                                                          \
+  s22[(k) + align +                                                            \
+      (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
+      (2 * align + nz) * ((j) + ngsl + 2)]
+#define _s33(i, j, k)                                                          \
+  s33[(k) + align +                                                            \
+      (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
+      (2 * align + nz) * ((j) + ngsl + 2)]
+#define _f1_2(i, j)                                                            \
+  f1_2[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _f2_1(i, j)                                                            \
+  f2_1[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
 #define _s12(i, j, k)                                                          \
   s12[(k) + align +                                                            \
       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
       (2 * align + nz) * ((j) + ngsl + 2)]
+#define _g(k) g[(k) + align]
 #define _s13(i, j, k)                                                          \
   s13[(k) + align +                                                            \
       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
       (2 * align + nz) * ((j) + ngsl + 2)]
-#define _s22(i, j, k)                                                          \
-  s22[(k) + align +                                                            \
-      (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
-      (2 * align + nz) * ((j) + ngsl + 2)]
+#define _f1_c(i, j)                                                            \
+  f1_c[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
 #define _s23(i, j, k)                                                          \
   s23[(k) + align +                                                            \
       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
       (2 * align + nz) * ((j) + ngsl + 2)]
-#define _s33(i, j, k)                                                          \
-  s33[(k) + align +                                                            \
-      (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
-      (2 * align + nz) * ((j) + ngsl + 2)]
-#define _u1(i, j, k)                                                           \
-  u1[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
-     (2 * align + nz) * ((j) + ngsl + 2)]
-#define _u2(i, j, k)                                                           \
-  u2[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
-     (2 * align + nz) * ((j) + ngsl + 2)]
-#define _u3(i, j, k)                                                           \
-  u3[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
-     (2 * align + nz) * ((j) + ngsl + 2)]
+#define _f2_c(i, j)                                                            \
+  f2_c[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
   float Jii = _f_c(i, j) * _g3_c(k);
   Jii = 1.0 * 1.0 / Jii;
   float J12i = _f(i, j) * _g3_c(k);
@@ -4280,52 +4229,45 @@ __global__ void dtopo_str_110(
                       pdhzl[k][7] * _u3(i, j + 2, 7) +
                       pdhzl[k][8] * _u3(i, j + 2, 8))))) *
       f_dcrj;
-#undef _dcrjx
-#undef _dcrjy
-#undef _dcrjz
+#undef _f_c
+#undef _g3_c
 #undef _f
-#undef _f1_1
-#undef _f1_2
-#undef _f1_c
-#undef _f2_1
-#undef _f2_2
-#undef _f2_c
 #undef _f_1
-#undef _f_2
-#undef _f_c
-#undef _g
 #undef _g3
-#undef _g3_c
-#undef _g_c
+#undef _f_2
 #undef _lami
 #undef _mui
+#undef _u2
+#undef _f2_2
+#undef _u3
+#undef _g_c
+#undef _f1_1
+#undef _u1
+#undef _dcrjx
+#undef _dcrjz
+#undef _dcrjy
 #undef _s11
+#undef _s22
+#undef _s33
+#undef _f1_2
+#undef _f2_1
 #undef _s12
+#undef _g
 #undef _s13
-#undef _s22
+#undef _f1_c
 #undef _s23
-#undef _s33
-#undef _u1
-#undef _u2
-#undef _u3
+#undef _f2_c
 }
 
 __global__ void dtopo_str_111(
-    float *__restrict__ s11, float *__restrict__ s12, float *__restrict__ s13,
-    float *__restrict__ s22, float *__restrict__ s23, float *__restrict__ s33,
-    float *__restrict__ u1, float *__restrict__ u2, float *__restrict__ u3,
-    const float *__restrict__ dcrjx, const float *__restrict__ dcrjy,
-    const float *__restrict__ dcrjz, const float *__restrict__ f,
-    const float *__restrict__ f1_1, const float *__restrict__ f1_2,
-    const float *__restrict__ f1_c, const float *__restrict__ f2_1,
-    const float *__restrict__ f2_2, const float *__restrict__ f2_c,
-    const float *__restrict__ f_1, const float *__restrict__ f_2,
-    const float *__restrict__ f_c, const float *__restrict__ g,
-    const float *__restrict__ g3, const float *__restrict__ g3_c,
-    const float *__restrict__ g_c, const float *__restrict__ lami,
-    const float *__restrict__ mui, const float a, const float nu, const int nx,
-    const int ny, const int nz, const int bi, const int bj, const int ei,
-    const int ej) {
+    float *s11, float *s12, float *s13, float *s22, float *s23, float *s33,
+    float *u1, float *u2, float *u3, const float *dcrjx, const float *dcrjy,
+    const float *dcrjz, const float *f, const float *f1_1, const float *f1_2,
+    const float *f1_c, const float *f2_1, const float *f2_2, const float *f2_c,
+    const float *f_1, const float *f_2, const float *f_c, const float *g,
+    const float *g3, const float *g3_c, const float *g_c, const float *lami,
+    const float *mui, const float a, const float nu, const int nx, const int ny,
+    const int nz, const int bi, const int bj, const int ei, const int ej) {
   const float phz[4] = {-0.0625000000000000, 0.5625000000000000,
                         0.5625000000000000, -0.0625000000000000};
   const float phy[4] = {-0.0625000000000000, 0.5625000000000000,
@@ -4369,43 +4311,19 @@ __global__ void dtopo_str_111(
   const int k = threadIdx.z + blockIdx.z * blockDim.z;
   if (k >= nz - 12)
     return;
-#define _dcrjx(i) dcrjx[(i) + ngsl + 2]
-#define _dcrjy(j) dcrjy[(j) + ngsl + 2]
-#define _dcrjz(k) dcrjz[(k) + align]
-
+#define _f_c(i, j)                                                             \
+  f_c[(j) + align + ngsl +                                                     \
+      ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _g3_c(k) g3_c[(k) + align]
 #define _f(i, j)                                                               \
   f[(j) + align + ngsl + ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f1_1(i, j)                                                            \
-  f1_1[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f1_2(i, j)                                                            \
-  f1_2[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f1_c(i, j)                                                            \
-  f1_c[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f2_1(i, j)                                                            \
-  f2_1[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f2_2(i, j)                                                            \
-  f2_2[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f2_c(i, j)                                                            \
-  f2_c[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
 #define _f_1(i, j)                                                             \
   f_1[(j) + align + ngsl +                                                     \
       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _g3(k) g3[(k) + align]
 #define _f_2(i, j)                                                             \
   f_2[(j) + align + ngsl +                                                     \
       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f_c(i, j)                                                             \
-  f_c[(j) + align + ngsl +                                                     \
-      ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _g(k) g[(k) + align]
-#define _g3(k) g3[(k) + align]
-#define _g3_c(k) g3_c[(k) + align]
-#define _g_c(k) g_c[(k) + align]
 #define _lami(i, j, k)                                                         \
   lami[(k) + align +                                                           \
        (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +             \
@@ -4414,40 +4332,62 @@ __global__ void dtopo_str_111(
   mui[(k) + align +                                                            \
       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
       (2 * align + nz) * ((j) + ngsl + 2)]
+#define _u2(i, j, k)                                                           \
+  u2[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
+     (2 * align + nz) * ((j) + ngsl + 2)]
+#define _f2_2(i, j)                                                            \
+  f2_2[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _u3(i, j, k)                                                           \
+  u3[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
+     (2 * align + nz) * ((j) + ngsl + 2)]
+#define _g_c(k) g_c[(k) + align]
+#define _f1_1(i, j)                                                            \
+  f1_1[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _u1(i, j, k)                                                           \
+  u1[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
+     (2 * align + nz) * ((j) + ngsl + 2)]
+#define _dcrjx(i) dcrjx[(i) + ngsl + 2]
+#define _dcrjz(k) dcrjz[(k) + align]
+#define _dcrjy(j) dcrjy[(j) + ngsl + 2]
 #define _s11(i, j, k)                                                          \
   s11[(k) + align +                                                            \
       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
       (2 * align + nz) * ((j) + ngsl + 2)]
+#define _s22(i, j, k)                                                          \
+  s22[(k) + align +                                                            \
+      (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
+      (2 * align + nz) * ((j) + ngsl + 2)]
+#define _s33(i, j, k)                                                          \
+  s33[(k) + align +                                                            \
+      (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
+      (2 * align + nz) * ((j) + ngsl + 2)]
+#define _f1_2(i, j)                                                            \
+  f1_2[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _f2_1(i, j)                                                            \
+  f2_1[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
 #define _s12(i, j, k)                                                          \
   s12[(k) + align +                                                            \
       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
       (2 * align + nz) * ((j) + ngsl + 2)]
+#define _g(k) g[(k) + align]
 #define _s13(i, j, k)                                                          \
   s13[(k) + align +                                                            \
       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
       (2 * align + nz) * ((j) + ngsl + 2)]
-#define _s22(i, j, k)                                                          \
-  s22[(k) + align +                                                            \
-      (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
-      (2 * align + nz) * ((j) + ngsl + 2)]
+#define _f1_c(i, j)                                                            \
+  f1_c[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
 #define _s23(i, j, k)                                                          \
   s23[(k) + align +                                                            \
       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
       (2 * align + nz) * ((j) + ngsl + 2)]
-#define _s33(i, j, k)                                                          \
-  s33[(k) + align +                                                            \
-      (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
-      (2 * align + nz) * ((j) + ngsl + 2)]
-#define _u1(i, j, k)                                                           \
-  u1[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
-     (2 * align + nz) * ((j) + ngsl + 2)]
-#define _u2(i, j, k)                                                           \
-  u2[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
-     (2 * align + nz) * ((j) + ngsl + 2)]
-#define _u3(i, j, k)                                                           \
-  u3[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
-     (2 * align + nz) * ((j) + ngsl + 2)]
-
+#define _f2_c(i, j)                                                            \
+  f2_c[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
   float Jii = _f_c(i, j) * _g3_c(k + 6);
   Jii = 1.0 * 1.0 / Jii;
   float J12i = _f(i, j) * _g3_c(k + 6);
@@ -4884,53 +4824,45 @@ __global__ void dtopo_str_111(
                       pdhz[5] * _u3(i, j + 2, k + 8) +
                       pdhz[6] * _u3(i, j + 2, k + 9))))) *
       f_dcrj;
-
-#undef _dcrjx
-#undef _dcrjy
-#undef _dcrjz
+#undef _f_c
+#undef _g3_c
 #undef _f
-#undef _f1_1
-#undef _f1_2
-#undef _f1_c
-#undef _f2_1
-#undef _f2_2
-#undef _f2_c
 #undef _f_1
-#undef _f_2
-#undef _f_c
-#undef _g
 #undef _g3
-#undef _g3_c
-#undef _g_c
+#undef _f_2
 #undef _lami
 #undef _mui
+#undef _u2
+#undef _f2_2
+#undef _u3
+#undef _g_c
+#undef _f1_1
+#undef _u1
+#undef _dcrjx
+#undef _dcrjz
+#undef _dcrjy
 #undef _s11
+#undef _s22
+#undef _s33
+#undef _f1_2
+#undef _f2_1
 #undef _s12
+#undef _g
 #undef _s13
-#undef _s22
+#undef _f1_c
 #undef _s23
-#undef _s33
-#undef _u1
-#undef _u2
-#undef _u3
+#undef _f2_c
 }
 
 __global__ void dtopo_str_112(
-    float *__restrict__ s11, float *__restrict__ s12, float *__restrict__ s13,
-    float *__restrict__ s22, float *__restrict__ s23, float *__restrict__ s33,
-    float *__restrict__ u1, float *__restrict__ u2, float *__restrict__ u3,
-    const float *__restrict__ dcrjx, const float *__restrict__ dcrjy,
-    const float *__restrict__ dcrjz, const float *__restrict__ f,
-    const float *__restrict__ f1_1, const float *__restrict__ f1_2,
-    const float *__restrict__ f1_c, const float *__restrict__ f2_1,
-    const float *__restrict__ f2_2, const float *__restrict__ f2_c,
-    const float *__restrict__ f_1, const float *__restrict__ f_2,
-    const float *__restrict__ f_c, const float *__restrict__ g,
-    const float *__restrict__ g3, const float *__restrict__ g3_c,
-    const float *__restrict__ g_c, const float *__restrict__ lami,
-    const float *__restrict__ mui, const float a, const float nu, const int nx,
-    const int ny, const int nz, const int bi, const int bj, const int ei,
-    const int ej) {
+    float *s11, float *s12, float *s13, float *s22, float *s23, float *s33,
+    float *u1, float *u2, float *u3, const float *dcrjx, const float *dcrjy,
+    const float *dcrjz, const float *f, const float *f1_1, const float *f1_2,
+    const float *f1_c, const float *f2_1, const float *f2_2, const float *f2_c,
+    const float *f_1, const float *f_2, const float *f_c, const float *g,
+    const float *g3, const float *g3_c, const float *g_c, const float *lami,
+    const float *mui, const float a, const float nu, const int nx, const int ny,
+    const int nz, const int bi, const int bj, const int ei, const int ej) {
   const float phzr[6][8] = {
       {0.0000000000000000, 0.8338228784688313, 0.1775123316429260,
        0.1435067013076542, -0.1548419114194114, 0.0000000000000000,
@@ -5055,42 +4987,19 @@ __global__ void dtopo_str_112(
   const int k = threadIdx.z + blockIdx.z * blockDim.z;
   if (k >= 6)
     return;
-#define _dcrjx(i) dcrjx[(i) + ngsl + 2]
-#define _dcrjy(j) dcrjy[(j) + ngsl + 2]
-#define _dcrjz(k) dcrjz[(k) + align]
+#define _f_c(i, j)                                                             \
+  f_c[(j) + align + ngsl +                                                     \
+      ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _g3_c(k) g3_c[(k) + align]
 #define _f(i, j)                                                               \
   f[(j) + align + ngsl + ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f1_1(i, j)                                                            \
-  f1_1[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f1_2(i, j)                                                            \
-  f1_2[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f1_c(i, j)                                                            \
-  f1_c[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f2_1(i, j)                                                            \
-  f2_1[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f2_2(i, j)                                                            \
-  f2_2[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f2_c(i, j)                                                            \
-  f2_c[(j) + align + ngsl +                                                    \
-       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
 #define _f_1(i, j)                                                             \
   f_1[(j) + align + ngsl +                                                     \
       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _g3(k) g3[(k) + align]
 #define _f_2(i, j)                                                             \
   f_2[(j) + align + ngsl +                                                     \
       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _f_c(i, j)                                                             \
-  f_c[(j) + align + ngsl +                                                     \
-      ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
-#define _g(k) g[(k) + align]
-#define _g3(k) g3[(k) + align]
-#define _g3_c(k) g3_c[(k) + align]
-#define _g_c(k) g_c[(k) + align]
 #define _lami(i, j, k)                                                         \
   lami[(k) + align +                                                           \
        (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +             \
@@ -5099,39 +5008,62 @@ __global__ void dtopo_str_112(
   mui[(k) + align +                                                            \
       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
       (2 * align + nz) * ((j) + ngsl + 2)]
+#define _u2(i, j, k)                                                           \
+  u2[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
+     (2 * align + nz) * ((j) + ngsl + 2)]
+#define _f2_2(i, j)                                                            \
+  f2_2[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _u3(i, j, k)                                                           \
+  u3[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
+     (2 * align + nz) * ((j) + ngsl + 2)]
+#define _g_c(k) g_c[(k) + align]
+#define _f1_1(i, j)                                                            \
+  f1_1[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _u1(i, j, k)                                                           \
+  u1[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
+     (2 * align + nz) * ((j) + ngsl + 2)]
+#define _dcrjx(i) dcrjx[(i) + ngsl + 2]
+#define _dcrjz(k) dcrjz[(k) + align]
+#define _dcrjy(j) dcrjy[(j) + ngsl + 2]
 #define _s11(i, j, k)                                                          \
   s11[(k) + align +                                                            \
       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
       (2 * align + nz) * ((j) + ngsl + 2)]
+#define _s22(i, j, k)                                                          \
+  s22[(k) + align +                                                            \
+      (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
+      (2 * align + nz) * ((j) + ngsl + 2)]
+#define _s33(i, j, k)                                                          \
+  s33[(k) + align +                                                            \
+      (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
+      (2 * align + nz) * ((j) + ngsl + 2)]
+#define _f1_2(i, j)                                                            \
+  f1_2[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
+#define _f2_1(i, j)                                                            \
+  f2_1[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
 #define _s12(i, j, k)                                                          \
   s12[(k) + align +                                                            \
       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
       (2 * align + nz) * ((j) + ngsl + 2)]
+#define _g(k) g[(k) + align]
 #define _s13(i, j, k)                                                          \
   s13[(k) + align +                                                            \
       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
       (2 * align + nz) * ((j) + ngsl + 2)]
-#define _s22(i, j, k)                                                          \
-  s22[(k) + align +                                                            \
-      (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
-      (2 * align + nz) * ((j) + ngsl + 2)]
+#define _f1_c(i, j)                                                            \
+  f1_c[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
 #define _s23(i, j, k)                                                          \
   s23[(k) + align +                                                            \
       (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
       (2 * align + nz) * ((j) + ngsl + 2)]
-#define _s33(i, j, k)                                                          \
-  s33[(k) + align +                                                            \
-      (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) +              \
-      (2 * align + nz) * ((j) + ngsl + 2)]
-#define _u1(i, j, k)                                                           \
-  u1[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
-     (2 * align + nz) * ((j) + ngsl + 2)]
-#define _u2(i, j, k)                                                           \
-  u2[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
-     (2 * align + nz) * ((j) + ngsl + 2)]
-#define _u3(i, j, k)                                                           \
-  u3[(k) + align + (2 * align + nz) * ((i) + ngsl + 2) * (2 * ngsl + ny + 4) + \
-     (2 * align + nz) * ((j) + ngsl + 2)]
+#define _f2_c(i, j)                                                            \
+  f2_c[(j) + align + ngsl +                                                    \
+       ((i) + ngsl + 2) * (2 * align + 2 * ngsl + ny + 4) + 2]
   float Jii = _f_c(i, j) * _g3_c(nz - 1 - k);
   Jii = 1.0 * 1.0 / Jii;
   float J12i = _f(i, j) * _g3_c(nz - 1 - k);
@@ -5807,41 +5739,39 @@ __global__ void dtopo_str_112(
                       pdhzr[k][1] * _u3(i, j + 2, nz - 2) +
                       pdhzr[k][0] * _u3(i, j + 2, nz - 1))))) *
       f_dcrj;
-
-#undef _dcrjx
-#undef _dcrjy
-#undef _dcrjz
+#undef _f_c
+#undef _g3_c
 #undef _f
-#undef _f1_1
-#undef _f1_2
-#undef _f1_c
-#undef _f2_1
-#undef _f2_2
-#undef _f2_c
 #undef _f_1
-#undef _f_2
-#undef _f_c
-#undef _g
 #undef _g3
-#undef _g3_c
-#undef _g_c
+#undef _f_2
 #undef _lami
 #undef _mui
+#undef _u2
+#undef _f2_2
+#undef _u3
+#undef _g_c
+#undef _f1_1
+#undef _u1
+#undef _dcrjx
+#undef _dcrjz
+#undef _dcrjy
 #undef _s11
+#undef _s22
+#undef _s33
+#undef _f1_2
+#undef _f2_1
 #undef _s12
+#undef _g
 #undef _s13
-#undef _s22
+#undef _f1_c
 #undef _s23
-#undef _s33
-#undef _u1
-#undef _u2
-#undef _u3
+#undef _f2_c
 }
 
-__global__ void dtopo_init_material_111(float *__restrict__ lami,
-                                        float *__restrict__ mui,
-                                        float *__restrict__ rho, const int nx,
-                                        const int ny, const int nz) {
+__global__ void dtopo_init_material_111(float *lami, float *mui, float *rho,
+                                        const int nx, const int ny,
+                                        const int nz) {
   const int i = threadIdx.x + blockIdx.x * blockDim.x;
   if (i >= nx)
     return;
@@ -5851,13 +5781,13 @@ __global__ void dtopo_init_material_111(float *__restrict__ lami,
   const int k = threadIdx.z + blockIdx.z * blockDim.z;
   if (k >= nz)
     return;
+#define _rho(i, j, k) rho[(i)*ny * nz + (j)*nz + (k)]
 #define _lami(i, j, k) lami[(i)*ny * nz + (j)*nz + (k)]
 #define _mui(i, j, k) mui[(i)*ny * nz + (j)*nz + (k)]
-#define _rho(i, j, k) rho[(i)*ny * nz + (j)*nz + (k)]
   _rho(i, j, k) = 1.0;
   _lami(i, j, k) = 1.0;
   _mui(i, j, k) = 1.0;
+#undef _rho
 #undef _lami
 #undef _mui
-#undef _rho
 }
diff --git a/tests/topography/accuracy/cutopography_kernel.cuh b/tests/topography/accuracy/cutopography_kernel.cuh
new file mode 100644
index 0000000..1ae8395
--- /dev/null
+++ b/tests/topography/accuracy/cutopography_kernel.cuh
@@ -0,0 +1,99 @@
+#ifndef CUTOPOGRAPHY_KERNEL_H
+#define CUTOPOGRAPHY_KERNEL_H
+#include "definitions.h"
+#include <math.h>
+
+__global__ void dtopo_vel_110(
+    float *u1, float *u2, float *u3, const float *dcrjx, const float *dcrjy,
+    const float *dcrjz, const float *f, const float *f1_1, const float *f1_2,
+    const float *f1_c, const float *f2_1, const float *f2_2, const float *f2_c,
+    const float *f_1, const float *f_2, const float *f_c, const float *g,
+    const float *g3, const float *g3_c, const float *g_c, const float *rho,
+    const float *s11, const float *s12, const float *s13, const float *s22,
+    const float *s23, const float *s33, const float a, const float nu,
+    const int nx, const int ny, const int nz, const int bi, const int bj,
+    const int ei, const int ej);
+__global__ void dtopo_vel_111(
+    float *u1, float *u2, float *u3, const float *dcrjx, const float *dcrjy,
+    const float *dcrjz, const float *f, const float *f1_1, const float *f1_2,
+    const float *f1_c, const float *f2_1, const float *f2_2, const float *f2_c,
+    const float *f_1, const float *f_2, const float *f_c, const float *g,
+    const float *g3, const float *g3_c, const float *g_c, const float *rho,
+    const float *s11, const float *s12, const float *s13, const float *s22,
+    const float *s23, const float *s33, const float a, const float nu,
+    const int nx, const int ny, const int nz, const int bi, const int bj,
+    const int ei, const int ej);
+__global__ void dtopo_vel_112(
+    float *u1, float *u2, float *u3, const float *dcrjx, const float *dcrjy,
+    const float *dcrjz, const float *f, const float *f1_1, const float *f1_2,
+    const float *f1_c, const float *f2_1, const float *f2_2, const float *f2_c,
+    const float *f_1, const float *f_2, const float *f_c, const float *g,
+    const float *g3, const float *g3_c, const float *g_c, const float *rho,
+    const float *s11, const float *s12, const float *s13, const float *s22,
+    const float *s23, const float *s33, const float a, const float nu,
+    const int nx, const int ny, const int nz, const int bi, const int bj,
+    const int ei, const int ej);
+__global__ void dtopo_buf_vel_110(
+    float *buf_u1, float *buf_u2, float *buf_u3, const float *dcrjx,
+    const float *dcrjy, const float *dcrjz, const float *f, const float *f1_1,
+    const float *f1_2, const float *f1_c, const float *f2_1, const float *f2_2,
+    const float *f2_c, const float *f_1, const float *f_2, const float *f_c,
+    const float *g, const float *g3, const float *g3_c, const float *g_c,
+    const float *rho, const float *s11, const float *s12, const float *s13,
+    const float *s22, const float *s23, const float *s33, const float *u1,
+    const float *u2, const float *u3, const float a, const float nu,
+    const int nx, const int ny, const int nz, const int bj, const int ej,
+    const int rj0);
+__global__ void dtopo_buf_vel_111(
+    float *buf_u1, float *buf_u2, float *buf_u3, const float *dcrjx,
+    const float *dcrjy, const float *dcrjz, const float *f, const float *f1_1,
+    const float *f1_2, const float *f1_c, const float *f2_1, const float *f2_2,
+    const float *f2_c, const float *f_1, const float *f_2, const float *f_c,
+    const float *g, const float *g3, const float *g3_c, const float *g_c,
+    const float *rho, const float *s11, const float *s12, const float *s13,
+    const float *s22, const float *s23, const float *s33, const float *u1,
+    const float *u2, const float *u3, const float a, const float nu,
+    const int nx, const int ny, const int nz, const int bj, const int ej,
+    const int rj0);
+__global__ void dtopo_buf_vel_112(
+    float *buf_u1, float *buf_u2, float *buf_u3, const float *dcrjx,
+    const float *dcrjy, const float *dcrjz, const float *f, const float *f1_1,
+    const float *f1_2, const float *f1_c, const float *f2_1, const float *f2_2,
+    const float *f2_c, const float *f_1, const float *f_2, const float *f_c,
+    const float *g, const float *g3, const float *g3_c, const float *g_c,
+    const float *rho, const float *s11, const float *s12, const float *s13,
+    const float *s22, const float *s23, const float *s33, const float *u1,
+    const float *u2, const float *u3, const float a, const float nu,
+    const int nx, const int ny, const int nz, const int bj, const int ej,
+    const int rj0);
+__global__ void dtopo_str_110(
+    float *s11, float *s12, float *s13, float *s22, float *s23, float *s33,
+    float *u1, float *u2, float *u3, const float *dcrjx, const float *dcrjy,
+    const float *dcrjz, const float *f, const float *f1_1, const float *f1_2,
+    const float *f1_c, const float *f2_1, const float *f2_2, const float *f2_c,
+    const float *f_1, const float *f_2, const float *f_c, const float *g,
+    const float *g3, const float *g3_c, const float *g_c, const float *lami,
+    const float *mui, const float a, const float nu, const int nx, const int ny,
+    const int nz, const int bi, const int bj, const int ei, const int ej);
+__global__ void dtopo_str_111(
+    float *s11, float *s12, float *s13, float *s22, float *s23, float *s33,
+    float *u1, float *u2, float *u3, const float *dcrjx, const float *dcrjy,
+    const float *dcrjz, const float *f, const float *f1_1, const float *f1_2,
+    const float *f1_c, const float *f2_1, const float *f2_2, const float *f2_c,
+    const float *f_1, const float *f_2, const float *f_c, const float *g,
+    const float *g3, const float *g3_c, const float *g_c, const float *lami,
+    const float *mui, const float a, const float nu, const int nx, const int ny,
+    const int nz, const int bi, const int bj, const int ei, const int ej);
+__global__ void dtopo_str_112(
+    float *s11, float *s12, float *s13, float *s22, float *s23, float *s33,
+    float *u1, float *u2, float *u3, const float *dcrjx, const float *dcrjy,
+    const float *dcrjz, const float *f, const float *f1_1, const float *f1_2,
+    const float *f1_c, const float *f2_1, const float *f2_2, const float *f2_c,
+    const float *f_1, const float *f_2, const float *f_c, const float *g,
+    const float *g3, const float *g3_c, const float *g_c, const float *lami,
+    const float *mui, const float a, const float nu, const int nx, const int ny,
+    const int nz, const int bi, const int bj, const int ei, const int ej);
+__global__ void dtopo_init_material_111(float *lami, float *mui, float *rho,
+                                        const int nx, const int ny,
+                                        const int nz);
+#endif
\ No newline at end of file
diff --git a/tests/topography/accuracy/cutopography_test.cu b/tests/topography/accuracy/cutopography_test.cu
new file mode 100644
index 0000000..dd8007a
--- /dev/null
+++ b/tests/topography/accuracy/cutopography_test.cu
@@ -0,0 +1,595 @@
+#include <cuda.h>
+#include <stdio.h>
+
+
+#include "cutopography_test.cuh"
+
+#define BLOCK_SIZE_X 1
+#define BLOCK_SIZE_Y 1
+#define BLOCK_SIZE_Z 32
+#define TBX 1
+#define TBY 1
+#define TBZ 32
+
+
+void topo_test_diffx_H(topo_t *T, _prec *out, const _prec *in)
+{
+        if (TOPO_DBG) {
+                printf("launching %s(%d)\n", __func__, T->rank);
+        }
+        dim3 block (BLOCK_SIZE_Z, BLOCK_SIZE_Y, BLOCK_SIZE_X);
+        dim3 grid ((T->nz+BLOCK_SIZE_Z-1)/BLOCK_SIZE_Z, 
+                   (T->ny+BLOCK_SIZE_Y-1)/BLOCK_SIZE_Y,
+                   (T->nx+BLOCK_SIZE_X-1)/BLOCK_SIZE_X);
+        dtopo_test_diffx<<<grid, block, 0, T->stream_i>>>(
+                                               out, in, 
+                                               T->off_x[1], T->off_x[2],
+                                               T->off_y[1], T->off_y[2],
+                                               T->off_z[1], T->off_z[2],
+                                               T->off_x[1], T->off_x[2],
+                                               T->off_y[1], T->off_y[2],
+                                               T->off_z[1], T->off_z[2],
+                                               T->line, T->slice,
+                                               T->line, T->slice
+                                              );
+        CUCHK(cudaGetLastError());
+        
+        return;
+}
+
+void topo_test_cgdiffx_H(topo_t *T, _prec *out, const _prec *in)
+{
+        if (TOPO_DBG) {
+                printf("launching %s(%d)\n", __func__, T->rank);
+        }
+        dim3 block (TBX, TBY, TBZ);
+        dim3 grid ((T->nx+TBX-1)/TBX, 
+                   (T->ny+TBY-1)/TBY,
+                   (T->nz+TBZ-1)/TBZ);
+        CUCHK(cudaGetLastError());
+        if (TOPO_DBG > 1) { 
+                printf("Grid: %d %d %d \n", grid.x, grid.y, grid.z);
+        }
+        dtopo_test_diffx_111<<<grid, block, 0, T->stream_i>>>(
+                                               out, in, 
+                                               T->nx, T->ny, T->nz);
+        CUCHK(cudaGetLastError());
+        
+        return;
+}
+
+void topo_test_diffy_H(topo_t *T, _prec *out, const _prec *in)
+{
+        if (TOPO_DBG) {
+                printf("launching %s(%d)\n", __func__, T->rank);
+        }
+        dim3 block (BLOCK_SIZE_Z, BLOCK_SIZE_Y, BLOCK_SIZE_X);
+        dim3 grid ((T->nz+BLOCK_SIZE_Z-1)/BLOCK_SIZE_Z, 
+                   (T->ny+BLOCK_SIZE_Y-1)/BLOCK_SIZE_Y,
+                   (T->nx+BLOCK_SIZE_X-1)/BLOCK_SIZE_X);
+        dtopo_test_diffy<<<grid, block, 0, T->stream_i>>>(
+                                               out, in, 
+                                               T->off_x[1], T->off_x[2],
+                                               T->off_y[1], T->off_y[2],
+                                               T->off_z[1], T->off_z[2],
+                                               T->off_x[1], T->off_x[2],
+                                               T->off_y[1], T->off_y[2],
+                                               T->off_z[1], T->off_z[2],
+                                               T->line, T->slice,
+                                               T->line, T->slice
+                                              );
+        CUCHK(cudaGetLastError());
+        
+        return;
+}
+
+void topo_test_diffz_H(topo_t *T, _prec *out, const _prec *in)
+{
+        if (TOPO_DBG) {
+                printf("launching %s(%d)\n", __func__, T->rank);
+        }
+        dim3 block (TBX, TBY, TBZ);
+        dim3 grid ((T->nx+TBX-1)/TBX, 
+                   (T->ny+TBY-1)/TBY,
+                   (T->nz+TBZ-1)/TBZ);
+        CUCHK(cudaGetLastError());
+        dtopo_test_diffz_111<<<grid, block, 0, T->stream_i>>>(
+                                               out, in, 
+                                               T->nx, T->ny, T->nz);
+        dtopo_test_diffz_112<<<grid, block, 0, T->stream_i>>>(
+                                               out, in, 
+                                               T->nx, T->ny, T->nz);
+        CUCHK(cudaGetLastError());
+        
+        return;
+}
+
+void topo_test_poly_H(topo_t *T, _prec *out, const _prec *coef,
+                      const _prec *deg, const int *shift)
+{
+        if (TOPO_DBG) {
+                printf("launching %s(%d)\n", __func__, T->rank);
+        }
+
+        dim3 block (BLOCK_SIZE_Z, BLOCK_SIZE_Y, BLOCK_SIZE_X);
+        dim3 grid ((T->nz+BLOCK_SIZE_Z-1)/BLOCK_SIZE_Z, 
+                   (T->ny+BLOCK_SIZE_Y-1)/BLOCK_SIZE_Y,
+                   (T->nx+BLOCK_SIZE_X-1)/BLOCK_SIZE_X);
+
+        CUCHK(cudaGetLastError());
+
+        // Initialize the end result (yy) to something else than zero to make
+        // sure that the test is not trivially passed
+        dtopo_test_poly<<<grid, block>>>(
+                                         out, 
+                                         T->off_x[1], T->off_x[2],
+                                         T->off_y[1], T->off_y[2],
+                                         T->off_z[1], T->off_z[2],
+                                         T->off_x[1], T->off_x[2],
+                                         T->off_y[1], T->off_y[2],
+                                         T->off_z[1], T->off_z[2],
+                                         T->nx, T->ny, T->nz,
+                                         T->line, T->slice,
+                                         T->coord[0], T->coord[1],
+                                         coef[0], coef[1], coef[2],
+                                         deg[0], deg[1], deg[2],
+                                         shift[0], shift[1], shift[2]
+                                         );
+        CUCHK(cudaGetLastError());
+}
+
+void topo_test_polystr_H(topo_t *T, _prec *out, const _prec *coef,
+                      const _prec *deg, const int *shift)
+{
+        if (TOPO_DBG) {
+                printf("launching %s(%d)\n", __func__, T->rank);
+        }
+
+        dim3 block (BLOCK_SIZE_Z, BLOCK_SIZE_Y, BLOCK_SIZE_X);
+        dim3 grid ((T->nz+BLOCK_SIZE_Z-1)/BLOCK_SIZE_Z, 
+                   (T->ny+ngsl+BLOCK_SIZE_Y-1)/BLOCK_SIZE_Y,
+                   (T->nx+ngsl+BLOCK_SIZE_X-1)/BLOCK_SIZE_X);
+
+        CUCHK(cudaGetLastError());
+
+        // Initialize the end result (yy) to something else than zero to make
+        // sure that the test is not trivially passed
+        dtopo_test_poly<<<grid, block>>>(
+                                         out, 
+                                         T->off_x[1]-ngsl/2, T->off_x[2]+ngsl/2,
+                                         T->off_y[1]-ngsl/2, T->off_y[2]+ngsl/2,
+                                         T->off_z[1], T->off_z[2],
+                                         T->off_x[1]-ngsl/2, T->off_x[2]+ngsl/2,
+                                         T->off_y[1]-ngsl/2, T->off_y[2]+ngsl/2,
+                                         T->off_z[1], T->off_z[2],
+                                         T->nx, T->ny, T->nz,
+                                         T->line, T->slice,
+                                         T->coord[0], T->coord[1],
+                                         coef[0], coef[1], coef[2],
+                                         deg[0], deg[1], deg[2],
+                                         shift[0], shift[1], shift[2]
+                                         );
+        CUCHK(cudaGetLastError());
+}
+
+void topo_test_polyzbnd_H(topo_t *T, _prec *out, const _prec *coef,
+                      const _prec *deg, const int *shift)
+{
+        if (TOPO_DBG) {
+                printf("launching %s(%d)\n", __func__, T->rank);
+        }
+
+        dim3 block (BLOCK_SIZE_Z, BLOCK_SIZE_Y, BLOCK_SIZE_X);
+        dim3 grid ((T->nz+BLOCK_SIZE_Z-1)/BLOCK_SIZE_Z, 
+                   (T->ny+BLOCK_SIZE_Y-1)/BLOCK_SIZE_Y,
+                   (T->nx+BLOCK_SIZE_X-1)/BLOCK_SIZE_X);
+
+        CUCHK(cudaGetLastError());
+
+        dtopo_test_polyzbnd<<<grid, block>>>(
+                                         out, 
+                                         T->off_x[1], T->off_x[2],
+                                         T->off_y[1], T->off_y[2],
+                                         T->off_z[1], T->off_z[2],
+                                         T->off_x[1], T->off_x[2],
+                                         T->off_y[1], T->off_y[2],
+                                         T->off_z[1], T->off_z[2],
+                                         T->nx, T->ny, T->nz,
+                                         T->line, T->slice,
+                                         T->coord[0], T->coord[1],
+                                         coef[0], coef[1], coef[2],
+                                         deg[0], deg[1], deg[2],
+                                         shift[0], shift[1], shift[2]
+                                         );
+        CUCHK(cudaGetLastError());
+}
+
+void topo_test_polystrzbnd_H(topo_t *T, _prec *out, const _prec *coef,
+                      const _prec *deg, const int *shift)
+{
+        if (TOPO_DBG) {
+                printf("launching %s(%d)\n", __func__, T->rank);
+        }
+
+        dim3 block (BLOCK_SIZE_Z, BLOCK_SIZE_Y, BLOCK_SIZE_X);
+        dim3 grid ((T->nz+BLOCK_SIZE_Z-1)/BLOCK_SIZE_Z, 
+                   (T->ny+ngsl+BLOCK_SIZE_Y-1)/BLOCK_SIZE_Y,
+                   (T->nx+ngsl+BLOCK_SIZE_X-1)/BLOCK_SIZE_X);
+
+        CUCHK(cudaGetLastError());
+
+        dtopo_test_polyzbnd<<<grid, block>>>(
+                                         out, 
+                                         T->off_x[1]-ngsl/2, T->off_x[2]+ngsl/2,
+                                         T->off_y[1]-ngsl/2, T->off_y[2]+ngsl/2,
+                                         T->off_z[1], T->off_z[2],
+                                         T->off_x[1]-ngsl/2, T->off_x[2]+ngsl/2,
+                                         T->off_y[1]-ngsl/2, T->off_y[2]+ngsl/2,
+                                         T->off_z[1], T->off_z[2],
+                                         T->nx, T->ny, T->nz,
+                                         T->line, T->slice,
+                                         T->coord[0], T->coord[1],
+                                         coef[0], coef[1], coef[2],
+                                         deg[0], deg[1], deg[2],
+                                         shift[0], shift[1], shift[2]
+                                         );
+        CUCHK(cudaGetLastError());
+}
+
+void topo_test_polyf_H(topo_t *T, _prec *out, const _prec *coef, const _prec
+                *deg, const int *shift)
+{
+        if (TOPO_DBG) {
+                printf("launching %s(%d)\n", __func__, T->rank);
+        }
+        dim3 block (BLOCK_SIZE_Z, BLOCK_SIZE_Y, BLOCK_SIZE_X);
+        dim3 grid ((T->nz+BLOCK_SIZE_Z-1)/BLOCK_SIZE_Z, 
+                   (T->ny+BLOCK_SIZE_Y-1)/BLOCK_SIZE_Y,
+                   (T->nx+BLOCK_SIZE_X-1)/BLOCK_SIZE_X);
+        dtopo_test_poly<<<grid, block, 0, T->stream_1>>>(
+                                         out,
+                                         T->off_x[1], T->off_x[2],
+                                         0, ngsl,
+                                         T->off_z[1], T->off_z[2],
+                                         T->off_x[1], T->off_x[2],
+                                         T->off_y[1], T->off_y[1] + ngsl,
+                                         T->off_z[1], T->off_z[2],
+                                         T->nx, T->ny, T->nz,
+                                         T->line, T->slice_gl,
+                                         T->coord[0], T->coord[1],
+                                         coef[0], coef[1], coef[2],
+                                         deg[0], deg[1], deg[2],
+                                         shift[0], shift[1], shift[2]
+                                         );
+        CUCHK(cudaGetLastError());
+        
+        return;
+}
+
+void topo_test_polyzbndf_H(topo_t *T, _prec *out, const _prec *coef, const _prec
+                *deg, const int *shift)
+{
+        if (TOPO_DBG) {
+                printf("launching %s(%d)\n", __func__, T->rank);
+        }
+        dim3 block (BLOCK_SIZE_Z, BLOCK_SIZE_Y, BLOCK_SIZE_X);
+        dim3 grid ((T->nz+BLOCK_SIZE_Z-1)/BLOCK_SIZE_Z, 
+                   (T->ny+BLOCK_SIZE_Y-1)/BLOCK_SIZE_Y,
+                   (T->nx+BLOCK_SIZE_X-1)/BLOCK_SIZE_X);
+        dtopo_test_polyzbnd<<<grid, block, 0, T->stream_1>>>(
+                                         out,
+                                         T->off_x[1], T->off_x[2],
+                                         0, ngsl,
+                                         T->off_z[1], T->off_z[2],
+                                         T->off_x[1], T->off_x[2],
+                                         T->off_y[1], T->off_y[1] + ngsl,
+                                         T->off_z[1], T->off_z[2],
+                                         T->nx, T->ny, T->nz,
+                                         T->line, T->slice_gl,
+                                         T->coord[0], T->coord[1],
+                                         coef[0], coef[1], coef[2],
+                                         deg[0], deg[1], deg[2],
+                                         shift[0], shift[1], shift[2]
+                                         );
+        CUCHK(cudaGetLastError());
+        
+        return;
+}
+
+void topo_test_polyb_H(topo_t *T, _prec *out, const _prec *coef, const _prec
+                       *deg, const int *shift)
+{
+        if (TOPO_DBG) {
+                printf("launching %s(%d)\n", __func__, T->rank);
+        }
+        dim3 block (BLOCK_SIZE_Z, BLOCK_SIZE_Y, BLOCK_SIZE_X);
+        dim3 grid ((T->nz+BLOCK_SIZE_Z-1)/BLOCK_SIZE_Z, 
+                   (T->ny+BLOCK_SIZE_Y-1)/BLOCK_SIZE_Y,
+                   (T->nx+BLOCK_SIZE_X-1)/BLOCK_SIZE_X);
+        //Differentiate  (`sxx = 0`)
+        dtopo_test_poly<<<grid, block, 0, T->stream_2>>>(
+                                         out, 
+                                         T->off_x[1], T->off_x[2],
+                                         0, ngsl,
+                                         T->off_z[1], T->off_z[2],
+                                         T->off_x[1], T->off_x[2],
+                                         T->off_y[2] - ngsl, T->off_y[2],
+                                         T->off_z[1], T->off_z[2],
+                                         T->nx, T->ny, T->nz,
+                                         T->line, T->slice_gl,
+                                         T->coord[0], T->coord[1],
+                                         coef[0], coef[1], coef[2],
+                                         deg[0], deg[1], deg[2],
+                                         shift[0], shift[1], shift[2]
+                                         );
+        CUCHK(cudaGetLastError());
+        
+        return;
+}
+
+void topo_test_polyzbndb_H(topo_t *T, _prec *out, const _prec *coef, const _prec
+                       *deg, const int *shift)
+{
+        if (TOPO_DBG) {
+                printf("launching %s(%d)\n", __func__, T->rank);
+        }
+        dim3 block (BLOCK_SIZE_Z, BLOCK_SIZE_Y, BLOCK_SIZE_X);
+        dim3 grid ((T->nz+BLOCK_SIZE_Z-1)/BLOCK_SIZE_Z, 
+                   (T->ny+BLOCK_SIZE_Y-1)/BLOCK_SIZE_Y,
+                   (T->nx+BLOCK_SIZE_X-1)/BLOCK_SIZE_X);
+        //Differentiate  (`sxx = 0`)
+        dtopo_test_polyzbnd<<<grid, block, 0, T->stream_2>>>(
+                                         out, 
+                                         T->off_x[1], T->off_x[2],
+                                         0, ngsl,
+                                         T->off_z[1], T->off_z[2],
+                                         T->off_x[1], T->off_x[2],
+                                         T->off_y[2] - ngsl, T->off_y[2],
+                                         T->off_z[1], T->off_z[2],
+                                         T->nx, T->ny, T->nz,
+                                         T->line, T->slice_gl,
+                                         T->coord[0], T->coord[1],
+                                         coef[0], coef[1], coef[2],
+                                         deg[0], deg[1], deg[2],
+                                         shift[0], shift[1], shift[2]
+                                         );
+        CUCHK(cudaGetLastError());
+        
+        return;
+}
+
+__global__ void dtopo_test_diffx(_prec *xx, const _prec *u1,
+                                    const int wi0, const int win,
+                                    const int wj0, const int wjn,
+                                    const int wk0, const int wkn,
+                                    const int ri0, const int rin,
+                                    const int rj0, const int rjn,
+                                    const int rk0, const int rkn,
+                                    const int wline, const int wslice,
+                                    const int rline, const int rslice)
+
+{
+     const _prec dx[2] = {-0.0416666666666667, 1.1250000000000000};
+
+     const int wk = threadIdx.x + blockIdx.x*blockDim.x + wk0;
+     const int wj = threadIdx.y + blockIdx.y*blockDim.y + wj0;
+     const int wi = threadIdx.z + blockIdx.z*blockDim.z + wi0;
+     const int rk = threadIdx.x + blockIdx.x*blockDim.x + wk0;
+     const int rj = threadIdx.y + blockIdx.y*blockDim.y + wj0;
+     const int ri = threadIdx.z + blockIdx.z*blockDim.z + wi0;
+     if (wk >= wkn || wj >= wjn || wi >= win) {
+             return;
+     }
+     if (rk >= rkn || rj >= rjn || ri >= rin) {
+             return;
+     }
+
+     int pos = wk + wline*wj + wslice*wi;
+     xx[pos] = dx[0]*( u1[rk + rline*rj + rslice*(ri + 2)] 
+                     - u1[rk + rline*rj + rslice*(ri - 1)]
+                     )
+             + dx[1]*(  u1[rk + rline*rj + rslice*(ri + 1)]       
+                      - u1[rk + rline*rj + rslice*(ri + 0)] 
+                     );                                   
+}
+
+__global__ void dtopo_test_diffx_111(_prec *xx, const _prec *u1, const int nx, const int ny, const int nz)
+{
+     const _prec dx[4] = {0.0416666666666667, -1.1250000000000000, 1.1250000000000000, -0.0416666666666667};
+     const int i = threadIdx.x + blockIdx.x*blockDim.x;
+     if ( i >= nx) return;
+     const int j = threadIdx.y + blockIdx.y*blockDim.y;
+     if ( j >= ny) return;
+     const int k = threadIdx.z + blockIdx.z*blockDim.z;
+     if ( k >= nz) return;
+     xx[align + k + (2*align + nz)*(i + ngsl + 2)*(2*ngsl + ny + 4) + (2*align + nz)*(j + ngsl + 2)] = 
+           + dx[0]*u1[align + k + (2*align + nz)*(i + ngsl + 1)*(2*ngsl + ny + 4) + (2*align + nz)*(j + ngsl + 2)] 
+           + dx[1]*u1[align + k + (2*align + nz)*(i + ngsl + 2)*(2*ngsl + ny + 4) + (2*align + nz)*(j + ngsl + 2)] 
+           + dx[2]*u1[align + k + (2*align + nz)*(i + ngsl + 3)*(2*ngsl + ny + 4) + (2*align + nz)*(j + ngsl + 2)] 
+           + dx[3]*u1[align + k + (2*align + nz)*(i + ngsl + 4)*(2*ngsl + ny + 4) + (2*align + nz)*(j + ngsl + 2)];
+     
+}
+
+
+__global__ void dtopo_test_diffy(_prec *yy, const _prec *v1,
+                                    const int wi0, const int win,
+                                    const int wj0, const int wjn,
+                                    const int wk0, const int wkn,
+                                    const int ri0, const int rin,
+                                    const int rj0, const int rjn,
+                                    const int rk0, const int rkn,
+                                    const int wline, const int wslice,
+                                    const int rline, const int rslice)
+
+{
+     const _prec dy[2] = {-0.0416666666666667, 1.1250000000000000};
+
+     const int wk = threadIdx.x + blockIdx.x*blockDim.x + wk0;
+     const int wj = threadIdx.y + blockIdx.y*blockDim.y + wj0;
+     const int wi = threadIdx.z + blockIdx.z*blockDim.z + wi0;
+     const int rk = threadIdx.x + blockIdx.x*blockDim.x + wk0;
+     const int rj = threadIdx.y + blockIdx.y*blockDim.y + wj0;
+     const int ri = threadIdx.z + blockIdx.z*blockDim.z + wi0;
+     if (wk >= wkn || wj >= wjn || wi >= win) {
+             return;
+     }
+     if (rk >= rkn || rj >= rjn || ri >= rin) {
+             return;
+     }
+
+     int pos = wk + wline*wj + wslice*wi;
+     yy[pos] = dy[0]*( v1[rk + rline*(rj + 2) + rslice*ri] 
+                     - v1[rk + rline*(rj - 1) + rslice*ri]
+                     )
+             + dy[1]*(  v1[rk + rline*(rj + 1) + rslice*ri]       
+                      - v1[rk + rline*(rj + 0) + rslice*ri] 
+                     );                                   
+}
+__global__ void dtopo_test_diffz_111(_prec *xz, const _prec *u1, const int nx, const int ny, const int nz)
+{
+     const _prec dz[4] = {0.0416666666666667, -1.1250000000000000, 1.1250000000000000, -0.0416666666666667};
+     const int i = threadIdx.x + blockIdx.x*blockDim.x;
+     if ( i >= nx) return;
+     const int j = threadIdx.y + blockIdx.y*blockDim.y;
+     if ( j >= ny) return;
+     const int k = threadIdx.z + blockIdx.z*blockDim.z;
+     if ( k >= nz - 5) return;
+     // Hack used to only update the interior point for which there is data.
+     if ( k <= 2) return;
+     #define _xz(p,q,r) xz[align + (r) + (2*align + nz)*((p) + ngsl + 2)*(2*ngsl + ny + 4) + (2*align + nz)*((q) + ngsl + 2)]
+     #define _u1(p,q,r) u1[align + (r) + (2*align + nz)*((p) + ngsl + 2)*(2*ngsl + ny + 4) + (2*align + nz)*((q) + ngsl + 2)]
+
+     _xz(i,j,k) = dz[0]*_u1(i,j,k-1) + dz[1]*_u1(i,j,k) \
+                + dz[2]*_u1(i,j,k+1) + dz[3]*_u1(i,j,k+2);
+
+     #undef _xz
+     #undef _u1
+     
+}
+
+__global__ void dtopo_test_diffz_112(_prec *xz, const _prec *u1, const int nx, const int ny, const int nz)
+{
+     const _prec dzr[5][6] = {{0.0000000000000000, 0.0000000000000000, 0.0000000000000000, 0.0000000000000000, 0.0000000000000000, 0.0000000000000000}, {2.4843703320382104, -2.6581943725716441, 0.1054629150477628, 0.0683611254856712, 0.0000000000000000, 0.0000000000000000}, {0.0788758473205719, 0.8521077862739277, -0.9014051908492852, -0.0295784427452145, 0.0000000000000000, 0.0000000000000000}, {-0.0147185348696016, -0.0162224835422866, 1.1130610406813668, -1.1259397586922681, 0.0438197364227896, 0.0000000000000000}, {-0.0040598373854470, 0.0051290309438727, -0.0391885057638776, 1.1187625510387915, -1.1222064403269296, 0.0415632014935900}};
+     const int i = threadIdx.x + blockIdx.x*blockDim.x;
+     if ( i >= nx) return;
+     const int j = threadIdx.y + blockIdx.y*blockDim.y;
+     if ( j >= ny) return;
+     const int k = threadIdx.z + blockIdx.z*blockDim.z;
+     if ( k >= 5) return;
+     xz[align + nz + (2*align + nz)*(i + ngsl + 2)*(2*ngsl + ny + 4) + (2*align + nz)*(j + ngsl + 2) - 1 - k] = 
+     dzr[k][5]*u1[align + nz + (2*align + nz)*(i + ngsl + 2)*(2*ngsl + ny + 4) + (2*align + nz)*(j + ngsl + 2) - 6] + dzr[k][4]*u1[align + nz + (2*align + nz)*(i + ngsl + 2)*(2*ngsl + ny + 4) + (2*align + nz)*(j + ngsl + 2) - 5] + dzr[k][3]*u1[align + nz + (2*align + nz)*(i + ngsl + 2)*(2*ngsl + ny + 4) + (2*align + nz)*(j + ngsl + 2) - 4] + dzr[k][2]*u1[align + nz + (2*align + nz)*(i + ngsl + 2)*(2*ngsl + ny + 4) + (2*align + nz)*(j + ngsl + 2) - 3] + dzr[k][1]*u1[align + nz + (2*align + nz)*(i + ngsl + 2)*(2*ngsl + ny + 4) + (2*align + nz)*(j + ngsl + 2) - 2] + dzr[k][0]*u1[align + nz + (2*align + nz)*(i + ngsl + 2)*(2*ngsl + ny + 4) + (2*align + nz)*(j + ngsl + 2) - 1];
+
+     _prec out = xz[align + nz + (2*align + nz)*(i + ngsl + 2)*(2*ngsl + ny + 4) + (2*align + nz)*(j + ngsl + 2) - 1 - k];
+     if (TOPO_DBG > 1 && i == 10 && j == 10) {
+             printf("out[%d] = %g in = %g %g %g %g %g %g \n", k, out,
+                             u1[align + nz + (2*align + nz)*(i + ngsl + 2)*(2*ngsl + ny + 4) + (2*align + nz)*(j + ngsl + 2) - 6],
+                             u1[align + nz + (2*align + nz)*(i + ngsl + 2)*(2*ngsl + ny + 4) + (2*align + nz)*(j + ngsl + 2) - 5],
+                             u1[align + nz + (2*align + nz)*(i + ngsl + 2)*(2*ngsl + ny + 4) + (2*align + nz)*(j + ngsl + 2) - 4],
+                             u1[align + nz + (2*align + nz)*(i + ngsl + 2)*(2*ngsl + ny + 4) + (2*align + nz)*(j + ngsl + 2) - 3],
+                             u1[align + nz + (2*align + nz)*(i + ngsl + 2)*(2*ngsl + ny + 4) + (2*align + nz)*(j + ngsl + 2) - 2],
+                             u1[align + nz + (2*align + nz)*(i + ngsl + 2)*(2*ngsl + ny + 4) + (2*align + nz)*(j + ngsl + 2) - 1]
+                             );
+     }
+     
+}
+
+__global__ void dtopo_test_poly(_prec *out, 
+                                const int wi0, const int win,
+                                const int wj0, const int wjn,
+                                const int wk0, const int wkn,
+                                const int ri0, const int rin,
+                                const int rj0, const int rjn,
+                                const int rk0, const int rkn,
+                                const int nx, const int ny, const int nz,
+                                const int line, const int slice,
+                                const int rx, const int ry,
+                                const _prec a0, const _prec a1, const _prec a2,
+                                const _prec p0, const _prec p1, const _prec p2, 
+                                const _prec s0, const _prec s1, const _prec s2)
+{
+     // Indices used for output
+     const int wk = threadIdx.x + blockIdx.x*blockDim.x + wk0;
+     if ( wk >= wkn) return;
+     const int wj = threadIdx.y + blockIdx.y*blockDim.y + wj0;
+     if ( wj >= wjn) return;
+     const int wi = threadIdx.z + blockIdx.z*blockDim.z + wi0;
+     if ( wi >= win) return;
+
+     // Indices used for input
+     const int rk = threadIdx.x + blockIdx.x*blockDim.x + rk0;
+     if ( rk >= rkn) return;
+     const int rj = threadIdx.y + blockIdx.y*blockDim.y + rj0;
+     if ( rj >= rjn) return;
+     const int ri = threadIdx.z + blockIdx.z*blockDim.z + ri0;
+     if ( ri >= rin) return;
+     
+     const int pos = wk + wj*line + wi*slice;
+     out[pos] = a0*pow((_prec)(ri + nx*rx - 0.5*s0), (_prec)p0) 
+              + a1*pow((_prec)(rj + ny*ry - 0.5*s1), (_prec)p1) 
+              + a2*pow((_prec)(rk         - 0.5*s2), (_prec)p2);
+}
+
+
+__global__ void dtopo_test_polyzbnd(_prec *out, 
+                                    const int wi0, const int win,
+                                    const int wj0, const int wjn,
+                                    const int wk0, const int wkn,
+                                    const int ri0, const int rin,
+                                    const int rj0, const int rjn,
+                                    const int rk0, const int rkn,
+                                    const int nx, const int ny, const int nz,
+                                    const int line, const int slice,
+                                    const int rx, const int ry,
+                                    const _prec a0, const _prec a1, const _prec a2,
+                                    const _prec p0, const _prec p1, const _prec p2, 
+                                    const _prec s0, const _prec s1, const _prec s2)
+{
+     // Indices used for output
+     const int wk = threadIdx.x + blockIdx.x*blockDim.x + wk0;
+     if ( wk >= wkn) return;
+     const int wj = threadIdx.y + blockIdx.y*blockDim.y + wj0;
+     if ( wj >= wjn) return;
+     const int wi = threadIdx.z + blockIdx.z*blockDim.z + wi0;
+     if ( wi >= win) return;
+
+     // Indices used for input
+     const int rk = threadIdx.x + blockIdx.x*blockDim.x + rk0;
+     if ( rk >= rkn) return;
+     const int rj = threadIdx.y + blockIdx.y*blockDim.y + rj0;
+     if ( rj >= rjn) return;
+     const int ri = threadIdx.z + blockIdx.z*blockDim.z + ri0;
+     if ( ri >= rin) return;
+
+
+
+/*
+ *                                       n-4  n-3   n-2  n-1  
+ *   z    ------o-----o-|---o-----o--|---o----o-----o---*
+ *                      |            |     
+ *                      |            |     
+ *   zh   ---o-----o----|o-----o-----|^----o-----o--o
+ *                      |            |n-4  n-3   n-2 n-1
+ *
+ *           Bottom           Interior           Top 
+ */
+
+
+     _prec zkp = 0.0; 
+     if (rk == rkn - 1 && s2 == 1) {
+           zkp = pow((_prec)(rkn - 2), (_prec)p2);
+     } 
+     else if (rk == rk0) {
+        zkp = pow((_prec)rk, (_prec)p2);
+     }   
+     else if (rk == rkn - 1 && s2 == 0) {
+           zkp = 0;
+     } 
+     else {
+        zkp = pow((_prec)(rk- 0.5*s2), (_prec)p2);
+     }
+     
+     const int pos = wk + wj*line + wi*slice;
+     out[pos] = a0*pow((_prec)(ri + nx*rx - 0.5*s0),(_prec)p0) 
+              + a1*pow((_prec)(rj + ny*ry - 0.5*s1),(_prec)p1) 
+              + a2*zkp;
+}
diff --git a/tests/topography/accuracy/cutopography_test.cuh b/tests/topography/accuracy/cutopography_test.cuh
new file mode 100644
index 0000000..7c82dee
--- /dev/null
+++ b/tests/topography/accuracy/cutopography_test.cuh
@@ -0,0 +1,93 @@
+#ifndef _TOPOGRAPHY_TEST_H
+#define _TOPOGRAPHY_TEST_H
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+void topo_test_diffx_H(topo_t *T, _prec *out, const _prec *in);
+//This function differs from the previous in that it calls an auto-generated
+// compute kernel
+void topo_test_cgdiffx_H(topo_t *T, _prec *out, const _prec *in);
+void topo_test_diffy_H(topo_t *T, _prec *out, const _prec *in);
+void topo_test_diffz_H(topo_t *T, _prec *out, const _prec *in);
+// Construct polynomial on velocity grid
+void topo_test_poly_H(topo_t *T, _prec *out, const _prec *coef,
+                      const _prec *deg, const int *shift);
+// Construct polynomial on stress grid
+void topo_test_polystr_H(topo_t *T, _prec *out, const _prec *coef,
+                      const _prec *deg, const int *shift);
+void topo_test_polyzbnd_H(topo_t *T, _prec *out, const _prec *coef,
+                       const _prec *deg, const int *shift);
+void topo_test_polyf_H(topo_t *T, _prec *out, const _prec *coef,
+                       const _prec *deg, const int *shift);
+void topo_test_polyzbndf_H(topo_t *T, _prec *out, const _prec *coef,
+                       const _prec *deg, const int *shift);
+void topo_test_polyb_H(topo_t *T, _prec *out, const _prec *coef, 
+                       const _prec *deg, const int *shift);
+void topo_test_polyzbndb_H(topo_t *T, _prec *out, const _prec *coef, 
+                       const _prec *deg, const int *shift);
+void topo_test_polystrzbnd_H(topo_t *T, _prec *out, const _prec *coef,
+                          const _prec *deg, const int *shift);
+#ifdef __cplusplus
+}
+#endif
+
+__global__ void dtopo_test_diffx(_prec *xx, const _prec *u1,
+                                    const int wi0, const int win,
+                                    const int wj0, const int wjn,
+                                    const int wk0, const int wkn,
+                                    const int ri0, const int rin,
+                                    const int rj0, const int rjn,
+                                    const int rk0, const int rkn,
+                                    const int wline, const int wslice,
+                                    const int rline, const int rslice);
+
+__global__ void dtopo_test_diffx_111(_prec *xx, const _prec *u1, 
+                                     const int nx, const int ny, const int nz);
+
+__global__ void dtopo_test_diffy(_prec *xx, const _prec *u1,
+                                    const int wi0, const int win,
+                                    const int wj0, const int wjn,
+                                    const int wk0, const int wkn,
+                                    const int ri0, const int rin,
+                                    const int rj0, const int rjn,
+                                    const int rk0, const int rkn,
+                                    const int wline, const int wslice,
+                                    const int rline, const int rslice);
+
+__global__ void dtopo_test_diffz_111(_prec *xz, const _prec *u1, const int nx, const int ny, const int nz);
+__global__ void dtopo_test_diffz_112(_prec *xz, const _prec *u1, const int nx, const int ny, const int nz);
+
+__global__ void dtopo_test_poly(_prec *out, 
+                                const int wi0, const int win,
+                                const int wj0, const int wjn,
+                                const int wk0, const int wkn,
+                                const int ri0, const int rin,
+                                const int rj0, const int rjn,
+                                const int rk0, const int rkn,
+                                const int nx, const int ny, const int nz,
+                                const int line, const int slice,
+                                const int rx, const int ry,
+                                const _prec a0, const _prec a1, const _prec a2,
+                                const _prec p0, const _prec p1, const _prec p2, 
+                                const _prec s0, const _prec s1, const _prec s2);
+
+__global__ void dtopo_test_polyzbnd(_prec *out, 
+                                    const int wi0, const int win,
+                                    const int wj0, const int wjn,
+                                    const int wk0, const int wkn,
+                                    const int ri0, const int rin,
+                                    const int rj0, const int rjn,
+                                    const int rk0, const int rkn,
+                                    const int nx, const int ny, const int nz,
+                                    const int line, const int slice,
+                                    const int rx, const int ry,
+                                    const _prec a0, const _prec a1, const _prec a2,
+                                    const _prec p0, const _prec p1, const _prec p2, 
+                                    const _prec s0, const _prec s1, const _prec s2);
+
+#endif
diff --git a/tests/topography/accuracy/data/topography_0.bin b/tests/topography/accuracy/data/topography_0.bin
new file mode 100644
index 0000000..6b9bfc6
Binary files /dev/null and b/tests/topography/accuracy/data/topography_0.bin differ
diff --git a/tests/topography/accuracy/data/topography_1.bin b/tests/topography/accuracy/data/topography_1.bin
new file mode 100644
index 0000000..97538a6
Binary files /dev/null and b/tests/topography/accuracy/data/topography_1.bin differ
diff --git a/tests/topography/accuracy/data/topography_2.bin b/tests/topography/accuracy/data/topography_2.bin
new file mode 100644
index 0000000..9be1218
Binary files /dev/null and b/tests/topography/accuracy/data/topography_2.bin differ
diff --git a/tests/topography/accuracy/data/topography_3.bin b/tests/topography/accuracy/data/topography_3.bin
new file mode 100644
index 0000000..78b1f6c
Binary files /dev/null and b/tests/topography/accuracy/data/topography_3.bin differ
diff --git a/tests/topography/accuracy/functions.c b/tests/topography/accuracy/functions.c
new file mode 100644
index 0000000..cf0f686
--- /dev/null
+++ b/tests/topography/accuracy/functions.c
@@ -0,0 +1,261 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <assert.h>
+
+#include <grid/grid_3d.h>
+#include "functions.h"
+
+/*
+ * Fill a grid with grid point values for a given axis.
+ *
+ * Example:
+ * `fcn_fill_grid(grid, 0)` constructs the x-axis.
+ */
+void fcn_fill_grid(_prec *out, const fcn_grid_t grid, const int3_t shift,
+                   const int axis) {
+        int i1 = grid.offset1.x;
+        int j1 = grid.offset1.y;
+        int k1 = grid.offset1.z;
+        int i2 = grid.offset2.x;
+        int j2 = grid.offset2.y;
+        int k2 = grid.offset2.z;
+
+        _prec a[3];
+        for (int i = 0; i < 3; ++i) {
+                a[i] = 0.0;
+        }
+        a[axis] = 1;
+        _prec h = grid.gridspacing;
+        int xshift = (axis == 0 && shift.x) ? 1 : 0;
+
+        for (int i = i1; i < i2; ++i) {
+        for (int j = j1; j < j2; ++j) {
+        for (int k = k1; k < k2; ++k) {
+                _prec zkp = 0.0; 
+                if (k == k2 - 1 && shift.z == 1) {
+                      zkp = k2 - 2;
+                } 
+                else if ( k == grid.offset1.z) {
+                      zkp = k1;
+                }
+                else if ( k == k2 - 1 && shift.z == 0) {
+                      zkp = k1;
+                } 
+                else {
+                   zkp = k - 0.5*shift.z;
+                }
+
+                int pos = k + j*grid.line + i*grid.slice;
+                out[pos] =
+                    h * a[0] * (i - i1 + grid.coordinate.x * grid.inner_size.x -
+                                0.5 * shift.x + xshift) +
+                    h * a[1] * (j - j1 + grid.coordinate.y * grid.inner_size.y -
+                                0.5 * shift.y) +
+                    h * a[2] * (zkp - k1);
+        }
+        }
+        }
+}
+
+
+void fcn_shift(_prec *out, _prec *in, const fcn_grid_t grid, const _prec shift)
+{
+        for (int i = 0; i < grid.size.x; ++i) {
+        for (int j = 0; j < grid.size.y; ++j) {
+        for (int k = 0; k < grid.size.z; ++k) {
+                int pos = grid.offset1.z + k +
+                          (grid.offset1.y + j) * grid.line +
+                          (grid.offset1.x + i) * grid.slice;
+                out[pos] = in[pos] + shift;
+        }
+        }
+        }
+}
+
+void fcn_power(_prec *out, _prec *in, const fcn_grid_t grid,
+               const _prec exponent) {
+        for (int i = 0; i < grid.size.x; ++i) {
+        for (int j = 0; j < grid.size.y; ++j) {
+        for (int k = 0; k < grid.size.z; ++k) {
+                int pos = grid.offset1.z + k +
+                          (grid.offset1.y + j) * grid.line +
+                          (grid.offset1.x + i) * grid.slice;
+                out[pos] = pow(in[pos], exponent);
+        }
+        }
+        }
+}
+
+void fcn_normalize(_prec *out, _prec *in, const fcn_grid_t grid)
+{
+
+        int i1 = grid.offset1.x;
+        int j1 = grid.offset1.y;
+        int k1 = grid.offset1.z;
+        int i2 = grid.offset2.x;
+        int j2 = grid.offset2.y;
+        int k2 = grid.offset2.z - grid.exclude_top_row;
+        int pos1 = k1 + j1 * grid.line + i1 * grid.slice;
+        int pos2 = (k2 - 1) + (j2 - 1) * grid.line + (i2 - 1) * grid.slice;
+
+        _prec normalization = 1.0/(in[pos2] - in[pos1]);
+        for (int i = i1; i < i2; ++i) {
+        for (int j = j1; j < j2; ++j) {
+        for (int k = k1; k < k2; ++k) {
+                int pos = k + j * grid.line + i * grid.slice;
+                out[pos] = (in[pos] - in[pos1])*normalization;
+        }
+        }
+        }
+}
+
+
+void fcn_difference(_prec *out, _prec *in1, _prec *in2, const fcn_grid_t grid)
+{
+
+        int i1 = grid.offset1.x;
+        int j1 = grid.offset1.y;
+        int k1 = grid.offset1.z;
+        int i2 = grid.offset2.x;
+        int j2 = grid.offset2.y;
+        int k2 = grid.offset2.z - grid.exclude_top_row;
+
+        for (int i = i1; i < i2; ++i) {
+        for (int j = j1; j < j2; ++j) {
+        for (int k = k1; k < k2; ++k) {
+                int pos = k + j * grid.line + i * grid.slice;
+                out[pos] = in1[pos] - in2[pos];
+        }
+        }
+        }
+}
+
+void fcn_apply(_prec *out, fcn_gridp fcn, const _prec *x, const _prec *y,
+               const _prec *z, const _prec *properties, const fcn_grid_t grid) {
+        for (int i = 0; i < grid.size.x; ++i) {
+        for (int j = 0; j < grid.size.y; ++j) {
+        for (int k = 0; k < grid.size.z; ++k) {
+                int pos = grid.offset1.z + k +
+                          (grid.offset1.y + j) * grid.line +
+                          (grid.offset1.x + i) * grid.slice;
+                out[pos] = fcn(x[pos], y[pos], z[pos], properties); 
+        }
+        }
+        }
+}
+
+void fcn_abs(_prec *out, _prec *in, const fcn_grid_t grid)
+{
+
+        int i1 = grid.offset1.x;
+        int j1 = grid.offset1.y;
+        int k1 = grid.offset1.z;
+        int i2 = grid.offset2.x;
+        int j2 = grid.offset2.y;
+        int k2 = grid.offset2.z - grid.exclude_top_row;
+
+        for (int i = i1; i < i2; ++i) {
+        for (int j = j1; j < j2; ++j) {
+        for (int k = k1; k < k2; ++k) {
+                int pos = k + j * grid.line + i * grid.slice;
+                out[pos] = fabs(in[pos]);
+        }
+        }
+        }
+}
+
+void fcn_constant(_prec *out, 
+                  const int i0, const int in, 
+                  const int j0, const int jn, 
+                  const int k0, const int kn, 
+                  const int line, const int slice,
+                  const _prec *args)
+{
+        for (int i = i0; i < in; ++i) {
+        for (int j = j0; j < jn; ++j) {
+        for (int k = k0; k < kn; ++k) {
+                int pos = k + j*line + i*slice; 
+                out[pos] = args[0];
+        }
+        }
+        }
+
+}
+void fcn_poly(_prec *out, 
+              const int i0, const int in, 
+              const int j0, const int jn, 
+              const int k0, const int kn, 
+              const int line, const int slice,
+              const _prec *args)
+{
+        const _prec a0 = args[0];
+        const _prec a1 = args[1];
+        const _prec a2 = args[2];
+        const _prec p0 = args[3];
+        const _prec p1 = args[4];
+        const _prec p2 = args[5];
+        const _prec s0 = args[6];
+        const _prec s1 = args[7];
+        const _prec s2 = args[8];
+        const int   nx = (int)args[9];
+        const int   ny = (int)args[10];
+        const int   rx = (int)args[11];
+        const int   ry = (int)args[12];
+        for (int i = i0; i < in; ++i) {
+        for (int j = j0; j < jn; ++j) {
+        for (int k = k0; k < kn; ++k) {
+                int pos = k + j*line + i*slice; 
+                out[pos] =  a0*pow(i + rx*nx - 0.5*s0, p0)
+                          + a1*pow(j + ry*ny - 0.5*s1, p1)
+                          + a2*pow(k         - 0.5*s2, p2);
+        }
+        }
+        }
+}
+
+//TODO: Deprecate this function. Use `fcn_polynomial` instead.
+void fcn_polybndz(_prec *out, 
+                  const int i0, const int in, 
+                  const int j0, const int jn, 
+                  const int k0, const int kn, 
+                  const int line, const int slice,
+                  const _prec *args)
+{
+        const _prec a0 = args[0];
+        const _prec a1 = args[1];
+        const _prec a2 = args[2];
+        const _prec p0 = args[3];
+        const _prec p1 = args[4];
+        const _prec p2 = args[5];
+        const _prec s0 = args[6];
+        const _prec s1 = args[7];
+        const _prec s2 = args[8];
+        const int   nx = (int)args[9];
+        const int   ny = (int)args[10];
+        const int   rx = (int)args[11];
+        const int   ry = (int)args[12];
+        for (int i = i0; i < in; ++i) {
+        for (int j = j0; j < jn; ++j) {
+        for (int k = k0; k < kn; ++k) {
+                _prec zkp = 0.0; 
+                if (k == kn - 1 && s2 == 1) {
+                      zkp = pow(kn - 2, p2);
+                } 
+                else if ( k == k0) {
+                      zkp = pow(k0, p2);
+                }
+                else if ( k == kn - 1 && s2 == 0) {
+                      zkp = 0;
+                } 
+                else {
+                   zkp = pow(k - 0.5*s2, p2);
+                }
+                int pos = k + j*line + i*slice; 
+                out[pos] =  a0*pow(i + rx*nx + 0.5*s0, p0)
+                          + a1*pow(j + ry*ny + 0.5*s1, p1)
+                          + a2*zkp;
+        }
+        }
+        }
+}
diff --git a/tests/topography/accuracy/functions.h b/tests/topography/accuracy/functions.h
new file mode 100644
index 0000000..f83aaa0
--- /dev/null
+++ b/tests/topography/accuracy/functions.h
@@ -0,0 +1,71 @@
+#ifndef FUNCTIONS_H
+#define FUNCTIONS_H
+
+#include "cuda.h"
+
+typedef void (*fcnp)(_prec *,
+                     const int, const int, 
+                     const int, const int, 
+                     const int, const int, 
+                     const int, const int, 
+                     const _prec *);
+
+typedef _prec (*fcn_gridp)(const _prec, const _prec, const _prec, const _prec *);
+
+//FIXME: remove
+//typedef struct
+//{
+//        int3_t size;
+//        int3_t inner_size;
+//        int3_t mem;
+//        int3_t coordinate;
+//        int3_t offset1;
+//        int3_t offset2;
+//        int3_t alignment;
+//        _prec gridspacing;
+//        int num_bytes;
+//        int line;
+//        int slice;
+//        int exclude_top_row;
+//} fcn_grid_t;
+
+
+void fcn_fill_grid(_prec *out, const fcn_grid_t grid, const int3_t shift, const int axis);
+
+void fcn_shift(_prec *out, _prec *in, const fcn_grid_t grid, const _prec shift);
+void fcn_power(_prec *out, _prec *in, const fcn_grid_t grid,
+               const _prec exponent);
+void fcn_normalize(_prec *out, _prec *in, const fcn_grid_t grid);
+void fcn_difference(_prec *out, _prec *in1, _prec *in2, const fcn_grid_t grid);
+void fcn_apply(_prec *out, fcn_gridp fcn, const _prec *x, const _prec *y,
+               const _prec *z, const _prec *properties, const fcn_grid_t grid);
+void fcn_abs(_prec *out, _prec *in, const fcn_grid_t grid);
+
+void fcn_constant(_prec *out, 
+                  const int i0, const int in, 
+                  const int j0, const int jn, 
+                  const int k0, const int kn, 
+                  const int line, const int slice, 
+                  const _prec *args);
+
+void fcn_poly(_prec *out, 
+              const int i0, const int in, 
+              const int j0, const int jn, 
+              const int k0, const int kn, 
+              const int line, const int slice, 
+              const _prec *args);
+
+void fcn_polynomial(_prec *out, const fcn_grid_t grid, const _prec *coef,
+                    const _prec *deg, const int *shift, const int i0,
+                    const int in, const int j0, const int jn, const int k0,
+                    const int kn);
+
+void fcn_polybndz(_prec *out, 
+              const int i0, const int in, 
+              const int j0, const int jn, 
+              const int k0, const int kn, 
+              const int line, const int slice, 
+              const _prec *args);
+
+
+#endif
diff --git a/tests/topography/accuracy/grid_check.c b/tests/topography/accuracy/grid_check.c
new file mode 100644
index 0000000..a362a2e
--- /dev/null
+++ b/tests/topography/accuracy/grid_check.c
@@ -0,0 +1,161 @@
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "grid_check.h"
+
+double check_fl1err(const _prec *u, const _prec *v, 
+                    const int i0, const int in, 
+                    const int j0, const int jn, 
+                    const int k0, const int kn, 
+                    const int line, const int slice)
+{
+        double err = 0.0;
+        int num = 0;
+        for (int i = i0; i < in; ++i) {
+        for (int j = j0; j < jn; ++j) {
+        for (int k = k0; k < kn; ++k) {
+                int pos = k + j*line + i*slice; 
+                err += fabs(u[pos] - v[pos]);
+                num++;
+        }
+        }
+        }
+        return err;
+
+}
+
+double check_fl2err(const _prec *u, const _prec *v, 
+                    const int i0, const int in, 
+                    const int j0, const int jn, 
+                    const int k0, const int kn, 
+                    const int line, const int slice)
+{
+        double err = 0.0;
+        int num = 0;
+        for (int i = i0; i < in; ++i) {
+        for (int j = j0; j < jn; ++j) {
+        for (int k = k0; k < kn; ++k) {
+                int pos = k + j*line + i*slice; 
+                err += pow(u[pos] - v[pos], 2);
+                num++;
+        }
+        }
+        }
+        return sqrt(err/num);
+
+}
+
+double check_flinferr(const _prec *u, const _prec *v, 
+                      const int i0, const int in, 
+                      const int j0, const int jn, 
+                      const int k0, const int kn, 
+                      const int line, const int slice)
+{
+        double err = 0.0;
+        for (int i = i0; i < in; ++i) {
+        for (int j = j0; j < jn; ++j) {
+        for (int k = k0; k < kn; ++k) {
+                int pos = k + j*line + i*slice; 
+                err = err > fabs(u[pos] - v[pos]) ? err : fabs(u[pos] - v[pos]);
+        }
+        }
+        }
+        return err;
+
+}
+
+int check_all(check_fun fp, 
+              const _prec *field, const _prec *result, 
+              const int *off_x, const int *off_y, const int *off_z, 
+              const int nx, const int ny,
+              const int line, const int slice, 
+              const _prec tol,
+              const int *regions,
+              _prec *regions_out
+              )
+{
+        int err = 0;
+        double errs[25] = {0};
+
+        for (int i = 0; i < nx; ++i) {
+        for (int j = 0; j < ny; ++j) {
+                int pos = i + nx * j;
+                if (!regions[pos]) {
+                        continue;
+                }
+                errs[pos] =
+                    fp(field, result, off_x[i], off_x[i + 1], off_y[j],
+                       off_y[j + 1], off_z[1], off_z[2], line, slice);
+                if (errs[pos] > tol) {
+                      err = 1;
+                }
+                if(regions_out) regions_out[pos] = errs[pos];
+        }
+        }
+
+        return err;
+}
+
+void check_printerr(const char *fcn, const int rank, const char *field_str, 
+                    const _prec *err)
+{
+
+        char buf[512];
+        fflush(stdout);
+        sprintf(buf, 
+                "%s(%d) Errors for %s.\n"
+                "%e \t %e \t %e \n"
+                "%e \t %e \t %e \n"
+                "%e \t %e \t %e \n",
+                fcn, rank, field_str,
+                err[6], err[7], err[8],
+                err[3], err[4], err[5],
+                err[0], err[1], err[2]
+                );
+        fprintf(stdout,"%s",buf);
+        fflush(stdout);
+}
+
+void check_printerr53(const char *fcn, const int rank, const char *field_str, 
+                    const _prec *err)
+{
+
+        char buf[512];
+        fflush(stdout);
+        sprintf(buf, 
+                "%s(%d) Errors for %s.\n"
+                "%e \t %e \t %e \t %e \t %e \n"
+                "%e \t %e \t %e \t %e \t %e \n"
+                "%e \t %e \t %e \t %e \t %e \n",
+                fcn, rank, field_str,
+                err[10], err[11], err[12], err[13], err[14],
+                err[5], err[6], err[7], err[8], err[9],
+                err[0], err[1], err[2], err[3], err[4]
+                );
+        fprintf(stdout,"%s",buf);
+        fflush(stdout);
+}
+
+void check_printerr55(const char *fcn, const int rank, const char *field_str, 
+                    const _prec *err)
+{
+
+        char buf[512];
+        fflush(stdout);
+        sprintf(buf, 
+                "%s(%d) Errors for %s.\n"
+                "%e \t %e \t %e \t %e \t %e \n"
+                "%e \t %e \t %e \t %e \t %e \n"
+                "%e \t %e \t %e \t %e \t %e \n"
+                "%e \t %e \t %e \t %e \t %e \n"
+                "%e \t %e \t %e \t %e \t %e \n",
+                fcn, rank, field_str,
+                err[20], err[21], err[22], err[23], err[24],
+                err[15], err[16], err[17], err[18], err[19],
+                err[10], err[11], err[12], err[13], err[14],
+                err[5], err[6], err[7], err[8], err[9],
+                err[0], err[1], err[2], err[3], err[4]
+                );
+        fprintf(stdout,"%s",buf);
+        fflush(stdout);
+}
diff --git a/tests/topography/accuracy/grid_check.h b/tests/topography/accuracy/grid_check.h
new file mode 100644
index 0000000..0e115a4
--- /dev/null
+++ b/tests/topography/accuracy/grid_check.h
@@ -0,0 +1,91 @@
+#ifndef GRID_CHECK_H
+#define GRID_CHECK_H
+/*
+ * This module is used to compare two arrays that are allocated on the host that
+ * have a memory layout that matches the 2D grid decomposition of a grid. In
+ * map view, this layout can be represented by:
+ *
+ *            back 
+ *
+ *        | 0 | 1 | 2 |
+ *        |   |---|   |
+ *  left  | 3 | 4 | 5 |  right
+ *        |   |---|   |
+ *        | 6 | 7 | 8 |
+ *
+ *            front
+ *
+ * All sections except for section `4` are ghost regions. 
+ *
+ * To use this module, first prepare two arrays that you want to compare. Then
+ * call any of the comparison functions to compute the error between the arrays
+ * and specify where you want the error to be computed.
+ * The comparison functions are:
+ *  - check_fl1err  : L1-error (sum of absolute value of all terms)
+ *  - check_fl2err  : L2-error
+ *  - check_finferr : L-infinity-error (maximum absolute value)
+ *
+ * Where you want the error to be computed is specified by the offsets `off_x`,
+ * `off_y`, and `off_z`, one for each direction. Each offset contains two
+ * values: the starting index and the exclusive ending index. 
+ *
+ * Use `check_all` to compute the error for each region in the above figure. If
+ * there is an error in a particular region, it will be flagged as `1`. The
+ * index of the error array maps to indices in the figure above. Pass
+ * `check_fl2err` as the `fcn` argument if you want to use this function for the
+ * comparison.
+ *
+ * The function `check_printerr` can be used to print a figure, like the one
+ * shown above, that shows what region contain errors and what the errors are.
+ *
+ * The function `check_printerr53` includes 2 additional regions in the
+ * x-direction. 
+ *
+ * The function `check_printerr55` includes 2 additional regions in both the
+ * x and y-directions. 
+ *
+ *
+ */
+
+typedef double (*check_fun)(const _prec *, const _prec *, 
+                            const int, const int, 
+                            const int, const int, 
+                            const int, const int, 
+                            const int, const int);
+
+double check_fl1err(const _prec *u, const _prec *v, 
+                    const int i0, const int in, 
+                    const int j0, const int jn, 
+                    const int k0, const int kn, 
+                    const int line, const int slice);
+
+double check_fl2err(const _prec *u, const _prec *v, 
+                    const int i0, const int in, 
+                    const int j0, const int jn, 
+                    const int k0, const int kn, 
+                    const int line, const int slice);
+
+double check_flinferr(const _prec *u, const _prec *v, 
+                      const int i0, const int in, 
+                      const int j0, const int jn, 
+                      const int k0, const int kn, 
+                      const int line, const int slice);
+
+int check_all(check_fun fp, 
+              const _prec *field, const _prec *result, 
+              const int *off_x, const int *off_y, const int *off_z, 
+              const int nx, const int ny,
+              const int line, const int slice, 
+              const _prec tol,
+              const int *regions,
+              _prec *regions_out);
+
+void check_printerr(const char *fcn, const int rank, const char *field_str, 
+                     const _prec *err);
+void check_printerr53(const char *fcn, const int rank, const char *field_str, 
+                      const _prec *err);
+void check_printerr55(const char *fcn, const int rank, const char *field_str, 
+                      const _prec *err);
+
+
+#endif
diff --git a/tests/topography/accuracy/mms.c b/tests/topography/accuracy/mms.c
new file mode 100644
index 0000000..6a5bbd4
--- /dev/null
+++ b/tests/topography/accuracy/mms.c
@@ -0,0 +1,109 @@
+#include "mms.h"
+_prec mms_init_vx(const _prec x, const _prec y, const _prec z, const _prec *properties)
+{
+     _prec k = properties[0];
+     return sin(k*x)*sin(k*y)*sin(k*z + 1.2)/k;
+}
+
+_prec mms_init_vy(const _prec x, const _prec y, const _prec z, const _prec *properties)
+{
+     _prec k = properties[0];
+     return sin(k*x)*sin(k*y)*sin(k*z + 0.25)/k;
+}
+
+_prec mms_init_vz(const _prec x, const _prec y, const _prec z, const _prec *properties)
+{
+     _prec k = properties[0];
+     return sin(k*x)*sin(k*y)*sin(k*z + 0.40000000000000002)/k;
+}
+
+_prec mms_init_sxx(const _prec x, const _prec y, const _prec z, const _prec *properties)
+{
+     _prec k = properties[0];
+     return sin(k*x)*sin(k*y)*sin(k*z + 0.69999999999999996)/k;
+}
+
+_prec mms_init_syy(const _prec x, const _prec y, const _prec z, const _prec *properties)
+{
+     _prec k = properties[0];
+     return sin(k*x)*sin(k*y)*sin(k*z + 0.29999999999999999)/k;
+}
+
+_prec mms_init_szz(const _prec x, const _prec y, const _prec z, const _prec *properties)
+{
+     _prec k = properties[0];
+     return sin(k*x)*sin(k*y)*sin(k*z + 0.12)/k;
+}
+
+_prec mms_init_sxy(const _prec x, const _prec y, const _prec z, const _prec *properties)
+{
+     _prec k = properties[0];
+     return sin(k*x)*sin(k*y)*sin(k*z + 0.02)/k;
+}
+
+_prec mms_init_sxz(const _prec x, const _prec y, const _prec z, const _prec *properties)
+{
+     _prec k = properties[0];
+     return sin(k*x)*sin(k*y)*sin(k*z + 0.46999999999999997)/k;
+}
+
+_prec mms_init_syz(const _prec x, const _prec y, const _prec z, const _prec *properties)
+{
+     _prec k = properties[0];
+     return sin(k*x)*sin(k*y)*sin(k*z + 0.33000000000000002)/k;
+}
+
+_prec mms_final_vx(const _prec x, const _prec y, const _prec z, const _prec *properties)
+{
+     _prec k = properties[0];
+     return sin(k*x)*sin(k*y)*cos(k*z + 0.46999999999999997) + sin(k*x)*sin(k*z + 0.02)*cos(k*y) + sin(k*y)*sin(k*z + 0.69999999999999996)*cos(k*x);
+}
+
+_prec mms_final_vy(const _prec x, const _prec y, const _prec z, const _prec *properties)
+{
+     _prec k = properties[0];
+     return sin(k*x)*sin(k*y)*cos(k*z + 0.33000000000000002) + sin(k*x)*sin(k*z + 0.29999999999999999)*cos(k*y) + sin(k*y)*sin(k*z + 0.02)*cos(k*x);
+}
+
+_prec mms_final_vz(const _prec x, const _prec y, const _prec z, const _prec *properties)
+{
+     _prec k = properties[0];
+     return sin(k*x)*sin(k*y)*cos(k*z + 0.12) + sin(k*x)*sin(k*z + 0.33000000000000002)*cos(k*y) + sin(k*y)*sin(k*z + 0.46999999999999997)*cos(k*x);
+}
+
+_prec mms_final_sxx(const _prec x, const _prec y, const _prec z, const _prec *properties)
+{
+     _prec k = properties[0];
+     return sin(k*x)*sin(k*y)*cos(k*z + 0.40000000000000002) + sin(k*x)*sin(k*z + 0.25)*cos(k*y) + 3*sin(k*y)*sin(k*z + 1.2)*cos(k*x);
+}
+
+_prec mms_final_syy(const _prec x, const _prec y, const _prec z, const _prec *properties)
+{
+     _prec k = properties[0];
+     return sin(k*x)*sin(k*y)*cos(k*z + 0.40000000000000002) + 3*sin(k*x)*sin(k*z + 0.25)*cos(k*y) + sin(k*y)*sin(k*z + 1.2)*cos(k*x);
+}
+
+_prec mms_final_szz(const _prec x, const _prec y, const _prec z, const _prec *properties)
+{
+     _prec k = properties[0];
+     return 3*sin(k*x)*sin(k*y)*cos(k*z + 0.40000000000000002) + sin(k*x)*sin(k*z + 0.25)*cos(k*y) + sin(k*y)*sin(k*z + 1.2)*cos(k*x);
+}
+
+_prec mms_final_sxy(const _prec x, const _prec y, const _prec z, const _prec *properties)
+{
+     _prec k = properties[0];
+     return sin(k*x)*sin(k*z + 1.2)*cos(k*y) + sin(k*y)*sin(k*z + 0.25)*cos(k*x);
+}
+
+_prec mms_final_sxz(const _prec x, const _prec y, const _prec z, const _prec *properties)
+{
+     _prec k = properties[0];
+     return sin(k*x)*sin(k*y)*cos(k*z + 1.2) + sin(k*y)*sin(k*z + 0.40000000000000002)*cos(k*x);
+}
+
+_prec mms_final_syz(const _prec x, const _prec y, const _prec z, const _prec *properties)
+{
+     _prec k = properties[0];
+     return sin(k*x)*sin(k*y)*cos(k*z + 0.25) + sin(k*x)*sin(k*z + 0.40000000000000002)*cos(k*y);
+}
+
diff --git a/tests/topography/accuracy/mms.h b/tests/topography/accuracy/mms.h
new file mode 100644
index 0000000..2c8b40a
--- /dev/null
+++ b/tests/topography/accuracy/mms.h
@@ -0,0 +1,27 @@
+#ifndef MMS_H
+#define MMS_H
+#include <awp/definitions.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+_prec mms_init_vx(const _prec x, const _prec y, const _prec z, const _prec *properties);
+_prec mms_init_vy(const _prec x, const _prec y, const _prec z, const _prec *properties);
+_prec mms_init_vz(const _prec x, const _prec y, const _prec z, const _prec *properties);
+_prec mms_init_sxx(const _prec x, const _prec y, const _prec z, const _prec *properties);
+_prec mms_init_syy(const _prec x, const _prec y, const _prec z, const _prec *properties);
+_prec mms_init_szz(const _prec x, const _prec y, const _prec z, const _prec *properties);
+_prec mms_init_sxy(const _prec x, const _prec y, const _prec z, const _prec *properties);
+_prec mms_init_sxz(const _prec x, const _prec y, const _prec z, const _prec *properties);
+_prec mms_init_syz(const _prec x, const _prec y, const _prec z, const _prec *properties);
+_prec mms_final_vx(const _prec x, const _prec y, const _prec z, const _prec *properties);
+_prec mms_final_vy(const _prec x, const _prec y, const _prec z, const _prec *properties);
+_prec mms_final_vz(const _prec x, const _prec y, const _prec z, const _prec *properties);
+_prec mms_final_sxx(const _prec x, const _prec y, const _prec z, const _prec *properties);
+_prec mms_final_syy(const _prec x, const _prec y, const _prec z, const _prec *properties);
+_prec mms_final_szz(const _prec x, const _prec y, const _prec z, const _prec *properties);
+_prec mms_final_sxy(const _prec x, const _prec y, const _prec z, const _prec *properties);
+_prec mms_final_sxz(const _prec x, const _prec y, const _prec z, const _prec *properties);
+_prec mms_final_syz(const _prec x, const _prec y, const _prec z, const _prec *properties);
+
+#endif
diff --git a/tests/topography/accuracy/test_accuracy.cu b/tests/topography/accuracy/test_accuracy.cu
new file mode 100644
index 0000000..e9e8848
--- /dev/null
+++ b/tests/topography/accuracy/test_accuracy.cu
@@ -0,0 +1,313 @@
+#include <stdio.h>
+
+#include <awp/definitions.h>
+#include <topography/topography.h>
+#include <topography/metrics/metrics.h>
+#include <topography/velocity.cuh>
+#include <topography/geometry/geometry.h>
+#include <topography/geometry.h>
+#include "cupolynomial.cu"
+#include "cutopography_test.cu"
+#include "grid_check.c"
+#include "functions.c"
+#include <grid/shift.h>
+using _prec=float;
+
+typedef struct
+{
+        _prec tol;
+        int verbose;
+        int deg[3];
+        int coef[3];
+        int num_bytes;
+        int size[3];
+        int mem[3];
+        int3_t shift;
+        _prec *output;
+        _prec *answer;
+        _prec *error;
+        fcn_grid_t velocity_grid;
+        fcn_grid_t interior_grid;
+        fcn_grid_t stress_grid;
+        fcn_grid_t topography_grid;
+        topo_t T;
+        int velocity_offset_x[2];
+        int velocity_offset_y[2];
+        int stress_offset_x[2];
+        int stress_offset_y[2];
+        int offset_z[2];
+} testdata_t;
+
+void copy_output_to_host(testdata_t *test, const _prec *input)
+{
+        cudaMemcpy(test->output, input, test->num_bytes,
+                   cudaMemcpyDeviceToHost);
+}
+
+void copy_answer_to_host(testdata_t *test, const _prec *input)
+{
+        cudaMemcpy(test->answer, input, test->num_bytes,
+                   cudaMemcpyDeviceToHost);
+}
+
+double check_answer(const testdata_t *test, const int *shift, const int *offset_x,
+                    const int *offset_y, const int *offset_z) {
+        // Do not check the ghost point on the nodal grid
+        int skip = 0;
+        if (shift[2] == 0) {
+                skip = 1;
+        }
+
+        double err = check_flinferr(test->output, test->answer, 
+                  offset_x[0], offset_x[1],
+                  offset_y[0], offset_y[1], 
+                  offset_z[0], offset_z[1] - skip, 
+                  test->T.line,
+                  test->T.slice);
+        return err;
+}
+
+void test_free(testdata_t *test)
+{
+        topo_free(&test->T);
+        free(test->output);
+        free(test->answer);
+        cudaStreamDestroy(test->T.stream_1);
+        cudaStreamDestroy(test->T.stream_2);
+        cudaStreamDestroy(test->T.stream_i);
+        topo_d_free(&test->T);
+}
+
+void write_vtk(const testdata_t *test)
+{
+
+        _prec *x = (_prec*)malloc(test->topography_grid.num_bytes);
+        _prec *y = (_prec*)malloc(test->topography_grid.num_bytes);
+        _prec *z = (_prec*)malloc(test->topography_grid.num_bytes);
+
+        fcn_fill_grid(x, test->topography_grid, test->shift, 0);
+        fcn_fill_grid(y, test->topography_grid, test->shift, 1);
+        fcn_fill_grid(z, test->topography_grid, test->shift, 2);
+
+        fcn_grid_t grid = test->interior_grid;
+        const char *vtk_file = "output.vtk";
+        vtk_write_grid(vtk_file, x, y, z, grid);
+        size_t count = vtk_append_scalar(vtk_file, "output", test->output, grid);
+
+        const char *vtk_file2 = "answer.vtk";
+        vtk_write_grid(vtk_file2, x, y, z, grid);
+        count = vtk_append_scalar(vtk_file2, "answer", test->answer, grid);
+
+        const char *vtk_file3 = "error.vtk";
+        fcn_difference(test->error, test->answer, test->output, grid); 
+        fcn_abs(test->error, test->error, grid); 
+        vtk_write_grid(vtk_file3, x, y, z, grid);
+        count = vtk_append_scalar(vtk_file3, "error", test->error, grid);
+
+}
+
+void test_initialize(testdata_t *test)
+{
+        int rank = 0;
+        int x_rank_l = -1;
+        int x_rank_r = -1;
+        int y_rank_f = -1;
+        int y_rank_b = -1;
+        int coord[2] = {0, 0};
+        int size[3] = {132, 132, 32};
+        cudaStream_t stream_1, stream_2, stream_i;
+        cudaStreamCreate(&stream_1);
+        cudaStreamCreate(&stream_2);
+        cudaStreamCreate(&stream_i);
+        test->tol = 1e-6;
+        _prec dt = 1.0;
+        _prec h  = 1.0;
+        int px = 1;
+        int py = 1;
+        test->T = topo_init(1, "topography.bin", rank, x_rank_l, x_rank_r, y_rank_f,
+                            y_rank_b, coord, px, py, size[0], size[1], size[2], dt, h,
+                            stream_1, stream_2, stream_i);
+        test->size[0] = test->T.nx;
+        test->size[1] = test->T.ny;
+        test->size[2] = test->T.nz;
+        test->mem[0] = test->T.mx;
+        test->mem[1] = test->T.my;
+        test->mem[2] = test->T.mz;
+        topo_d_malloc(&test->T);
+
+        topo_init_metrics(&test->T);
+        topo_init_grid(&test->T);
+        topo_init_geometry(&test->T);
+        topo_build(&test->T);
+
+        test->velocity_offset_x[0] = test->T.off_x[1];
+        test->velocity_offset_x[1] = test->T.off_x[2];
+        test->velocity_offset_y[0] = test->T.off_y[1];
+        test->velocity_offset_y[1] = test->T.off_y[2];
+        test->stress_offset_x[0] = test->T.off_x[1] - ngsl/2;
+        test->stress_offset_x[1] = test->T.off_x[3] + ngsl/2;
+        test->stress_offset_y[0] = test->T.off_y[1] - ngsl/2;
+        test->stress_offset_y[1] = test->T.off_y[3] + ngsl/2;
+        test->offset_z[0] = test->T.off_z[1];
+        test->offset_z[1] = test->T.off_z[2];
+
+        int num_bytes = sizeof(_prec)*test->T.gridsize;
+        test->num_bytes = num_bytes;
+        test->output = (prec*)malloc(num_bytes);
+        test->answer = (prec*)malloc(num_bytes);
+        test->error = (prec*)malloc(num_bytes);
+
+        if (test->verbose) {
+                //metrics_print_info_f(&test->T.metrics_f);
+                printf("offset x: %d %d \n", test->velocity_offset_x[0],
+                                             test->velocity_offset_x[1]);
+                printf("offset y: %d %d \n", test->velocity_offset_y[0],
+                                             test->velocity_offset_y[1]);
+                printf("offset z: %d %d \n", test->offset_z[0],
+                                             test->offset_z[1]);
+        }
+
+        int3_t sizet = {.x = test->size[0],
+                     .y = test->size[1],
+                     .z = test->size[2]};
+        int3_t coordt = {0, 0, 0};
+        int3_t shift = {0, 0, 1};
+
+        test->shift = shift;
+        test->velocity_grid = fcn_init_grid(sizet, coordt, shift, 0, h);
+        test->interior_grid = fcn_init_grid(sizet, coordt, shift, -ngsl/2, h);
+        test->stress_grid = fcn_init_grid(sizet, coordt, shift, ngsl/2, h);
+        test->topography_grid = fcn_init_grid(sizet, coordt, shift, ngsl, h);
+}
+
+double test_velocity_kernel(testdata_t *test, _prec *input, const _prec *input_coef,
+                          const _prec *input_deg, const int *input_shift,
+                          _prec *output, _prec *answer,
+                          const _prec *answer_coef, _prec *answer_deg,
+                          const int *answer_shift) {
+        topo_test_polystrzbnd_H(&test->T, input, input_coef, input_deg,
+                            input_shift);
+        topo_velocity_interior_H(&test->T);
+        topo_test_polystrzbnd_H(&test->T, answer, answer_coef, answer_deg,
+                            answer_shift);
+        cudaDeviceSynchronize();
+
+        copy_output_to_host(test, output);
+        copy_answer_to_host(test, answer);
+
+        int offset_x[2] = {test->velocity_offset_x[0],
+                           test->velocity_offset_x[1]};
+        int offset_y[2] = {test->velocity_offset_y[0],
+                           test->velocity_offset_y[1]};
+        int offset_z[2] = {test->offset_z[0]+8, test->offset_z[1]};
+
+        double err = check_answer(test, answer_shift, offset_x, offset_y, offset_z);
+        return err;
+        
+
+}
+
+void test_velocity_mod(testdata_t *test)
+{
+        printf("Testing velocity update kernel (kernels must be generated with debug=1, debug_ops=1... \n");
+        printf(" * Testing u1 update equation. \n");
+        {
+        printf("    -- Testing DczPx*s11. \n");
+        _prec input_coef[3] = {0, 0, 1};
+        _prec input_deg[3] = {0, 0, 1};
+        int input_shift[3];
+        shift_xx(input_shift);
+
+        _prec answer_coef[3] = {0, 0, 1};
+        _prec answer_deg[3] = {0, 0, 0};
+        int answer_shift[3];
+        shift_u1(answer_shift);
+        test_initialize(test);
+        double err = test_velocity_kernel(
+            test, test->T.xx, input_coef, input_deg, input_shift, test->T.u1,
+            test->T.yy, answer_coef, answer_deg, answer_shift);
+
+        printf("   Error: %g \n", err);
+        write_vtk(test);
+        test_free(test);
+        }
+        {
+        printf("    -- Testing DczPy*s12. \n");
+        _prec input_coef[3] = {0, 0, 1};
+        _prec input_deg[3] = {0, 0, 1};
+        int input_shift[3];
+        shift_xy(input_shift);
+
+        _prec answer_coef[3] = {0, 0, 1};
+        _prec answer_deg[3] = {0, 0, 0};
+        int answer_shift[3];
+        shift_u1(answer_shift);
+        test_initialize(test);
+        double err = test_velocity_kernel(test, test->T.xy, input_coef, input_deg,
+                             input_shift, test->T.u1, test->T.yy, answer_coef,
+                             answer_deg, answer_shift);
+
+        printf("   Error: %g \n", err);
+        write_vtk(test);
+        test_free(test);
+        }
+
+
+
+        return;
+
+        printf(" * Testing u2 update equation. \n");
+        {
+        _prec input_coef[3] = {0, 0, 1};
+        _prec input_deg[3] = {0, 0, 2};
+        int input_shift[3];
+        shift_xy(input_shift);
+
+        _prec answer_coef[3] = {0, 0, 2};
+        _prec answer_deg[3] = {0, 0, 1};
+        int answer_shift[3];
+        shift_u2(answer_shift);
+        test_initialize(test);
+        test_velocity_kernel(test, test->T.xy, input_coef, input_deg,
+                             input_shift, test->T.v1, test->T.yy, answer_coef,
+                             answer_deg, answer_shift);
+
+        write_vtk(test);
+        test_free(test);
+        }
+
+        printf(" * Testing u3 update equation. \n");
+        {
+        printf("    -- Testing quadratic function in x-direction. \n");
+        // Only linear functions can be used in the test because interpolation
+        // operators is only first order accurate near boundary
+        _prec input_coef[3] = {0, 0, 1};
+        _prec input_deg[3] = {0, 0, 2};
+        int input_shift[3];
+        shift_xz(input_shift);
+
+        _prec answer_coef[3] = {0, 0, 2};
+        _prec answer_deg[3] = {0, 0, 1};
+        int answer_shift[3];
+        shift_u3(answer_shift);
+        test_initialize(test);
+        double err = test_velocity_kernel(
+            test, test->T.xz, input_coef, input_deg, input_shift, test->T.w1,
+            test->T.yy, answer_coef, answer_deg, answer_shift);
+
+        printf("       Error: %g \n", err);
+        write_vtk(test);
+        test_free(test);
+        }
+}
+
+int main(int argc, char **argv) {
+
+
+    printf("Testing topography.c, cutopography.c, cutopography_kernels.cu\n");
+    testdata_t test;
+    test.verbose = 1;
+    test_initialize(&test);
+    test_velocity_mod(&test);
+    return 0;
+}
diff --git a/tests/topography/accuracy/test_convergence.cu b/tests/topography/accuracy/test_convergence.cu
new file mode 100644
index 0000000..d747c35
--- /dev/null
+++ b/tests/topography/accuracy/test_convergence.cu
@@ -0,0 +1,893 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <math.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+
+#define APPLY_BC 0
+#define ERROR_TOLERANCE 2.0
+#include <topography/topography.h>
+#include <topography/metrics/metrics.h>
+#include <topography/velocity.cuh>
+#include <topography/stress.cuh>
+#include <topography/geometry.h>
+#include <topography/mapping.h>
+#include <grid/shift.h>
+#include "functions.c"
+#include "grid_check.c"
+#include "mms.c"
+
+using _prec=float;
+
+void geom_mapping_z(_prec *out, const fcn_grid_t grid, const int3_t shift,
+                    const f_grid_t *metrics_f,
+                    const g_grid_t *metrics_g) {
+        _prec *g;
+        if (shift.z == 0) {
+                g = metrics_g->g;
+        }
+        else {
+                g = metrics_g->g_c;
+        }
+
+        int3_t nodes = grid_node();
+        int3_t u1 = grid_u1();
+        int3_t u2 = grid_u2();
+        _prec *f;
+        if (shift.x == nodes.x && shift.y == nodes.y) {
+                f = metrics_f->f;
+        } 
+        else if(shift.x == u1.x && shift.y == u1.y) {
+                f = metrics_f->f_1;
+        }
+        else if(shift.x == u2.x && shift.y == u2.y) {
+                f = metrics_f->f_2;
+        }
+        else {
+                f = metrics_f->f_c;
+        }
+
+        int f_offset_x = metrics_f->offset[0] + metrics_f->bounds_stress_x[0];
+        int f_offset_y = metrics_f->offset[1] + metrics_f->bounds_stress_y[0];
+
+        for (int i = 0; i < grid.size.x; ++i) {
+        for (int j = 0; j < grid.size.y; ++j) {
+        for (int k = 0; k < grid.size.z; ++k) {
+                int pos = grid.offset1.z + k +
+                          (grid.offset1.y + j) * grid.line +
+                          (grid.offset1.x + i) * grid.slice;
+                int pos_g = k + metrics_g->offset;
+                int pos_f = f_offset_y + j +
+                            (i + f_offset_x) * metrics_f->slice;
+                if (k >= MAPPING_START_POINT)
+                out[pos] = g[pos_g] * f[pos_f];
+                else
+                out[pos] = g[pos_g];
+
+        }
+        }
+        }
+}
+
+
+typedef struct
+{
+        _prec *vx;
+        _prec *vy;
+        _prec *vz;
+        _prec *sxx;
+        _prec *syy;
+        _prec *szz;
+        _prec *sxy;
+        _prec *sxz;
+        _prec *syz;
+        _prec *rho;
+        _prec *lami;
+        _prec *mui;
+        _prec *qpi;
+        _prec *qsi;
+        _prec *r1;
+        _prec *r2;
+        _prec *r3;
+        _prec *r4;
+        _prec *r5;
+        _prec *r6;
+        _prec *wwo;
+        _prec *vx1, *vx2, *coeff;
+        int *ww;
+        int num_bytes;
+} variables_t;
+
+typedef struct
+{
+        _prec tol;
+        _prec grid_spacing;
+        int write_vtk;
+        int verbose;
+        int num_bytes;
+        int3_t size;
+        int3_t coord3;
+        topo_t T;
+        variables_t input;
+        variables_t output;
+        variables_t answer;
+        _prec mms_wavenumber;
+} testdata_t;
+
+typedef struct
+{
+        // parameter coordinates
+        _prec *x, *y, *z;
+        // physical coordinate
+        _prec *zp;
+} grid_t;
+
+typedef struct
+{
+        _prec interior;
+        _prec boundary[TOP_BOUNDARY_SIZE];
+} err_t;
+
+typedef struct
+{
+        err_t vx;
+        err_t vy;
+        err_t vz;
+        err_t sxx;
+        err_t syy;
+        err_t szz;
+        err_t sxy;
+        err_t sxz;
+        err_t syz;
+} vars_err_t;
+
+
+int3_t refine(const int3_t initial_size, const int grid);
+prec max_error(vars_err_t *err, const int num_refinements);
+void convergence_rates(vars_err_t *rates, const vars_err_t *err, const _prec *h,
+                       const int num_refinements);
+_prec convergence_rate(const _prec err1, const _prec err2, const _prec h1, const _prec h2);
+void test_initialize(testdata_t *test, const int grid, const char *topoography_dir);
+void test_velocity(testdata_t *test, vars_err_t *err);
+void test_stress(testdata_t *test, vars_err_t *err);
+void test_free(testdata_t *test);
+void vars_init(variables_t *vars, const int num_bytes);
+void vars_copy_to_device(topo_t *topo, const variables_t *vars);
+void vars_copy_to_host(variables_t *vars, const topo_t *topo);
+void vars_free(variables_t *vars);
+void test_grid_data_init(grid_t *data, const testdata_t *test, const fcn_grid_t grid,
+                    const int3_t shift);
+void test_grid_data_free(grid_t *data);
+err_t check_answer(const _prec *u, const _prec *v, const fcn_grid_t grid);
+void init_sponge(topo_t *topo, const int num_bytes);
+
+
+int main(int argc, char **argv)
+{
+        int num_refinements = 4;
+
+        testdata_t test;
+        int3_t initial_size = {16, 16, 24};
+
+        vars_err_t err[num_refinements];
+        int grid_sizes[num_refinements];
+        _prec grid_spacings[num_refinements];
+
+        const char *topography_dir = argv[1];
+
+        printf("Convergence rate test\n");
+        printf("-----------------------------------------------------\n");
+        for (int grid = 0; grid < num_refinements; ++grid) {
+                test.size = refine(initial_size, grid);
+                grid_sizes[grid] = test.size.x;
+                test_initialize(&test, grid, topography_dir);
+                grid_spacings[grid] = test.grid_spacing;
+                printf("Grid refinement: %d  grid size: {%d, %d, %d} \n", 
+                        grid, test.size.x, test.size.y, test.size.z);
+                test_velocity(&test, &err[grid]);
+                printf("Testing stresses\n");
+                test_free(&test);
+                test_initialize(&test, grid, topography_dir);
+                test_stress(&test, &err[grid]);
+                test_free(&test);
+        }
+        printf("-----------------------------------------------------\n");
+
+        vars_err_t rates[num_refinements - 1]; 
+        for (int i = 0; i < num_refinements - 1; ++i) {
+                convergence_rates(rates, err, grid_spacings, num_refinements); 
+        }
+
+        const int show_velocity = 1;
+        const int show_stress = 1;
+
+
+        if (show_velocity) {
+        printf("Interior truncation errors\n");
+        printf("N \t vx        \t vy          \t vz \n");
+        for (int i = 0; i < num_refinements; ++i) {
+                printf("%d \t %e \t %e \t %e \n", grid_sizes[i],
+                       err[i].vx.interior, err[i].vy.interior,
+                       err[i].vz.interior);
+        }
+        
+        
+        printf("Interior error rates\n");
+        printf("N \t vx        \t vy          \t vz \n");
+        for (int i = 0; i < num_refinements - 1; ++i) {
+                printf("%d \t %e \t %e \t %e \n", grid_sizes[i+1],
+                       rates[i].vx.interior, rates[i].vy.interior,
+                       rates[i].vz.interior);
+        }
+        printf("\n");
+        printf("\n");
+        printf("Boundary truncation errors\n");
+        printf("N \t z \t  vx        \t vy          \t vz \n");
+        for (int j = 0; j < TOP_BOUNDARY_SIZE; ++j) {
+        for (int i = 0; i < num_refinements; ++i) {
+                printf("%d \t %d \t %e \t %e \t %e \n", grid_sizes[i], j , 
+                       err[i].vx.boundary[j], err[i].vy.boundary[j],
+                       err[i].vz.boundary[j]);
+        }
+        printf("\n");
+        }
+
+        
+        printf("\n");
+        printf("Boundary error rates\n");
+        printf("N \t z \t vx        \t vy          \t vz \n");
+        for (int j = 0; j < TOP_BOUNDARY_SIZE; ++j) {
+        for (int i = 0; i < num_refinements - 1; ++i) {
+                printf("%d \t %d \t %e \t %e \t %e \n", grid_sizes[i+1], j,
+                       rates[i].vx.boundary[j], rates[i].vy.boundary[j],
+                       rates[i].vz.boundary[j]);
+        }
+        printf("\n");
+        }
+
+
+        }
+
+        if (show_stress) {
+        printf("Interior truncation error\n");
+        printf("N \t sxx        \t syy          \t szz \n");
+        for (int i = 0; i < num_refinements; ++i) {
+                printf("%d \t %e \t %e \t %e \n", grid_sizes[i],
+                       err[i].sxx.interior, err[i].syy.interior,
+                       err[i].szz.interior);
+        }
+
+        printf("N \t sxy        \t sxz          \t syz \n");
+        for (int i = 0; i < num_refinements; ++i) {
+                printf("%d \t %e \t %e \t %e \n", grid_sizes[i],
+                       err[i].sxy.interior, err[i].sxz.interior,
+                       err[i].syz.interior);
+        }
+        printf("\n");
+        printf("\n");
+
+        printf("Interior error rates\n");
+
+
+        printf("N \t sxx        \t syy          \t szz \n");
+        for (int i = 0; i < num_refinements - 1; ++i) {
+                printf("%d \t %e \t %e \t %e \n", grid_sizes[i+1],
+                       rates[i].sxx.interior, rates[i].syy.interior,
+                       rates[i].szz.interior);
+        }
+
+        printf("N \t sxy        \t sxz          \t syz \n");
+        for (int i = 0; i < num_refinements - 1; ++i) {
+                printf("%d \t %e \t %e \t %e \n", grid_sizes[i+1],
+                       rates[i].sxy.interior, rates[i].sxz.interior,
+                       rates[i].syz.interior);
+        }
+
+        printf("\n");
+        printf("\n");
+
+        printf("Boundary truncation errors\n");
+        
+        printf("N \t z \t sxx        \t syy          \t szz \n");
+        for (int j = 0; j < TOP_BOUNDARY_SIZE; ++j) {
+        for (int i = 0; i < num_refinements; ++i) {
+                printf("%d \t %d \t %e \t %e \t %e \n", grid_sizes[i], j,
+                       err[i].sxx.boundary[j], err[i].syy.boundary[j],
+                       err[i].szz.boundary[j]);
+        }
+        printf("\n");
+        }
+
+        printf("N \t sxy        \t sxz          \t syz \n");
+        for (int j = 0; j < TOP_BOUNDARY_SIZE; ++j) {
+        for (int i = 0; i < num_refinements; ++i) {
+                printf("%d \t %e \t %e \t %e \n", grid_sizes[i],
+                       err[i].sxy.boundary[j], err[i].sxz.boundary[j],
+                       err[i].syz.boundary[j]);
+        }
+        printf("\n");
+        }
+
+        printf("Boundary error rates\n");
+
+
+        printf("N \t z \t sxx        \t syy          \t szz \n");
+        for (int j = 0; j < TOP_BOUNDARY_SIZE; ++j) {
+        for (int i = 0; i < num_refinements - 1; ++i) {
+                printf("%d \t %d \t %e \t %e \t %e \n", grid_sizes[i+1], j,
+                       rates[i].sxx.boundary[j], rates[i].syy.boundary[j],
+                       rates[i].szz.boundary[j]);
+        }
+        printf("\n");
+        }
+
+        printf("N \t z \t sxy        \t sxz          \t syz \n");
+        for (int j = 0; j < TOP_BOUNDARY_SIZE; ++j) {
+        for (int i = 0; i < num_refinements - 1; ++i) {
+                printf("%d \t %d \t %e \t %e \t %e \n", grid_sizes[i+1], j,
+                       rates[i].sxy.boundary[j], rates[i].sxz.boundary[j],
+                       rates[i].syz.boundary[j]);
+        }
+        printf("\n");
+        }
+
+
+        }
+
+
+
+        return !(max_error(err, num_refinements) < ERROR_TOLERANCE);
+}
+
+int3_t refine(const int3_t initial_size, const int grid) 
+{
+        int3_t out;
+        out.x = initial_size.x*pow(2, grid);
+        out.y = initial_size.y*pow(2, grid);
+        out.z = initial_size.z*pow(2, grid);
+        return out;
+}
+
+void convergence_rates(vars_err_t *rates, const vars_err_t *err, const _prec *h,
+                       const int num_refinements) {
+        for (int i = 0; i < num_refinements - 1; ++i) {
+                rates[i].vx.interior = convergence_rate(
+                    err[i].vx.interior, err[i + 1].vx.interior, h[i], h[i + 1]);
+                rates[i].vy.interior = convergence_rate(
+                    err[i].vy.interior, err[i + 1].vy.interior, h[i], h[i + 1]);
+                rates[i].vz.interior = convergence_rate(
+                    err[i].vz.interior, err[i + 1].vz.interior, h[i], h[i + 1]);
+                rates[i].sxx.interior =
+                    convergence_rate(err[i].sxx.interior,
+                                     err[i + 1].sxx.interior, h[i], h[i + 1]);
+                rates[i].syy.interior =
+                    convergence_rate(err[i].syy.interior,
+                                     err[i + 1].syy.interior, h[i], h[i + 1]);
+                rates[i].szz.interior =
+                    convergence_rate(err[i].szz.interior,
+                                     err[i + 1].szz.interior, h[i], h[i + 1]);
+                rates[i].sxy.interior =
+                    convergence_rate(err[i].sxy.interior,
+                                     err[i + 1].sxy.interior, h[i], h[i + 1]);
+                rates[i].sxz.interior =
+                    convergence_rate(err[i].sxz.interior,
+                                     err[i + 1].sxz.interior, h[i], h[i + 1]);
+                rates[i].syz.interior =
+                    convergence_rate(err[i].syz.interior,
+                                     err[i + 1].syz.interior, h[i], h[i + 1]);
+
+                for (int j = 0; j < TOP_BOUNDARY_SIZE; ++j) {
+                rates[i].vx.boundary[j] = convergence_rate(
+                    err[i].vx.boundary[j], err[i + 1].vx.boundary[j], h[i], h[i + 1]);
+                rates[i].vy.boundary[j] = convergence_rate(
+                    err[i].vy.boundary[j], err[i + 1].vy.boundary[j], h[i], h[i + 1]);
+                rates[i].vz.boundary[j] = convergence_rate(
+                    err[i].vz.boundary[j], err[i + 1].vz.boundary[j], h[i], h[i + 1]);
+                rates[i].sxx.boundary[j] =
+                    convergence_rate(err[i].sxx.boundary[j],
+                                     err[i + 1].sxx.boundary[j], h[i], h[i + 1]);
+                rates[i].syy.boundary[j] =
+                    convergence_rate(err[i].syy.boundary[j],
+                                     err[i + 1].syy.boundary[j], h[i], h[i + 1]);
+                rates[i].szz.boundary[j] =
+                    convergence_rate(err[i].szz.boundary[j],
+                                     err[i + 1].szz.boundary[j], h[i], h[i + 1]);
+                rates[i].sxy.boundary[j] =
+                    convergence_rate(err[i].sxy.boundary[j],
+                                     err[i + 1].sxy.boundary[j], h[i], h[i + 1]);
+                rates[i].sxz.boundary[j] =
+                    convergence_rate(err[i].sxz.boundary[j],
+                                     err[i + 1].sxz.boundary[j], h[i], h[i + 1]);
+                rates[i].syz.boundary[j] =
+                    convergence_rate(err[i].syz.boundary[j],
+                                     err[i + 1].syz.boundary[j], h[i], h[i + 1]);
+                }
+        }
+}
+
+
+prec max_error(vars_err_t *err, const int num_refinements) {
+
+                double err_max = 0.0;
+                for (int i = 0; i < num_refinements - 1; ++i) {
+                for (int j = 0; j < TOP_BOUNDARY_SIZE; ++j) {
+                    err_max = max(err_max, err[i].vx.boundary[j]);
+                    err_max = max(err_max, err[i].vy.boundary[j]);
+                    err_max = max(err_max, err[i].vz.boundary[j]);
+                    err_max = max(err_max, err[i].sxx.boundary[j]);
+                    err_max = max(err_max, err[i].syy.boundary[j]);
+                    err_max = max(err_max, err[i].szz.boundary[j]);
+                    err_max = max(err_max, err[i].sxy.boundary[j]);
+                    err_max = max(err_max, err[i].sxz.boundary[j]);
+                    err_max = max(err_max, err[i].syz.boundary[j]);
+                }
+                }
+                return err_max;
+
+}
+
+_prec convergence_rate(const _prec err1, const _prec err2, const _prec h1,
+                       const _prec h2) {
+        return log(err1/err2)/log(h1/h2);
+}
+
+void test_initialize(testdata_t *test, const int grid, const char *topography_dir)
+{
+        int rank = 0;
+        int x_rank_l = -1;
+        int x_rank_r = -1;
+        int y_rank_f = -1;
+        int y_rank_b = -1;
+        int coord[2] = {0, 0};
+        int px = 1;
+        int py = 1;
+        cudaStream_t stream_1, stream_2, stream_i;
+        cudaStreamCreate(&stream_1);
+        cudaStreamCreate(&stream_2);
+        cudaStreamCreate(&stream_i);
+        test->tol = 1e-6;
+        _prec dt = 1.0;
+        _prec h  = 1.0/(test->size.x - 2);
+        printf("Test size: %d %d %d \n", test->size.x, test->size.y, test->size.z);
+        char gridname[2048];
+        sprintf(gridname, "%s/topography_%d.bin", topography_dir, grid);
+        test->T = topo_init(1, gridname, rank, x_rank_l, x_rank_r, y_rank_f,
+                            y_rank_b, coord, px, py, test->size.x, test->size.y,
+                            test->size.z, dt, h, h, h, stream_1, stream_2, stream_i);
+        test->T.timestep = 0;
+        topo_d_malloc(&test->T);
+        test->coord3.x = coord[0];
+        test->coord3.y = coord[1];
+        test->grid_spacing = h;
+        test->write_vtk = 0;
+        test->mms_wavenumber = 2 * M_PI * 4;
+
+        topo_init_metrics(&test->T);
+        topo_init_geometry(&test->T);
+        topo_build(&test->T);
+        topo_set_constants(&test->T);
+
+        int num_items = test->T.mx*test->T.my*test->T.mz;
+        vars_init(&test->input, num_items);
+        vars_init(&test->output,num_items);
+        vars_init(&test->answer,num_items);
+
+        init_sponge(&test->T, sizeof(_prec)*num_items);
+}
+
+void test_velocity(testdata_t *test, vars_err_t *err)
+{
+
+        int3_t shift = {0, 0, 0};
+        fcn_grid_t velocity_grid = fcn_init_grid(
+            test->size, shift, test->coord3, 0, test->grid_spacing);
+        fcn_grid_t stress_grid = fcn_init_grid(test->size, shift, test->coord3,
+                                               ngsl / 2, test->grid_spacing);
+        grid_t gvx;
+        grid_t gvy;
+        grid_t gvz;
+        grid_t gsxx;
+        grid_t gsyy;
+        grid_t gszz;
+        grid_t gsxy;
+        grid_t gsxz;
+        grid_t gsyz;
+        test_grid_data_init(&gvx,  test, stress_grid, grid_u1());  
+        test_grid_data_init(&gvy,  test, stress_grid, grid_u2());  
+        test_grid_data_init(&gvz,  test, stress_grid, grid_u3());  
+        test_grid_data_init(&gsxx, test, stress_grid, grid_xx());  
+        test_grid_data_init(&gsyy, test, stress_grid, grid_yy());  
+        test_grid_data_init(&gszz, test, stress_grid, grid_zz());  
+        test_grid_data_init(&gsxy, test, stress_grid, grid_xy());  
+        test_grid_data_init(&gsxz, test, stress_grid, grid_xz());  
+        test_grid_data_init(&gsyz, test, stress_grid, grid_yz());  
+
+
+        // Input
+        _prec properties[2] = {test->mms_wavenumber, 0};
+        fcn_apply(test->input.sxx, mms_init_sxx, gsxx.x, gsxx.y, gsxx.zp,
+                  properties, stress_grid);
+        fcn_apply(test->input.syy, mms_init_syy, gsyy.x, gsyy.y, gsyy.zp,
+                  properties, stress_grid);
+        fcn_apply(test->input.szz, mms_init_szz, gszz.x, gszz.y, gszz.zp,
+                  properties, stress_grid);
+        fcn_apply(test->input.sxy, mms_init_sxy, gsxy.x, gsxy.y, gsxy.zp,
+                  properties, stress_grid);
+        fcn_apply(test->input.sxz, mms_init_sxz, gsxz.x, gsxz.y, gsxz.zp,
+                  properties, stress_grid);
+        fcn_apply(test->input.syz, mms_init_syz, gsyz.x, gsyz.y, gsyz.zp,
+                  properties, stress_grid);
+
+        vars_copy_to_device(&test->T, &test->input);
+
+        topo_velocity_interior_H(&test->T);
+
+        // Output
+        vars_copy_to_host(&test->output, &test->T);
+        
+        //Check answer
+        fcn_apply(test->answer.vx, mms_final_vx, gvx.x, gvx.y, gvx.zp,
+                  properties, velocity_grid);
+        fcn_apply(test->answer.vy, mms_final_vy, gvy.x, gvy.y, gvy.zp,
+                  properties, velocity_grid);
+        fcn_apply(test->answer.vz, mms_final_vz, gvz.x, gvz.y, gvz.zp,
+                  properties, velocity_grid);
+
+
+        err_t tmp = check_answer(test->output.vx, test->answer.vx, velocity_grid);
+        err->vx = tmp;
+
+        tmp = check_answer(test->output.vy, test->answer.vy, velocity_grid);
+        err->vy = tmp;
+
+        tmp = check_answer(test->output.vz, test->answer.vz, velocity_grid);
+        err->vz = tmp;
+        // Exclude solution at ghost point
+        err->vz.boundary[TOP_BOUNDARY_SIZE-1] = 0.0;
+
+        char vtk_file[128];
+        if (test->write_vtk) {
+                sprintf(vtk_file, "input_sxx.vtk");
+                vtk_write_grid(vtk_file, gsxx.x, gsxx.y, gsxx.zp,
+                               velocity_grid);
+                vtk_append_scalar(vtk_file, "z", test->input.sxx,
+                                  velocity_grid);
+
+                sprintf(vtk_file, "output_vx.vtk");
+                vtk_write_grid(vtk_file, gvx.x, gvx.y, gvx.zp, velocity_grid);
+                vtk_append_scalar(vtk_file, "z", test->output.vx,
+                                  velocity_grid);
+
+                sprintf(vtk_file, "answer_vx.vtk");
+                vtk_write_grid(vtk_file, gvx.x, gvx.y, gvx.zp, velocity_grid);
+                vtk_append_scalar(vtk_file, "z", test->answer.vx,
+                                  velocity_grid);
+        }
+}
+
+void test_stress(testdata_t *test, vars_err_t *err)
+{
+
+        int3_t shift = {0, 0, 0};
+        fcn_grid_t velocity_grid = fcn_init_grid(
+            test->size, shift, test->coord3, 0, test->grid_spacing);
+        fcn_grid_t stress_grid = fcn_init_grid(test->size, shift, test->coord3,
+                                               ngsl / 2, test->grid_spacing);
+        grid_t gvx;
+        grid_t gvy;
+        grid_t gvz;
+        grid_t gsxx;
+        grid_t gsyy;
+        grid_t gszz;
+        grid_t gsxy;
+        grid_t gsxz;
+        grid_t gsyz;
+        test_grid_data_init(&gvx,  test, stress_grid, grid_u1());  
+        test_grid_data_init(&gvy,  test, stress_grid, grid_u2());  
+        test_grid_data_init(&gvz,  test, stress_grid, grid_u3());  
+        test_grid_data_init(&gsxx, test, stress_grid, grid_xx());  
+        test_grid_data_init(&gsyy, test, stress_grid, grid_yy());  
+        test_grid_data_init(&gszz, test, stress_grid, grid_zz());  
+        test_grid_data_init(&gsxy, test, stress_grid, grid_xy());  
+        test_grid_data_init(&gsxz, test, stress_grid, grid_xz());  
+        test_grid_data_init(&gsyz, test, stress_grid, grid_yz());  
+
+        // Input
+        _prec properties[2] = {test->mms_wavenumber, 0};
+        fcn_apply(test->input.vx, mms_init_vx, gvx.x, gvx.y, gvx.zp,
+                  properties, stress_grid);
+        fcn_apply(test->input.vy, mms_init_vy, gvy.x, gvy.y, gvy.zp,
+                  properties, stress_grid);
+        fcn_apply(test->input.vz, mms_init_vz, gvz.x, gvz.y, gvz.zp,
+                  properties, stress_grid);
+        
+        vars_copy_to_device(&test->T, &test->input);
+
+        topo_stress_interior_H(&test->T);
+
+        // Output
+        vars_copy_to_host(&test->output, &test->T);
+
+        // Answer
+        fcn_apply(test->answer.sxx, mms_final_sxx, gsxx.x, gsxx.y, gsxx.zp,
+                  properties, velocity_grid);
+        fcn_apply(test->answer.syy, mms_final_syy, gsyy.x, gsyy.y, gsyy.zp,
+                  properties, velocity_grid);
+        fcn_apply(test->answer.szz, mms_final_szz, gszz.x, gszz.y, gszz.zp,
+                  properties, velocity_grid);
+        fcn_apply(test->answer.sxy, mms_final_sxy, gsxy.x, gsxy.y, gsxy.zp,
+                  properties, velocity_grid);
+        fcn_apply(test->answer.sxz, mms_final_sxz, gsxz.x, gsxz.y, gsxz.zp,
+                  properties, velocity_grid);
+        fcn_apply(test->answer.syz, mms_final_syz, gsyz.x, gsyz.y, gsyz.zp,
+                  properties, velocity_grid);
+
+
+        // Exclude solution at ghost point
+        err->vz.boundary[TOP_BOUNDARY_SIZE-1] = 0.0;
+        err_t tmp;
+        tmp = check_answer(test->output.sxx, test->answer.sxx, velocity_grid);
+        err->sxx = tmp;
+        tmp = check_answer(test->output.syy, test->answer.syy, velocity_grid);
+        err->syy = tmp;
+        tmp = check_answer(test->output.szz, test->answer.szz, velocity_grid);
+        err->szz = tmp;
+        tmp = check_answer(test->output.sxy, test->answer.sxy, velocity_grid);
+        err->sxy = tmp;
+        tmp = check_answer(test->output.sxz, test->answer.sxz, velocity_grid);
+        err->sxz = tmp;
+        tmp = check_answer(test->output.syz, test->answer.syz, velocity_grid);
+        err->syz = tmp;
+
+        // Exclude solution at ghost point
+        err->sxz.boundary[TOP_BOUNDARY_SIZE-1] = 0.0;
+        err->syz.boundary[TOP_BOUNDARY_SIZE-1] = 0.0;
+}
+
+void test_free(testdata_t *test)
+{
+        topo_free(&test->T);
+        cudaStreamDestroy(test->T.stream_1);
+        cudaStreamDestroy(test->T.stream_2);
+        cudaStreamDestroy(test->T.stream_i);
+        vars_free(&test->input);
+        vars_free(&test->output);
+        vars_free(&test->answer);
+}
+
+void vars_init(variables_t *vars, const int num_items)
+{
+        int item_size = sizeof(_prec);
+        vars->vx = (_prec*)calloc(num_items, item_size);
+        vars->vy = (_prec*)calloc(num_items, item_size);
+        vars->vz = (_prec*)calloc(num_items, item_size);
+        vars->sxx =(_prec*) calloc(num_items, item_size);
+        vars->syy =(_prec*) calloc(num_items, item_size);
+        vars->szz =(_prec*) calloc(num_items, item_size);
+        vars->sxy =(_prec*) calloc(num_items, item_size);
+        vars->sxz =(_prec*) calloc(num_items, item_size);
+        vars->syz =(_prec*) calloc(num_items, item_size);
+        vars->rho =(_prec*) calloc(num_items, item_size);
+        vars->lami =(_prec*) calloc(num_items, item_size);
+        vars->mui =(_prec*) calloc(num_items, item_size);
+        vars->qpi =(_prec*) calloc(num_items, item_size);
+        vars->qsi =(_prec*) calloc(num_items, item_size);
+        vars->r1 =(_prec*) calloc(num_items, item_size);
+        vars->r2 =(_prec*) calloc(num_items, item_size);
+        vars->r3 =(_prec*) calloc(num_items, item_size);
+        vars->r4 =(_prec*) calloc(num_items, item_size);
+        vars->r5 =(_prec*) calloc(num_items, item_size);
+        vars->r6 =(_prec*) calloc(num_items, item_size);
+        vars->wwo =(_prec*) calloc(num_items, item_size);
+        vars->vx1 =(_prec*) calloc(num_items, item_size);
+        vars->vx2 =(_prec*) calloc(num_items, item_size);
+        vars->coeff =(_prec*) calloc(num_items, item_size);
+        vars->ww =(int*) calloc(num_items, item_size);
+        vars->num_bytes = num_items*item_size;
+        for (int i = 0; i < num_items; ++i) {
+            vars->rho[i] = 1.0;
+            vars->lami[i] = 1.0;
+            vars->mui[i] = 1.0;
+            vars->ww[i] = 1;
+            vars->wwo[i] = 1.0;
+
+        }
+}
+
+void vars_copy_to_device(topo_t *topo, const variables_t *vars)
+{
+        cudaMemcpy(topo->u1, vars->vx, vars->num_bytes,
+                   cudaMemcpyHostToDevice);
+        cudaMemcpy(topo->v1, vars->vy, vars->num_bytes,
+                   cudaMemcpyHostToDevice);
+        cudaMemcpy(topo->w1, vars->vz, vars->num_bytes,
+                   cudaMemcpyHostToDevice);
+        cudaMemcpy(topo->xx, vars->sxx, vars->num_bytes,
+                   cudaMemcpyHostToDevice);
+        cudaMemcpy(topo->yy, vars->syy, vars->num_bytes,
+                   cudaMemcpyHostToDevice);
+        cudaMemcpy(topo->zz, vars->szz, vars->num_bytes,
+                   cudaMemcpyHostToDevice);
+        cudaMemcpy(topo->xy, vars->sxy, vars->num_bytes,
+                   cudaMemcpyHostToDevice);
+        cudaMemcpy(topo->xz, vars->sxz, vars->num_bytes,
+                   cudaMemcpyHostToDevice);
+        cudaMemcpy(topo->yz, vars->syz, vars->num_bytes,
+                   cudaMemcpyHostToDevice);
+        cudaMemcpy(topo->rho, vars->rho, vars->num_bytes,
+                   cudaMemcpyHostToDevice);
+        cudaMemcpy(topo->lami, vars->lami, vars->num_bytes,
+                   cudaMemcpyHostToDevice);
+        cudaMemcpy(topo->mui, vars->mui, vars->num_bytes,
+                   cudaMemcpyHostToDevice);
+        cudaMemcpy(topo->qpi, vars->qpi, vars->num_bytes,
+                   cudaMemcpyHostToDevice);
+        cudaMemcpy(topo->qsi, vars->qsi, vars->num_bytes,
+                   cudaMemcpyHostToDevice);
+        cudaMemcpy(topo->r1, vars->r1, vars->num_bytes,
+                   cudaMemcpyHostToDevice);
+        cudaMemcpy(topo->r2, vars->r2, vars->num_bytes,
+                   cudaMemcpyHostToDevice);
+        cudaMemcpy(topo->r3, vars->r3, vars->num_bytes,
+                   cudaMemcpyHostToDevice);
+        cudaMemcpy(topo->r4, vars->r4, vars->num_bytes,
+                   cudaMemcpyHostToDevice);
+        cudaMemcpy(topo->r5, vars->r5, vars->num_bytes,
+                   cudaMemcpyHostToDevice);
+        cudaMemcpy(topo->r6, vars->r6, vars->num_bytes,
+                   cudaMemcpyHostToDevice);
+        cudaMemcpy(topo->wwo, vars->wwo, vars->num_bytes,
+                   cudaMemcpyHostToDevice);
+        cudaMemcpy(topo->vx1, vars->vx1, vars->num_bytes,
+                   cudaMemcpyHostToDevice);
+        cudaMemcpy(topo->vx2, vars->vx2, vars->num_bytes,
+                   cudaMemcpyHostToDevice);
+        cudaMemcpy(topo->coeff, vars->coeff, vars->num_bytes,
+                   cudaMemcpyHostToDevice);
+        cudaMemcpy(topo->ww, vars->ww, vars->num_bytes,
+                   cudaMemcpyHostToDevice);
+}
+
+void vars_copy_to_host(variables_t *vars, const topo_t *topo)
+{
+        cudaMemcpy(vars->vx, topo->u1, vars->num_bytes,
+                   cudaMemcpyDeviceToHost);
+        cudaMemcpy(vars->vy, topo->v1, vars->num_bytes,
+                   cudaMemcpyDeviceToHost);
+        cudaMemcpy(vars->vz, topo->w1, vars->num_bytes,
+                   cudaMemcpyDeviceToHost);
+        cudaMemcpy(vars->sxx, topo->xx, vars->num_bytes,
+                   cudaMemcpyDeviceToHost);
+        cudaMemcpy(vars->syy, topo->yy, vars->num_bytes,
+                   cudaMemcpyDeviceToHost);
+        cudaMemcpy(vars->szz, topo->zz, vars->num_bytes,
+                   cudaMemcpyDeviceToHost);
+        cudaMemcpy(vars->sxy, topo->xy, vars->num_bytes,
+                   cudaMemcpyDeviceToHost);
+        cudaMemcpy(vars->sxz, topo->xz, vars->num_bytes,
+                   cudaMemcpyDeviceToHost);
+        cudaMemcpy(vars->syz, topo->yz, vars->num_bytes,
+                   cudaMemcpyDeviceToHost);
+        cudaMemcpy(vars->rho, topo->rho, vars->num_bytes,
+                   cudaMemcpyDeviceToHost);
+        cudaMemcpy(vars->lami, topo->lami, vars->num_bytes,
+                   cudaMemcpyDeviceToHost);
+        cudaMemcpy(vars->mui, topo->mui, vars->num_bytes,
+                   cudaMemcpyDeviceToHost);
+        cudaMemcpy(vars->qpi, topo->qpi, vars->num_bytes,
+                   cudaMemcpyDeviceToHost);
+        cudaMemcpy(vars->qsi, topo->qsi, vars->num_bytes,
+                   cudaMemcpyDeviceToHost);
+        cudaMemcpy(vars->r1, topo->r1, vars->num_bytes,
+                   cudaMemcpyDeviceToHost);
+        cudaMemcpy(vars->r2, topo->r2, vars->num_bytes,
+                   cudaMemcpyDeviceToHost);
+        cudaMemcpy(vars->r3, topo->r3, vars->num_bytes,
+                   cudaMemcpyDeviceToHost);
+        cudaMemcpy(vars->r4, topo->r4, vars->num_bytes,
+                   cudaMemcpyDeviceToHost);
+        cudaMemcpy(vars->r5, topo->r5, vars->num_bytes,
+                   cudaMemcpyDeviceToHost);
+        cudaMemcpy(vars->r6, topo->r6, vars->num_bytes,
+                   cudaMemcpyDeviceToHost);
+        cudaMemcpy(vars->wwo, topo->wwo, vars->num_bytes,
+                   cudaMemcpyDeviceToHost);
+        cudaMemcpy(vars->vx1, topo->vx1, vars->num_bytes,
+                   cudaMemcpyDeviceToHost);
+        cudaMemcpy(vars->vx2, topo->vx2, vars->num_bytes,
+                   cudaMemcpyDeviceToHost);
+        cudaMemcpy(vars->coeff, topo->coeff, vars->num_bytes,
+                   cudaMemcpyDeviceToHost);
+        cudaMemcpy(vars->ww, topo->ww, vars->num_bytes,
+                   cudaMemcpyDeviceToHost);
+}
+void vars_free(variables_t *vars)
+{
+        free(vars->vx);
+        free(vars->vy);
+        free(vars->vz);
+        free(vars->sxx);
+        free(vars->syy);
+        free(vars->szz);
+        free(vars->sxy);
+        free(vars->sxz);
+        free(vars->syz);
+        free(vars->rho);
+        free(vars->lami);
+        free(vars->mui);
+        free(vars->qpi);
+        free(vars->qsi);
+        free(vars->r1);
+        free(vars->r2);
+        free(vars->r3);
+        free(vars->r4);
+        free(vars->r5);
+        free(vars->r6);
+}
+
+void test_grid_data_init(grid_t *data, const testdata_t *test, const fcn_grid_t grid,
+                    const int3_t shift) {
+        data->x = (_prec*)malloc(grid.num_bytes);
+        data->y = (_prec*)malloc(grid.num_bytes);
+        data->z = (_prec*)malloc(grid.num_bytes);
+        data->zp =(_prec*) malloc(grid.num_bytes);
+
+        fcn_fill_grid(data->x, grid, shift, 0);
+        fcn_fill_grid(data->y, grid, shift, 1);
+        fcn_fill_grid(data->z, grid, shift, 2);
+
+        fcn_shift(data->x, data->x, grid, -ngsl*grid.gridspacing);
+        fcn_shift(data->y, data->y, grid, -ngsl*grid.gridspacing);
+
+        geom_mapping_z(data->zp, grid, shift, &test->T.metrics_f,
+                       &test->T.metrics_g);
+}
+
+void test_grid_data_free(grid_t *data)
+{
+        free(data->x);
+        free(data->y);
+        free(data->z);
+        free(data->zp);
+}
+
+err_t check_answer(const _prec *u, const _prec *v, const fcn_grid_t grid)
+{
+        // Maximum truncation error at the boundary points
+        const int nb = TOP_BOUNDARY_SIZE;
+        err_t out;
+        for (int i = 0; i < nb; ++i) {
+        out.boundary[i] = check_flinferr(u, v, 
+                  grid.offset1.x + nb, grid.offset2.x - nb,
+                  grid.offset1.y + nb, grid.offset2.y - nb,
+                  grid.offset2.z - nb + i, grid.offset2.z - nb + i + 1,
+                  grid.line,
+                  grid.slice);
+        }
+        
+        // Maximum truncation error in the interior of the domain
+        out.interior = check_flinferr(u, v, 
+                  grid.offset1.x + nb, grid.offset2.x - nb,
+                  grid.offset1.y + nb, grid.offset2.y - nb,
+                  grid.offset1.z + OVERLAP + nb, 
+                  grid.offset2.z - nb - grid.exclude_top_row,
+                  grid.line,
+                  grid.slice);
+        return out;
+}
+
+void init_sponge(topo_t *topo, const int num_bytes)
+{
+        _prec *ones = (_prec*)malloc(num_bytes);
+        for (size_t i = 0; i < num_bytes/(sizeof(_prec)); ++i) {
+                ones[i] = 1.0;
+        }
+
+        cudaMemcpy(topo->dcrjx, ones, num_bytes, cudaMemcpyHostToDevice);
+        cudaMemcpy(topo->dcrjy, ones, num_bytes, cudaMemcpyHostToDevice);
+        cudaMemcpy(topo->dcrjz, ones, num_bytes, cudaMemcpyHostToDevice);
+        free(ones);
+}
+
diff --git a/tests/topography/accuracy/test_topography_kernels.c b/tests/topography/accuracy/test_topography_kernels.c
new file mode 100644
index 0000000..6523494
--- /dev/null
+++ b/tests/topography/accuracy/test_topography_kernels.c
@@ -0,0 +1,490 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <math.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include "topography.h"
+#include "cutopography.cuh"
+#include "cutopography_test.cuh"
+#include "cutopography_test.cuh"
+#include "check.h"
+#include "grid_check.h"
+#include "vtk.h"
+#include "functions.h"
+#include "metrics.h"
+#include "geometry.h"
+#include "shift.h"
+
+typedef struct
+{
+        _prec tol;
+        int verbose;
+        int deg[3];
+        int coef[3];
+        int num_bytes;
+        int size[3];
+        int mem[3];
+        int3_t shift;
+        _prec *output;
+        _prec *answer;
+        _prec *error;
+        fcn_grid_t velocity_grid;
+        fcn_grid_t interior_grid;
+        fcn_grid_t stress_grid;
+        fcn_grid_t topography_grid;
+        topo_t T;
+        int velocity_offset_x[2];
+        int velocity_offset_y[2];
+        int stress_offset_x[2];
+        int stress_offset_y[2];
+        int offset_z[2];
+} testdata_t;
+
+void test_initialize(testdata_t *test);
+void test_velocity(testdata_t *test);
+void test_velocity_mod(testdata_t *test);
+void test_stress(testdata_t *test);
+void test_free(testdata_t *test);
+double test_velocity_kernel(testdata_t *test, _prec *input, const _prec *input_coef,
+                   const _prec *input_deg, const int *input_shift,
+                   _prec *output, _prec *answer, const _prec *answer_coef,
+                   _prec *answer_deg, const int *answer_shift);
+double test_stress_kernel(testdata_t *test, _prec *input, const _prec *input_coef,
+                   const _prec *input_deg, const int *input_shift,
+                   _prec *output, _prec *answer, const _prec *answer_coef,
+                   _prec *answer_deg, const int *answer_shift);
+void copy_output_to_host(testdata_t *test, const _prec *input);
+void copy_answer_to_host(testdata_t *test, const _prec *input);
+double check_answer(const testdata_t *test, const int *shift, const int *offset_x,
+                    const int *offset_y, const int *offset_z);
+void write_vtk(const testdata_t *test);
+
+int main(int argc, char **argv)
+{
+        printf("Testing topography.c, cutopography.c, cutopography_kernels.cu\n");
+        testdata_t test;
+        test.verbose = 0;
+        //test_velocity(&test);
+        test_velocity_mod(&test);
+        test_stress(&test);
+}
+
+void test_initialize(testdata_t *test)
+{
+        int rank = 0;
+        int x_rank_l = -1;
+        int x_rank_r = -1;
+        int y_rank_f = -1;
+        int y_rank_b = -1;
+        int coord[2] = {0, 0};
+        int size[3] = {132, 132, 32};
+        cudaStream_t stream_1, stream_2, stream_i;
+        cudaStreamCreate(&stream_1);
+        cudaStreamCreate(&stream_2);
+        cudaStreamCreate(&stream_i);
+        test->tol = 1e-6;
+        _prec dt = 1.0;
+        _prec h  = 1.0;
+        test->T = topo_init(1, "topo", rank, x_rank_l, x_rank_r, y_rank_f,
+                            y_rank_b, coord, size[0], size[1], size[2], dt, h,
+                            stream_1, stream_2, stream_i);
+        test->size[0] = test->T.nx;
+        test->size[1] = test->T.ny;
+        test->size[2] = test->T.nz;
+        test->mem[0] = test->T.mx;
+        test->mem[1] = test->T.my;
+        test->mem[2] = test->T.mz;
+        topo_d_malloc(&test->T);
+
+        topo_init_metrics(&test->T);
+        topo_init_grid(&test->T);
+        topo_build(&test->T);
+
+        test->velocity_offset_x[0] = test->T.off_x[1];
+        test->velocity_offset_x[1] = test->T.off_x[2];
+        test->velocity_offset_y[0] = test->T.off_y[1];
+        test->velocity_offset_y[1] = test->T.off_y[2];
+        test->stress_offset_x[0] = test->T.off_x[1] - ngsl/2;
+        test->stress_offset_x[1] = test->T.off_x[3] + ngsl/2;
+        test->stress_offset_y[0] = test->T.off_y[1] - ngsl/2;
+        test->stress_offset_y[1] = test->T.off_y[3] + ngsl/2;
+        test->offset_z[0] = test->T.off_z[1];
+        test->offset_z[1] = test->T.off_z[2];
+
+        int num_bytes = sizeof(_prec)*test->T.gridsize;
+        test->num_bytes = num_bytes;
+        test->output = malloc(num_bytes);
+        test->answer = malloc(num_bytes);
+        test->error = malloc(num_bytes);
+
+        if (test->verbose) {
+                metrics_print_info_f(&test->T.metrics_f);
+                printf("offset x: %d %d \n", test->velocity_offset_x[0],
+                                             test->velocity_offset_x[1]);
+                printf("offset y: %d %d \n", test->velocity_offset_y[0],
+                                             test->velocity_offset_y[1]);
+                printf("offset z: %d %d \n", test->offset_z[0],
+                                             test->offset_z[1]);
+        }
+
+        int3_t sizet = {.x = test->size[0],
+                     .y = test->size[1],
+                     .z = test->size[2]};
+        int3_t coordt = {0, 0, 0};
+        int3_t shift = {0, 0, 1};
+
+        test->shift = shift;
+        test->velocity_grid = fcn_init_grid(sizet, coordt, shift, 0, h);
+        test->interior_grid = fcn_init_grid(sizet, coordt, shift, -ngsl/2, h);
+        test->stress_grid = fcn_init_grid(sizet, coordt, shift, ngsl/2, h);
+        test->topography_grid = fcn_init_grid(sizet, coordt, shift, ngsl, h);
+}
+
+void test_velocity(testdata_t *test)
+{
+        printf("Testing velocity update kernel... \n");
+        printf(" * Testing u1 update equation. \n");
+        {
+        printf("    -- Testing quadratic function in x-direction. \n");
+        _prec input_coef[3] = {1, 0, 0};
+        _prec input_deg[3] = {2, 0, 0};
+        int input_shift[3];
+        shift_xx(input_shift);
+
+        _prec answer_coef[3] = {2, 0, 0};
+        _prec answer_deg[3] = {1, 0, 0};
+        int answer_shift[3];
+        shift_u1(answer_shift);
+        test_initialize(test);
+        test_velocity_kernel(test, test->T.xx, input_coef, input_deg,
+                             input_shift, test->T.u1, test->T.yy, answer_coef,
+                             answer_deg, answer_shift);
+        test_free(test);
+        }
+}
+
+void test_velocity_mod(testdata_t *test)
+{
+        printf("Testing velocity update kernel (kernels must be generated with debug=1, debug_ops=1... \n");
+        printf(" * Testing u1 update equation. \n");
+        {
+        printf("    -- Testing DczPx*s11. \n");
+        _prec input_coef[3] = {0, 0, 1};
+        _prec input_deg[3] = {0, 0, 1};
+        int input_shift[3];
+        shift_xx(input_shift);
+
+        _prec answer_coef[3] = {0, 0, 1};
+        _prec answer_deg[3] = {0, 0, 0};
+        int answer_shift[3];
+        shift_u1(answer_shift);
+        test_initialize(test);
+        double err = test_velocity_kernel(
+            test, test->T.xx, input_coef, input_deg, input_shift, test->T.u1,
+            test->T.yy, answer_coef, answer_deg, answer_shift);
+
+        printf("   Error: %g \n", err);
+        write_vtk(test);
+        test_free(test);
+        }
+        {
+        printf("    -- Testing DczPy*s12. \n");
+        _prec input_coef[3] = {0, 0, 1};
+        _prec input_deg[3] = {0, 0, 1};
+        int input_shift[3];
+        shift_xy(input_shift);
+
+        _prec answer_coef[3] = {0, 0, 1};
+        _prec answer_deg[3] = {0, 0, 0};
+        int answer_shift[3];
+        shift_u1(answer_shift);
+        test_initialize(test);
+        double err = test_velocity_kernel(test, test->T.xy, input_coef, input_deg,
+                             input_shift, test->T.u1, test->T.yy, answer_coef,
+                             answer_deg, answer_shift);
+
+        printf("   Error: %g \n", err);
+        write_vtk(test);
+        test_free(test);
+        }
+
+
+
+        return;
+
+        printf(" * Testing u2 update equation. \n");
+        {
+        _prec input_coef[3] = {0, 0, 1};
+        _prec input_deg[3] = {0, 0, 2};
+        int input_shift[3];
+        shift_xy(input_shift);
+
+        _prec answer_coef[3] = {0, 0, 2};
+        _prec answer_deg[3] = {0, 0, 1};
+        int answer_shift[3];
+        shift_u2(answer_shift);
+        test_initialize(test);
+        test_velocity_kernel(test, test->T.xy, input_coef, input_deg,
+                             input_shift, test->T.v1, test->T.yy, answer_coef,
+                             answer_deg, answer_shift);
+
+        write_vtk(test);
+        test_free(test);
+        }
+
+        printf(" * Testing u3 update equation. \n");
+        {
+        printf("    -- Testing quadratic function in x-direction. \n");
+        // Only linear functions can be used in the test because interpolation
+        // operators is only first order accurate near boundary
+        _prec input_coef[3] = {0, 0, 1};
+        _prec input_deg[3] = {0, 0, 2};
+        int input_shift[3];
+        shift_xz(input_shift);
+
+        _prec answer_coef[3] = {0, 0, 2};
+        _prec answer_deg[3] = {0, 0, 1};
+        int answer_shift[3];
+        shift_u3(answer_shift);
+        test_initialize(test);
+        double err = test_velocity_kernel(
+            test, test->T.xz, input_coef, input_deg, input_shift, test->T.w1,
+            test->T.yy, answer_coef, answer_deg, answer_shift);
+
+        printf("       Error: %g \n", err);
+        write_vtk(test);
+        test_free(test);
+        }
+}
+
+void test_stress(testdata_t *test)
+{
+        printf("Testing stress update kernel (kernels must be generated with debug=1, debug_ops=1... \n");
+        printf(" * Testing s11 := PzDcx*u3. \n");
+        {
+        _prec input_coef[3] = {1, 0, 0};
+        _prec input_deg[3] = {2, 0, 0};
+        int input_shift[3];
+        shift_u3(input_shift);
+
+        _prec answer_coef[3] = {2, 0, 0};
+        _prec answer_deg[3] = {1, 0, 0};
+        int answer_shift[3];
+        shift_xx(answer_shift);
+        test_initialize(test);
+        double err = test_stress_kernel(test, test->T.w1, input_coef, input_deg,
+                             input_shift, test->T.xx, test->T.yy, answer_coef,
+                             answer_deg, answer_shift);
+        printf("   Error: %g \n", err);
+        write_vtk(test);
+        test_free(test);
+        }
+
+        printf(" * Testing s22 := PzDcy*u3. \n");
+        {
+        _prec input_coef[3] = {0, 1, 0};
+        _prec input_deg[3] = {0, 2, 0};
+        int input_shift[3];
+        shift_u3(input_shift);
+
+        _prec answer_coef[3] = {0, 2, 0};
+        _prec answer_deg[3] = {0, 1, 0};
+        int answer_shift[3];
+        shift_yy(answer_shift);
+        test_initialize(test);
+        double err = test_stress_kernel(test, test->T.w1, input_coef, input_deg,
+                             input_shift, test->T.yy, test->T.xz, answer_coef,
+                             answer_deg, answer_shift);
+        printf("   Error: %g \n", err);
+        write_vtk(test);
+        test_free(test);
+        }
+
+        printf(" * Testing s12 := PxDcz*u2. \n");
+        {
+        _prec input_coef[3] = {0, 0, 1};
+        _prec input_deg[3] = {0, 0, 2};
+        int input_shift[3];
+        shift_u2(input_shift);
+
+        _prec answer_coef[3] = {0, 0, 2};
+        _prec answer_deg[3] = {0, 0, 1};
+        int answer_shift[3];
+        shift_xy(answer_shift);
+        test_initialize(test);
+        double err = test_stress_kernel(test, test->T.v1, input_coef, input_deg,
+                             input_shift, test->T.xy, test->T.yy, answer_coef,
+                             answer_deg, answer_shift);
+
+        printf("   Error: %g \n", err);
+        write_vtk(test);
+        test_free(test);
+        }
+
+        printf(" * Testing s13 := PxDcz*u3. \n");
+        {
+        _prec input_coef[3] = {0, 0, 1};
+        _prec input_deg[3] = {0, 0, 2};
+        int input_shift[3];
+        shift_u3(input_shift);
+
+        _prec answer_coef[3] = {0, 0, 2};
+        _prec answer_deg[3] = {0, 0, 1};
+        int answer_shift[3];
+        shift_xz(answer_shift);
+        test_initialize(test);
+        double err = test_stress_kernel(test, test->T.w1, input_coef, input_deg,
+                             input_shift, test->T.xz, test->T.yy, answer_coef,
+                             answer_deg, answer_shift);
+
+        printf("   Error: %g \n", err);
+        write_vtk(test);
+        test_free(test);
+        }
+
+
+        printf(" * Testing s23 := PyDcz*u3. \n");
+        {
+        _prec input_coef[3] = {0, 1, 1};
+        _prec input_deg[3] = {0, 1, 2};
+        int input_shift[3];
+        shift_u3(input_shift);
+
+        _prec answer_coef[3] = {0, 0, 2};
+        _prec answer_deg[3] = {0, 0, 1};
+        int answer_shift[3];
+        shift_yz(answer_shift);
+        test_initialize(test);
+        double err = test_stress_kernel(test, test->T.w1, input_coef, input_deg,
+                             input_shift, test->T.yz, test->T.yy, answer_coef,
+                             answer_deg, answer_shift);
+
+        printf("   Error: %g \n", err);
+        write_vtk(test);
+        test_free(test);
+        }
+
+}
+
+double test_velocity_kernel(testdata_t *test, _prec *input, const _prec *input_coef,
+                          const _prec *input_deg, const int *input_shift,
+                          _prec *output, _prec *answer,
+                          const _prec *answer_coef, _prec *answer_deg,
+                          const int *answer_shift) {
+        topo_test_polystrzbnd_H(&test->T, input, input_coef, input_deg,
+                            input_shift);
+        topo_velocity_interior_H(&test->T);
+        topo_test_polystrzbnd_H(&test->T, answer, answer_coef, answer_deg,
+                            answer_shift);
+        cudaDeviceSynchronize();
+
+        copy_output_to_host(test, output);
+        copy_answer_to_host(test, answer);
+
+        int offset_x[2] = {test->velocity_offset_x[0],
+                           test->velocity_offset_x[1]};
+        int offset_y[2] = {test->velocity_offset_y[0],
+                           test->velocity_offset_y[1]};
+        int offset_z[2] = {test->offset_z[0]+8, test->offset_z[1]};
+
+        double err = check_answer(test, answer_shift, offset_x, offset_y, offset_z);
+        return err;
+        
+}
+
+double test_stress_kernel(testdata_t *test, _prec *input, const _prec *input_coef,
+                          const _prec *input_deg, const int *input_shift,
+                          _prec *output, _prec *answer,
+                          const _prec *answer_coef, _prec *answer_deg,
+                          const int *answer_shift) {
+        topo_test_polystrzbnd_H(&test->T, input, input_coef, input_deg,
+                            input_shift);
+        topo_stress_interior_H(&test->T);
+        topo_test_polystrzbnd_H(&test->T, answer, answer_coef, answer_deg,
+                            answer_shift);
+        cudaDeviceSynchronize();
+
+        copy_output_to_host(test, output);
+        copy_answer_to_host(test, answer);
+
+        //FIXME: should be stress_offset
+        int offset_x[2] = {test->velocity_offset_x[0]+8,
+                           test->velocity_offset_x[1]-8};
+        int offset_y[2] = {test->velocity_offset_y[0]+8,
+                           test->velocity_offset_y[1]-8};
+        int offset_z[2] = {test->offset_z[0]+8, test->offset_z[1]};
+
+        double err = check_answer(test, answer_shift, offset_x, offset_y, offset_z);
+        return err;
+        
+}
+
+void test_free(testdata_t *test)
+{
+        topo_free(&test->T);
+        free(test->output);
+        free(test->answer);
+        cudaStreamDestroy(test->T.stream_1);
+        cudaStreamDestroy(test->T.stream_2);
+        cudaStreamDestroy(test->T.stream_i);
+        topo_d_free(&test->T);
+}
+
+void copy_output_to_host(testdata_t *test, const _prec *input)
+{
+        cudaMemcpy(test->output, input, test->num_bytes,
+                   cudaMemcpyDeviceToHost);
+}
+
+void copy_answer_to_host(testdata_t *test, const _prec *input)
+{
+        cudaMemcpy(test->answer, input, test->num_bytes,
+                   cudaMemcpyDeviceToHost);
+}
+
+double check_answer(const testdata_t *test, const int *shift, const int *offset_x,
+                    const int *offset_y, const int *offset_z) {
+        // Do not check the ghost point on the nodal grid
+        int skip = 0;
+        if (shift[2] == 0) {
+                skip = 1;
+        }
+
+        double err = check_flinferr(test->output, test->answer, 
+                  offset_x[0], offset_x[1],
+                  offset_y[0], offset_y[1], 
+                  offset_z[0], offset_z[1] - skip, 
+                  test->T.line,
+                  test->T.slice);
+        return err;
+}
+
+void write_vtk(const testdata_t *test)
+{
+
+        _prec *x = malloc(test->topography_grid.num_bytes);
+        _prec *y = malloc(test->topography_grid.num_bytes);
+        _prec *z = malloc(test->topography_grid.num_bytes);
+
+        fcn_fill_grid(x, test->topography_grid, test->shift, 0);
+        fcn_fill_grid(y, test->topography_grid, test->shift, 1);
+        fcn_fill_grid(z, test->topography_grid, test->shift, 2);
+
+        fcn_grid_t grid = test->interior_grid;
+        const char *vtk_file = "output.vtk";
+        vtk_write_grid(vtk_file, x, y, z, grid);
+        size_t count = vtk_append_scalar(vtk_file, "output", test->output, grid);
+
+        const char *vtk_file2 = "answer.vtk";
+        vtk_write_grid(vtk_file2, x, y, z, grid);
+        count = vtk_append_scalar(vtk_file2, "answer", test->answer, grid);
+
+        const char *vtk_file3 = "error.vtk";
+        fcn_difference(test->error, test->answer, test->output, grid); 
+        fcn_abs(test->error, test->error, grid); 
+        vtk_write_grid(vtk_file3, x, y, z, grid);
+        count = vtk_append_scalar(vtk_file3, "error", test->error, grid);
+
+}
diff --git a/tests/topography/accuracy/topography.bin b/tests/topography/accuracy/topography.bin
new file mode 100644
index 0000000..88ea8b6
Binary files /dev/null and b/tests/topography/accuracy/topography.bin differ
diff --git a/tests/topography/accuracy/topography.py b/tests/topography/accuracy/topography.py
new file mode 100644
index 0000000..47ed145
--- /dev/null
+++ b/tests/topography/accuracy/topography.py
@@ -0,0 +1,43 @@
+"""usage: topography.py refine=int plot=int save=int
+Generate Gaussian hill topography file.
+
+Args:
+        refine int      Level of grid refinement
+
+Optional args:
+    plot int        Show plot
+    save int        Save figure to file
+    peaks int       Use peaks elevation map data instead of Gaussian hill
+    """
+import numpy as np
+import sys
+import pyawp
+
+plot = 0
+
+filename = sys.argv[1]
+nx = int(sys.argv[2])
+ny = int(sys.argv[3])
+h = float(sys.argv[4])
+
+
+ngsl = 8
+T = pyawp.Topography(nx, ny, h, ngsl)
+
+a = 0.2
+b = 8
+xc = 8
+yc = 8
+gaussian = lambda x, y: a * np.exp(-b**-2*(x - xc) ** 2 -b**-2*(y - yc) ** 2)
+z = T.map(gaussian)
+Z = T.reshape(z)
+
+
+T.write(Z, filename)
+print("Wrote topography file: %s" % filename)
+
+if plot:
+    import matplotlib.pyplot as plt
+    T.imshow(Z)
+    plt.show()
+
diff --git a/tests/topography/accuracy/topography_test.c b/tests/topography/accuracy/topography_test.c
new file mode 100644
index 0000000..e5eb70e
--- /dev/null
+++ b/tests/topography/accuracy/topography_test.c
@@ -0,0 +1,1833 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+#include "topography_test.h"
+#include "cutopography.cuh"
+#include "cutopography_test.cuh"
+#include "topography.h"
+#include "functions.h"
+#include "grid_check.h"
+
+
+topo_test_t topo_test_init(topo_t *T)
+{
+        topo_test_t Tt = {.use = TOPO_TEST, .tol = TOPO_TEST_TOLERANCE};
+
+        if (Tt.use && T->rank == 0) printf("Topography:: testing enabled\n");
+
+#if TOPO_TEST_CONSTX || TOPO_TEST_CONSTY
+        Tt.out = T->u1;
+        for (int i = 0; i < 3; ++i) {
+                Tt.out_shift[i] = T->su1[i];
+        }
+        Tt.cu1[0] = 1;
+        Tt.cv1[0] = 2;
+        Tt.cw1[0] = 3;
+        topo_test_poly_H(T, T->u1, Tt.cu1, Tt.deg, Tt.out_shift);
+        topo_test_poly_H(T, T->v1, Tt.cv1, Tt.deg, Tt.out_shift);
+        topo_test_poly_H(T, T->w1, Tt.cw1, Tt.deg, Tt.out_shift);
+#endif
+
+#if TOPO_TEST_LINX
+        Tt.coef[0] = 1.0;
+        Tt.deg[0] = 1.0;
+        Tt.out = T->u1;
+        for (int i = 0; i < 3; ++i) {
+                Tt.out_shift[i] = T->su1[i];
+        }
+        topo_test_poly_H(T, Tt.out, Tt.coef, Tt.deg, Tt.out_shift);
+#endif
+
+#if TOPO_TEST_LINY
+        Tt.coef[1] = 1.0;
+        Tt.deg[1] = 1.0;
+        Tt.out = T->u1;
+        Tt.velf = T->f_u1;
+        Tt.velb = T->b_u1;
+        for (int i = 0; i < 3; ++i) {
+                Tt.out_shift[i] = T->su1[i];
+                Tt.in_shift[i] = T->su1[i];
+        }
+        topo_test_poly_H(T, Tt.out, Tt.coef, Tt.deg, Tt.out_shift);
+#endif
+
+#if TOPO_TEST_DIFFCONSTX
+        Tt.coef[0] = 1.0;
+        Tt.out = T->xx;
+        Tt.in = T->u1;
+        for (int i = 0; i < 3; ++i) {
+                Tt.out_shift[i] = T->sxx[i];
+                Tt.in_shift[i] = T->su1[i];
+        }
+        topo_test_poly_H(T, Tt.in, Tt.coef, Tt.deg, Tt.in_shift);
+        topo_test_poly_H(T, Tt.out, Tt.coef, Tt.deg, Tt.out_shift);
+#endif
+
+#if TOPO_TEST_DIFFCONSTY
+        Tt.coef[1] = 1.0;
+        Tt.out = T->yy;
+        Tt.velf = T->f_v1;
+        Tt.velb = T->b_v1;
+        Tt.in = T->v1;
+        for (int i = 0; i < 3; ++i) {
+                Tt.out_shift[i] = T->syy[i];
+                Tt.in_shift[i] = T->sv1[i];
+        }
+        topo_test_poly_H(T, Tt.in, Tt.coef, Tt.deg, Tt.in_shift);
+        topo_test_poly_H(T, Tt.out, Tt.coef, Tt.deg, Tt.out_shift);
+#endif
+
+#if TOPO_TEST_DIFFCONSTZ
+        Tt.coef[2] = 1.0;
+        Tt.out = T->xz;
+        Tt.in = T->u1;
+        for (int i = 0; i < 3; ++i) {
+                Tt.out_shift[i] = T->sxz[i];
+                Tt.in_shift[i] = T->su1[i];
+        }
+        topo_test_polyzbnd_H(T, Tt.in, Tt.coef, Tt.deg, Tt.in_shift);
+#endif
+
+#if TOPO_TEST_DIFFLINX
+        Tt.coef[0] = 1.0;
+        Tt.deg[0] = 1.0;
+        Tt.out = T->xx;
+        Tt.in = T->u1;
+        for (int i = 0; i < 3; ++i) {
+                Tt.out_shift[i] = T->sxx[i];
+                Tt.in_shift[i] = T->su1[i];
+        }
+        topo_test_poly_H(T, Tt.in, Tt.coef, Tt.deg, Tt.in_shift);
+        topo_test_poly_H(T, Tt.out, Tt.coef, Tt.deg, Tt.out_shift);
+#endif
+
+#if TOPO_TEST_DIFFLINY
+        Tt.coef[1] = 1.0;
+        Tt.deg[1] = 1.0;
+        Tt.out = T->yy;
+        Tt.in = T->v1;
+        Tt.velf = T->f_v1;
+        Tt.velb = T->b_v1;
+        for (int i = 0; i < 3; ++i) {
+                Tt.out_shift[i] = T->syy[i];
+                Tt.in_shift[i] = T->sv1[i];
+        }
+        topo_test_poly_H(T, Tt.in, Tt.coef, Tt.deg, Tt.in_shift);
+        topo_test_poly_H(T, Tt.out, Tt.coef, Tt.deg, Tt.out_shift);
+#endif
+
+#if TOPO_TEST_DIFFLINZ
+        Tt.coef[2] = 1.0;
+        Tt.deg[2] = 1.0;
+        Tt.out = T->xz;
+        Tt.in = T->u1;
+        for (int i = 0; i < 3; ++i) {
+                Tt.out_shift[i] = T->sxz[i];
+                Tt.in_shift[i] = T->su1[i];
+        }
+        topo_test_polyzbnd_H(T, Tt.in, Tt.coef, Tt.deg, Tt.in_shift);
+        // Plug in answer in advance to make sure that points that do
+        // not get updated have the correct answer (instead of adjusting
+        // bounds of test function)
+        _prec deg[3] = {0, 0, 0};
+        _prec coef[3] = {0, 0, 1};
+        topo_test_polyzbnd_H(T, Tt.out, coef, deg, Tt.out_shift);
+#endif
+
+#if TOPO_TEST_DIFFQUADX || TOPO_TEST_CGDIFFQUADX
+        Tt.coef[0] = 1.0;
+        Tt.deg[0] = 2.0;
+        Tt.out = T->xx;
+        Tt.in = T->u1;
+        for (int i = 0; i < 3; ++i) {
+                Tt.out_shift[i] = T->sxx[i];
+                Tt.in_shift[i] = T->su1[i];
+        }
+        topo_test_poly_H(T, Tt.in, Tt.coef, Tt.deg, Tt.in_shift);
+        topo_test_poly_H(T, Tt.out, Tt.coef, Tt.deg, Tt.out_shift);
+#endif
+
+#if TOPO_TEST_DIFFQUADY
+        Tt.coef[1] = 1.0;
+        Tt.deg[1] = 2.0;
+        Tt.out = T->yz;
+        Tt.in = T->w1;
+        Tt.velf = T->f_w1;
+        Tt.velb = T->b_w1;
+        for (int i = 0; i < 3; ++i) {
+                Tt.out_shift[i] = T->syz[i];
+                Tt.in_shift[i] = T->sw1[i];
+        }
+        topo_test_poly_H(T, Tt.in, Tt.coef, Tt.deg, Tt.in_shift);
+        topo_test_poly_H(T, Tt.out, Tt.coef, Tt.deg, Tt.out_shift);
+#endif
+
+#if TOPO_TEST_DIFFQUADZ
+        Tt.coef[2] = 1.0;
+        Tt.deg[2] = 2.0;
+        Tt.out = T->xz;
+        Tt.in = T->u1;
+        for (int i = 0; i < 3; ++i) {
+                Tt.out_shift[i] = T->sxz[i];
+                Tt.in_shift[i] = T->su1[i];
+        }
+        topo_test_polyzbnd_H(T, Tt.in, Tt.coef, Tt.deg, Tt.in_shift);
+        // Plug in answer in advance to make sure that points that do
+        // not get updated have the correct answer (instead of adjusting
+        // bounds of test function)
+        _prec deg[3] = {0, 0, 1};
+        _prec coef[3] = {0, 0, 2};
+        topo_test_polyzbnd_H(T, Tt.out, coef, deg, Tt.out_shift);
+#endif
+
+#if TOPO_TEST_VELCONST
+        Tt.cxx[0] = 1.0;
+        Tt.cyy[0] = 1.0;
+        Tt.czz[0] = 1.0;
+        Tt.cxy[0] = 1.0;
+        Tt.cxz[0] = 1.0;
+        Tt.cyz[0] = 1.0;
+        // Input
+        topo_test_polystr_H(T, T->xx, Tt.cxx, Tt.deg, T->sxx);
+        topo_test_polystr_H(T, T->yy, Tt.cyy, Tt.deg, T->syy);
+        topo_test_polystr_H(T, T->zz, Tt.czz, Tt.deg, T->szz);
+        topo_test_polystr_H(T, T->xy, Tt.cxy, Tt.deg, T->sxy);
+        topo_test_polystr_H(T, T->xz, Tt.cxz, Tt.deg, T->sxz);
+        topo_test_polystr_H(T, T->yz, Tt.cyz, Tt.deg, T->syz);
+
+        // Output
+        topo_test_poly_H(T, T->u1, Tt.cu1, Tt.deg, T->su1);
+        topo_test_poly_H(T, T->v1, Tt.cv1, Tt.deg, T->sv1);
+        topo_test_poly_H(T, T->w1, Tt.cw1, Tt.deg, T->sw1);
+#endif
+
+#if TOPO_TEST_VELLINX
+        Tt.cxx[0] = 1.0;
+        Tt.cyy[0] = 1.0;
+        Tt.czz[0] = 1.0;
+        Tt.cxy[0] = 1.0;
+        Tt.cxz[0] = 1.0;
+        Tt.cyz[0] = 1.0;
+        Tt.deg[0] = 1.0;
+        // Input
+        topo_test_polystr_H(T, T->xx, Tt.cxx, Tt.deg, T->sxx);
+        topo_test_polystr_H(T, T->yy, Tt.cyy, Tt.deg, T->syy);
+        topo_test_polystr_H(T, T->zz, Tt.czz, Tt.deg, T->szz);
+        topo_test_polystr_H(T, T->xy, Tt.cxy, Tt.deg, T->sxy);
+        topo_test_polystr_H(T, T->xz, Tt.cxz, Tt.deg, T->sxz);
+        topo_test_polystr_H(T, T->yz, Tt.cyz, Tt.deg, T->syz);
+
+        // Output
+        Tt.cu1[0] = 0.0;
+        Tt.cv1[0] = 0.0;
+        Tt.cw1[0] = 0.0;
+        topo_test_poly_H(T, T->u1, Tt.cu1, Tt.deg, T->su1);
+        topo_test_poly_H(T, T->v1, Tt.cv1, Tt.deg, T->sv1);
+        topo_test_poly_H(T, T->w1, Tt.cw1, Tt.deg, T->sw1);
+#endif
+
+#if TOPO_TEST_VELLINY
+        Tt.cxx[1] = 1.0;
+        Tt.cyy[1] = 1.0;
+        Tt.czz[1] = 1.0;
+        Tt.cxy[1] = 1.0;
+        Tt.cxz[1] = 1.0;
+        Tt.cyz[1] = 1.0;
+        Tt.deg[1] = 1.0;
+        // Input
+        topo_test_polystr_H(T, T->xx, Tt.cxx, Tt.deg, T->sxx);
+        topo_test_polystr_H(T, T->yy, Tt.cyy, Tt.deg, T->syy);
+        topo_test_polystr_H(T, T->zz, Tt.czz, Tt.deg, T->szz);
+        topo_test_polystr_H(T, T->xy, Tt.cxy, Tt.deg, T->sxy);
+        topo_test_polystr_H(T, T->xz, Tt.cxz, Tt.deg, T->sxz);
+        topo_test_polystr_H(T, T->yz, Tt.cyz, Tt.deg, T->syz);
+
+        // Output
+        Tt.cu1[0] = 0.0;
+        Tt.cv1[0] = 0.0;
+        Tt.cw1[0] = 0.0;
+        topo_test_poly_H(T, T->u1, Tt.cu1, Tt.deg, T->su1);
+        topo_test_poly_H(T, T->v1, Tt.cv1, Tt.deg, T->sv1);
+        topo_test_poly_H(T, T->w1, Tt.cw1, Tt.deg, T->sw1);
+#endif
+
+#if TOPO_TEST_VELLINZ
+        Tt.cxx[2] = 1.0;
+        Tt.cyy[2] = 1.0;
+        Tt.czz[2] = 1.0;
+        Tt.cxy[2] = 1.0;
+        Tt.cxz[2] = 1.0;
+        Tt.cyz[2] = 1.0;
+        Tt.deg[2] = 1.0;
+        // Input
+        topo_test_polystrzbnd_H(T, T->xx, Tt.cxx, Tt.deg, T->sxx);
+        topo_test_polystrzbnd_H(T, T->yy, Tt.cyy, Tt.deg, T->syy);
+        topo_test_polystrzbnd_H(T, T->zz, Tt.czz, Tt.deg, T->szz);
+        topo_test_polystrzbnd_H(T, T->xy, Tt.cxy, Tt.deg, T->sxy);
+        topo_test_polystrzbnd_H(T, T->xz, Tt.cxz, Tt.deg, T->sxz);
+        topo_test_polystrzbnd_H(T, T->yz, Tt.cyz, Tt.deg, T->syz);
+
+        // Output
+        Tt.cu1[0] = 0;
+        Tt.cv1[0] = 0;
+        Tt.cw1[0] = 0;
+        topo_test_polyzbnd_H(T, T->u1, Tt.cu1, Tt.deg, T->su1);
+        topo_test_polyzbnd_H(T, T->v1, Tt.cv1, Tt.deg, T->sv1);
+        topo_test_polyzbnd_H(T, T->w1, Tt.cw1, Tt.deg, T->sw1);
+#endif
+
+#if TOPO_TEST_VELQUADX
+        Tt.cxx[0] = 1.0;
+        Tt.cyy[0] = 1.0;
+        Tt.czz[0] = 1.0;
+        Tt.cxy[0] = 1.0;
+        Tt.cxz[0] = 1.0;
+        Tt.cyz[0] = 1.0;
+        Tt.deg[0] = 2.0;
+        // Input
+        topo_test_polystr_H(T, T->xx, Tt.cxx, Tt.deg, T->sxx);
+        topo_test_polystr_H(T, T->yy, Tt.cyy, Tt.deg, T->syy);
+        topo_test_polystr_H(T, T->zz, Tt.czz, Tt.deg, T->szz);
+        topo_test_polystr_H(T, T->xy, Tt.cxy, Tt.deg, T->sxy);
+        topo_test_polystr_H(T, T->xz, Tt.cxz, Tt.deg, T->sxz);
+        topo_test_polystr_H(T, T->yz, Tt.cyz, Tt.deg, T->syz);
+
+        // Output
+        Tt.cu1[0] = 0;
+        Tt.cv1[0] = 0;
+        Tt.cw1[0] = 0;
+        topo_test_poly_H(T, T->u1, Tt.cu1, Tt.deg, T->su1);
+        topo_test_poly_H(T, T->v1, Tt.cv1, Tt.deg, T->sv1);
+        topo_test_poly_H(T, T->w1, Tt.cw1, Tt.deg, T->sw1);
+#endif
+
+#if TOPO_TEST_VELQUADY
+        Tt.cxx[1] = 1.0;
+        Tt.cyy[1] = 1.0;
+        Tt.czz[1] = 1.0;
+        Tt.cxy[1] = 1.0;
+        Tt.cxz[1] = 1.0;
+        Tt.cyz[1] = 1.0;
+        Tt.deg[1] = 2.0;
+        // Input
+        topo_test_polystr_H(T, T->xx, Tt.cxx, Tt.deg, T->sxx);
+        topo_test_polystr_H(T, T->yy, Tt.cyy, Tt.deg, T->syy);
+        topo_test_polystr_H(T, T->zz, Tt.czz, Tt.deg, T->szz);
+        topo_test_polystr_H(T, T->xy, Tt.cxy, Tt.deg, T->sxy);
+        topo_test_polystr_H(T, T->xz, Tt.cxz, Tt.deg, T->sxz);
+        topo_test_polystr_H(T, T->yz, Tt.cyz, Tt.deg, T->syz);
+
+        // Output
+        Tt.cu1[0] = 0;
+        Tt.cv1[0] = 0;
+        Tt.cw1[0] = 0;
+        topo_test_poly_H(T, T->u1, Tt.cu1, Tt.deg, T->su1);
+        topo_test_poly_H(T, T->v1, Tt.cv1, Tt.deg, T->sv1);
+        topo_test_poly_H(T, T->w1, Tt.cw1, Tt.deg, T->sw1);
+#endif
+
+#if TOPO_TEST_VELQUADZ
+        Tt.cxx[2] = 1.0;
+        Tt.cyy[2] = 1.0;
+        Tt.czz[2] = 1.0;
+        Tt.cxy[2] = 1.0;
+        Tt.cxz[2] = 1.0;
+        Tt.cyz[2] = 1.0;
+        Tt.deg[2] = 2.0;
+        // Input
+        topo_test_polystrzbnd_H(T, T->xx, Tt.cxx, Tt.deg, T->sxx);
+        topo_test_polystrzbnd_H(T, T->yy, Tt.cyy, Tt.deg, T->syy);
+        topo_test_polystrzbnd_H(T, T->zz, Tt.czz, Tt.deg, T->szz);
+        topo_test_polystrzbnd_H(T, T->xy, Tt.cxy, Tt.deg, T->sxy);
+        topo_test_polystrzbnd_H(T, T->xz, Tt.cxz, Tt.deg, T->sxz);
+        topo_test_polystrzbnd_H(T, T->yz, Tt.cyz, Tt.deg, T->syz);
+
+        // Output
+        Tt.cu1[0] = 0;
+        Tt.cv1[0] = 0;
+        Tt.cw1[0] = 0;
+        topo_test_polyzbnd_H(T, T->u1, Tt.cu1, Tt.deg, T->su1);
+        topo_test_polyzbnd_H(T, T->v1, Tt.cv1, Tt.deg, T->sv1);
+        topo_test_polyzbnd_H(T, T->w1, Tt.cw1, Tt.deg, T->sw1);
+#endif
+
+#if TOPO_TEST_VELFRONTBACK
+        Tt.cxx[1] = 1.0;
+        Tt.cyy[1] = 1.0;
+        Tt.czz[1] = 1.0;
+        Tt.cxy[1] = 1.0;
+        Tt.cxz[1] = 1.0;
+        Tt.cyz[1] = 1.0;
+        Tt.deg[1] = 2.0;
+        // Input
+        topo_test_polystr_H(T, T->xx, Tt.cxx, Tt.deg, T->sxx);
+        topo_test_polystr_H(T, T->yy, Tt.cyy, Tt.deg, T->syy);
+        topo_test_polystr_H(T, T->zz, Tt.czz, Tt.deg, T->szz);
+        topo_test_polystr_H(T, T->xy, Tt.cxy, Tt.deg, T->sxy);
+        topo_test_polystr_H(T, T->xz, Tt.cxz, Tt.deg, T->sxz);
+        topo_test_polystr_H(T, T->yz, Tt.cyz, Tt.deg, T->syz);
+
+        // Output
+        Tt.cu1[0] = 0;
+        Tt.cv1[0] = 0;
+        Tt.cw1[0] = 0;
+        topo_test_poly_H(T, T->u1, Tt.cu1, Tt.deg, T->su1);
+        topo_test_poly_H(T, T->v1, Tt.cv1, Tt.deg, T->sv1);
+        topo_test_poly_H(T, T->w1, Tt.cw1, Tt.deg, T->sw1);
+#endif
+
+#if TOPO_TEST_STRCONST
+        // Input
+        Tt.cu1[0] = 0;
+        Tt.cv1[0] = 0;
+        Tt.cw1[0] = 0;
+        Tt.deg[0] = 0.0;
+        topo_test_poly_H(T, T->u1, Tt.cu1, Tt.deg, T->su1);
+        topo_test_poly_H(T, T->v1, Tt.cv1, Tt.deg, T->sv1);
+        topo_test_poly_H(T, T->w1, Tt.cw1, Tt.deg, T->sw1);
+
+#endif
+
+#if TOPO_TEST_STRLINX
+        // Input
+        Tt.cu1[0] = 1;
+        Tt.cv1[0] = 1;
+        Tt.cw1[0] = 1;
+        Tt.deg[0] = 1.0;
+        topo_test_poly_H(T, T->u1, Tt.cu1, Tt.deg, T->su1);
+        topo_test_poly_H(T, T->v1, Tt.cv1, Tt.deg, T->sv1);
+        topo_test_poly_H(T, T->w1, Tt.cw1, Tt.deg, T->sw1);
+
+        // Output
+        Tt.cxx[0] = 0.0;
+        Tt.cyy[0] = 0.0;
+        Tt.czz[0] = 0.0;
+        Tt.cxy[0] = 0.0;
+        Tt.cxz[0] = 0.0;
+        Tt.cyz[0] = 0.0;
+        topo_test_polystr_H(T, T->xx, Tt.cxx, Tt.deg, T->sxx);
+        topo_test_polystr_H(T, T->yy, Tt.cyy, Tt.deg, T->syy);
+        topo_test_polystr_H(T, T->zz, Tt.czz, Tt.deg, T->szz);
+        topo_test_polystr_H(T, T->xy, Tt.cxy, Tt.deg, T->sxy);
+        topo_test_polystr_H(T, T->xz, Tt.cxz, Tt.deg, T->sxz);
+        topo_test_polystr_H(T, T->yz, Tt.cyz, Tt.deg, T->syz);
+#endif
+
+#if TOPO_TEST_STRLINY
+        // Input
+        Tt.cu1[1] = 1;
+        Tt.cv1[1] = 1;
+        Tt.cw1[1] = 1;
+        Tt.deg[1] = 1.0;
+        topo_test_poly_H(T, T->u1, Tt.cu1, Tt.deg, T->su1);
+        topo_test_poly_H(T, T->v1, Tt.cv1, Tt.deg, T->sv1);
+        topo_test_poly_H(T, T->w1, Tt.cw1, Tt.deg, T->sw1);
+
+        // Output
+        Tt.cxx[0] = 0.0;
+        Tt.cyy[0] = 0.0;
+        Tt.czz[0] = 0.0;
+        Tt.cxy[0] = 0.0;
+        Tt.cxz[0] = 0.0;
+        Tt.cyz[0] = 0.0;
+        topo_test_polystr_H(T, T->xx, Tt.cxx, Tt.deg, T->sxx);
+        topo_test_polystr_H(T, T->yy, Tt.cyy, Tt.deg, T->syy);
+        topo_test_polystr_H(T, T->zz, Tt.czz, Tt.deg, T->szz);
+        topo_test_polystr_H(T, T->xy, Tt.cxy, Tt.deg, T->sxy);
+        topo_test_polystr_H(T, T->xz, Tt.cxz, Tt.deg, T->sxz);
+        topo_test_polystr_H(T, T->yz, Tt.cyz, Tt.deg, T->syz);
+#endif
+
+#if TOPO_TEST_STRLINZ
+        // Input
+        Tt.cu1[2] = 1;
+        Tt.cv1[2] = 1;
+        Tt.cw1[2] = 1;
+        Tt.deg[2] = 1.0;
+        topo_test_polyzbnd_H(T, T->u1, Tt.cu1, Tt.deg, T->su1);
+        topo_test_polyzbnd_H(T, T->v1, Tt.cv1, Tt.deg, T->sv1);
+        topo_test_polyzbnd_H(T, T->w1, Tt.cw1, Tt.deg, T->sw1);
+
+        // Output
+        Tt.cxx[0] = 0.0;
+        Tt.cyy[0] = 0.0;
+        Tt.czz[0] = 0.0;
+        Tt.cxy[0] = 0.0;
+        Tt.cxz[0] = 0.0;
+        Tt.cyz[0] = 0.0;
+        topo_test_polystrzbnd_H(T, T->xx, Tt.cxx, Tt.deg, T->sxx);
+        topo_test_polystrzbnd_H(T, T->yy, Tt.cyy, Tt.deg, T->syy);
+        topo_test_polystrzbnd_H(T, T->zz, Tt.czz, Tt.deg, T->szz);
+        topo_test_polystrzbnd_H(T, T->xy, Tt.cxy, Tt.deg, T->sxy);
+        topo_test_polystrzbnd_H(T, T->xz, Tt.cxz, Tt.deg, T->sxz);
+        topo_test_polystrzbnd_H(T, T->yz, Tt.cyz, Tt.deg, T->syz);
+#endif
+
+#if TOPO_TEST_STRQUADX
+        // Input
+        Tt.cu1[0] = 1;
+        Tt.cv1[0] = 1;
+        Tt.cw1[0] = 1;
+        Tt.deg[0] = 2.0;
+        topo_test_poly_H(T, T->u1, Tt.cu1, Tt.deg, T->su1);
+        topo_test_poly_H(T, T->v1, Tt.cv1, Tt.deg, T->sv1);
+        topo_test_poly_H(T, T->w1, Tt.cw1, Tt.deg, T->sw1);
+
+        // Output
+        Tt.cxx[0] = 0.0;
+        Tt.cyy[0] = 0.0;
+        Tt.czz[0] = 0.0;
+        Tt.cxy[0] = 0.0;
+        Tt.cxz[0] = 0.0;
+        Tt.cyz[0] = 0.0;
+        topo_test_polystr_H(T, T->xx, Tt.cxx, Tt.deg, T->sxx);
+        topo_test_polystr_H(T, T->yy, Tt.cyy, Tt.deg, T->syy);
+        topo_test_polystr_H(T, T->zz, Tt.czz, Tt.deg, T->szz);
+        topo_test_polystr_H(T, T->xy, Tt.cxy, Tt.deg, T->sxy);
+        topo_test_polystr_H(T, T->xz, Tt.cxz, Tt.deg, T->sxz);
+        topo_test_polystr_H(T, T->yz, Tt.cyz, Tt.deg, T->syz);
+#endif
+
+#if TOPO_TEST_STRQUADY
+        // Input
+        Tt.cu1[1] = 1;
+        Tt.cv1[1] = 1;
+        Tt.cw1[1] = 1;
+        Tt.deg[1] = 2.0;
+        topo_test_poly_H(T, T->u1, Tt.cu1, Tt.deg, T->su1);
+        topo_test_poly_H(T, T->v1, Tt.cv1, Tt.deg, T->sv1);
+        topo_test_poly_H(T, T->w1, Tt.cw1, Tt.deg, T->sw1);
+
+        // Output
+        Tt.cxx[0] = 0.0;
+        Tt.cyy[0] = 0.0;
+        Tt.czz[0] = 0.0;
+        Tt.cxy[0] = 0.0;
+        Tt.cxz[0] = 0.0;
+        Tt.cyz[0] = 0.0;
+        topo_test_polystr_H(T, T->xx, Tt.cxx, Tt.deg, T->sxx);
+        topo_test_polystr_H(T, T->yy, Tt.cyy, Tt.deg, T->syy);
+        topo_test_polystr_H(T, T->zz, Tt.czz, Tt.deg, T->szz);
+        topo_test_polystr_H(T, T->xy, Tt.cxy, Tt.deg, T->sxy);
+        topo_test_polystr_H(T, T->xz, Tt.cxz, Tt.deg, T->sxz);
+        topo_test_polystr_H(T, T->yz, Tt.cyz, Tt.deg, T->syz);
+#endif
+
+#if TOPO_TEST_STRQUADZ
+        // Input
+        Tt.cu1[2] = 1;
+        Tt.cv1[2] = 1;
+        Tt.cw1[2] = 1;
+        Tt.deg[2] = 2.0;
+        topo_test_polyzbnd_H(T, T->u1, Tt.cu1, Tt.deg, T->su1);
+        topo_test_polyzbnd_H(T, T->v1, Tt.cv1, Tt.deg, T->sv1);
+        topo_test_polyzbnd_H(T, T->w1, Tt.cw1, Tt.deg, T->sw1);
+
+        // Output
+        Tt.cxx[0] = 0.0;
+        Tt.cyy[0] = 0.0;
+        Tt.czz[0] = 0.0;
+        Tt.cxy[0] = 0.0;
+        Tt.cxz[0] = 0.0;
+        Tt.cyz[0] = 0.0;
+        topo_test_polystrzbnd_H(T, T->xx, Tt.cxx, Tt.deg, T->sxx);
+        topo_test_polystrzbnd_H(T, T->yy, Tt.cyy, Tt.deg, T->syy);
+        topo_test_polystrzbnd_H(T, T->zz, Tt.czz, Tt.deg, T->szz);
+        topo_test_polystrzbnd_H(T, T->xy, Tt.cxy, Tt.deg, T->sxy);
+        topo_test_polystrzbnd_H(T, T->xz, Tt.cxz, Tt.deg, T->sxz);
+        topo_test_polystrzbnd_H(T, T->yz, Tt.cyz, Tt.deg, T->syz);
+#endif
+
+        return Tt;
+}
+
+void topo_test_velfront(topo_test_t *Tt, topo_t *T)
+{
+        if (T->y_rank_f < 0) {
+                return;
+        }
+
+#if TOPO_TEST_CONSTY
+        topo_test_polyf_H(T, T->f_u1, Tt->cu1, Tt->deg, Tt->in_shift);
+        topo_test_polyf_H(T, T->f_v1, Tt->cv1, Tt->deg, Tt->in_shift);
+        topo_test_polyf_H(T, T->f_w1, Tt->cw1, Tt->deg, Tt->in_shift);
+#endif
+
+#if TOPO_TEST_STRLINY || TOPO_TEST_STRQUADY
+        topo_test_polyf_H(T, T->f_u1, Tt->cu1, Tt->deg, T->su1);
+        topo_test_polyf_H(T, T->f_v1, Tt->cv1, Tt->deg, T->sv1);
+        topo_test_polyf_H(T, T->f_w1, Tt->cw1, Tt->deg, T->sw1);
+#endif
+
+#if TOPO_TEST_STRLINZ || TOPO_TEST_STRQUADZ
+        topo_test_polyzbndf_H(T, T->f_u1, Tt->cu1, Tt->deg, T->su1);
+        topo_test_polyzbndf_H(T, T->f_v1, Tt->cv1, Tt->deg, T->sv1);
+        topo_test_polyzbndf_H(T, T->f_w1, Tt->cw1, Tt->deg, T->sw1);
+#endif
+
+#if TOPO_TEST_LINY || TOPO_TEST_DIFFCONSTY || TOPO_TEST_DIFFLINY || \
+    TOPO_TEST_DIFFQUADY
+        topo_test_polyf_H(T, Tt->velf, Tt->coef, Tt->deg, Tt->in_shift);
+#endif
+
+#if TOPO_TEST_VELFRONTBACK
+        topo_velocity_front_H(T);
+#endif
+}
+
+void topo_test_velback(topo_test_t *Tt, topo_t *T)
+{
+        if (T->y_rank_b < 0) {
+                return;
+        }
+
+#if TOPO_TEST_CONSTY
+        topo_test_polyb_H(T, T->b_u1, Tt->cu1, Tt->deg, Tt->in_shift);
+        topo_test_polyb_H(T, T->b_v1, Tt->cv1, Tt->deg, Tt->in_shift);
+        topo_test_polyb_H(T, T->b_w1, Tt->cw1, Tt->deg, Tt->in_shift);
+#endif
+
+#if TOPO_TEST_STRLINY || TOPO_TEST_STRQUADY
+        topo_test_polyb_H(T, T->b_u1, Tt->cu1, Tt->deg, T->su1);
+        topo_test_polyb_H(T, T->b_v1, Tt->cv1, Tt->deg, T->sv1);
+        topo_test_polyb_H(T, T->b_w1, Tt->cw1, Tt->deg, T->sw1);
+#endif
+
+#if TOPO_TEST_STRLINZ || TOPO_TEST_STRQUADZ
+        topo_test_polyzbndb_H(T, T->b_u1, Tt->cu1, Tt->deg, T->su1);
+        topo_test_polyzbndb_H(T, T->b_v1, Tt->cv1, Tt->deg, T->sv1);
+        topo_test_polyzbndb_H(T, T->b_w1, Tt->cw1, Tt->deg, T->sw1);
+#endif
+
+#if TOPO_TEST_LINY || TOPO_TEST_DIFFCONSTY || TOPO_TEST_DIFFLINY || \
+    TOPO_TEST_DIFFQUADY
+        topo_test_polyb_H(T, Tt->velb, Tt->coef, Tt->deg, Tt->in_shift);
+#endif
+
+#if TOPO_TEST_VELFRONTBACK
+        topo_velocity_back_H(T);
+#endif
+}
+
+void topo_test_velx(const topo_test_t *Tt, topo_t *T)
+{
+#if TOPO_TEST_VELCONST || TOPO_TEST_VELLINX || TOPO_TEST_VELLINY ||  \
+    TOPO_TEST_VELLINZ || TOPO_TEST_VELQUADX || TOPO_TEST_VELQUADY || \
+    TOPO_TEST_VELQUADZ
+        topo_velocity_interior_H(T);
+#endif
+}
+
+void topo_test_stress(const topo_test_t *Tt, topo_t *T)
+{
+#if TOPO_TEST_DIFFCONSTX || TOPO_TEST_DIFFLINX || TOPO_TEST_DIFFQUADX
+        topo_test_diffx_H(T, T->xx, T->u1);
+#endif
+
+#if TOPO_TEST_DIFFCONSTZ || TOPO_TEST_DIFFLINZ || TOPO_TEST_DIFFQUADZ
+        topo_test_diffz_H(T, Tt->out, Tt->in);
+#endif
+
+#if TOPO_TEST_CGDIFFQUADX
+        topo_test_cgdiffx_H(T, T->xx, T->u1);
+#endif
+
+#if TOPO_TEST_DIFFCONSTY || TOPO_TEST_DIFFLINY
+        topo_test_diffy_H(T, T->yy, T->v1);
+#endif
+
+#if TOPO_TEST_DIFFQUADY
+        topo_test_diffy_H(T, T->yz, T->w1);
+#endif
+}
+
+void topo_test_stress_interior(const topo_test_t *Tt, topo_t *T)
+{
+#if TOPO_TEST_STRCONST || TOPO_TEST_STRLINX || TOPO_TEST_STRLINY ||  \
+    TOPO_TEST_STRLINZ || TOPO_TEST_STRQUADX || TOPO_TEST_STRQUADY || \
+    TOPO_TEST_STRQUADZ
+        topo_stress_interior_H(T);
+#endif
+}
+
+void topo_test_stress_sides(const topo_test_t *Tt, topo_t *T)
+{
+#if TOPO_TEST_STRCONST || TOPO_TEST_STRLINX || TOPO_TEST_STRLINY ||  \
+    TOPO_TEST_STRLINZ || TOPO_TEST_STRQUADX || TOPO_TEST_STRQUADY || \
+    TOPO_TEST_STRQUADZ
+        topo_stress_left_H(T);
+        topo_stress_right_H(T);
+#endif
+}
+
+int topo_test_finalize(const topo_test_t *Tt, topo_t *T)
+{
+        if (!Tt->use) return 0;
+
+        int err = 0;
+
+#if TOPO_TEST_CONSTX
+        err |= topo_test_constx(Tt, T);
+#endif
+
+#if TOPO_TEST_CONSTY
+        err |= topo_test_consty(Tt, T);
+#endif
+
+#if TOPO_TEST_LINX
+        err |= topo_test_linx(Tt, T);
+#endif
+
+#if TOPO_TEST_LINY
+        err |= topo_test_liny(Tt, T);
+#endif
+
+#if TOPO_TEST_DIFFCONSTX
+        err |= topo_test_diffconstx(Tt, T);
+#endif
+
+#if TOPO_TEST_DIFFCONSTY
+        err |= topo_test_diffconsty(Tt, T);
+#endif
+
+#if TOPO_TEST_DIFFCONSTZ
+        err |= topo_test_diffconstz(Tt, T);
+#endif
+
+#if TOPO_TEST_DIFFLINX
+        err |= topo_test_difflinx(Tt, T);
+#endif
+
+#if TOPO_TEST_DIFFLINY
+        err |= topo_test_diffliny(Tt, T);
+#endif
+
+#if TOPO_TEST_DIFFLINZ
+        err |= topo_test_difflinz(Tt, T);
+#endif
+
+#if TOPO_TEST_DIFFQUADX || TOPO_TEST_CGDIFFQUADX
+        err |= topo_test_diffquadx(Tt, T);
+#endif
+
+#if TOPO_TEST_DIFFQUADY
+        err |= topo_test_diffquady(Tt, T);
+#endif
+
+#if TOPO_TEST_DIFFQUADZ
+        err |= topo_test_diffquadz(Tt, T);
+#endif
+
+#if TOPO_TEST_VELCONST
+        err |= topo_test_velconst(Tt, T);
+#endif
+
+#if TOPO_TEST_VELLINX
+        err |= topo_test_vellinx(Tt, T);
+#endif
+
+#if TOPO_TEST_VELLINY
+        err |= topo_test_velliny(Tt, T);
+#endif
+
+#if TOPO_TEST_VELLINZ
+        err |= topo_test_vellinz(Tt, T);
+#endif
+
+#if TOPO_TEST_VELQUADX
+        err |= topo_test_velquadx(Tt, T);
+#endif
+
+#if TOPO_TEST_VELQUADY
+        err |= topo_test_velquady(Tt, T);
+#endif
+
+#if TOPO_TEST_VELQUADZ
+        err |= topo_test_velquadz(Tt, T);
+#endif
+
+#if TOPO_TEST_VELFRONTBACK
+        err |= topo_test_velfrontback(Tt, T);
+#endif
+
+#if TOPO_TEST_STRCONST
+        err |= topo_test_strconst(Tt, T);
+#endif
+
+#if TOPO_TEST_STRLINX
+        err |= topo_test_strlinx(Tt, T);
+#endif
+
+#if TOPO_TEST_STRLINY
+        err |= topo_test_strliny(Tt, T);
+#endif
+
+#if TOPO_TEST_STRLINZ
+        err |= topo_test_strlinz(Tt, T);
+#endif
+
+#if TOPO_TEST_STRQUADX
+        err |= topo_test_strquadx(Tt, T);
+#endif
+
+#if TOPO_TEST_STRQUADY
+        err |= topo_test_strquady(Tt, T);
+#endif
+
+#if TOPO_TEST_STRQUADZ
+        err |= topo_test_strquadz(Tt, T);
+#endif
+
+        return err;
+}
+
+int topo_test_constx(const topo_test_t *Tt, const topo_t *T)
+{
+        // Select regions to test
+        // 1 : region will be tested
+        // 0 : region will not be tested
+        // There are only two processes in this test, so MPI send-recv only
+        // takes place in the x-direction.
+        int regions[9] = {0, 0, 0,
+                          1, 1, 1,
+                          0, 0, 0};
+
+        if (T->rank == 0) {
+                regions[3] = 0;
+        }
+        if (T->rank == 1) {
+                regions[5] = 0;
+        }
+        
+        _prec *fields[3] = {T->u1, T->v1, T->w1};
+        char *fields_str[3] = {"u1", "v1", "w1"};
+        _prec ans[3] = {1.0, 2.0, 3.0};
+
+        int err = 0;
+        for (int i = 0; i < 3; ++i) {
+                _prec ferr[9] = {0.0};
+                _prec args[1] = {(_prec)ans[i]};
+                err |= topo_test_fcn(fcn_constant, T, fields[i], Tt->tol,
+                                     args, regions, ferr);
+                check_printerr(__func__, T->rank, fields_str[i], ferr);
+        }
+
+        return err;
+}
+
+int topo_test_consty(const topo_test_t *Tt, const topo_t *T)
+{
+        int regions[9] = {0, 1, 0,
+                          0, 1, 0,
+                          0, 1, 0};
+
+        if (T->rank == 0) {
+                regions[7] = 0;
+        }
+        if (T->rank == 1) {
+                regions[1] = 0;
+        }
+        
+        _prec *fields[3] = {T->u1, T->v1, T->w1};
+        char *fields_str[3] = {"u1", "v1", "w1"};
+        _prec ans[3] = {1.0, 2.0, 3.0};
+
+        int err = 0;
+        for (int i = 0; i < 3; ++i) {
+                _prec ferr[9] = {0.0};
+                _prec args[1] = {(_prec)ans[i]};
+                err |= topo_test_fcn(fcn_constant, T, fields[i], Tt->tol,
+                                     args, regions, ferr);
+                check_printerr(__func__, T->rank, fields_str[i], ferr);
+        }
+
+        return err;
+}
+
+int topo_test_linx(const topo_test_t *Tt, const topo_t *T)
+{
+        int regions[9] = {0, 0, 0,
+                          1, 1, 1,
+                          0, 0, 0};
+
+        if (T->rank == 0) {
+                regions[3] = 0;
+        }
+        if (T->rank == 1) {
+                regions[5] = 0;
+        }
+        
+        _prec *fields[3] = {T->u1};
+        char *fields_str[3] = {"u1"};
+
+        int err = 0;
+        for (int i = 0; i < 1; ++i) {
+                _prec ferr[9] = {0.0};
+                _prec args[13] = {1.0, 0.0, 0.0,
+                                 1.0, 0.0, 0.0,
+                                 T->su1[0], T->su1[1], T->su1[2],
+                                 T->coord[0], T->coord[1],
+                                 T->nx, T->ny};
+                err |= topo_test_fcn(fcn_poly, T, fields[i], Tt->tol,
+                                     args, regions, ferr);
+                check_printerr(__func__, T->rank, fields_str[i], ferr);
+        }
+
+        return err;
+}
+
+int topo_test_liny(const topo_test_t *Tt, const topo_t *T)
+{
+        int regions[9] = {0, 1, 0,
+                          0, 1, 0,
+                          0, 1, 0};
+
+        if (T->rank == 0) {
+                regions[7] = 0;
+        }
+        if (T->rank == 1) {
+                regions[1] = 0;
+        }
+        
+        _prec *fields[3] = {T->u1, T->v1, T->w1};
+        char *fields_str[3] = {"u1", "v1", "w1"};
+
+        int err = 0;
+        for (int i = 0; i < 1; ++i) {
+                _prec ferr[9] = {0.0};
+                _prec args[13] = {0.0, 1.0, 0.0,
+                                 0.0, 1.0, 0.0,
+                                 T->su1[0], T->su1[1], T->su1[2],
+                                 T->coord[0], T->coord[1],
+                                 T->nx, T->ny};
+                err |= topo_test_fcn(fcn_poly, T, fields[i], Tt->tol,
+                                     args, regions, ferr);
+                check_printerr(__func__, T->rank, fields_str[i], ferr);
+        }
+
+        return err;
+}
+
+int topo_test_diffconstx(const topo_test_t *Tt, const topo_t *T)
+{
+        int regions[9] = {0, 0, 0,
+                          1, 1, 1,
+                          0, 0, 0};
+
+        _prec *fields[1] = {T->xx};
+        char *fields_str[1] = {"xx"};
+       
+        int err = 0;
+
+        // Only check the rank in the middle because the ranks on the boundary
+        // will not correctly compute the stencil due to applying an interior
+        // stencil on the boundary
+        if (T->rank != 1) {
+                return err;
+        }
+
+        for (int i = 0; i < 1; ++i) {
+                _prec ferr[9] = {0.0};
+                _prec args[1] = {0.0};
+                err |= topo_test_fcn(fcn_constant, T, fields[i], Tt->tol,
+                                     args, regions, ferr);
+                check_printerr(__func__, T->rank, fields_str[i], ferr);
+        }
+
+        return err;
+}
+
+int topo_test_diffconsty(const topo_test_t *Tt, const topo_t *T)
+{
+        int regions[9] = {0, 1, 0,
+                          0, 1, 0,
+                          0, 1, 0};
+
+        _prec *fields[1] = {T->yy};
+        char *fields_str[1] = {"yy"};
+       
+        int err = 0;
+
+        // Only check the rank in the middle because the ranks on the boundary
+        // will not correctly compute the stencil due to applying an interior
+        // stencil on the boundary
+        if (T->rank != 1) {
+                return err;
+        }
+
+        for (int i = 0; i < 1; ++i) {
+                _prec ferr[9] = {0.0};
+                _prec args[1] = {0.0};
+                err |= topo_test_fcn(fcn_constant, T, fields[i], Tt->tol,
+                                     args, regions, ferr);
+                check_printerr(__func__, T->rank, fields_str[i], ferr);
+        }
+
+        return err;
+}
+
+int topo_test_diffconstz(const topo_test_t *Tt, const topo_t *T)
+{
+        int regions[9] = {0, 1, 0,
+                          0, 1, 0,
+                          0, 1, 0};
+
+        _prec *fields[1] = {Tt->out};
+        char *fields_str[1] = {"xz"};
+       
+        int err = 0;
+
+        // Only check the rank in the middle because the ranks on the boundary
+        // will not correctly compute the stencil due to applying an interior
+        // stencil on the boundary
+        if (T->rank != 1) {
+                return err;
+        }
+
+        for (int i = 0; i < 1; ++i) {
+                _prec ferr[9] = {0.0};
+                _prec args[1] = {0.0};
+                err |= topo_test_fcn(fcn_constant, T, fields[i], Tt->tol,
+                                     args, regions, ferr);
+                check_printerr(__func__, T->rank, fields_str[i], ferr);
+        }
+
+        return err;
+}
+
+int topo_test_difflinx(const topo_test_t *Tt, const topo_t *T)
+{
+        int regions[9] = {0, 0, 0,
+                          0, 1, 0,
+                          0, 0, 0};
+
+        _prec *fields[1] = {T->xx};
+        char *fields_str[1] = {"xx"};
+       
+        int err = 0;
+
+        // Only check the rank in the middle because the ranks on the boundary
+        // will not correctly compute the stencil due to applying an interior
+        // stencil on the boundary
+        if (T->rank != 1) {
+                return err;
+        }
+
+        for (int i = 0; i < 1; ++i) {
+                _prec ferr[9] = {0.0};
+                _prec args[13] = {1.0, 0.0, 0.0,
+                                  0, 0.0, 0.0,
+                                  T->su1[0], T->su1[1], T->su1[2],
+                                  T->coord[0], T->coord[1],
+                                  T->nx, T->ny};
+                err |= topo_test_fcn(fcn_poly, T, fields[i], Tt->tol,
+                                     args, regions, ferr);
+                check_printerr(__func__, T->rank, fields_str[i], ferr);
+        }
+
+        return err;
+}
+
+int topo_test_diffliny(const topo_test_t *Tt, const topo_t *T)
+{
+        int regions[9] = {0, 0, 0,
+                          0, 1, 0,
+                          0, 0, 0};
+
+        _prec *fields[1] = {T->yy};
+        char *fields_str[1] = {"yy"};
+       
+        int err = 0;
+
+        // Only check the rank in the middle because the ranks on the boundary
+        // will not correctly compute the stencil due to applying an interior
+        // stencil on the boundary
+        if (T->rank != 1) {
+                return err;
+        }
+
+        for (int i = 0; i < 1; ++i) {
+                _prec ferr[9] = {0.0};
+                _prec args[13] = {0.0, 1.0, 0.0,
+                                  0, 0.0, 0.0,
+                                  T->sv1[0], T->sv1[1], T->sv1[2],
+                                  T->coord[0], T->coord[1],
+                                  T->nx, T->ny};
+                err |= topo_test_fcn(fcn_poly, T, fields[i], Tt->tol,
+                                     args, regions, ferr);
+                check_printerr(__func__, T->rank, fields_str[i], ferr);
+        }
+
+        return err;
+}
+
+int topo_test_difflinz(const topo_test_t *Tt, const topo_t *T)
+{
+        int regions[9] = {0, 0, 0,
+                          0, 1, 0,
+                          0, 0, 0};
+
+        _prec *fields[1] = {Tt->out};
+        char *fields_str[1] = {"xz"};
+       
+        int err = 0;
+
+        for (int i = 0; i < 1; ++i) {
+                _prec ferr[9] = {0.0};
+                _prec args[13] = {0.0, 0.0, 1.0,
+                                  0, 0.0, 0.0,
+                                  Tt->out_shift[0], Tt->out_shift[1], 
+                                  Tt->out_shift[2],
+                                  T->coord[0], T->coord[1],
+                                  T->nx, T->ny};
+                err |= topo_test_fcn(fcn_polybndz, T, fields[i], Tt->tol,
+                                     args, regions, ferr);
+                check_printerr(__func__, T->rank, fields_str[i], ferr);
+        }
+
+        return err;
+}
+
+int topo_test_diffquadx(const topo_test_t *Tt, const topo_t *T)
+{
+        int regions[9] = {0, 0, 0,
+                          0, 1, 0,
+                          0, 0, 0};
+
+        _prec *fields[1] = {T->xx};
+        char *fields_str[1] = {"xx"};
+       
+        int err = 0;
+
+        // Only check the rank in the middle because the ranks on the boundary
+        // will not correctly compute the stencil due to applying an interior
+        // stencil on the boundary
+        if (T->rank != 1) {
+                return err;
+        }
+
+        for (int i = 0; i < 1; ++i) {
+                _prec ferr[9] = {0.0};
+                _prec args[13] = {2.0, 0.0, 0.0,
+                                  1.0, 0.0, 0.0,
+                                  Tt->out_shift[0], Tt->out_shift[1], 
+                                  Tt->out_shift[2],
+                                  T->coord[0], T->coord[1],
+                                  T->nx, T->ny};
+                err |= topo_test_fcn(fcn_poly, T, fields[i], Tt->tol,
+                                     args, regions, ferr);
+                check_printerr(__func__, T->rank, fields_str[i], ferr);
+        }
+
+        return err;
+}
+
+int topo_test_diffquady(const topo_test_t *Tt, const topo_t *T)
+{
+        int regions[9] = {0, 0, 0,
+                          0, 1, 0,
+                          0, 0, 0};
+
+        _prec *fields[1] = {T->yz};
+        char *fields_str[1] = {"yz"};
+       
+        int err = 0;
+
+        // Only check the rank in the middle because the ranks on the boundary
+        // will not correctly compute the stencil due to applying an interior
+        // stencil on the boundary
+        if (T->rank != 1) {
+                return err;
+        }
+
+        for (int i = 0; i < 1; ++i) {
+                _prec ferr[9] = {0.0};
+                _prec args[13] = {0.0, 2.0, 0.0,
+                                  0.0, 1.0, 0.0,
+                                  Tt->out_shift[0], Tt->out_shift[1], 
+                                  Tt->out_shift[2],
+                                  T->coord[0], T->coord[1],
+                                  T->nx, T->ny};
+                err |= topo_test_fcn(fcn_poly, T, fields[i], Tt->tol,
+                                     args, regions, ferr);
+                check_printerr(__func__, T->rank, fields_str[i], ferr);
+        }
+
+        return err;
+}
+
+int topo_test_diffquadz(const topo_test_t *Tt, const topo_t *T)
+{
+        int regions[9] = {0, 0, 0,
+                          0, 1, 0,
+                          0, 0, 0};
+
+        _prec *fields[1] = {Tt->out};
+        char *fields_str[1] = {"xz"};
+       
+        int err = 0;
+
+        for (int i = 0; i < 1; ++i) {
+                _prec ferr[9] = {0.0};
+                _prec args[13] = {0.0, 0.0, 2.0,
+                                  0, 0.0, 1.0,
+                                  Tt->out_shift[0], Tt->out_shift[1], 
+                                  Tt->out_shift[2],
+                                  T->coord[0], T->coord[1],
+                                  T->nx, T->ny};
+                err |= topo_test_fcn(fcn_polybndz, T, fields[i], Tt->tol,
+                                     args, regions, ferr);
+                check_printerr(__func__, T->rank, fields_str[i], ferr);
+        }
+
+        return err;
+}
+
+int topo_test_velconst(const topo_test_t *Tt, const topo_t *T)
+{
+        int regions[9] = {1, 1, 1,
+                          1, 1, 1,
+                          1, 1, 1};
+
+        _prec *fields[3] = {T->u1, T->v1, T->w1};
+        char *fields_str[3] = {"u1", "v1", "w1"};
+       
+        int err = 0;
+
+        if (T->rank != 1) {
+                return err;
+        }
+
+        for (int i = 0; i < 3; ++i) {
+                _prec ferr[9] = {0.0};
+                _prec args[13] = {0.0, 0.0, 0.0,
+                                  0.0, 0.0, 0.0,
+                                  Tt->out_shift[0], Tt->out_shift[1], 
+                                  Tt->out_shift[2],
+                                  T->coord[0], T->coord[1],
+                                  T->nx, T->ny};
+                err |= topo_test_fcn(fcn_poly, T, fields[i], Tt->tol,
+                                     args, regions, ferr);
+                check_printerr(__func__, T->rank, fields_str[i], ferr);
+        }
+
+        return err;
+}
+
+int topo_test_vellinx(const topo_test_t *Tt, const topo_t *T)
+{
+        int regions[9] = {0, 0, 0,
+                          0, 1, 0,
+                          0, 0, 0};
+
+        _prec *fields[3] = {T->u1, T->v1, T->w1};
+        xyz shift[3] = {
+                {.x = T->su1[0], .y = T->su1[1], .z = T->su1[2]},
+                {.x = T->sv1[0], .y = T->sv1[1], .z = T->sv1[2]},
+                {.x = T->sw1[0], .y = T->sw1[1], .z = T->sw1[2]}
+        };
+        char *fields_str[3] = {"u1", "v1", "w1"};
+       
+        int err = 0;
+
+        if (T->rank != 1) {
+                return err;
+        }
+
+        for (int i = 0; i < 3; ++i) {
+                _prec ferr[9] = {0.0};
+                _prec args[13] = {T->dth, 0.0, 0.0,
+                                  0.0, 0.0, 0.0,
+                                  shift[i].x, shift[i].y, shift[i].z,
+                                  T->coord[0], T->coord[1],
+                                  T->nx, T->ny};
+                err |= topo_test_fcn(fcn_poly, T, fields[i], Tt->tol,
+                                     args, regions, ferr);
+                check_printerr(__func__, T->rank, fields_str[i], ferr);
+        }
+
+        return err;
+}
+
+int topo_test_velliny(const topo_test_t *Tt, const topo_t *T)
+{
+        int regions[9] = {0, 0, 0,
+                          0, 1, 0,
+                          0, 0, 0};
+
+        _prec *fields[3] = {T->u1, T->v1, T->w1};
+        xyz shift[3] = {
+                {.x = T->su1[0], .y = T->su1[1], .z = T->su1[2]},
+                {.x = T->sv1[0], .y = T->sv1[1], .z = T->sv1[2]},
+                {.x = T->sw1[0], .y = T->sw1[1], .z = T->sw1[2]}
+        };
+        char *fields_str[3] = {"u1", "v1", "w1"};
+       
+        int err = 0;
+
+        if (T->rank != 1) {
+                return err;
+        }
+
+        for (int i = 0; i < 3; ++i) {
+                _prec ferr[9] = {0.0};
+                _prec args[13] = {0.0, T->dth, 0.0,
+                                  0.0, 0.0, 0.0,
+                                  shift[i].x, shift[i].y, shift[i].z,
+                                  T->coord[0], T->coord[1],
+                                  T->nx, T->ny};
+                err |= topo_test_fcn(fcn_poly, T, fields[i], Tt->tol,
+                                     args, regions, ferr);
+                check_printerr(__func__, T->rank, fields_str[i], ferr);
+        }
+
+        return err;
+}
+
+int topo_test_vellinz(const topo_test_t *Tt, const topo_t *T)
+{
+        int regions[9] = {0, 0, 0,
+                          0, 1, 0,
+                          0, 0, 0};
+
+        _prec *fields[3] = {T->u1, T->v1, T->w1};
+        xyz shift[3] = {
+                {.x = T->su1[0], .y = T->su1[1], .z = T->su1[2]},
+                {.x = T->sv1[0], .y = T->sv1[1], .z = T->sv1[2]},
+                {.x = T->sw1[0], .y = T->sw1[1], .z = T->sw1[2]}
+        };
+        char *fields_str[3] = {"u1", "v1", "w1"};
+       
+        int err = 0;
+
+        if (T->rank != 1) {
+                return err;
+        }
+
+        for (int i = 0; i < 3; ++i) {
+                _prec ferr[9] = {0.0};
+                _prec args[13] = {0.0, 0.0, T->dth,
+                                  0.0, 0.0, 0.0,
+                                  shift[i].x, shift[i].y, shift[i].z,
+                                  T->coord[0], T->coord[1],
+                                  T->nx, T->ny};
+                err |= topo_test_fcn(fcn_polybndz, T, fields[i], Tt->tol,
+                                     args, regions, ferr);
+                check_printerr(__func__, T->rank, fields_str[i], ferr);
+        }
+
+        return err;
+}
+
+int topo_test_velquadx(const topo_test_t *Tt, const topo_t *T)
+{
+        int regions[9] = {0, 0, 0,
+                          0, 1, 0,
+                          0, 0, 0};
+
+        _prec *fields[3] = {T->u1, T->v1, T->w1};
+        xyz shift[3] = {
+                {.x = T->su1[0], .y = T->su1[1], .z = T->su1[2]},
+                {.x = T->sv1[0], .y = T->sv1[1], .z = T->sv1[2]},
+                {.x = T->sw1[0], .y = T->sw1[1], .z = T->sw1[2]}
+        };
+        char *fields_str[3] = {"u1", "v1", "w1"};
+       
+        int err = 0;
+
+        if (T->rank != 1) {
+                return err;
+        }
+
+        for (int i = 0; i < 3; ++i) {
+                _prec ferr[9] = {0.0};
+                _prec args[13] = {2*T->dth, 0.0, 0.0,
+                                  1.0, 0.0, 0.0,
+                                  shift[i].x, shift[i].y, shift[i].z,
+                                  T->coord[0], T->coord[1],
+                                  T->nx, T->ny};
+                err |= topo_test_fcn(fcn_poly, T, fields[i], Tt->tol,
+                                     args, regions, ferr);
+                check_printerr(__func__, T->rank, fields_str[i], ferr);
+        }
+
+        return err;
+}
+
+int topo_test_velquady(const topo_test_t *Tt, const topo_t *T)
+{
+        int regions[9] = {0, 0, 0,
+                          0, 1, 0,
+                          0, 0, 0};
+
+        _prec *fields[3] = {T->u1, T->v1, T->w1};
+        xyz shift[3] = {
+                {.x = T->su1[0], .y = T->su1[1], .z = T->su1[2]},
+                {.x = T->sv1[0], .y = T->sv1[1], .z = T->sv1[2]},
+                {.x = T->sw1[0], .y = T->sw1[1], .z = T->sw1[2]}
+        };
+        char *fields_str[3] = {"u1", "v1", "w1"};
+       
+        int err = 0;
+
+        if (T->rank != 1) {
+                return err;
+        }
+
+        for (int i = 0; i < 3; ++i) {
+                _prec ferr[9] = {0.0};
+                _prec args[13] = {0.0, 2*T->dth, 0.0,
+                                  0.0, 1.0, 0.0,
+                                  shift[i].x, shift[i].y, shift[i].z,
+                                  T->coord[0], T->coord[1],
+                                  T->nx, T->ny};
+                err |= topo_test_fcn(fcn_poly, T, fields[i], Tt->tol,
+                                     args, regions, ferr);
+                check_printerr(__func__, T->rank, fields_str[i], ferr);
+        }
+
+        return err;
+}
+
+int topo_test_velquadz(const topo_test_t *Tt, const topo_t *T)
+{
+        int regions[9] = {0, 0, 0,
+                          0, 1, 0,
+                          0, 0, 0};
+
+        _prec *fields[3] = {T->u1, T->v1, T->w1};
+        xyz shift[3] = {
+                {.x = T->su1[0], .y = T->su1[1], .z = T->su1[2]},
+                {.x = T->sv1[0], .y = T->sv1[1], .z = T->sv1[2]},
+                {.x = T->sw1[0], .y = T->sw1[1], .z = T->sw1[2]}
+        };
+        char *fields_str[3] = {"u1", "v1", "w1"};
+       
+        int err = 0;
+
+        if (T->rank != 1) {
+                return err;
+        }
+
+        for (int i = 0; i < 3; ++i) {
+                _prec ferr[9] = {0.0};
+                _prec args[13] = {0.0, 0.0, 2*T->dth,
+                                  0.0, 0.0, 1.0,
+                                  shift[i].x, shift[i].y, shift[i].z,
+                                  T->coord[0], T->coord[1],
+                                  T->nx, T->ny};
+                err |= topo_test_fcn(fcn_polybndz, T, fields[i], Tt->tol,
+                                     args, regions, ferr);
+                check_printerr(__func__, T->rank, fields_str[i], ferr);
+        }
+
+        return err;
+}
+
+
+int topo_test_velfrontback(const topo_test_t *Tt, const topo_t *T)
+{
+        int regions[25] = {0, 1, 1, 1, 0, 
+                           0, 0, 0, 0, 0, 
+                           0, 0, 0, 0, 0, 
+                           0, 0, 0, 0, 0, 
+                           0, 1, 1, 1, 0};
+
+        _prec *fields[3] = {T->u1, T->v1, T->w1};
+        xyz shift[3] = {
+                {.x = T->su1[0], .y = T->su1[1], .z = T->su1[2]},
+                {.x = T->sv1[0], .y = T->sv1[1], .z = T->sv1[2]},
+                {.x = T->sw1[0], .y = T->sw1[1], .z = T->sw1[2]}
+        };
+        char *fields_str[3] = {"u1", "v1", "w1"};
+       
+        int err = 0;
+
+        if (T->rank != 1) {
+                return err;
+        }
+
+        for (int i = 0; i < 3; ++i) {
+                _prec ferr[25] = {0.0};
+                _prec args[13] = {0.0, 2*T->dth, 0.0,
+                                  0.0, 1.0, 0.0,
+                                  shift[i].x, shift[i].y, shift[i].z,
+                                  T->coord[0], T->coord[1],
+                                  T->nx, T->ny};
+                err |= topo_test_velocity_fcn(
+                    fcn_poly, check_flinferr, T, fields[i], Tt->tol,
+                    args, regions, ferr);
+                check_printerr55(__func__, T->rank, fields_str[i], ferr);
+        }
+
+        return err;
+}
+
+int topo_test_strconst(const topo_test_t *Tt, const topo_t *T)
+{
+        int regions[15] = {1, 1, 1, 1, 1, 
+                           1, 1, 1, 1, 1,
+                           1, 1, 1, 1, 1};
+
+        _prec *fields[6] = {T->xx, T->yy, T->zz, T->xy, T->xz, T->yz};
+        xyz shift[6] = {
+                {.x = T->sxx[0], .y = T->sxx[1], .z = T->sxx[2]},
+                {.x = T->syy[0], .y = T->syy[1], .z = T->syy[2]},
+                {.x = T->szz[0], .y = T->szz[1], .z = T->szz[2]},
+                {.x = T->sxy[0], .y = T->sxy[1], .z = T->sxy[2]},
+                {.x = T->sxz[0], .y = T->sxz[1], .z = T->sxz[2]},
+                {.x = T->syz[0], .y = T->syz[1], .z = T->syz[2]}
+        };
+        char *fields_str[6] = {"xx", "yy", "zz", "xy", "xz", "yz"};
+       
+        int err = 0;
+
+        if (T->rank != 1) {
+                return err;
+        }
+
+        for (int i = 0; i < 6; ++i) {
+                _prec ferr[15] = {0.0};
+                _prec args[13] = {0.0, 0.0, 0.0,
+                                  0.0, 0.0, 0.0,
+                                  shift[i].x, shift[i].y, shift[i].z,
+                                  T->coord[0], T->coord[1],
+                                  T->nx, T->ny};
+                err |= topo_test_stress_fcn(fcn_poly, check_fl1err, T, fields[i],
+                                            Tt->tol, args, regions, ferr);
+                check_printerr53(__func__, T->rank, fields_str[i], ferr);
+        }
+
+        return err;
+}
+
+int topo_test_strlinx(const topo_test_t *Tt, const topo_t *T)
+{
+        int regions[15] = {1, 1, 1, 1, 1, 
+                           1, 1, 1, 1, 1,
+                           1, 1, 1, 1, 1};
+
+        _prec *fields[6] = {T->xx, T->yy, T->zz, T->xy, T->xz, T->yz};
+        xyz shift[6] = {
+                {.x = T->sxx[0], .y = T->sxx[1], .z = T->sxx[2]},
+                {.x = T->syy[0], .y = T->syy[1], .z = T->syy[2]},
+                {.x = T->szz[0], .y = T->szz[1], .z = T->szz[2]},
+                {.x = T->sxy[0], .y = T->sxy[1], .z = T->sxy[2]},
+                {.x = T->sxz[0], .y = T->sxz[1], .z = T->sxz[2]},
+                {.x = T->syz[0], .y = T->syz[1], .z = T->syz[2]}
+        };
+        char *fields_str[6] = {"xx", "yy", "zz", "xy", "xz", "yz"};
+       
+        int err = 0;
+
+        if (T->rank != 1) {
+                return err;
+        }
+
+        _prec err_coef[6] = {3*T->dth, T->dth, T->dth, T->dth, T->dth, 0};
+        for (int i = 0; i < 6; ++i) {
+                _prec ferr[15] = {0.0};
+                _prec args[13] = {err_coef[i], 0.0, 0.0,
+                                  0.0, 0.0, 0.0,
+                                  shift[i].x, shift[i].y, shift[i].z,
+                                  T->coord[0], T->coord[1],
+                                  T->nx, T->ny};
+                err |=
+                    topo_test_stress_fcn(fcn_poly, check_flinferr, T, fields[i],
+                                         Tt->tol, args, regions, ferr);
+                check_printerr53(__func__, T->rank, fields_str[i], ferr);
+        }
+
+        return err;
+}
+
+int topo_test_strliny(const topo_test_t *Tt, const topo_t *T)
+{
+        int regions[15] = {1, 1, 1, 1, 1, 
+                           1, 1, 1, 1, 1,
+                           1, 1, 1, 1, 1};
+
+        _prec *fields[6] = {T->xx, T->yy, T->zz, T->xy, T->xz, T->yz};
+        xyz shift[6] = {
+                {.x = T->sxx[0], .y = T->sxx[1], .z = T->sxx[2]},
+                {.x = T->syy[0], .y = T->syy[1], .z = T->syy[2]},
+                {.x = T->szz[0], .y = T->szz[1], .z = T->szz[2]},
+                {.x = T->sxy[0], .y = T->sxy[1], .z = T->sxy[2]},
+                {.x = T->sxz[0], .y = T->sxz[1], .z = T->sxz[2]},
+                {.x = T->syz[0], .y = T->syz[1], .z = T->syz[2]}
+        };
+        char *fields_str[6] = {"xx", "yy", "zz", "xy", "xz", "yz"};
+       
+        int err = 0;
+
+        if (T->rank != 1) {
+                return err;
+        }
+
+        _prec err_coef[6] = {T->dth, 3 * T->dth, T->dth, T->dth, 0, T->dth};
+        for (int i = 0; i < 6; ++i) {
+                _prec ferr[15] = {0.0};
+                _prec args[13] = {0.0, err_coef[i], 0.0,
+                                  0.0, 0.0, 0.0,
+                                  shift[i].x, shift[i].y, shift[i].z,
+                                  T->coord[0], T->coord[1],
+                                  T->nx, T->ny};
+                err |=
+                    topo_test_stress_fcn(fcn_poly, check_flinferr, T, fields[i],
+                                         Tt->tol, args, regions, ferr);
+                check_printerr53(__func__, T->rank, fields_str[i], ferr);
+        }
+
+        return err;
+}
+
+int topo_test_strlinz(const topo_test_t *Tt, const topo_t *T)
+{
+        int regions[15] = {1, 1, 1, 1, 1, 
+                           1, 1, 1, 1, 1,
+                           1, 1, 1, 1, 1};
+
+        _prec *fields[6] = {T->xx, T->yy, T->zz, T->xy, T->xz, T->yz};
+        xyz shift[6] = {
+                {.x = T->sxx[0], .y = T->sxx[1], .z = T->sxx[2]},
+                {.x = T->syy[0], .y = T->syy[1], .z = T->syy[2]},
+                {.x = T->szz[0], .y = T->szz[1], .z = T->szz[2]},
+                {.x = T->sxy[0], .y = T->sxy[1], .z = T->sxy[2]},
+                {.x = T->sxz[0], .y = T->sxz[1], .z = T->sxz[2]},
+                {.x = T->syz[0], .y = T->syz[1], .z = T->syz[2]}
+        };
+        char *fields_str[6] = {"xx", "yy", "zz", "xy", "xz", "yz"};
+       
+        int err = 0;
+
+        if (T->rank != 1) {
+                return err;
+        }
+
+        _prec err_coef[6] = {T->dth, T->dth, 3 * T->dth, 0, T->dth, T->dth};
+        for (int i = 0; i < 6; ++i) {
+                _prec ferr[15] = {0.0};
+                _prec args[13] = {0.0, 0.0, err_coef[i],
+                                  0.0, 0.0, 0.0,
+                                  shift[i].x, shift[i].y, shift[i].z,
+                                  T->coord[0], T->coord[1],
+                                  T->nx, T->ny};
+                err |= topo_test_stress_fcn(fcn_polybndz, check_flinferr, T,
+                                            fields[i], Tt->tol, args, regions,
+                                            ferr);
+                check_printerr53(__func__, T->rank, fields_str[i], ferr);
+        }
+
+        return err;
+}
+
+int topo_test_strquadx(const topo_test_t *Tt, const topo_t *T)
+{
+        int regions[15] = {1, 1, 1, 1, 1, 
+                           1, 1, 1, 1, 1,
+                           1, 1, 1, 1, 1};
+
+        _prec *fields[6] = {T->xx, T->yy, T->zz, T->xy, T->xz, T->yz};
+        xyz shift[6] = {
+                {.x = T->sxx[0], .y = T->sxx[1], .z = T->sxx[2]},
+                {.x = T->syy[0], .y = T->syy[1], .z = T->syy[2]},
+                {.x = T->szz[0], .y = T->szz[1], .z = T->szz[2]},
+                {.x = T->sxy[0], .y = T->sxy[1], .z = T->sxy[2]},
+                {.x = T->sxz[0], .y = T->sxz[1], .z = T->sxz[2]},
+                {.x = T->syz[0], .y = T->syz[1], .z = T->syz[2]}
+        };
+        char *fields_str[6] = {"xx", "yy", "zz", "xy", "xz", "yz"};
+       
+        int err = 0;
+
+        if (T->rank != 1) {
+                return err;
+        }
+
+        _prec err_coef[6] = {3*T->dth, T->dth, T->dth, T->dth, T->dth, 0};
+        for (int i = 0; i < 6; ++i) {
+                _prec ferr[15] = {0.0};
+                _prec args[13] = {2 * err_coef[i], 0.0, 0.0,
+                                  1.0, 0.0, 0.0,
+                                  shift[i].x, shift[i].y, shift[i].z,
+                                  T->coord[0], T->coord[1],
+                                  T->nx, T->ny};
+                err |=
+                    topo_test_stress_fcn(fcn_poly, check_flinferr, T, fields[i],
+                                         Tt->tol, args, regions, ferr);
+                check_printerr53(__func__, T->rank, fields_str[i], ferr);
+        }
+
+        return err;
+}
+
+int topo_test_strquady(const topo_test_t *Tt, const topo_t *T)
+{
+        int regions[15] = {1, 1, 1, 1, 1, 
+                           1, 1, 1, 1, 1,
+                           1, 1, 1, 1, 1};
+
+        _prec *fields[6] = {T->xx, T->yy, T->zz, T->xy, T->xz, T->yz};
+        xyz shift[6] = {
+                {.x = T->sxx[0], .y = T->sxx[1], .z = T->sxx[2]},
+                {.x = T->syy[0], .y = T->syy[1], .z = T->syy[2]},
+                {.x = T->szz[0], .y = T->szz[1], .z = T->szz[2]},
+                {.x = T->sxy[0], .y = T->sxy[1], .z = T->sxy[2]},
+                {.x = T->sxz[0], .y = T->sxz[1], .z = T->sxz[2]},
+                {.x = T->syz[0], .y = T->syz[1], .z = T->syz[2]}
+        };
+        char *fields_str[6] = {"xx", "yy", "zz", "xy", "xz", "yz"};
+       
+        int err = 0;
+
+        if (T->rank != 1) {
+                return err;
+        }
+
+        _prec err_coef[6] = {T->dth, 3 * T->dth, T->dth, T->dth, 0, T->dth};
+        for (int i = 0; i < 6; ++i) {
+                _prec ferr[15] = {0.0};
+                _prec args[13] = {0.0, 2 * err_coef[i], 0.0,
+                                  0.0, 1.0, 0.0,
+                                  shift[i].x, shift[i].y, shift[i].z,
+                                  T->coord[0], T->coord[1],
+                                  T->nx, T->ny};
+                err |=
+                    topo_test_stress_fcn(fcn_poly, check_flinferr, T, fields[i],
+                                         Tt->tol, args, regions, ferr);
+                check_printerr53(__func__, T->rank, fields_str[i], ferr);
+        }
+
+        return err;
+}
+
+int topo_test_strquadz(const topo_test_t *Tt, const topo_t *T)
+{
+        int regions[15] = {1, 1, 1, 1, 1, 
+                           1, 1, 1, 1, 1,
+                           1, 1, 1, 1, 1};
+
+        _prec *fields[6] = {T->xx, T->yy, T->zz, T->xy, T->xz, T->yz};
+        xyz shift[6] = {
+                {.x = T->sxx[0], .y = T->sxx[1], .z = T->sxx[2]},
+                {.x = T->syy[0], .y = T->syy[1], .z = T->syy[2]},
+                {.x = T->szz[0], .y = T->szz[1], .z = T->szz[2]},
+                {.x = T->sxy[0], .y = T->sxy[1], .z = T->sxy[2]},
+                {.x = T->sxz[0], .y = T->sxz[1], .z = T->sxz[2]},
+                {.x = T->syz[0], .y = T->syz[1], .z = T->syz[2]}
+        };
+        char *fields_str[6] = {"xx", "yy", "zz", "xy", "xz", "yz"};
+       
+        int err = 0;
+
+        if (T->rank != 1) {
+                return err;
+        }
+
+        _prec err_coef[6] = {T->dth, T->dth, 3 * T->dth, 0, T->dth, T->dth};
+        for (int i = 0; i < 6; ++i) {
+                _prec ferr[15] = {0.0};
+                _prec args[13] = {0.0, 0.0, 2 * err_coef[i],
+                                  0.0, 0.0, 1.0,
+                                  shift[i].x, shift[i].y, shift[i].z,
+                                  T->coord[0], T->coord[1],
+                                  T->nx, T->ny};
+                err |= topo_test_stress_fcn(fcn_polybndz, check_flinferr, T,
+                                            fields[i], Tt->tol, args, regions,
+                                            ferr);
+                check_printerr53(__func__, T->rank, fields_str[i], ferr);
+        }
+
+        return err;
+}
+
+
+int topo_test_fcn(fcnp fp, const topo_t *T, const _prec *dres, const _prec tol,
+                  const _prec *args, const int *regions, _prec *ferr)
+{
+        int size = sizeof(_prec)*T->gridsize;
+        _prec *res = malloc(size);
+        _prec *ans = malloc(size);
+        cudaMemcpy(res, dres, size, cudaMemcpyDeviceToHost);
+
+        int err = 0;
+
+        // Apply function everywhere (excluding alignment space and bottom
+        // in z-direction region)
+        fp(ans, 
+           T->off_x[0], T->off_x[3],
+           T->off_y[0], T->off_y[3],
+           T->off_z[1], T->off_z[2],
+           T->line, T->slice, 
+           args);
+        err = check_all(check_fl1err, res, ans, 
+                        T->off_x, T->off_y, T->off_z, 3, 3,
+                        T->line, T->slice, 
+                        tol, regions, ferr);
+
+        free(res);
+        free(ans);
+
+        return err;
+}
+
+int topo_test_stress_fcn(fcnp fp, check_fun check_fp,
+                         const topo_t *T, const _prec *dres,
+                         const _prec tol, const _prec *args, const int *regions,
+                         _prec *ferr) 
+{
+        int size = sizeof(_prec)*T->gridsize;
+        _prec *res = malloc(size);
+        _prec *ans = malloc(size);
+        cudaMemcpy(res, dres, size, cudaMemcpyDeviceToHost);
+
+        int err = 0;
+
+        fp(ans, 
+           T->stress_offset_x[1], T->stress_offset_x[4],
+           T->stress_offset_y[1], T->stress_offset_y[2],
+           T->off_z[1], T->off_z[2],
+           T->line, T->slice, 
+           args);
+
+        err = check_all(check_fp, res, ans, 
+                        T->stress_offset_x, T->stress_offset_y, T->off_z, 5, 3,
+                        T->line, T->slice, 
+                        tol, regions, ferr);
+
+        free(res);
+        free(ans);
+
+        return err;
+}
+
+int topo_test_velocity_fcn(fcnp fp, check_fun check_fp, const topo_t *T,
+                           const _prec *dres, const _prec tol,
+                           const _prec *args, const int *regions, _prec *ferr) 
+{
+        int size = sizeof(_prec)*T->gridsize;
+        _prec *res = malloc(size);
+        _prec *ans = malloc(size);
+        cudaMemcpy(res, dres, size, cudaMemcpyDeviceToHost);
+
+        int err = 0;
+
+        fp(ans, 
+           T->off_x[0], T->off_x[3],
+           T->off_y[0], T->off_y[3],
+           T->off_z[1], T->off_z[2],
+           T->line, T->slice, 
+           args);
+
+        err = check_all(check_fp, res, ans, 
+                        T->velocity_offset_x, T->velocity_offset_y, 
+                        T->off_z, 5, 5,
+                        T->line, T->slice, 
+                        tol, regions, ferr);
+
+        free(res);
+        free(ans);
+
+        return err;
+}
diff --git a/tests/topography/accuracy/topography_test.h b/tests/topography/accuracy/topography_test.h
new file mode 100644
index 0000000..4dc2c99
--- /dev/null
+++ b/tests/topography/accuracy/topography_test.h
@@ -0,0 +1,333 @@
+#ifndef TOPOGRAPHY_TEST_H
+#define TOPOGRAPHY_TEST_H
+/* 
+ *  This module is used to test the module `topography`.
+ *  While this module provides a number of tests (see below), only one test can
+ *  run at a time. A test that fails will return an error code `err`. 
+ *  If `err > 0` then the test has failed.
+ *
+ *  The testing procedure works in the following three steps:
+ *  1. The test is initialized, and the particular chosen test is configured.
+ *  This action takes place when calling `test_topo_init()`, and this function
+ *  is called before entering the time stepping loop.
+ *  For example, `TOPO_TEST_CONSTX` enables a test that checks if a constant
+ *  function is correctly handled. The initialization step calls a CUDA kernel
+ *  that will initialize one or more of the device arrays `u1, v1, w1, etc`. In
+ *  this case, the kernel launch function is named `topo_test_const_H`.
+ *
+ *  2. Outside code then runs and modifies the device arrays as desired. To
+ *  mimic the behavior of the velocity communication and computation, the
+ *  functions `test_velfront`, `test_velback` are called at the time when the
+ *  compute kernels for the front and back parts of the velocity field are
+ *  called. Similarly, `test_velx` is called when the kernel for the interior
+ *  computation of the velocity field takes place.
+ *
+ *  3. The test is finalized by copying the device data to host and comparing
+ *  the result to a priori known answer. The l-2 norm is used for the
+ *  comparison. This action takes place when calling `test_topo_finalize` and is
+ *  executed after the time stepping loop has completed.
+ *
+ */ 
+
+
+// TOPO_TEST: Enable testing
+#ifndef TOPO_TEST
+#define TOPO_TEST 0
+#endif 
+
+/*
+ * Tests to run (choose only one).
+ * These tests can be activated by specifying the flag `-D` and
+ * variable name. For example,`-DTOPO_TEST_CONSTX=1` will run the test
+ * `TOPO_TEST_CONSTX` (see below for available tests). 
+ *
+ * Description:
+ *      TOPO_TEST_CONSTX: Check that a constant function is handled correctly
+ *      with communication in the x-direction.
+ *
+ *      TOPO_TEST_CONSTY: Check that a constant function is handled correctly
+ *      with communication in the y-direction.
+ *
+ *      TOPO_TEST_LINX: Check that a linear function is handled correctly
+ *      with communication in the x-direction.
+ *
+ *      TOPO_TEST_DIFFCONSTX: Differentiate a constant function in the
+ *      x-direction on the device and check that the zero function is produced
+ *      on the host.
+ *
+ *      TOPO_TEST_DIFFCONSTY: Differentiate a constant function in the
+ *      y-direction on the device and check that the zero function is produced
+ *      on the host.
+ *
+ *      TOPO_TEST_DIFFCONSTZ: Differentiate a constant function in the
+ *      z-direction on the device and check that the zero function is produced
+ *      on the host. This test includes stencils for the top boundary.
+ *
+ *      TOPO_TEST_DIFFLINX: Differentiate a linear function in the x-direction
+ *      on the device and check that the correct constant function is produced
+ *      on the host.
+ *
+ *      TOPO_TEST_DIFFLINY: Differentiate a linear function in the y-direction
+ *      on the device and check that the correct constant function is produced
+ *      on the host.
+ *
+ *      TOPO_TEST_DIFFLINZ: Differentiate a linear function in the
+ *      z-direction on the device and check that the correct constant function
+ *      is produced on the host. This test includes stencils for the top
+ *      boundary.
+ *
+ *      TOPO_TEST_DIFFQUADX: Differentiate a quadratic function in the
+ *      x-direction on the device and check that the correct linear function is
+ *      produced on the host.
+ *
+ *      TOPO_TEST_DIFFQUADY: Differentiate a quadratic function in the
+ *      y-direction on the device and check that the correct linear function is
+ *      produced on the host.
+ *
+ *      TOPO_TEST_DIFFQUADZ: Differentiate a quadratic function in the
+ *      y-direction on the device and check that the correct linear function is
+ *      produced on the host.
+ *
+ *      TOPO_TEST_VELCONST: Test the topography velocity kernel by using a
+ *      constant function. 
+ *
+ *      TOPO_TEST_VELLINX: Test the topography velocity kernel by using a linear
+ *      function in the x-direction. 
+ *
+ *      TOPO_TEST_VELLINY: Test the topography velocity kernel by using a linear
+ *      function in the y-direction. 
+ *
+ *      TOPO_TEST_VELLINZ: Test the topography velocity kernel by using a linear
+ *      function in the z-direction. 
+ *
+ *      TOPO_TEST_VELQUADX: Test the topography velocity kernel by using a
+ *      quadratic function in the x-direction. 
+ *
+ *      TOPO_TEST_VELQUADY: Test the topography velocity kernel by using a
+ *      quadratic function in the y-direction. 
+ *
+ *      TOPO_TEST_VELQUADZ: Test the topography velocity kernel by using a
+ *      quadratic function in the z-direction. 
+ *
+ *      TOPO_TEST_STRCONST: Test the topography stress kernel by using a
+ *      constant function. 
+ * 
+ *      TOPO_TEST_STRLINX: Test the topography stress kernel by using a linear
+ *      function in the x-direction. 
+ * 
+ *      TOPO_TEST_STRLINY: Test the topography stress kernel by using a linear
+ *      function in the y-direction. 
+ *
+ *      TOPO_TEST_STRLINZ: Test the topography stress kernel by using a linear
+ *      function in the z-direction. 
+ *
+ *      TOPO_TEST_STRQUADX: Test the topography stress kernel by using a
+ *      quadratic function in the x-direction. 
+ *
+ *      TOPO_TEST_STRQUADY: Test the topography stress kernel by using a
+ *      quadratic function in the y-direction. 
+ *
+ *      TOPO_TEST_STRQUADZ: Test the topography stress kernel by using a
+ *      quadratic function in the z-direction. 
+ *
+ *      TOPO_TEST_VELFRONTBACK: Test the topography front and back velocity
+ *      kernels by using a quadratic function in the y-direction. 
+ */
+#ifndef TOPO_TEST_CONSTX
+#define TOPO_TEST_CONSTX 0
+#endif
+
+#ifndef TOPO_TEST_CONSTY
+#define TOPO_TEST_CONSTY 0
+#endif
+
+#ifndef TOPO_TEST_LINX
+#define TOPO_TEST_LINX 0
+#endif
+
+#ifndef TOPO_TEST_DIFFCONSTX
+#define TOPO_TEST_DIFFCONSTX 0
+#endif
+
+#ifndef TOPO_TEST_DIFFCONSTY
+#define TOPO_TEST_DIFFCONSTY 0
+#endif
+
+#ifndef TOPO_TEST_DIFFCONSTZ
+#define TOPO_TEST_DIFFCONSTZ 0
+#endif
+
+#ifndef TOPO_TEST_DIFFLINX
+#define TOPO_TEST_DIFFLINX 0
+#endif
+
+#ifndef TOPO_TEST_DIFFLINY
+#define TOPO_TEST_DIFFLINY 0
+#endif
+
+#ifndef TOPO_TEST_DIFFLINZ
+#define TOPO_TEST_DIFFLINZ 0
+#endif
+
+#ifndef TOPO_TEST_DIFFQUADX
+#define TOPO_TEST_DIFFQUADX 0
+#endif
+
+#ifndef TOPO_TEST_DIFFQUADY
+#define TOPO_TEST_DIFFQUADY 0
+#endif
+
+#ifndef TOPO_TEST_DIFFQUADZ
+#define TOPO_TEST_DIFFQUADZ 0
+#endif
+
+#ifndef TOPO_TEST_VELCONST
+#define TOPO_TEST_VELCONST 0
+#endif
+
+#ifndef TOPO_TEST_VELLINX
+#define TOPO_TEST_VELLINX 0
+#endif
+
+#ifndef TOPO_TEST_VELLINY
+#define TOPO_TEST_VELLINY 0
+#endif
+
+#ifndef TOPO_TEST_VELLINZ
+#define TOPO_TEST_VELLINZ 0
+#endif
+
+#ifndef TOPO_TEST_VELQUADX
+#define TOPO_TEST_VELQUADX 0
+#endif
+
+#ifndef TOPO_TEST_VELQUADY
+#define TOPO_TEST_VELQUADY 0
+#endif
+
+#ifndef TOPO_TEST_VELQUADZ
+#define TOPO_TEST_VELQUADZ 0
+#endif
+
+#ifndef TOPO_TEST_VELFRONTBACK
+#define TOPO_TEST_VELFRONTBACK 0
+#endif
+
+#ifndef TOPO_TEST_STRCONST
+#define TOPO_TEST_STRCONST 0
+#endif
+
+#ifndef TOPO_TEST_STRLINX
+#define TOPO_TEST_STRLINX 0
+#endif
+
+#ifndef TOPO_TEST_STRLINY
+#define TOPO_TEST_STRLINY 0
+#endif
+
+#ifndef TOPO_TEST_STRLINZ
+#define TOPO_TEST_STRLINZ 0
+#endif
+
+#ifndef TOPO_TEST_STRQUADX
+#define TOPO_TEST_STRQUADX 0
+#endif
+
+#ifndef TOPO_TEST_STRQUADY
+#define TOPO_TEST_STRQUADY 0
+#endif
+
+#ifndef TOPO_TEST_STRQUADZ
+#define TOPO_TEST_STRQUADZ 0
+#endif
+
+#define TOPO_TEST_TOLERANCE 1e-6
+
+#include "functions.h"
+#include "grid_check.h"
+#include "topography.h"
+
+typedef struct {
+        int x;
+        int y;
+        int z;
+} xyz;
+
+typedef struct {
+        int use;
+        _prec tol;
+        _prec coef[3];
+        _prec deg[3];
+        _prec *out;
+        _prec *velf;
+        _prec *velb;
+        _prec *in;
+        int out_shift[3];
+        int in_shift[3];
+        _prec cxx[3];
+        _prec cyy[3];
+        _prec czz[3];
+        _prec cxy[3];
+        _prec cxz[3];
+        _prec cyz[3];
+        _prec cu1[3];
+        _prec cv1[3];
+        _prec cw1[3];
+} topo_test_t;
+
+topo_test_t topo_test_init(topo_t *T);
+void topo_test_velfront(topo_test_t *Tt, topo_t *T);
+void topo_test_velback(topo_test_t *Tt, topo_t *T);
+void topo_test_velx(const topo_test_t *Tt, topo_t *T);
+void topo_test_stress(const topo_test_t *Tt, topo_t *T);
+void topo_test_stress_interior(const topo_test_t *Tt, topo_t *T);
+void topo_test_stress_sides(const topo_test_t *Tt, topo_t *T);
+int topo_test_finalize(const topo_test_t *Tt, topo_t *T);
+
+// Tests
+int topo_test_constx(const topo_test_t *Tt, const topo_t *T);
+int topo_test_consty(const topo_test_t *Tt, const topo_t *T);
+int topo_test_linx(const topo_test_t *Tt, const topo_t *T);
+int topo_test_liny(const topo_test_t *Tt, const topo_t *T);
+int topo_test_diffconstx(const topo_test_t *Tt, const topo_t *T);
+int topo_test_diffconsty(const topo_test_t *Tt, const topo_t *T);
+int topo_test_diffconstz(const topo_test_t *Tt, const topo_t *T);
+int topo_test_difflinx(const topo_test_t *Tt, const topo_t *T);
+int topo_test_diffliny(const topo_test_t *Tt, const topo_t *T);
+int topo_test_difflinz(const topo_test_t *Tt, const topo_t *T);
+int topo_test_diffquadx(const topo_test_t *Tt, const topo_t *T);
+int topo_test_diffquady(const topo_test_t *Tt, const topo_t *T);
+int topo_test_diffquadz(const topo_test_t *Tt, const topo_t *T);
+int topo_test_velconst(const topo_test_t *Tt, const topo_t *T);
+int topo_test_vellinx(const topo_test_t *Tt, const topo_t *T);
+int topo_test_velliny(const topo_test_t *Tt, const topo_t *T);
+int topo_test_vellinz(const topo_test_t *Tt, const topo_t *T);
+int topo_test_velquadx(const topo_test_t *Tt, const topo_t *T);
+int topo_test_velquady(const topo_test_t *Tt, const topo_t *T);
+int topo_test_velquadz(const topo_test_t *Tt, const topo_t *T);
+int topo_test_velfrontback(const topo_test_t *Tt, const topo_t *T);
+int topo_test_strconst(const topo_test_t *Tt, const topo_t *T);
+int topo_test_strlinx(const topo_test_t *Tt, const topo_t *T);
+int topo_test_strliny(const topo_test_t *Tt, const topo_t *T);
+int topo_test_strlinz(const topo_test_t *Tt, const topo_t *T);
+int topo_test_strquadx(const topo_test_t *Tt, const topo_t *T);
+int topo_test_strquady(const topo_test_t *Tt, const topo_t *T);
+int topo_test_strquadz(const topo_test_t *Tt, const topo_t *T);
+
+// Test helper functions
+int topo_test_fcn(fcnp fp, const topo_t *T, const _prec *dres, const _prec tol,
+                  const _prec *args, const int *regions, _prec *ferr);
+
+int topo_test_stress_fcn(fcnp fp, check_fun check_fp,
+                         const topo_t *T, const _prec *dres,
+                         const _prec tol, 
+                         const _prec *args,
+                         const int *regions, _prec *ferr);
+
+int topo_test_velocity_fcn(fcnp fp, check_fun check_fp,
+                         const topo_t *T, const _prec *dres,
+                         const _prec tol, 
+                         const _prec *args,
+                         const int *regions, _prec *ferr);
+
+#endif
diff --git a/tests/topography/geometry/CMakeLists.txt b/tests/topography/geometry/CMakeLists.txt
index efb6058..05b9e16 100644
--- a/tests/topography/geometry/CMakeLists.txt
+++ b/tests/topography/geometry/CMakeLists.txt
@@ -11,7 +11,7 @@ target_link_libraries(test_topography_geometry
 
 target_include_directories(test_topography_geometry
         PUBLIC
-        ${AWP_MINI_SOURCE_DIR}/include/
+        ${AWP_SOURCE_DIR}/include/
         )
 
 add_test(NAME test_topography_geometry 
diff --git a/tests/topography/geometry/test_geometry.c b/tests/topography/geometry/test_geometry.c
index 4261863..a7ddf51 100644
--- a/tests/topography/geometry/test_geometry.c
+++ b/tests/topography/geometry/test_geometry.c
@@ -53,7 +53,7 @@ void test_gaussian(_prec **x, _prec **y, _prec **z, const int write_vtk,
         _prec h = gridspacing;
         int gsize[3] = {128, 128, 32};
 
-        f_grid_t metrics_f = metrics_init_f(gsize, gridspacing);
+        f_grid_t metrics_f = metrics_init_f(gsize, gridspacing, 8);
         g_grid_t metrics_g = metrics_init_g(gsize, gridspacing);
 
         int3_t shift = grid_xx();
@@ -119,7 +119,7 @@ void test_incline_plane(const int write_vtk, const int rank)
 
         _prec gridspacing = 1.0 / (gsize[2] - 2);
 
-        f_grid_t metrics_f = metrics_init_f(gsize, gridspacing);
+        f_grid_t metrics_f = metrics_init_f(gsize, gridspacing, 8);
         g_grid_t metrics_g = metrics_init_g(gsize, gridspacing);
 
         int3_t shift = grid_u3();
@@ -149,9 +149,9 @@ void test_incline_plane(const int write_vtk, const int rank)
         _prec *y1 = malloc(sizeof(y1) * y1_grid.size);
         _prec *z1 = malloc(sizeof(z1) * z1_grid.size);
 
-        grid_fill1(x1, x1_grid);
-        grid_fill1(y1, y1_grid);
-        grid_fill1(z1, z1_grid);
+        grid_fill1(x1, x1_grid, 1);
+        grid_fill1(y1, y1_grid, 0);
+        grid_fill1(z1, z1_grid, 0);
 
         _prec *x = malloc(topography_grid.num_bytes);
         _prec *y = malloc(topography_grid.num_bytes);
diff --git a/tests/topography/mapping/CMakeLists.txt b/tests/topography/mapping/CMakeLists.txt
new file mode 100644
index 0000000..81202b7
--- /dev/null
+++ b/tests/topography/mapping/CMakeLists.txt
@@ -0,0 +1,14 @@
+add_executable(test_mapping test_mapping.c)
+
+target_link_libraries(test_mapping 
+        topography
+        )
+
+target_include_directories(test_mapping
+        PUBLIC
+        ${AWP_SOURCE_DIR}/include/
+        )
+
+add_test(NAME test_mapping COMMAND test_mapping)
+
+
diff --git a/tests/topography/mapping/test_mapping.c b/tests/topography/mapping/test_mapping.c
new file mode 100644
index 0000000..0221b08
--- /dev/null
+++ b/tests/topography/mapping/test_mapping.c
@@ -0,0 +1,60 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <assert.h>
+#include <topography/mapping.h>
+
+#ifdef NDEBUG
+#undef NDEBUG
+#endif
+
+void test_convergence(const double dzb, const double dzt, const int n, const double eps);
+void test_convergence(const double dzb, const double dzt, const int n, const double eps) {
+
+    printf("Testing dzb=%f, dzt=%f n=%d eps=%f \n", dzb, dzt, n, eps);
+    double h = 1.0 / (n - 1);
+    struct mapping map = map_init(dzb, dzt, h);
+    for (int i = 0; i < n; ++i) {
+        double z = i * h;
+        double r = map_invert(z, &map, 0.5 * eps, 10000);
+        double zeval = map_eval(r, &map);
+        assert(fabs(zeval - z) < eps);
+    }
+}
+
+int main(int argc, char **argv) {
+
+    double eps = 1e-4;
+
+    // Check that if the mapping is linear then, z = r
+    double n = 4;
+    double h = 1.0 / (n - 1);
+    double dzb = h;
+    double dzt = h;
+    struct mapping map = map_init(dzb, dzt, h);
+
+    assert(map_find_cell_r(0.2 * h, &map) == 0);
+    assert(map_find_cell_r(1.1 * h, &map) == 1);
+    assert(map_find_cell_r(1.0 - 0.5 * h, &map) == 2);
+
+    for (int i = 0; i <  n; ++i) {
+        double r = i * h;
+        assert(fabs(r - map_eval(r, &map)) < eps * h);
+    }
+    
+    for (int i = 0; i < n; ++i) {
+        double r = i * h;
+        assert(fabs(r - map_invert(r, &map, 0.5 * eps, 1000)) < eps);
+    }
+
+    test_convergence(0.1, 0.01, 11, eps);
+    test_convergence(0.1, 0.1, 10, eps);
+    test_convergence(1e-2, 0.1, 10, eps);
+    test_convergence(1e-2, 0.1, 100, eps);
+    test_convergence(5e-3, 0.1, 100, eps);
+    test_convergence(0.01, 0.1, 1000, eps);
+    test_convergence(1e-2, 1e-2, 1000, eps);
+    test_convergence(1e-4, 1e-5, 10000, eps);
+    test_convergence(1e-3, 1e-6, 10000, eps);
+}
+
diff --git a/tests/topography/metrics/CMakeLists.txt b/tests/topography/metrics/CMakeLists.txt
index b9794a0..2b8398d 100644
--- a/tests/topography/metrics/CMakeLists.txt
+++ b/tests/topography/metrics/CMakeLists.txt
@@ -9,7 +9,7 @@ target_link_libraries(test_metrics
 
 target_include_directories(test_metrics
         PUBLIC
-        ${AWP_MINI_SOURCE_DIR}/include/
+        ${AWP_SOURCE_DIR}/include/
         )
 
 add_test(NAME test_metrics COMMAND test_metrics)
diff --git a/tests/topography/metrics/test_metrics.c b/tests/topography/metrics/test_metrics.c
index defd10a..cd533a7 100644
--- a/tests/topography/metrics/test_metrics.c
+++ b/tests/topography/metrics/test_metrics.c
@@ -77,7 +77,7 @@ f_grid_t test_f_init(test_t *test)
 {
         int size[3] = {32, 16, 1};
         _prec gridspacing = 1.0;
-        f_grid_t out = metrics_init_f(size, gridspacing);
+        f_grid_t out = metrics_init_f(size, gridspacing, 8);
         assert(out.size[0] == size[0]);
         assert(out.size[1] == size[1]);
         assert(out.size[2] == size[2]);
diff --git a/tests/topography/readers/CMakeLists.txt b/tests/topography/readers/CMakeLists.txt
index 0899b93..8356d1c 100644
--- a/tests/topography/readers/CMakeLists.txt
+++ b/tests/topography/readers/CMakeLists.txt
@@ -11,7 +11,7 @@ target_link_libraries(test_topography_serial_reader
 
 target_include_directories(test_topography_serial_reader
         PUBLIC
-        ${AWP_MINI_SOURCE_DIR}/include/
+        ${AWP_SOURCE_DIR}/include/
         )
 
 add_test(NAME test_topography_serial_reader COMMAND
diff --git a/tests/topography/readers/test_serial_reader.c b/tests/topography/readers/test_serial_reader.c
index b71857a..234506b 100644
--- a/tests/topography/readers/test_serial_reader.c
+++ b/tests/topography/readers/test_serial_reader.c
@@ -13,8 +13,8 @@
 #include <topography/geometry/geometry.h>
 #include <topography/metrics/metrics.h>
 
-void init_geometry(prec **f, const int *gsize, const int3_t coord,
-                   const int rank, int px, int py);
+void init_global_grid(prec **f, const int *gsize);
+void init_local_grid(prec **f, const int *lsize, const int3_t coord, const int gmy);
 void write_geometry(const _prec *f, const int *gsize, int rank);
 int test_read_grid(int rank, const _prec *local_f, const int *local_size,
                    const int px, const int py, const int3_t coord);
@@ -30,14 +30,14 @@ int main(int argc, char **argv)
         MPI_Init(&argc, &argv);
         MPI_Comm_rank(MPI_COMM_WORLD, &rank);
         MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
-        int px = 2; 
-        int py = 3; 
+        int px = 3; 
+        int py = 2; 
         assert(mpi_size == px * py);
 
         int3_t coord = { .x = rank % px, .y = rank / px, .z = 0};
 
         int err = 0;
-        int local_grid[3] = {4, 8, 10};
+        int local_grid[3] = {32, 64, 4};
         int global_grid[3] = {local_grid[0] * px, local_grid[1] * py,
                               local_grid[2]};
         prec * global_f;
@@ -50,11 +50,15 @@ int main(int argc, char **argv)
                 printf("===========================\n");
         }
         if (rank == 0) {
-                init_geometry(&global_f, global_grid, coord, rank, 1, 1);
+                init_global_grid(&global_f, global_grid);
         }
-        init_geometry(&local_f, local_grid, coord, rank, px, py);
+            
+        int gmy = global_grid[1] + 4 + 2 * align + 2 * metrics_padding;
+
+        init_local_grid(&local_f, local_grid, coord, gmy);
         write_geometry(global_f, global_grid, rank);
 
+        MPI_Barrier(MPI_COMM_WORLD);
         err = test_read_grid(rank, local_f, local_grid, px, py, coord);
         if (rank == 0) {
                 free(global_f);
@@ -64,90 +68,65 @@ int main(int argc, char **argv)
         return err;
 }
 
-void init_geometry(prec **f, const int *gsize, const int3_t coord,
-                   const int rank, int px, int py) {
-        _prec gridspacing = 0.1; 
-
-        f_grid_t metrics_f = metrics_init_f(gsize, gridspacing);
-        g_grid_t metrics_g = metrics_init_g(gsize, gridspacing);
-
-        int3_t shift = grid_u3();
-        
-        int3_t size = {gsize[0], gsize[1], gsize[2]};
-
-        int3_t boundary1 = {.x = 0, .y = 0, .z = 0};
-        int3_t boundary2 = {.x = 0, .y = 0, .z = 1};
+void init_global_grid(prec **f, const int *gsize) {
 
-        grid3_t topography_grid = grid_init_metric_grid(
-            size, shift, coord, boundary1, boundary2, gridspacing);
-
-        grid1_t x1_grid = grid_grid1_x(topography_grid);
-        grid1_t y1_grid = grid_grid1_y(topography_grid);
-        grid1_t z1_grid = grid_grid1_z(topography_grid);
+        int nx = gsize[0];
+        int ny = gsize[1];
 
+        int mxp = nx + 2 * metrics_padding;
+        int myp = ny + 2 * metrics_padding;
+        int mx = 4 + mxp;
+        int my = 4 + myp + 2 * align;
+        *f = malloc(mx * my * sizeof(prec)); 
 
-        _prec *x1 = malloc(sizeof(x1) * x1_grid.size);
-        _prec *y1 = malloc(sizeof(y1) * y1_grid.size);
-        _prec *z1 = malloc(sizeof(z1) * z1_grid.size);
+        prec *global_f = *f;
 
-        grid_fill1(x1, x1_grid);
-        grid_fill1(y1, y1_grid);
-        grid_fill1(z1, z1_grid);
+        for (int i = 0; i < mxp; ++i) {
+        for (int j = 0; j < myp; ++j) {
+            size_t pos = align + 2 + j + my * (2 + i);
+            global_f[pos] = j + my * i;
+        }
+        }
+}
 
-        _prec *x = malloc(topography_grid.num_bytes);
-        _prec *y = malloc(topography_grid.num_bytes);
-        _prec *z = malloc(topography_grid.num_bytes);
+void init_local_grid(prec **f, const int *lsize, const int3_t coord, const int gmy) {
 
-        grid_fill3_x(x, x1, topography_grid);
-        grid_fill3_y(y, y1, topography_grid);
-        grid_fill3_z(z, z1, topography_grid);
+        int nx = lsize[0];
+        int ny = lsize[1];
+        int mxp = nx + 2 * metrics_padding;
+        int myp = ny + 2 * metrics_padding;
+        int mx = 4 + mxp;
+        int my = 4 + myp + 2 * align;
+        size_t num_bytes = mx * my * sizeof(prec);
+        *f = malloc(num_bytes); 
 
-         _prec3_t hill_width = {.x = 0.5, .y = 0.5, .z = 0};
-         _prec hill_height = 1.0;
-         _prec3_t hill_center = {.x = 0.0, .y = 0.0, .z = 0};
-         _prec3_t canyon_width = {.x = 0.5, .y = 0.5, .z = 0};
-         _prec canyon_height = 0.0;
-         _prec3_t canyon_center = {.x = 2, .y = 2, .z = 0};
+        prec *local_f = *f;
+        memset(local_f, 0, num_bytes);
 
-        geom_gaussian_hill_and_canyon(&metrics_f, x1, y1, topography_grid, 
-                        hill_width, hill_height, hill_center,
-                        canyon_width, canyon_height, canyon_center,
-                        px, py);
 
-        *f = malloc(metrics_sizeof_f(&metrics_f)); 
-        for (int i = 0; i < metrics_f.mem[0]; ++i) {
-        for (int j = 0; j < metrics_f.mem[1]; ++j) {
-                size_t pos = j + metrics_f.mem[1] * i;
-                (*f)[pos] = metrics_f.f[pos];
+        for (int i = 0; i < mxp; ++i) {
+        for (int j = 0; j < myp; ++j) {
+                size_t pos = align + 2 + j + my * (2 + i);
+                local_f[pos] = (j + ny * coord.y) + gmy * (i + nx * coord.x);
         }
         }
 
-        metrics_build_f(&metrics_f);
-        metrics_build_g(&metrics_g);
-
 
-        free(x);
-        free(y);
-        free(z);
-        free(x1);
-        free(y1);
-        free(z1);
 }
 
 void write_geometry(const prec *f, const int *gsize, int rank) {
         if (rank != 0)
                 return;
-        int padding = ngsl;
         int nx = gsize[0];
         int ny = gsize[1];
-        int mx = nx + 2 * ngsl;
-        int my = ny + 2 * ngsl;
+        int mx = nx + 2 * metrics_padding;
+        int my = ny + 2 * metrics_padding;
         FILE *fh = fopen(geometry_file, "wb");
         float *data;
         data = malloc(sizeof data * mx * my);
         fwrite(&nx, sizeof nx, 1, fh);
         fwrite(&ny, sizeof ny, 1, fh);
-        fwrite(&padding, sizeof padding, 1, fh);
+        fwrite(&metrics_padding, sizeof metrics_padding, 1, fh);
         int slice = 4 + my + 2 * align;
 
         for (int i = 0; i < mx; ++i) {
@@ -159,6 +138,7 @@ void write_geometry(const prec *f, const int *gsize, int rank) {
 
         fwrite(data, sizeof(float), mx * my, fh);
         fclose(fh);
+
 }
 
 int test_read_grid(int rank, const _prec *local_f, const int *local_size, const
@@ -172,24 +152,33 @@ int test_read_grid(int rank, const _prec *local_f, const int *local_size, const
         int lnx, lny;
         lnx = local_size[0];
         lny = local_size[1];
-        int lmy = 4 + lny + 2 * ngsl + 2 * align;
+        int lmx = 4 + lnx + 2 * metrics_padding;
+        int lmy = 4 + lny + 2 * metrics_padding + 2 * align;
         prec *read_f;
         int icoord[2] = {coord.x, coord.y};
         int alloc = 1;
 
+        read_f = malloc(sizeof read_f * lmx * lmy); 
+
+        for (int i = 0; i < lmx * lmy; ++i)
+                read_f[i] = 0.0;
+
         err |= topo_read_serial(geometry_file, rank, px, py, icoord, lnx, lny,
                                 alloc, &read_f);
 
+        MPI_Barrier(MPI_COMM_WORLD);
+
         // Compare data read from file with locally computed data
         float sum = 0;
-        for (int i = 0; i < (lnx + 2 * ngsl); ++i) {
-        for (int j = 0; j < (lny + 2 * ngsl); ++j) {
+            
+        for (int i = 0; i < (lnx + 2 * metrics_padding); ++i) {
+        for (int j = 0; j < (lny + 2 * metrics_padding); ++j) {
                 size_t local_pos = 2 + align + j + (i + 2) * lmy;
-                sum += fabs(read_f[local_pos] - local_f[local_pos]); 
+                double val = fabs(read_f[local_pos] - local_f[local_pos]);
+                sum += val; 
         }
         }
 
-
         free(read_f);
         remove(geometry_file);
 
diff --git a/tests/topography/receivers/CMakeLists.txt b/tests/topography/receivers/CMakeLists.txt
index 13f415c..c2f9fd2 100644
--- a/tests/topography/receivers/CMakeLists.txt
+++ b/tests/topography/receivers/CMakeLists.txt
@@ -9,13 +9,13 @@ target_link_libraries(test_topography_receivers
 
 target_include_directories(test_topography_receivers
         PUBLIC
-        ${AWP_MINI_SOURCE_DIR}/include/
+        ${AWP_SOURCE_DIR}/include/
         )
 
 add_test(NAME test_topography_receivers COMMAND
         ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 --oversubscribe
         test_topography_receivers
-        ${AWP_MINI_SOURCE_DIR}/tests/fixtures/receiver.txt)
+        ${AWP_SOURCE_DIR}/tests/fixtures/receiver.txt)
 
 
 
diff --git a/tests/topography/receivers/test_receivers.c b/tests/topography/receivers/test_receivers.c
index 6332199..cb34db7 100644
--- a/tests/topography/receivers/test_receivers.c
+++ b/tests/topography/receivers/test_receivers.c
@@ -96,7 +96,7 @@ int test_receivers(const char *inputfile, int rank, int size, const int px)
         test_t test;
 
         test = test_init(" * receivers_init", rank, size);
-        receivers_init(inputfile, grids, ngrids, NULL, MPI_COMM_WORLD, rank, size);
+        receivers_init(inputfile, grids, NULL, ngrids, NULL, MPI_COMM_WORLD, rank, size);
         err = test_finalize(&test, err);
         
         test = test_init(" * receivers_write", rank, size);
diff --git a/tests/topography/sources/CMakeLists.txt b/tests/topography/sources/CMakeLists.txt
index 552f84d..7af1617 100644
--- a/tests/topography/sources/CMakeLists.txt
+++ b/tests/topography/sources/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_executable(test_topography_sources test_sources.c)
 add_executable(test_topography_sources_dm test_sources_dm.c)
+add_executable(test_topography_source_distribution test_source_distribution.c)
 
 target_link_libraries(test_topography_sources 
         topography
@@ -15,19 +16,38 @@ target_link_libraries(test_topography_sources_dm
         testing
         )
 
+target_link_libraries(test_topography_source_distribution
+        topography
+        buffers
+        ${MPI_C_LIBRARIES}
+        testing
+        )
+
 target_include_directories(test_topography_sources
         PUBLIC
-        ${AWP_MINI_SOURCE_DIR}/include/
+        ${AWP_SOURCE_DIR}/include/
         )
 
 target_include_directories(test_topography_sources_dm
         PUBLIC
-        ${AWP_MINI_SOURCE_DIR}/include/
+        ${AWP_SOURCE_DIR}/include/
         )
 
+target_include_directories(test_topography_source_distribution
+        PUBLIC
+        ${AWP_SOURCE_DIR}/include/
+        )
+
+add_test(NAME test_topography_sources_dm COMMAND
+        test_topography_sources_dm)
+
+add_test(NAME test_topography_source_distribution COMMAND
+        test_topography_source_distribution)
+
 add_test(NAME test_topography_sources COMMAND
         ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 --oversubscribe
         test_topography_sources
-        ${AWP_MINI_SOURCE_DIR}/tests/fixtures/source.txt)
+        ${AWP_SOURCE_DIR}/tests/fixtures/source.txt)
+
 
 
diff --git a/tests/topography/sources/test_source_distribution.c b/tests/topography/sources/test_source_distribution.c
new file mode 100644
index 0000000..7f04139
--- /dev/null
+++ b/tests/topography/sources/test_source_distribution.c
@@ -0,0 +1,115 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <math.h>
+#include <mpi.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#define STR_LEN 2048
+#define ADDLINENUM 1
+#define ADDRANK 1
+#define RANK rank
+#define STR_LEN 2048
+
+#include <awp/definitions.h>
+#include <test/test.h>
+#include <awp/error.h>
+#include <utils/array.h>
+#include <grid/shift.h>
+#include <topography/grids.h>
+#include <grid/grid_3d.h>
+#include <topography/sources/source.h>
+#include <topography/sources/sources.h>
+#include <topography/receivers/receivers.h>
+#include <readers/input.h>
+
+void init(float **x, float **y, int nx, int ny, int px, int py, float h);
+int inbounds(int nx, int ny, int blocknum, int px, int py, float h, int degree, const enum source_type st);
+
+int main(int argc, char **argv)
+{
+        test_divider();
+        printf("Testing test_source_distribution.c\n");
+
+        int nx = 32;
+        int ny = 32;
+        float h = 1.0f;
+
+        int degree = 3;
+
+        int err = 0;
+        err |= inbounds(nx, ny, 0, 0, 0, h, 3, RECEIVER);
+        err |= inbounds(nx, ny, 0, 1, 0, h, 3, RECEIVER);
+
+
+
+
+}
+
+
+int inbounds(int nx, int ny, int blocknum, int px, int py, float h, int degree, const enum source_type st) {
+
+        int err = 0;
+        float *x, *y;
+        init(&x, &y, nx, ny, px, py, h);
+
+        int overlap = 0;
+        switch(st) {
+            case MOMENT_TENSOR:
+                overlap = 2;
+                break;
+            case FORCE: 
+                overlap = 2;
+                break;
+            case RECEIVER:
+                overlap = 0;
+                break;
+            case SGT:
+                overlap = 0;
+                break;
+        }
+
+        float qx, qy;
+
+        int half_width = (degree + 1) / 2;
+
+
+        // inbounds
+        {
+            qx = (nx - half_width - 1 - overlap) * h;
+            qy = (ny - half_width - 1 - overlap) * h;
+            printf("Query point: (%g, %g), \n", qx, qy);
+            printf("x = %g %g %g, y = %g %g %g \n", x[0], x[1], x[2], y[0], y[1], y[2]);
+            printf("Velocity bounds. In bounds if %g <= qx <= %g, %g <= qy <= %g \n", 
+                    x[2 + ngsl], x[2 + ngsl + nx - 1],
+                    y[2 + ngsl], y[2 + ngsl + ny - 1]);
+            printf("Stress bounds. In bounds if %g <= qx <= %g, %g <= qy <= %g \n", 
+                    x[2 + ngsl / 2], x[2 + 3 / 2 * ngsl + nx - 1],
+                    y[2 + ngsl / 2], y[2 + 3 / 2 * ngsl + ny - 1]);
+        }
+
+        free(x);
+        free(y);
+
+        return err;
+}
+
+
+void init(float **x, float **y, int nx, int ny, int px, int py, float h) {
+        int3_t gsize = {nx, ny, 1};
+        int3_t shift = {0, 0, 0};
+        int3_t coordinate = {px, py, 0};
+        int3_t boundary1 = {0, 0, 0};
+        int3_t boundary2 = {0, 0, 0};
+
+        grid3_t grid = grid_init(gsize, shift, coordinate, boundary1, boundary2, ngsl + 2, h); 
+        grid1_t xgrid = grid_grid1_x(grid); 
+        grid1_t ygrid = grid_grid1_y(grid); 
+        *x = malloc(sizeof(float) * xgrid.size);
+        *y = malloc(sizeof(float) * ygrid.size);
+
+        grid_fill1(*x, xgrid, 1);
+        grid_fill_y_dm(*y, ygrid, 0);
+}
+
diff --git a/tests/topography/sources/test_sources.c b/tests/topography/sources/test_sources.c
index 2e08a4c..330a463 100644
--- a/tests/topography/sources/test_sources.c
+++ b/tests/topography/sources/test_sources.c
@@ -76,7 +76,7 @@ int test_sources(const char *inputfile, int rank, int size, const int px)
         test_t test;
 
         test = test_init(" * sources_init", rank, size);
-        sources_init(inputfile, grids, ngrids, NULL, MPI_COMM_WORLD, rank, size);
+        sources_init(inputfile, grids, NULL, ngrids, NULL, NULL, MPI_COMM_WORLD, rank, size);
         err = test_finalize(&test, err);
 
         input_t input;
diff --git a/tests/topography/sources/test_sources_dm.c b/tests/topography/sources/test_sources_dm.c
index 1520827..596f409 100644
--- a/tests/topography/sources/test_sources_dm.c
+++ b/tests/topography/sources/test_sources_dm.c
@@ -16,10 +16,13 @@
 #include <test/test.h>
 #include <awp/error.h>
 #include <utils/array.h>
+#include <grid/shift.h>
+#include <topography/grids.h>
 #include <topography/sources/source.h>
 #include <topography/sources/sources.h>
+#include <topography/receivers/receivers.h>
 #include <readers/input.h>
-int test_sources_dm(const char *inputfile, int rank, int size, const int px);
+int test_sources_dm(const char *inputfile, int rank, int size, const int px, const enum grid_types grid, const enum eshift grid2, const int3_t src1, const int3_t src2, const int3_t src3, int run_test);
 
 int main(int argc, char **argv)
 {
@@ -29,46 +32,71 @@ int main(int argc, char **argv)
         MPI_Comm_rank(MPI_COMM_WORLD, &rank);
         MPI_Comm_size(MPI_COMM_WORLD, &size);
 
-        char inputfile[STR_LEN];
+
         int px = 2;
 
-        if (argc == 2) {
-                assert(strlen(argv[1]) < STR_LEN);
-                sprintf(inputfile, "%s", argv[1]);
-        }
-        else {
-                sprintf(inputfile, "../tests/fixtures/source_dm.txt");
-        }
 
-        if (rank == 0) {
-                test_divider();
-                printf("Testing test_sources_dm.c\n");
-        }
+        int3_t src1 = {9, 9, 0};
+        int3_t src2 = {3, 3, 0};
+        int3_t src3 = {1, 1, 0};
 
-        test_sources_dm(inputfile, rank, size, px);
+        if (argc == 2) {
+            printf("Source input file: %s \n", argv[1]);
+            test_sources_dm(argv[1], rank, size, px, XY, GRID_XY, src1, src2, src3, 0);
+            test_sources_dm(argv[1], rank, size, px, XZ, GRID_XZ, src1, src2, src3, 0);
+            test_sources_dm(argv[1], rank, size, px, YZ, GRID_YZ, src1, src2, src3, 0);
+            test_sources_dm(argv[1], rank, size, px, XX, GRID_XX, src1, src2, src3, 0);
+            test_sources_dm(argv[1], rank, size, px, X, GRID_U1, src1, src2, src3, 0);
+            test_sources_dm(argv[1], rank, size, px, Y, GRID_U2, src1, src2, src3, 0);
+            test_sources_dm(argv[1], rank, size, px, Z, GRID_U3, src1, src2, src3, 0);
+        } else {
+
+            if (rank == 0) {
+                    test_divider();
+                    printf("Testing test_sources_dm.c\n");
+            }
+
+            test_sources_dm("source_xy.txt", rank, size, px, XY, GRID_XY, src1, src2, src3, 1);
+            test_sources_dm("source_xz.txt", rank, size, px, XZ, GRID_XZ, src1, src2, src3, 1);
+            test_sources_dm("source_yz.txt", rank, size, px, YZ, GRID_YZ, src1, src2, src3, 1);
+            test_sources_dm("source_xx.txt", rank, size, px, XX, GRID_XX, src1, src2, src3, 1);
+            test_sources_dm("source_x.txt", rank, size, px, X, GRID_U1, src1, src2, src3, 1);
+            test_sources_dm("source_y.txt", rank, size, px, Y, GRID_U2, src1, src2, src3, 1);
+            test_sources_dm("source_z.txt", rank, size, px, Z, GRID_U3, src1, src2, src3, 1);
+
+            if (rank == 0) {
+                    printf("Testing completed.\n");
+                    test_divider();
+            }
 
-        if (rank == 0) {
-                printf("Testing completed.\n");
-                test_divider();
         }
 
+
         MPI_Finalize();
 
         return test_last_error();
 }
 
-int test_sources_dm(const char *inputfile, int rank, int size, const int px) 
+int test_sources_dm(const char *sourcefile, int rank, int size, const int px, const enum grid_types grid, const enum eshift grid2, const int3_t src1, const int3_t src2, const int3_t src3, int run_test)
 {
+ 
+        char inputfile[STR_LEN];
+        if (run_test)
+            sprintf(inputfile, "../../../../tests/fixtures/%s", sourcefile);
+        else 
+            sprintf(inputfile, "%s", sourcefile);
+                                       
         int coord_x = rank / px;
         int coord_y = rank % px;
         int coord_z = 0;
         // Grid points on the coarse grid
         int nx = 11;
         int ny = 11;
-        int nz = 11;
+        int nz = 11;;
         prec h = 1.0;
         int ngrids = 3;
         int err = 0;
+        int nzs[3] = {nz, nz, nz};
         grids_t grids[3] = {
             grids_init(9 * nx, 9 * ny, nz, coord_x, coord_y, coord_z, 0,
                        h),
@@ -79,18 +107,31 @@ int test_sources_dm(const char *inputfile, int rank, int size, const int px)
 
         test_t test;
 
-        test = test_init(" * sources_dm", rank, size);
-        sources_init(inputfile, grids, ngrids, NULL, MPI_COMM_WORLD, rank, size);
-        source_t Mxx = sources_get_source(XX);
+        char testname[STR_LEN];
+        sprintf(testname, " * sources_dm: %s", grid_shift_label(grid2));
+
+        if (run_test) test = test_init(testname, rank, size);
+        source_t M;
+        if (grid2 == GRID_U1 || grid2 == GRID_U2 || grid2 == GRID_U3) {
+            receivers_init(inputfile, grids, NULL, ngrids, NULL, MPI_COMM_WORLD, rank, size);
+            M = receivers_get_receiver(grid);
+        }
+        else {
+            sources_init(inputfile, grids, NULL, ngrids, NULL, NULL, MPI_COMM_WORLD, rank, size);
+            M = sources_get_source(grid);
+        }
 
         for (size_t i = 0; i < (size_t)ngrids; ++i) {
-                grid3_t xx = grids[i].xx;
-                printf("   - Grid: %ld, grid spacing: %g \n", i, xx.gridspacing); 
-                for (size_t j = 0; j < Mxx.lengths[i]; ++j) {
+                grid3_t x = grids_select(grid, &grids[i]);
+
+                if (!run_test) {
+                    printf("   - Grid: %ld, grid spacing: %g \n", i, x.gridspacing); 
+                }
+                for (size_t j = 0; j < M.lengths[i]; ++j) {
 
                 grid3_t vel_grid = grid_init_velocity_grid(
-                                   xx.inner_size, xx.shift, xx.coordinate,
-                                   xx.boundary1, xx.boundary2, xx.gridspacing);
+                                   x.inner_size, x.shift, x.coordinate,
+                                   x.boundary1, x.boundary2, x.gridspacing);
                 grid1_t x_grid = grid_grid1_x(vel_grid);
                 grid1_t y_grid = grid_grid1_y(vel_grid);
                 grid1_t z_grid = grid_grid1_z(vel_grid);
@@ -99,9 +140,9 @@ int test_sources_dm(const char *inputfile, int rank, int size, const int px)
                 prec *y1 = malloc(sizeof y1 * y_grid.size);
                 prec *z1 = malloc(sizeof z1 * z_grid.size);
 
-                grid_fill1(x1, x_grid);
-                grid_fill1(y1, y_grid);
-                grid_fill1(z1, z_grid);
+                grid_fill1(x1, x_grid, 1);
+                grid_fill_y_dm(y1, y_grid, i);
+                grid_fill1(z1, z_grid, 0);
                 // The user coordinate system (user) defines (0, 0, 0) at
                 // material grid point and is a global coordinate system (a
                 // single coordinate system defined for all blocks, irrespective
@@ -112,37 +153,46 @@ int test_sources_dm(const char *inputfile, int rank, int size, const int px)
                 //
                 // However, Mxx.x, Mxx.y, Mxx.z contains the coordinates of the
                 // source at a normal stress position in the internal
-                // coordinates system that shifts by -0.5 * grid spacings in the 
-                // y and z-directions (see shift.c, xx = [0, 1, 1]), 
-                // but with adjustments to the x-direction,
-                // and y-directions due to the DM and due having (0, 0, 0) at a
-                // material point in the user coordinate system.
+                // coordinates system that shifts by (0.5 h, -0.5 h, -0.5 h)
+                // (see shift.c, xx = [1, 1, 1]), 
                 //
                 //
-                int ix = Mxx.interpolation[i].ix[j] - ngsl;
-                int iy = Mxx.interpolation[i].iy[j] - ngsl;
-                int iz = Mxx.interpolation[i].iz[j];
+                int ix = M.interpolation[i].ix[j] - ngsl - 2;
+                int iy = M.interpolation[i].iy[j] - ngsl - 2;
+                int iz = M.interpolation[i].iz[j];
                 
                 // Once setup has been confirmed, we can add some test cases to
                 // ensure that we don't break this configuration in the future.
-                if (i == 0) err |= s_assert(ix == 4);
-                if (i == 1) err |= s_assert(ix == 1);
-                if (i == 2) err |= s_assert(ix == 0);
+                if (run_test) {
+                    if (i == 0) err |= s_assert(ix == src1.x);
+                    if (i == 1) err |= s_assert(ix == src2.x);
+                    if (i == 2) err |= s_assert(ix == src3.x);
+
+                    if (i == 0) err |= s_assert(iy == src1.y);
+                    if (i == 1) err |= s_assert(iy == src2.y);
+                    if (i == 2) err |= s_assert(iy == src3.y);
+                }
 
+                // Check that the global z-coordinate maps to the correct local z-coordinate
+                _prec zloc;
+                int block_index;
+                global_to_local(&zloc, &block_index, M.zu[i][j],
+                     1.0, nzs, 3, 0);
+                err |= s_assert((size_t)block_index == i);
+                err |= s_assert(fabs(zloc - M.z[i][j]) < FLTOL);
 
-                //FIXME: Resolve the y-direction
-                //if (i == 0) err |= s_assert(iy == 2);
 
-                printf("     - Mxx(%ld), index         = [%d, %d, %d]\n"\
+                if (err > 0 || !run_test)
+                printf("     - %s(%ld), index         = [%d, %d, %d]\n"\
                        "               user(x, y, z) = [%g, %g, %g],\n"\
                        "               int(x, y, z)  = [%g, %g, %g]\n"\
                        "               int x = [%g %g %g ... ]\n"\
                        "               int y = [%g %g %g ... ]\n"\
                        "               int z = [%g %g %g ... ]\n", 
-                                j, 
+                                grid_shift_label(grid2), j, 
                                 ix, iy, iz,
-                                Mxx.xu[i][j], Mxx.yu[i][j], Mxx.zu[i][j], 
-                                Mxx.x[i][j], Mxx.y[i][j], Mxx.z[i][j],
+                                M.xu[i][j], M.yu[i][j], M.zu[i][j], 
+                                M.x[i][j], M.y[i][j], M.z[i][j],
                                 x1[0], x1[1], x1[2], 
                                 y1[0], y1[1], y1[2],
 				z1[0], z1[1], z1[2]);
@@ -152,9 +202,14 @@ int test_sources_dm(const char *inputfile, int rank, int size, const int px)
 
                 }
         }
+        if (run_test)
         err = test_finalize(&test, err);
         
-        sources_finalize();
+        if (grid2 == GRID_U1 || grid2 == GRID_U2 || grid2 == GRID_U3) {
+            receivers_finalize();
+        } else {
+            sources_finalize();
+        }
         grids_finalize(grids);
 
         return test_last_error();
diff --git a/tools/OpenFD/openfd/base/gridfunction.py b/tools/OpenFD/openfd/base/gridfunction.py
index 0d57f69..52745c6 100644
--- a/tools/OpenFD/openfd/base/gridfunction.py
+++ b/tools/OpenFD/openfd/base/gridfunction.py
@@ -1,5 +1,5 @@
 from sympy import Expr, Symbol, Tuple, sympify
-from sympy.core.compatibility import NotIterable, string_types, is_sequence
+from sympy.core.compatibility import NotIterable , is_sequence
 from sympy import preorder_traversal
 from sympy.core.cache import clear_cache
 from sympy.core.mul import Mul
@@ -8,6 +8,8 @@
 from .. dev.macro import Macro
 import openfd
 
+string_types=str
+
 def gridfunctions(label, shape, dtype=None, layout=None, struct=False, 
                   remap=None, macro='_'):
     """
diff --git a/tools/OpenFD/openfd/dev/variable.py b/tools/OpenFD/openfd/dev/variable.py
index cebb5d3..6ba9a7e 100644
--- a/tools/OpenFD/openfd/dev/variable.py
+++ b/tools/OpenFD/openfd/dev/variable.py
@@ -1,8 +1,10 @@
 from ..base import GridFunctionExpression, GridFunction
-from sympy.core.compatibility import NotIterable, string_types
+from sympy.core.compatibility import NotIterable
 from sympy import Expr, Symbol, sympify
 from . types import C
 
+string_types=str
+
 class Variable(Expr, NotIterable):
 
     def __new__(cls, label, val=None, dtype=None, 
diff --git a/tools/kernel_generation/Makefile b/tools/kernel_generation/Makefile
index 0ac2140..1086080 100644
--- a/tools/kernel_generation/Makefile
+++ b/tools/kernel_generation/Makefile
@@ -1,5 +1,5 @@
 ifndef $(${AWP_OPT})
-AWP_OPT=../..
+AWP_OPT=../../
 $(warning "The environment variable AWP_OPT is not set. Set it to the path to export kernels to")
 $(info "Using default value: ${AWP_OPT})
 endif
diff --git a/tools/kernel_generation/mms.py b/tools/kernel_generation/mms.py
index cd4ce35..f215641 100644
--- a/tools/kernel_generation/mms.py
+++ b/tools/kernel_generation/mms.py
@@ -8,16 +8,27 @@
 x = sp.symbols('x y z')
 t = sp.symbols('t')
 
-vx = sp.sin(k*x[0])*sp.sin(k*x[1])*sp.sin(k*x[2]) 
-vy = sp.sin(k*x[0])*sp.sin(k*x[1])*sp.sin(k*x[2]) 
-vz = sp.sin(k*x[0])*sp.sin(k*x[1])*sp.sin(k*x[2])
-
-sxx = sp.sin(k*x[0])*sp.sin(k*x[1])*sp.sin(k*x[2])   
-syy = sp.sin(k*x[0])*sp.sin(k*x[1])*sp.sin(k*x[2])   
-szz = sp.sin(k*x[0])*sp.sin(k*x[1])*sp.sin(k*x[2])   
-sxy = sp.sin(k*x[0])*sp.sin(k*x[1])*sp.sin(k*x[2])   
-sxz = sp.sin(k*x[0])*sp.sin(k*x[1])*sp.sin(k*x[2])   
-syz = sp.sin(k*x[0])*sp.sin(k*x[1])*sp.sin(k*x[2])   
+# shifts to remove exact solutions from axis of symmetry
+phi_1 = 1.2
+phi_2 = 0.25
+phi_3 = 0.4
+phi_4 = 0.7
+phi_5 = 0.3
+phi_6 = 0.12
+phi_7 = 0.02
+phi_8 = 0.47
+phi_9 = 0.33
+
+vx = sp.sin(k*x[0])*sp.sin(k*x[1])*sp.sin(k*x[2] + phi_1) / k 
+vy = sp.sin(k*x[0])*sp.sin(k*x[1])*sp.sin(k*x[2] + phi_2) / k 
+vz = sp.sin(k*x[0])*sp.sin(k*x[1])*sp.sin(k*x[2] + phi_3) / k
+
+sxx = sp.sin(k*x[0])*sp.sin(k*x[1])*sp.sin(k*x[2] + phi_4) / k   
+syy = sp.sin(k*x[0])*sp.sin(k*x[1])*sp.sin(k*x[2] + phi_5) / k   
+szz = sp.sin(k*x[0])*sp.sin(k*x[1])*sp.sin(k*x[2] + phi_6) / k   
+sxy = sp.sin(k*x[0])*sp.sin(k*x[1])*sp.sin(k*x[2] + phi_7) / k   
+sxz = sp.sin(k*x[0])*sp.sin(k*x[1])*sp.sin(k*x[2] + phi_8) / k   
+syz = sp.sin(k*x[0])*sp.sin(k*x[1])*sp.sin(k*x[2] + phi_9) / k   
                       
 v = [vx, vy, vz]
 s = sp.zeros(3)
@@ -36,7 +47,7 @@ def mms(state, field, value):
           "const _prec *properties)"%(state, field))
     print("{")
     print("     _prec k = properties[0];");
-    print("     return %s;" % value)
+    print("     return %s;" % sp.ccode(value))
     print("}")
     print("")
 
diff --git a/tools/kernel_generation/scheme.py b/tools/kernel_generation/scheme.py
index 321b35e..a186522 100644
--- a/tools/kernel_generation/scheme.py
+++ b/tools/kernel_generation/scheme.py
@@ -100,9 +100,9 @@ def velocity(label, buf=0, debug=0, debug_ops=0, use_cartesian=0):
 
     print("Generating velocity kernels: %s. "%label)
 
-    rho1 = fd.Variable('rho1', dtype=fd.prec, val=Pavg(Pavg(F.rho, 'y', 1), 'z', 1)) 
-    rho2 = fd.Variable('rho2', dtype=fd.prec, val=Pavg(Pavg(F.rho, 'x', 1), 'z', 1)) 
-    rho3 = fd.Variable('rho3', dtype=fd.prec, val=Pavg(Pavg(F.rho, 'x', 1), 'y', 1)) 
+    rho1 = fd.Variable('rho1', dtype=fd.prec, val=Pavg(Pavg(F.rho, 'y', G.u1[1]), 'z', G.u1[2])) 
+    rho2 = fd.Variable('rho2', dtype=fd.prec, val=Pavg(Pavg(F.rho, 'x', G.u2[0]), 'z', G.u2[2])) 
+    rho3 = fd.Variable('rho3', dtype=fd.prec, val=Pavg(Pavg(F.rho, 'x', G.u3[0]), 'y', G.u3[1])) 
 
     # Jacobians
     J1 = F.f_1 * F.g3_c
@@ -232,8 +232,8 @@ def velocity(label, buf=0, debug=0, debug_ops=0, use_cartesian=0):
         rhs_indices = lambda idx : (idx[0], idx[1] + rj0, idx[2] - 6)
         index_bounds = (0,1,0)
     else:
-        lhs_indices = lambda idx : (idx[0], idx[1], idx[2] - 6)
-        rhs_indices = lambda idx : (idx[0], idx[1], idx[2] - 6)
+        lhs_indices = lambda idx : (idx[0], idx[1], idx[2])
+        rhs_indices = lambda idx : (idx[0], idx[1], idx[2])
         index_bounds = (1,1,0)
 
     kernels = kg.make_kernel(label, 
@@ -282,9 +282,9 @@ def stress(label, debug=0, debug_ops=0, use_cartesian=0):
         F.u3 = F.w1
 
     a, nu = sp.symbols('a nu')
-    rho1 = fd.Variable('rho1', dtype=fd.prec, val=Pavg(Pavg(F.rho, 'y', 1), 'z', 1)) 
-    rho2 = fd.Variable('rho2', dtype=fd.prec, val=Pavg(Pavg(F.rho, 'x', 1), 'z', 1)) 
-    rho3 = fd.Variable('rho3', dtype=fd.prec, val=Pavg(Pavg(F.rho, 'x', 1), 'y', 1)) 
+    rho1 = fd.Variable('rho1', dtype=fd.prec, val=Pavg(Pavg(F.rho, 'y', G.u1[1]), 'z', G.u1[2])) 
+    rho2 = fd.Variable('rho2', dtype=fd.prec, val=Pavg(Pavg(F.rho, 'x', G.u2[0]), 'z', G.u2[2])) 
+    rho3 = fd.Variable('rho3', dtype=fd.prec, val=Pavg(Pavg(F.rho, 'x', G.u3[0]), 'y', G.u3[1])) 
 
     Jii = fd.Variable('Jii', dtype=fd.prec, val=F.f_c*F.g3_c)
     J12i = fd.Variable('J12i', dtype=fd.prec, val=F.f*F.g3_c)
diff --git a/tools/write_grid/CMakeLists.txt b/tools/write_grid/CMakeLists.txt
index 704157a..725cfbc 100644
--- a/tools/write_grid/CMakeLists.txt
+++ b/tools/write_grid/CMakeLists.txt
@@ -4,10 +4,11 @@ target_link_libraries(write_grid
         ${MPI_C_LIBRARIES} 
         m
         topography_readers
+        mapping
         )
 
 target_include_directories(write_grid
         PUBLIC
-        ${AWP_MINI_SOURCE_DIR}/include/
+        ${AWP_SOURCE_DIR}/include/
         )
 
diff --git a/tools/write_grid/README.md b/tools/write_grid/README.md
index c3c57f2..6b5a52c 100644
--- a/tools/write_grid/README.md
+++ b/tools/write_grid/README.md
@@ -10,7 +10,7 @@ with the values at the nearest regular grid point.
 ## Usage
 
 ```
-write_grid <input> <output> <property> <mesh> <nx> <ny> <nz> <mz> <h> <px> <py> <mesh_out> <rpt>
+write_grid <input> <output> <property> <mesh> <nx> <ny> <nz> <mz> <h> <hb> <ht> <px> <py> <mesh_out>
 ```
 ---------------------------------------------------------------
 |  Argument       |  Description                                  |
@@ -24,10 +24,11 @@ write_grid <input> <output> <property> <mesh> <nx> <ny> <nz> <mz> <h> <px> <py>
 | nz `int`        |   Number of grid points in the z-direction    |
 | mz `int`        |   Number of grid points in the z-direction of the regular property grid  |
 | h `float`       |   Grid spacing                                |
+| hb `float`      |   Bottom grid spacing                         |
+| ht `float`      |   Top grid spacing                            |
 | px `int`        |   Number of MPI partitions in the x-direction |
 | py `int`        |   Number of MPI partitions in the y-direction |
-| mesh_out `int`  |   Generate mesh output (0: disalbe; 1:enable) |
-| rpt `bool`      |   Write the top layer twice (0: disable; default=1: enable)            |
+| mesh_out `int`  |   Generate mesh output (0: disable; 1:enable) |
 
 See
 [awp-benchmarks](https://github.com/SCECcode/awp-benchmarks/tree/master/tests/topography/write_grid)
@@ -87,7 +88,3 @@ neighbor in the property gird (red, `mz` layers). Note that the grid spacing of
 `nz` curvilinear grids are always larger than or equal to that of the `mz` property
 grids.
 ![](https://i.loli.net/2019/11/06/3XvYondONmFSIzH.png)
-
-### Note
-When `rpt == 1`, both the material properties and grid coordinates at the top layer
-will be repeated twice.
diff --git a/tools/write_grid/write_grid.c b/tools/write_grid/write_grid.c
index 70274f3..7bddebe 100644
--- a/tools/write_grid/write_grid.c
+++ b/tools/write_grid/write_grid.c
@@ -3,9 +3,13 @@
  * that contains the grid coordinates (x_i, y_j, z_k) for each grid point in the
  * curvilinear grid.
  *
+ *
+ * Changelog:
+ *  v.3.0.0  Add DM and nonlinear grid stretching compatibility 
+ *
  */ 
-#define VERSION_MAJOR 2
-#define VERSION_MINOR 1
+#define VERSION_MAJOR 3
+#define VERSION_MINOR 0
 #define VERSION_PATCH 0
 
 #include <stdio.h>
@@ -16,23 +20,27 @@
 
 #include <test/test.h>
 #include <topography/readers/serial_reader.h>
+#include <topography/metrics/metrics.h>
+#include <topography/mapping.h>
 #include <awp/definitions.h>
 
 // Command line arguments
-static int nx;
-static int ny;
-static int nz;
-static int mz;
-static prec h;
-static int px;
-static int py;
-static int mesh_out;
-static int rpt;
+int nx;
+int ny;
+int nz;
+int mz;
+prec h;
+prec hb;
+prec ht;
+int px;
+int py;
+int mesh_out;
+int rpt;
 const char *input;
 const char *output;
 const char *property;
 const char *mesh;
-static int nvars = 3;
+int nvars = 3;
 
 struct Mpi
 {
@@ -47,20 +55,18 @@ struct Mpi
   int coord[2];
   MPI_Comm MCW, MC1;
 };
-
 void mpi_init(struct Mpi *m, int argc, char **argv);
 void mpi_cart(struct Mpi *m, const int *size, const int *part);
 MPI_Datatype data_type(const struct Mpi *mpi, int nz);
-
 int main(int argc, char **argv)
 {
         struct Mpi m;
         mpi_init(&m, argc, argv);
 
-        if (argc < 13 && m.rank == 0) {
+        if (argc < 15 && m.rank == 0) {
                 printf(
                     "usage: %s <input> <output> <prop> <mesh> <nx> "
-                    "<ny> <nz> <mz> <h> <px> <py> <rpt>\n",
+                    "<ny> <nz> <mz> <h> <hb> <ht> <px> <py>\n",
                     argv[0]);
                 printf("AWP curvilinear grid writer, v%d.%d.%d\n",
                        VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH);
@@ -84,20 +90,21 @@ int main(int argc, char **argv)
                 printf(" mz int         Number of grid points in the "
                     "z-direction of the property grid\n");
                 printf(" h float        Grid spacing\n");
+                printf(" hb float        Bottom grid spacing\n");
+                printf(" ht float        Top grid spacing\n");
                 printf(" px int         Number of MPI partitions in "
                     "the x-direction\n");
                 printf(" py int         Number of MPI partitions in "
                     "the y-direction\n");
                 printf(" mesh_out       Whether to output the mesh "
                     "(1 = True, 0 = False) \n");
-                printf(" rpt int        Whether to repeat top layer when "
-                    "writing (1 = True, 0 = False) \n");
-                printf(" Expect at least %d argc, got %d\n", 13, argc);
+                printf(" Expect at least %d argc, got %d\n", 15, argc);
 
                 MPI_Finalize();
                 return -1;
         }
 
+
         input = argv[1];
         output = argv[2];
         property = argv[3];
@@ -107,10 +114,13 @@ int main(int argc, char **argv)
         nz = atoi(argv[7]);
         mz = atoi(argv[8]);
         h = atof(argv[9]);
-        px = atoi(argv[10]);
-        py = atoi(argv[11]);
-        mesh_out = atoi(argv[12]);
-        rpt = argc < 14 ? 1 : atoi(argv[13]);
+        hb = atof(argv[10]);
+        ht = atof(argv[11]);
+        px = atoi(argv[12]);
+        py = atoi(argv[13]);
+        mesh_out = atoi(argv[14]);
+        rpt = 1;
+
 
         if (m.rank == 0) {
                 printf("AWP curvilinear grid writer, v%d.%d.%d\n",
@@ -118,10 +128,9 @@ int main(int argc, char **argv)
                 printf(
                     "input = %s output = %s property file = %s "
                     "mesh file = %s nx = %d ny = %d nz = %d "
-                    "mz = %d h = %g px = %d py = %d mesh_out = %d "
-                    "rpt = %d\n",
+                    "mz = %d h = %g px = %d py = %d mesh_out = %d\n",
                     input, output, property, mesh, nx, ny, nz, mz, h, px,
-                    py, mesh_out, rpt);
+                    py, mesh_out);
                 int size = nvars * nx * ny * nz * sizeof(prec);
                 printf("Expected file size: %d \n", size);
                 if (rpt > 1 || rpt < 0) {
@@ -143,6 +152,7 @@ int main(int argc, char **argv)
                 }
         }
 
+        fflush(stdout);
 
         int size[3] = {nx, ny, nz};
         int part[2] = {px, py};
@@ -172,7 +182,6 @@ int main(int argc, char **argv)
         err |= topo_read_serial(input, m.rank, px, py, m.coord, m.nxt, m.nyt,
                                 alloc, &f);
 
-
         MPI_Datatype readtype = data_type(&m, nz);
         MPI_Datatype readtype_m = data_type(&m, mz);
         MPI_File     fh, fm, fp;
@@ -183,19 +192,20 @@ int main(int argc, char **argv)
         MPICHK(MPI_File_set_view(fh, 0, MPI_FLOAT, readtype, "native", 
                           MPI_INFO_NULL));
         
-
         int buffer_size = m.nxt * m.nyt * nvars;
-        float *buffer = (float*) calloc(buffer_size * nz, sizeof(float));
+        float *buffer = (float*) calloc(buffer_size, sizeof(float));
         float *prop = (float*) calloc(buffer_size * mz, sizeof(float));
         float *buffer_m = (float*) calloc(buffer_size * nz, sizeof(float));
 
         for (int j = 0; j < m.nyt; ++j) {
         for (int i = 0; i < m.nxt; ++i) {
-                buffer[0 + nvars * i + j * nvars * m.nxt] = i * h;
-                buffer[1 + nvars * i + j * nvars * m.nxt] = j * h;
+                buffer[0 + nvars * i + j * nvars * m.nxt] = (m.coord[0]*m.nxt + i) * h;
+                buffer[1 + nvars * i + j * nvars * m.nxt] = (m.coord[1]*m.nyt + j) * h;
+                //buffer[0 + nvars * i + j * nvars * m.nxt] = i * h;
+                //buffer[1 + nvars * i + j * nvars * m.nxt] = j * h;
         }
         }
-
+        
         if (mesh_out == 1) {
             MPICHK(MPI_File_open(m.MCW, mesh, MPI_MODE_WRONLY | MPI_MODE_CREATE,
                                 MPI_INFO_NULL, &fm));
@@ -212,6 +222,7 @@ int main(int argc, char **argv)
                 printf("%d) ERROR! MPI-IO reading property file set view: %s\n",
                             m.rank, mpiErrStr); 
             }
+
             err = MPI_File_read_all(fp, prop, buffer_size * mz, 
                             MPI_FLOAT, &filestatus);
             if (err != MPI_SUCCESS) {
@@ -221,28 +232,47 @@ int main(int argc, char **argv)
             }
         }
 
+
+
         int show_info = (int) (nz / 10);
         show_info = show_info == 0 ? 1 : show_info;
-        double H = (nz - 1 - rpt) * h;
 
         int len = buffer_size * nz;
         if (m.rank == 0) printf("Processing...\n");
 
+        prec H = map_height(nz, h);
+        struct mapping map = map_init(ht / H, hb / H, h / H);
+
         for (int k = 0; k < nz; ++k) {
             // If k > 0 and we need repeat (rpt == 1), 
             // we shift the domain up by 1
             k0 = k == 0 ? k : k - rpt;  
-            double rk = (double) k0 / (double) (nz - 1 - rpt);
+            // Define index that is kuniform = 0 at the start of the overlapping zone
+            int kuniform = k - (nz - 1) + MAPPING_START_POINT;
+            double rk;
+            if (kuniform >= 0)
+                rk = 1.0;
+            else {
+                rk = map_eval(h * k0 / H, &map);
+            }
+
+            MPI_Barrier(MPI_COMM_WORLD);
             for (int i = 0; i < m.nxt; ++i) {
                 for (int j = 0; j < m.nyt; ++j) {
-                    size_t lmy = 4 + m.nyt + 2 * ngsl + 2 * align;
-                    size_t local_pos = 2 + align + (j + ngsl) +
-                                       (2 + i + ngsl) * lmy;
-                    // Depth, k=0 is the surface
-                    double mapping =
-                        (H + f[local_pos]) * (1 - rk) - H;
-                    buffer[2 + nvars * i + j * nvars * m.nxt] =
+
+                    size_t lmy = 4 + m.nyt + 2 * metrics_padding + 2 * align;
+                    size_t local_pos = 2 + align + (j + metrics_padding) +
+                                       (2 + i + metrics_padding) * lmy;
+                    // Use uniform grid spacing in the DM overlap zone
+                    double mapping;
+                    if (kuniform >= 0) 
+                        mapping = -H - h * kuniform;
+                    else
+                    mapping =
+                        (H + (double)f[local_pos]) * (1.0 - rk) - H;
+                    buffer[2 + nvars * i + j * nvars * m.nxt] = 
                             (prec)mapping;
+
                     if (mesh_out == 1) {           
                         // For reading and mesh writing, we start from the
                         // the surface, to keep compatible with the queried
@@ -256,6 +286,7 @@ int main(int argc, char **argv)
                             MPI_Finalize();
                             return(-1);
                         }
+
                         size_t pos = nvars * (idx_z * m.nxt * m.nyt +
                                             j * m.nxt + i);
                         memcpy(buffer_m + nvars * (k * m.nxt * m.nyt +