diff --git a/benchmark.c b/benchmark.c
index aee29f344..a49ab6aaf 100644
--- a/benchmark.c
+++ b/benchmark.c
@@ -45,7 +45,9 @@
 # include <omp.h>
 # include "init/init_openmp.h"
 #endif
+#include "git_hash.h"
 #include "gettime.h"
+#include "io/utils.h"
 #include "su3.h"
 #include "su3adj.h"
 #include "ranlxd.h"
@@ -55,6 +57,8 @@
 #include "boundary.h"
 #include "operator/Hopping_Matrix.h"
 #include "operator/Hopping_Matrix_nocom.h"
+#include "operator/Hopping_Matrix_32.h"
+#include "operator/Hopping_Matrix_32_nocom.h"
 #include "operator/tm_operators.h"
 #include "global.h"
 #include "xchange/xchange.h"
@@ -63,6 +67,7 @@
 #include "operator/D_psi.h"
 #include "phmc.h"
 #include "mpi_init.h"
+#include "linalg/assign_to_32.h"
 
 #ifdef PARALLELT
 #  define SLICE (LX*LY*LZ/2)
@@ -82,6 +87,22 @@
 
 int check_xchange();
 
+double benchmark_hopping(const int j_max, const int k_max, double* antioptaway);
+double benchmark_hopping_nocom(const int j_max, const int k_max, double* antioptaway);
+
+double benchmark_hopping_32(const int j_max, const int k_max, float* antioptaway_32);
+double benchmark_hopping_32_nocom(const int j_max, const int k_max, float* antioptaway_32);
+
+void average_and_print(
+  const double dt, const int j_max, const int k_max, const double antioptaway, 
+  const unsigned int precision);
+
+void average_and_compute_bandwidth(
+  const double dt, const double dt_nocom, const int j_max, const int k_max, 
+  const double antioptaway, const unsigned int precision);
+
+double benchmark_Dpsi(const int k_max, const int j_max);
+
 int main(int argc,char *argv[])
 {
   int j,j_max,k,k_max = 1;
@@ -90,16 +111,16 @@ int main(int argc,char *argv[])
 #endif
   int status = 0;
   
-  static double t1,t2,dt,sdt,dts,qdt,sqdt;
+  static double t1,t2,dt,dt2;
   double antioptaway=0.0;
-
-#ifdef TM_USE_MPI
-  static double dt2;
+  float antioptaway_32=0.0;
   
   DUM_DERI = 6;
   DUM_MATRIX = DUM_DERI+8;
   NO_OF_SPINORFIELDS = DUM_MATRIX+2;
-
+  NO_OF_SPINORFIELDS_32 = 6;
+  
+#ifdef TM_USE_MPI
 #  ifdef TM_USE_OMP
   int mpi_thread_provided;
   MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_thread_provided);
@@ -107,7 +128,6 @@ int main(int argc,char *argv[])
   MPI_Init(&argc, &argv);
 #  endif
   MPI_Comm_rank(MPI_COMM_WORLD, &g_proc_id);
-
 #else
   g_proc_id = 0;
 #endif
@@ -126,64 +146,24 @@ int main(int argc,char *argv[])
 
   tmlqcd_mpi_init(argc, argv);
 
-
-  
-  if(g_proc_id==0) {
-#ifdef SSE
-    printf("# The code was compiled with SSE instructions\n");
-#endif
-#ifdef SSE2
-    printf("# The code was compiled with SSE2 instructions\n");
-#endif
-#ifdef SSE3
-    printf("# The code was compiled with SSE3 instructions\n");
-#endif
-#ifdef P4
-    printf("# The code was compiled for Pentium4\n");
-#endif
-#ifdef OPTERON
-    printf("# The code was compiled for AMD Opteron\n");
-#endif
-#ifdef _GAUGE_COPY
-    printf("# The code was compiled with -D_GAUGE_COPY\n");
-#endif
-#ifdef BGL
-    printf("# The code was compiled for Blue Gene/L\n");
-#endif
-#ifdef BGP
-    printf("# The code was compiled for Blue Gene/P\n");
-#endif
-#ifdef _USE_HALFSPINOR
-    printf("# The code was compiled with -D_USE_HALFSPINOR\n");
-#endif    
-#ifdef _USE_SHMEM
-    printf("# The code was compiled with -D_USE_SHMEM\n");
-#  ifdef _PERSISTENT
-    printf("# The code was compiled for persistent MPI calls (halfspinor only)\n");
-#  endif
-#endif
-#ifdef TM_USE_MPI
-#  ifdef _NON_BLOCKING
-    printf("# The code was compiled for non-blocking MPI calls (spinor and gauge)\n");
-#  endif
-#endif
-    printf("\n");
-    fflush(stdout);
-  }
-  
+  write_first_messages(NULL,"benchmark",git_hash);
   
 #ifdef _GAUGE_COPY
   init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1);
+  j += init_gauge_field_32(VOLUMEPLUSRAND, 1);
 #else
   init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 0);
+  j += init_gauge_field_32(VOLUMEPLUSRAND, 0);  
 #endif
   init_geometry_indices(VOLUMEPLUSRAND + g_dbw2rand);
 
   if(even_odd_flag) {
     j = init_spinor_field(VOLUMEPLUSRAND/2, 2*k_max+1);
+    j += init_spinor_field_32(VOLUMEPLUSRAND / 2, NO_OF_SPINORFIELDS_32);
   }
   else {
     j = init_spinor_field(VOLUMEPLUSRAND, 2*k_max);
+    j += init_spinor_field_32(VOLUMEPLUSRAND, NO_OF_SPINORFIELDS_32);
   }
 
   if ( j!= 0) {
@@ -222,13 +202,11 @@ int main(int argc,char *argv[])
     fprintf(stderr, "Not enough memory for halfspinor fields! Aborting...\n");
     exit(0);
   }
-  if(g_sloppy_precision_flag == 1) {
-    g_sloppy_precision = 1;
-    j = init_dirac_halfspinor32();
-    if ( j!= 0) {
-      fprintf(stderr, "Not enough memory for 32-Bit halfspinor fields! Aborting...\n");
-      exit(0);
-    }
+  j = init_dirac_halfspinor32();
+  if (j != 0)
+  {
+    fprintf(stderr, "Not enough memory for 32-bit halffield! Aborting...\n");
+    exit(-1);
   }
 #  if (defined _PERSISTENT)
   init_xchange_halffield();
@@ -251,131 +229,52 @@ int main(int argc,char *argv[])
   /*For parallelization: exchange the gaugefield */
   xchange_gauge(g_gauge_field);
 #endif
+  convert_32_gauge_field(g_gauge_field_32, g_gauge_field, VOLUMEPLUSRAND);
 
   if(even_odd_flag) {
-    sdt=0.; sqdt=0.0;
     /*initialize the pseudo-fermion fields*/
     for (k = 0; k < k_max; k++) {
       random_spinor_field_eo(g_spinor_field[k], reproduce_randomnumber_flag, RN_GAUSS);
+      assign_to_32(g_spinor_field32[k], g_spinor_field[k], VOLUME/2);
     }
     
-    j_max=512;
-    antioptaway=0.0;
-    /* compute approximately how many applications we need to do to get a reliable measurement */
-#ifdef TM_USE_MPI
-    MPI_Barrier(MPI_COMM_WORLD);
-#endif
-    t1 = gettime();
-    for (j=0;j<j_max;j++) {
-      for (k=0;k<k_max;k++) {
-        Hopping_Matrix(0, g_spinor_field[k+k_max], g_spinor_field[k]);
-        Hopping_Matrix(1, g_spinor_field[2*k_max], g_spinor_field[k+k_max]);
-        antioptaway+=creal(g_spinor_field[2*k_max][0].s0.c0);
-      }
-    }
-    dt = gettime()-t1;
+    // estimate reasonable number of iterations to do to get good averages
+    j_max=128;
+    dt = benchmark_hopping(j_max, k_max, &antioptaway);
     // division by g_nproc because we will average over processes
     j = (int)(ceil(j_max*31.0/dt/g_nproc));
 #ifdef TM_USE_MPI
-    MPI_Allreduce(&j,&j_max, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+    MPI_Allreduce(&j, &j_max, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
 #else
     j_max = j;
 #endif
 
-
-
-    /* perform the actual benchmark */
+    // do the actual benchmark
+    antioptaway = 0.0;
+    dt = benchmark_hopping(j_max, k_max, &antioptaway);
+    average_and_print(dt, j_max, k_max, antioptaway, 64);
 #ifdef TM_USE_MPI
-    MPI_Barrier(MPI_COMM_WORLD);
-#endif
-    t1 = gettime();
+    /* isolated computation */
     antioptaway=0.0;
-    for (j=0;j<j_max;j++) {
-      for (k=0;k<k_max;k++) {
-        Hopping_Matrix(0, g_spinor_field[k+k_max], g_spinor_field[k]);
-        Hopping_Matrix(1, g_spinor_field[2*k_max], g_spinor_field[k+k_max]);
-        antioptaway+=creal(g_spinor_field[2*k_max][0].s0.c0);
-      }
-    }
-    dt = gettime()-t1;
-#ifdef TM_USE_MPI
-    MPI_Allreduce (&dt, &sdt, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-#else
-    sdt = dt;
+    dt2 = benchmark_hopping_nocom(j_max, k_max, &antioptaway);
+    average_and_compute_bandwidth(dt, dt2, j_max, k_max, antioptaway, 64);
 #endif
     
-    qdt=dt*dt;
-#ifdef TM_USE_MPI
-    MPI_Allreduce (&qdt, &sqdt, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-#else
-    sqdt = qdt;
-#endif
-
-    sdt=sdt/((double)g_nproc);
-    sqdt=sqrt(sqdt/g_nproc-sdt*sdt);
-     
-    dts=dt;
-    sdt=1.0e6f*sdt/((double)(k_max*j_max*(VOLUME)));
-    sqdt=1.0e6f*sqdt/((double)(k_max*j_max*(VOLUME)));
-    
-    if(g_proc_id==0) {
-      printf("# The following result is just to make sure that the calculation is not optimized away: %e\n", antioptaway);
-      printf("# Total compute time %e sec, variance of the time %e sec. (%d iterations).\n", sdt, sqdt, j_max);
-#ifdef TM_USE_MPI
-      printf("# Communication switched on: \n");
-#endif
-      printf("\n%12d Mflops(total) %8d Mflops(process)", (int)(g_nproc*1608.0f/sdt),(int)(1608.0f/sdt));
-#ifdef TM_USE_OMP
-      printf(" %8d Mflops(thread)",(int)(1608.0f/(omp_num_threads*sdt)));
-#endif
-      printf(" [ %d bit arithmetic ]\n\n",(int)(sizeof(spinor)/3)); 
-      fflush(stdout);
-    }
-    
+    // and now in single precision
+    antioptaway_32 = 0.0;
+    dt = benchmark_hopping_32(j_max, k_max, &antioptaway_32);
+    average_and_print(dt, j_max, k_max, (double)antioptaway_32, 32);
 #ifdef TM_USE_MPI
     /* isolated computation */
-    t1 = gettime();
-    antioptaway=0.0;
-    for (j=0;j<j_max;j++) {
-      for (k=0;k<k_max;k++) {
-        Hopping_Matrix_nocom(0, g_spinor_field[k+k_max], g_spinor_field[k]);
-        Hopping_Matrix_nocom(1, g_spinor_field[2*k_max], g_spinor_field[k+k_max]);
-        antioptaway += creal(g_spinor_field[2*k_max][0].s0.c0);
-      }
-    }
-    t2 = gettime();
-    dt2 = t2-t1;
-    /* compute the bandwidth */
-    dt=dts-dt2;
-    MPI_Allreduce (&dt, &sdt, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    sdt=sdt/((double)g_nproc);
-    MPI_Allreduce (&dt2, &dt, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    dt=dt/((double)g_nproc);
-    dt=1.0e6f*dt/((double)(k_max*j_max*(VOLUME)));
-    if(g_proc_id==0) {
-      printf("# The following result is printed just to make sure that the calculation is not optimized away: %e\n",antioptaway);
-      printf("# Communication switched off: \n\n%12d Mflops(total) %8d Mflops(process)", (int)(g_nproc*1608.0f/dt),(int)(1608.0f/dt));
-#ifdef TM_USE_OMP
-      printf(" %8d Mflops(thread)",(int)(1608.0f/(omp_num_threads*dt)));
-#endif
-      printf(" [ %d bit arithmetic ]\n\n",(int)(sizeof(spinor)/3)); 
-      fflush(stdout);
-    }
-    sdt=sdt/((double)k_max);
-    sdt=sdt/((double)j_max);
-    sdt=sdt/((double)(2*SLICE));
-    if(g_proc_id==0) {
-      printf("# The size of the package is %d bytes.\n",(SLICE)*192);
-#ifdef _USE_HALFSPINOR
-      printf("# The bandwidth is %5.2f + %5.2f MB/sec\n", 192./sdt/1024/1024, 192./sdt/1024./1024);
-#else
-      printf("# The bandwidth is %5.2f + %5.2f MB/sec\n", 2.*192./sdt/1024/1024, 2.*192./sdt/1024./1024);
-#endif
-    }
+    antioptaway_32=0.0;
+    dt2 = benchmark_hopping_32_nocom(j_max, k_max, &antioptaway_32);
+    average_and_compute_bandwidth(dt, dt2, j_max, k_max, (double)antioptaway_32, 32);
 #endif
+    
     fflush(stdout);
   }
   else {
+    double sdt, qdt, dts, sqdt;
     /* the non even/odd case now */
     /*initialize the pseudo-fermion fields*/
     j_max=128;
@@ -473,3 +372,136 @@ int main(int argc,char *argv[])
 #endif
   return(0);
 }
+
+double benchmark_hopping(const int j_max, const int k_max, double* antioptaway){
+#ifdef TM_USE_MPI
+  MPI_Barrier(MPI_COMM_WORLD);
+#endif
+  double t1 = gettime();
+  for (int j=0;j<j_max;j++) {
+    for (int k=0;k<k_max;k++) {
+      Hopping_Matrix(0, g_spinor_field[k+k_max], g_spinor_field[k]);
+      Hopping_Matrix(1, g_spinor_field[2*k_max], g_spinor_field[k+k_max]);
+      *antioptaway+=creal(g_spinor_field[2*k_max][0].s0.c0);
+    }
+  }
+  return(gettime()-t1);
+}
+
+double benchmark_hopping_nocom(const int j_max, const int k_max, double* antioptaway){
+#ifdef TM_USE_MPI
+  MPI_Barrier(MPI_COMM_WORLD);
+#endif
+  double t1 = gettime();
+  for (int j=0;j<j_max;j++) {
+    for (int k=0;k<k_max;k++) {
+      Hopping_Matrix_nocom(0, g_spinor_field[k+k_max], g_spinor_field[k]);
+      Hopping_Matrix_nocom(1, g_spinor_field[2*k_max], g_spinor_field[k+k_max]);
+      *antioptaway+=creal(g_spinor_field[2*k_max][0].s0.c0);
+    }
+  }
+  return(gettime()-t1);
+}
+
+double benchmark_hopping_32(const int j_max, const int k_max, float* antioptaway_32){
+#ifdef TM_USE_MPI
+  MPI_Barrier(MPI_COMM_WORLD);
+#endif
+  double t1 = gettime();
+  for (int j=0;j<j_max;j++) {
+    for (int k=0;k<k_max;k++) {
+      Hopping_Matrix_32(0, g_spinor_field32[k+k_max], g_spinor_field32[k]);
+      Hopping_Matrix_32(1, g_spinor_field32[2*k_max], g_spinor_field32[k+k_max]);
+      *antioptaway_32+=crealf(g_spinor_field32[2*k_max][0].s0.c0);
+    }
+  }
+  return(gettime()-t1);
+}
+
+double benchmark_hopping_32_nocom(const int j_max, const int k_max, float* antioptaway_32){
+#ifdef TM_USE_MPI
+  MPI_Barrier(MPI_COMM_WORLD);
+#endif
+  double t1 = gettime();
+  for (int j=0;j<j_max;j++) {
+    for (int k=0;k<k_max;k++) {
+      Hopping_Matrix_32_nocom(0, g_spinor_field32[k+k_max], g_spinor_field32[k]);
+      Hopping_Matrix_32_nocom(1, g_spinor_field32[2*k_max], g_spinor_field32[k+k_max]);
+      *antioptaway_32+=crealf(g_spinor_field32[2*k_max][0].s0.c0);
+    }
+  }
+  return(gettime()-t1);
+}
+
+void average_and_print(const double dt, const int j_max, const int k_max,
+       const double antioptaway, const unsigned int precision){
+  double sdt, qdt, sqdt, dts;
+#ifdef TM_USE_MPI
+  MPI_Allreduce (&dt, &sdt, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+#else
+  sdt = dt;
+#endif
+  
+  qdt=dt*dt;
+#ifdef TM_USE_MPI
+  MPI_Allreduce (&qdt, &sqdt, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+#else
+  sqdt = qdt;
+#endif
+
+  sdt=sdt/((double)g_nproc);
+  sqdt=sqrt(sqdt/g_nproc-sdt*sdt);
+    
+  dts=dt;
+  sdt=1.0e6f*sdt/((double)(k_max*j_max*(VOLUME)));
+  sqdt=1.0e6f*sqdt/((double)(k_max*j_max*(VOLUME)));
+  
+  if(g_proc_id==0) {
+    printf("# The following result is just to make sure that the calculation is not optimized away: %e\n", antioptaway);
+    printf("# Total compute time %e sec, variance of the time %e sec. (%d iterations).\n", sdt, sqdt, j_max);
+#ifdef TM_USE_MPI
+    printf("# Communication switched on: \n");
+#endif
+    printf("\n%12d Mflops(total) %8d Mflops(process)", (int)(g_nproc*1608.0f/sdt),(int)(1608.0f/sdt));
+#ifdef TM_USE_OMP
+    printf(" %8d Mflops(thread)",(int)(1608.0f/(omp_num_threads*sdt)));
+#endif
+    printf(" [ %u bit arithmetic ]\n\n", precision); 
+    fflush(stdout);
+  }
+}
+
+void average_and_compute_bandwidth(const double dt, const double dt_nocom, const int j_max, const int k_max, const double antioptaway, const unsigned int precision){
+#ifdef TM_USE_MPI
+  double dt_diff, sdt_diff, sdt_nocom;
+  
+  dt_diff=dt-dt_nocom;
+  MPI_Allreduce (&dt_diff, &sdt_diff, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+  sdt_diff=sdt_diff/((double)g_nproc);
+  
+  MPI_Allreduce (&dt_nocom, &sdt_nocom, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+  sdt_nocom=sdt_nocom/((double)g_nproc);
+  sdt_nocom=1.0e6f*sdt_nocom/((double)(k_max*j_max*(VOLUME)));
+  if(g_proc_id==0) {
+    printf("# The following result is printed just to make sure that the calculation is not optimized away: %e\n",antioptaway);
+    printf("# Communication switched off: \n\n%12d Mflops(total) %8d Mflops(process)", (int)(g_nproc*1608.0f/sdt_nocom),(int)(1608.0f/sdt_nocom));
+#ifdef TM_USE_OMP
+    printf(" %8d Mflops(thread)",(int)(1608.0f/(omp_num_threads*sdt_nocom)));
+#endif
+    printf(" [ %u bit arithmetic ]\n\n", precision); 
+    fflush(stdout);
+  }
+  sdt_diff=sdt_diff/((double)k_max);
+  sdt_diff=sdt_diff/((double)j_max);
+  sdt_diff=sdt_diff/((double)(2*SLICE));
+  if(g_proc_id==0) {
+    double bytes_per_site = (double)precision*24.0/8;
+    printf("# The size of the package is %f bytes.\n", (SLICE)*bytes_per_site );
+#ifdef _USE_HALFSPINOR
+    printf("# The bandwidth is %5.2f + %5.2f MB/sec\n", bytes_per_site/sdt_diff/1024/1024, bytes_per_site/sdt_diff/1024./1024);
+#else
+    printf("# The bandwidth is %5.2f + %5.2f MB/sec\n", 2.*bytes_per_site/sdt_diff/1024/1024, 2.*bytes_per_site/sdt_diff/1024./1024);
+#endif
+  }
+#endif //TM_USE_MPI
+}
\ No newline at end of file
diff --git a/invert.c b/invert.c
index 847e84a17..b0cb4f9a7 100644
--- a/invert.c
+++ b/invert.c
@@ -221,11 +221,11 @@ int main(int argc, char *argv[])
   }
   if (even_odd_flag) {
     j = init_spinor_field(VOLUMEPLUSRAND / 2, NO_OF_SPINORFIELDS);
-    j += init_spinor_field_32(VOLUMEPLUSRAND / 2, NO_OF_SPINORFIELDS_32);   
+    j += init_spinor_field_32(VOLUMEPLUSRAND / 2, NO_OF_SPINORFIELDS_32);
   }
   else {
     j = init_spinor_field(VOLUMEPLUSRAND, NO_OF_SPINORFIELDS);
-    j += init_spinor_field_32(VOLUMEPLUSRAND, NO_OF_SPINORFIELDS_32);   
+    j += init_spinor_field_32(VOLUMEPLUSRAND, NO_OF_SPINORFIELDS_32);
   }
   if (j != 0) {
     fprintf(stderr, "Not enough memory for spinor fields! Aborting...\n");
diff --git a/io/utils_write_first_message.c b/io/utils_write_first_message.c
index d357d7d07..8dd9c473e 100644
--- a/io/utils_write_first_message.c
+++ b/io/utils_write_first_message.c
@@ -22,139 +22,124 @@
 #include "utils.ih"
 #include <read_input.h>
 
+void print_fprint(FILE* parameterfile, const char * const msg){
+  if(g_proc_id == 0){
+    printf(msg);
+    if( (void*)parameterfile != NULL ) fprintf(parameterfile, msg);
+  }
+}
+
 int write_first_messages(FILE * parameterfile, char const * const executable, char const * const git_hash) {
   char message[1024];
-  snprintf(message, 1024, "This is the %s code for twisted mass Wilson QCD\n\nVersion %s, commit %s\n",executable,PACKAGE_VERSION,git_hash);
-  printf("%s",message);
-  fprintf(parameterfile,"%s",message);
-
+  snprintf(message, 1023, "This is the %s code for twisted mass Wilson QCD\n\nVersion %s, commit %s\n",executable,PACKAGE_VERSION,git_hash);
+  print_fprint(parameterfile, message);
+  
 #ifdef SSE
-  printf("# The code is compiled with SSE instructions\n");
-  fprintf(parameterfile, 
-	  "# The code is compiled with SSE instructions\n");
+  snprintf(message, 1023, "# The code is compiled with SSE instructions\n");
+  print_fprint(parameterfile, message);
 #endif
 #ifdef SSE2
-  printf("# The code is compiled with SSE2 instructions\n");
-  fprintf(parameterfile, 
-	  "# The code is compiled with SSE2 instructions\n");
+  snprintf(message, 1023, "# The code is compiled with SSE2 instructions\n");
+  print_fprint(parameterfile, message);
 #endif
 #ifdef SSE3
-  printf("# The code is compiled with SSE3 instructions\n");
-  fprintf(parameterfile, 
-	  "# The code is compiled with SSE3 instructions\n");
+  snprintf(message, 1023, "# The code is compiled with SSE3 instructions\n");
+  print_fprint(parameterfile, message);
 #endif
 #ifdef P4
-  printf("# The code is compiled for Pentium4\n");
-  fprintf(parameterfile, 
-	  "# The code is compiled for Pentium4\n");
+  snprintf(message, 1023, "# The code is compiled for Pentium4\n");
+  print_fprint(parameterfile, message);
 #endif
 #if (defined BGL && !defined BGP)
-  printf("# The code is compiled for Blue Gene/L\n");
-  fprintf(parameterfile, 
-	  "# The code is compiled for Blue Gene/L\n");
+  snprintf(message, 1023, "# The code is compiled for Blue Gene/L\n");
+  print_fprint(parameterfile, message);
 #endif
 #ifdef BGP
-  printf("# The code is compiled for Blue Gene/P\n");
-  fprintf(parameterfile,
-          "# The code is compiled for Blue Gene/P\n");
+  snprintf(message, 1023, "# The code is compiled for Blue Gene/P\n");
+  print_fprint(parameterfile, message);
 #endif
 #if (defined BGQ && defined XLC)
-  printf("# The code is compiled with QPX intrinsics for Blue Gene/Q\n");
-  fprintf(parameterfile,
-          "# The code is compiled with QPX intrinsics for Blue Gene/Q\n");
+  snprintf(message, 1023, "# The code is compiled for Blue Gene/Q\n");
+  print_fprint(parameterfile, message);
 #endif
 #ifdef SPI
-  printf("# Compiled with BG/Q SPI communication\n");
-  fprintf(parameterfile,
-	  "# Compiled with IBM Blue Gene/Q SPI communication\n");
+  snprintf(message, 1023, "# The code is compiled with Blue Gene/Q SPI communication\n");
+  print_fprint(parameterfile, message);
 #endif
 #ifdef OPTERON
-  printf("# The code is compiled for AMD Opteron\n");
-  fprintf(parameterfile,
-	  "# The code is compiled for AMD Opteron\n");
+  snprintf(message, 1023, "# The code is compiled for AMD Opteron\n");
+  print_fprint(parameterfile, message);
 #endif
 #ifdef _GAUGE_COPY
-  printf("# The code is compiled with -D_GAUGE_COPY\n");
-  fprintf(parameterfile,
-	  "# The code is compiled with -D_GAUGE_COPY\n");
+  snprintf(message, 1023, "# The code is compiled with -D_GAUGE_COPY\n");
+  print_fprint(parameterfile, message);
 #endif
 #ifdef _USE_HALFSPINOR
-  printf("# The code is compiled with -D_USE_HALFSPINOR\n");
-  fprintf(parameterfile,
-	  "# The code is compiled with -D_USE_HALFSPINOR\n");
+  snprintf(message, 1023, "# the code is compiled with -D_USE_HALFSPINOR\n");
+  print_fprint(parameterfile, message);
 #endif
 #ifdef _USE_SHMEM
-  printf("# the code is compiled with -D_USE_SHMEM\n");
-  fprintf(parameterfile,
-         "# the code is compiled with -D_USE_SHMEM\n");
+  snprintf(message, 1023, "# the code is compiled with -D_USE_SHMEM\n");
+  print_fprint(parameterfile, message);
 #  ifdef _PERSISTENT
-  printf("# the code is compiled for persistent MPI calls (halfspinor only)\n");
-  fprintf(parameterfile,
-         "# the code is compiled for persistent MPI calls (halfspinor only)\n");
+  snprintf(message, 1023, "# the code is compiled for persistent MPI calls (halfspinor only)\n");
+  print_fprint(parameterfile, message);
 #  endif
 #endif
 #ifdef TM_USE_MPI
 #  ifdef _NON_BLOCKING
-  printf("# the code is compiled for non-blocking MPI calls (spinor and gauge)\n");
-  fprintf(parameterfile,
-         "# the code is compiled for non-blocking MPI calls (spinor and gauge)\n");
+  snprintf(message, 1023, "# the code is compiled for non-blocking MPI calls (spinor and gauge)\n");
+  print_fprint(parameterfile, message);
 #  endif
 #  ifdef HAVE_LIBLEMON
-  printf("# the code is compiled with MPI IO / Lemon\n");
-  fprintf(parameterfile,
-	  "# the code is compiled with MPI IO / Lemon\n");
+  snprintf(message, 1023, "# the code is compiled with MPI IO / Lemon\n");
+  print_fprint(parameterfile, message);
 #  endif
 #endif
 #ifdef TM_USE_OMP
-  printf("# the code is compiled with openMP support\n");
-  fprintf(parameterfile,
-          "# the code is compiled with openMP support\n");
+  snprintf(message, 1023, "# the code is compiled with OpenMP support\n");
+  print_fprint(parameterfile, message);
 #endif
   if( bc_flag == 0 ) {
-    printf("# Periodic boundary conditions are used\n");
-    fprintf(parameterfile, "# Periodic boundary conditions are used\n");
+    snprintf(message, 1023, "# Periodic boundary conditions are used\n");
+    print_fprint(parameterfile, message);
   }
   if( bc_flag == 1 ) {
-    printf("# Schroedinger Functional boundary conditions are used\n");
-    fprintf(parameterfile, "# Schroedinger Functional boundary conditions are used\n");
+    snprintf(message, 1023, "# Schroedinger Functional boundary conditions are used\n");
+    print_fprint(parameterfile, message);
   }
-  printf("# The lattice size is %d x %d x %d x %d\n",
+  snprintf(message, 1023, "# The lattice size is %d x %d x %d x %d\n",
 	 (int)(T*g_nproc_t), (int)(LX*g_nproc_x), (int)(LY*g_nproc_y), (int)(LZ*g_nproc_z));
-  printf("# The local lattice size is %d x %d x %d x %d\n", 
+  print_fprint(parameterfile, message);
+  
+  snprintf(message, 1023, "# The local lattice size is %d x %d x %d x %d\n", 
       (int)(T), (int)(LX), (int)(LY),(int) LZ);
+  print_fprint(parameterfile, message);
+  
+  
   if(even_odd_flag) {
-    printf("# Even/odd preconditioning is used\n");
-    fprintf(parameterfile, "# Even/odd preconditioning is used\n");
+    snprintf(message, 1023, "# Even/odd preconditioning is used\n");
+    print_fprint(parameterfile, message);
   }
   else {
-    printf("# Even/odd preconditioning is not used\n");
-    fprintf(parameterfile, "# Even/odd preconditioning is not used\n");
-  }
-  printf("# beta = %.12f , kappa= %.12f\n", g_beta, g_kappa);
-  printf("# boundary conditions for fermion fields (t,x,y,z) * pi: %f %f %f %f \n",X0,X1,X2,X3);
-  if( strcmp(executable,"hmc") == 0 ) {
-    printf("# mu = %.12f\n", g_mu/2./g_kappa);
-    printf("# g_rgi_C0 = %f, g_rgi_C1 = %f\n", g_rgi_C0, g_rgi_C1);
-    printf("# Using %s precision for the inversions!\n", 
-	   g_relative_precision_flag ? "relative" : "absolute");
+    snprintf(message, 1023, "# Even/odd preconditioning is not used\n");
+    print_fprint(parameterfile, message);
   }
-  fprintf(parameterfile, "# The lattice size is %d x %d x %d x %d\n", (int)(g_nproc_t*T), (int)(g_nproc_x*LX), 
-	  (int)(g_nproc_y*LY), (int)(g_nproc_z*LZ));
-  fprintf(parameterfile, "# The local lattice size is %d x %d x %d x %d\n", (int)(T), (int)(LX), (int)(LY), (int)(LZ));
-  fprintf(parameterfile, "# g_beta = %.12f , g_kappa= %.12f, c_sw = %.12f \n",g_beta,g_kappa,g_c_sw);
-  fprintf(parameterfile, "# boundary conditions for fermion fields (t,x,y,z) * pi: %f %f %f %f \n",X0,X1,X2,X3);
+  snprintf(message, 1023, "# Using %s precision for the inversions!\n", 
+	         g_relative_precision_flag ? "relative" : "absolute");
+  print_fprint(parameterfile, message);
+
+  snprintf(message, 1023, "# beta = %.12f , kappa= %.12f, mu= %.12f\n", g_beta, g_kappa, g_mu/2/g_kappa);
+  print_fprint(parameterfile, message);
+
+  snprintf(message, 1023, "# boundary conditions for fermion fields (t,x,y,z) * pi: %f %f %f %f \n",X0,X1,X2,X3);
+  print_fprint(parameterfile, message);
+
   if( strcmp(executable,"hmc") == 0 ) {
-    fprintf(parameterfile, "# Nmeas=%d, Nsave=%d \n",
-	    Nmeas,Nsave);
-    fprintf(parameterfile, "# mu = %.12f\n", g_mu/2./g_kappa);
-    fprintf(parameterfile, "# g_rgi_C0 = %f, g_rgi_C1 = %f\n", g_rgi_C0, g_rgi_C1);
-    fprintf(parameterfile, "# Using %s precision for the inversions!\n", 
-	    g_relative_precision_flag ? "relative" : "absolute");
-  }
-  if( strcmp(executable,"invert") == 0 ) {
-    printf("# beta = %.12f, mu = %.12f, kappa = %.12f\n", g_beta, g_mu/2./g_kappa, g_kappa);
-    fprintf(parameterfile,
-	    "# beta = %.12f, mu = %.12f, kappa = %.12f\n", g_beta, g_mu/2./g_kappa, g_kappa);
+    snprintf(message, 1023, "# g_rgi_C0 = %f, g_rgi_C1 = %f\n", g_rgi_C0, g_rgi_C1);
+    print_fprint(parameterfile, message);
+    snprintf(message, 1023, "# Nmeas=%d, Nsave=%d \n", Nmeas,Nsave);
+    print_fprint(parameterfile, message);
   }
   fflush(stdout); fflush(parameterfile);
   return(0);
diff --git a/operator/Hopping_Matrix_32_nocom.h b/operator/Hopping_Matrix_32_nocom.h
new file mode 100644
index 000000000..1642f2df4
--- /dev/null
+++ b/operator/Hopping_Matrix_32_nocom.h
@@ -0,0 +1,27 @@
+/***********************************************************************
+ * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifndef _HOPPING_MATRIX_32_NOCOM_H
+#define _HOPPING_MATRIX_32_NOCOM_H
+
+#include "su3.h"
+
+void Hopping_Matrix_32_nocom(const int ieo, spinor32* const l, spinor32* const k);
+
+#endif