Speedup C++ runtime performance using OpenMP.

bing-jian · May 27, 2019 · 36fb490 · 36fb490
1 parent 8ab6701
commit 36fb490
Show file tree

Hide file tree

Showing 3 changed files with 64 additions and 32 deletions.
diff --git a/C++/CMakeLists.txt b/C++/CMakeLists.txt
@@ -3,8 +3,20 @@ IF(COMMAND cmake_policy)
     cmake_policy(SET CMP0003 NEW)
 ENDIF(COMMAND cmake_policy)
 
-PROJECT(GMMREG )
+PROJECT(GMMREG)
 SET(CMAKE_CXX_STANDARD 11)
+SET(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+# Set a default build type if none was specified
+set(default_build_type "Release")
+if(NOT CMAKE_BUILD_TYPE AND NOT WIN32)
+  message(STATUS "Setting build type to '${default_build_type}' as none was specified.")
+  set(CMAKE_BUILD_TYPE "${default_build_type}" CACHE
+      STRING "Choose the type of build." FORCE)
+  # Set the possible values of build type for cmake-gui
+  set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release"
+    "MinSizeRel" "RelWithDebInfo")
+endif()
 
 # The CMake command FIND_PACKAGE(VXL) attempts to find VXL binary
 # installation.  CMake will look in the directory specified by the
@@ -78,6 +90,12 @@ ELSE(WIN32)
   ADD_LIBRARY(port_ini ${INI_LIBRARY_TYPE} port_ini.c)
 ENDIF(WIN32)
 
+find_package(OpenMP)
+if (OPENMP_FOUND)
+    set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+    set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+endif()
 
 SET(GMMREG_API_SRCS
     gmmreg_api.cpp

diff --git a/C++/gmmreg_transform.cpp b/C++/gmmreg_transform.cpp
@@ -31,15 +31,15 @@ int ThinPlateSplineTransform(const char* f_config) {
   if (LoadMatrixFromTxt(f_scene, scene) < 0) {
     return -1;
   }
-  assert(scene.cols() == d);
+  // assert(scene.cols() == d);
 
   char f_ctrl_pts[256] = {0};
   GetPrivateProfileString(common_section, "ctrl_pts", NULL,
       f_ctrl_pts, 256, f_config);
   if (LoadMatrixFromTxt(f_ctrl_pts, ctrl_pts) < 0) {
     return -1;
   }
-  assert(ctrl_pts.cols() == d);
+  // assert(ctrl_pts.cols() == d);
   int n = ctrl_pts.rows();
 
   char f_source[256] = {0};
@@ -48,7 +48,7 @@ int ThinPlateSplineTransform(const char* f_config) {
   if (LoadMatrixFromTxt(f_source, source) < 0) {
     return -1;
   }
-  assert(source.cols() == d);
+  // assert(source.cols() == d);
   int m = source.rows();
 
   vnl_matrix<double> affine, tps;
@@ -58,16 +58,16 @@ int ThinPlateSplineTransform(const char* f_config) {
   if (LoadMatrixFromTxt(f_init_affine, affine) < 0) {
     return -1;
   }
-  assert(affine.cols() == d);
-  assert(affine.rows() == d+1);
+  // assert(affine.cols() == d);
+  // assert(affine.rows() == d+1);
 
   GetPrivateProfileString(common_section, "init_tps", NULL,
       f_init_tps, 256, f_config);
   if (LoadMatrixFromTxt(f_init_tps, tps) < 0) {
     return -1;
   }
-  assert(tps.cols() == d);
-  assert(tps.rows() == (n - d - 1));
+  // assert(tps.cols() == d);
+  // assert(tps.rows() == (n - d - 1));
 
   vnl_matrix<double> param_all;
   param_all.set_size(n, d);

diff --git a/C++/gmmreg_utils.h b/C++/gmmreg_utils.h
@@ -6,6 +6,7 @@
 
 namespace gmmreg {
 
+
 template<typename T>
 T GaussTransform(const vnl_matrix<T>& A,
                  const vnl_matrix<T>& B,
@@ -18,8 +19,6 @@ T GaussTransform(const vnl_matrix<T>& A,
                  vnl_matrix<T>& gradient);
 
 template<typename T>
-//T GaussTransform(const vnl_matrix<T>& A, const vnl_matrix<T>& B, T scale, T* gradient);
-//T GaussTransform(const T* A, const T* B, int m, int n, int dim, T scale, T* grad);
 void ComputeTPSKernel(const vnl_matrix<T>& model,
                       const vnl_matrix<T>& ctrl_pts,
                       vnl_matrix<T>& U,
@@ -37,8 +36,6 @@ void ComputeSquaredDistanceMatrix(const vnl_matrix<T>& A,
                                   const vnl_matrix<T>& B,
                                   vnl_matrix<T>& D);
 
-//void normalize(const vnl_matrix<T>& x, vnl_vector<T>& centroid, T& scale, vnl_matrix<T>& normalized_x);
-//void denormalize(const vnl_matrix<T>& x, const vnl_vector<T>& centroid, const T scale, vnl_matrix<T>& denormalized_x);
 template<typename T>
 void Normalize(vnl_matrix<T>& x,
                vnl_vector<T>& centroid,
@@ -86,13 +83,16 @@ template<typename T>
 void GaussianAffinityMatrix(const T* A, const T* B,
     int m, int n, int dim, T scale, T* dist) {
   scale = -2.0*SQR(scale);
-  for (int i = 0, k = 0; i < m; ++i) {
-    for (int j = 0; j < n; ++j, ++k) {
+  int k = 0;
+  #pragma omp for
+  for (int i = 0; i < m; ++i) {
+    for (int j = 0; j < n; ++j) {
       T dist_ij = 0;
       for (int d = 0; d < dim; ++d) {
         dist_ij +=  SQR(A[i * dim + d] - B[j * dim + d]);
       }
       dist[k] = exp(dist_ij / scale);
+      ++k;
     }
   }
 }
@@ -101,11 +101,15 @@ template<typename T>
 T GaussTransform(const T* A, const T* B,
     int m, int n, int dim, T scale, T* grad) {
   T cross_term = 0;
+
+  #pragma omp for
   for (int i = 0; i < m * dim; ++i) {
     grad[i] = 0;
   }
 
   scale = SQR(scale);
+
+  #pragma omp parallel for reduction(+:cross_term)
   for (int i = 0; i < m; ++i) {
     for (int j = 0; j < n; ++j) {
       T dist_ij = 0;
@@ -118,9 +122,10 @@ T GaussTransform(const T* A, const T* B,
       }
       cross_term += cost_ij;
     }
-    /* printf("cross_term = %.3f\n", cross_term);  */
   }
+
   scale *= m * n;
+  #pragma omp for
   for (int i = 0; i < m * dim; ++i) {
     grad[i] /= scale;
   }
@@ -145,7 +150,7 @@ T GaussTransform(const vnl_matrix<T>& A,
       gradient.data_block());
 }
 
-// todo: add one more version when the model is same as ctrl_pts
+// TODO: add one more version when the model is same as ctrl_pts
 // reference:  Landmark-based Image Analysis, Karl Rohr, p195
 template<typename T>
 void ComputeTPSKernel(const vnl_matrix<T>& model,
@@ -161,10 +166,10 @@ void ComputeTPSKernel(const vnl_matrix<T>& model,
   U.fill(0);
   T eps = 1e-006;
 
-  vnl_vector<T> v_ij;
+  #pragma omp for
   for (int i = 0; i < m; ++i) {
     for (int j = 0; j < n; ++j) {
-      v_ij = model.get_row(i) - ctrl_pts.get_row(j);
+      vnl_vector<T> v_ij = model.get_row(i) - ctrl_pts.get_row(j);
       if (d == 2) {
         T r = v_ij.squared_magnitude();
         if (r > eps) {
@@ -176,9 +181,11 @@ void ComputeTPSKernel(const vnl_matrix<T>& model,
       }
     }
   }
+
+  #pragma omp for
   for (int i = 0; i < n; ++i) {
     for (int j = i + 1; j < n; ++j) {
-      v_ij = ctrl_pts.get_row(i) - ctrl_pts.get_row(j);
+      vnl_vector<T> v_ij = ctrl_pts.get_row(i) - ctrl_pts.get_row(j);
       if (d == 2) {
         T r = v_ij.squared_magnitude();
         if (r > eps) {
@@ -191,6 +198,8 @@ void ComputeTPSKernel(const vnl_matrix<T>& model,
       }
     }
   }
+
+  #pragma omp for
   for (int i = 0; i < n; ++i) {
     for (int j = 0; j < i; ++j) {
       K(i, j) = K(j, i);
@@ -242,6 +251,8 @@ void ComputeSquaredDistanceMatrix(const vnl_matrix<T>& A,
   //asssert(A.cols()==B.cols());
   D.set_size(m,n);
   vnl_vector<T> v_ij;
+
+  #pragma omp for
   for (int i = 0; i < m; ++i) {
     for (int j = 0; j < n; ++j) {
       v_ij = A.get_row(i) - B.get_row(j);
@@ -259,10 +270,12 @@ void Normalize(vnl_matrix<T>& x,
   centroid.set_size(d);
 
   vnl_vector<T> col;
+  #pragma omp for
   for (int i = 0; i < d; ++i) {
     col = x.get_column(i);
     centroid(i) = col.mean();
   }
+  #pragma omp for
   for (int i = 0; i < n; ++i) {
     x.set_row(i, x.get_row(i) - centroid);
   }
@@ -271,56 +284,57 @@ void Normalize(vnl_matrix<T>& x,
 }
 
 template<typename T>
-void Denormalize(vnl_matrix<T>& x, const vnl_vector<T>& centroid, const T scale) {
+void Denormalize(vnl_matrix<T>& x, const vnl_vector<T>& centroid,
+        const T scale) {
   int n = x.rows();
   if (n==0) return;
   int d = x.cols();
+  #pragma omp for
   for (int i = 0; i < n; ++i) {
     x.set_row(i, x.get_row(i) * scale + centroid);
   }
 }
 
 template<typename T>
-void ComputeP(const vnl_matrix<T>& x,const vnl_matrix<T>& y, vnl_matrix<T>& P, T &E, T sigma, int outliers) {
+void ComputeP(const vnl_matrix<T>& x, const vnl_matrix<T>& y, vnl_matrix<T>& P,
+        T &E, T sigma, int outliers) {
   T k;
   k = -2*sigma*sigma;
 
-  //P.set_size(m,n); P.fill(0);
-  //vnl_vector<T> v_ij;
-
   vnl_vector<T> column_sum;
   int m = x.rows();
   int s = y.rows();
   int d = x.cols();
   column_sum.set_size(s);
   column_sum.fill(0);
-  T outlier_term = outliers*pow((2*sigma*sigma*3.1415926),0.5*d);
+  T outlier_term = outliers * pow((2*sigma*sigma*3.1415926), 0.5*d);
+  #pragma omp for
   for (int i = 0; i < m; ++i) {
     for (int j = 0; j < s; ++j) {
       T r = 0;
       for (int t = 0; t < d; ++t) {
-        r += (x(i,t) - y(j,t))*(x(i,t) - y(j,t));
+        r += (x(i, t) - y(j, t)) * (x(i, t) - y(j, t));
       }
       P(i,j) = exp(r/k);
-      column_sum[j]+=P(i,j);
+      column_sum[j] += P(i,j);
     }
   }
 
 
-  if (outliers!=0) {
+  if (outliers != 0) {
+    #pragma omp for
     for (int i = 0; i < s; ++i)
       column_sum[i] += outlier_term;
   }
-  if (column_sum.min_value()>(1e-12)) {
+  if (column_sum.min_value( )> (1e-12)) {
     E = 0;
+    #pragma omp for
     for (int i = 0; i < s; ++i) {
       for (int j = 0; j < m; ++j){
-        P(j,i) = P(j,i)/column_sum[i];
+        P(j, i) = P(j, i) / column_sum[i];
       }
       E -= log(column_sum[i]);
     }
-    //vcl_cerr< < s;
-    //vcl_cerr<<P.get_column(10);
   }
   else {
     P.empty();