Enable SSE code for LBFGS (openmm#4327)

* Updated to new version of libLBFGS * Enable SSE code for LBFGS
erbad · Nov 30, 2023 · ff3432f · ff3432f
1 parent f6e6b6e
commit ff3432f
Show file tree

Hide file tree

Showing 6 changed files with 210 additions and 47 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -244,6 +244,7 @@ FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
 ENDFOREACH(subdir)
 IF(X86)
     SET_SOURCE_FILES_PROPERTIES(${CMAKE_SOURCE_DIR}/libraries/sfmt/src/SFMT.cpp PROPERTIES COMPILE_FLAGS "-DHAVE_SSE2=1")
+    SET_SOURCE_FILES_PROPERTIES(${CMAKE_SOURCE_DIR}/libraries/lbfgs/src/lbfgs.cpp PROPERTIES COMPILE_FLAGS "-DUSE_SSE=1 -DHAVE_EMMINTRIN_H=1")
 ELSE()
     SET_SOURCE_FILES_PROPERTIES(${CMAKE_SOURCE_DIR}/libraries/sfmt/src/SFMT.cpp PROPERTIES COMPILE_FLAGS "-UHAVE_SSE2")
 ENDIF()

diff --git a/libraries/lbfgs/include/lbfgs.h b/libraries/lbfgs/include/lbfgs.h
@@ -24,7 +24,7 @@
  * THE SOFTWARE.
  */
 
-/* $Id: lbfgs.h 65 2010-01-29 12:19:16Z naoaki $ */
+/* $Id$ */
 
 #ifndef __LBFGS_H__
 #define __LBFGS_H__
@@ -243,7 +243,7 @@ typedef struct {
      *      (f' - f) / f < \ref delta,
      *  where f' is the objective value of \ref past iterations ago, and f is
      *  the objective value of the current iteration.
-     *  The default value is \c 0.
+     *  The default value is \c 1e-5.
      */
     lbfgsfloatval_t delta;
 
@@ -267,7 +267,7 @@ typedef struct {
     /**
      * The maximum number of trials for the line search.
      *  This parameter controls the number of function and gradients evaluations
-     *  per iteration for the line search routine. The default value is \c 20.
+     *  per iteration for the line search routine. The default value is \c 40.
      */
     int             max_linesearch;
 
@@ -313,7 +313,7 @@ typedef struct {
      *  evaluations are inexpensive with respect to the cost of the
      *  iteration (which is sometimes the case when solving very large
      *  problems) it may be advantageous to set this parameter to a small
-     *  value. A typical small value is \c 0.1. This parameter shuold be
+     *  value. A typical small value is \c 0.1. This parameter should be
      *  greater than the \ref ftol parameter (\c 1e-4) and smaller than
      *  \c 1.0.
      */
@@ -525,6 +525,13 @@ lbfgsfloatval_t WINDOWS_EXPORT * lbfgs_malloc(int n);
  */
 void WINDOWS_EXPORT lbfgs_free(lbfgsfloatval_t *x);
 
+/**
+ * Get string description of an lbfgs() return code.
+ *
+ *  @param err          A value returned by lbfgs().
+ */
+const char* lbfgs_strerror(int err);
+
 /** @} */
 
 #ifdef  __cplusplus
@@ -583,7 +590,7 @@ Among the various ports of L-BFGS, this library provides several features:
   The library is thread-safe, which is the secondary gain from the callback
   interface.
 - <b>Cross platform.</b> The source code can be compiled on Microsoft Visual
-  Studio 2005, GNU C Compiler (gcc), etc.
+  Studio 2010, GNU C Compiler (gcc), etc.
 - <b>Configurable precision</b>: A user can choose single-precision (float)
   or double-precision (double) accuracy by changing ::LBFGS_FLOAT macro.
 - <b>SSE/SSE2 optimization</b>:
@@ -597,17 +604,28 @@ This library is used by:
 - <a href="http://www.chokkan.org/software/classias/">Classias: A collection of machine-learning algorithms for classification</a>
 - <a href="http://www.public.iastate.edu/~gdancik/mlegp/">mlegp: an R package for maximum likelihood estimates for Gaussian processes</a>
 - <a href="http://infmath.uibk.ac.at/~matthiasf/imaging2/">imaging2: the imaging2 class library</a>
-- <a href="http://search.cpan.org/~laye/Algorithm-LBFGS-0.16/">Algorithm::LBFGS - Perl extension for L-BFGS</a>
-- <a href="http://www.cs.kuleuven.be/~bernd/yap-lbfgs/">YAP-LBFGS (an interface to call libLBFGS from YAP Prolog)</a>
 
 @section download Download
 
-- <a href="http://www.chokkan.org/software/dist/liblbfgs-1.9.tar.gz">Source code</a>
+- <a href="https://github.com/downloads/chokkan/liblbfgs/liblbfgs-1.10.tar.gz">Source code</a>
+- <a href="https://github.com/chokkan/liblbfgs">GitHub repository</a>
 
 libLBFGS is distributed under the term of the
 <a href="http://opensource.org/licenses/mit-license.php">MIT license</a>.
 
+@section modules Third-party modules
+- <a href="http://cran.r-project.org/web/packages/lbfgs/index.html">lbfgs: Limited-memory BFGS Optimization (a wrapper for R)</a> maintained by Antonio Coppola.
+- <a href="http://search.cpan.org/~laye/Algorithm-LBFGS-0.16/">Algorithm::LBFGS - Perl extension for L-BFGS</a> maintained by Lei Sun.
+- <a href="http://www.cs.kuleuven.be/~bernd/yap-lbfgs/">YAP-LBFGS (an interface to call libLBFGS from YAP Prolog)</a> maintained by Bernd Gutmann.
+
 @section changelog History
+- Version 1.10 (2010-12-22):
+    - Fixed compiling errors on Mac OS X; this patch was kindly submitted by
+      Nic Schraudolph.
+    - Reduced compiling warnings on Mac OS X; this patch was kindly submitted
+      by Tamas Nepusz.
+    - Replaced memalign() with posix_memalign().
+    - Updated solution and project files for Microsoft Visual Studio 2010.
 - Version 1.9 (2010-01-29):
     - Fixed a mistake in checking the validity of the parameters "ftol" and
       "wolfe"; this was discovered by Kevin S. Van Horn.
@@ -728,6 +746,7 @@ Special thanks go to:
     - Yoshimasa Tsuruoka and Daisuke Okanohara for technical information about
       OWL-QN
     - Takashi Imamichi for the useful enhancements of the backtracking method
+    - Kevin S. Van Horn, Nic Schraudolph, and Tamas Nepusz for bug fixes
 
 Finally I would like to thank the original author, Jorge Nocedal, who has been
 distributing the effieicnt and explanatory implementation in an open source

diff --git a/libraries/lbfgs/src/arithmetic_ansi.h b/libraries/lbfgs/src/arithmetic_ansi.h
@@ -23,10 +23,10 @@
  * THE SOFTWARE.
  */
 
-/* $Id: arithmetic_ansi.h 65 2010-01-29 12:19:16Z naoaki $ */
+/* $Id$ */
 
 #include <stdlib.h>
-#include <string.h>
+#include <memory.h>
 
 #if     LBFGS_FLOAT == 32 && LBFGS_IEEE_FLOAT
 #define fsigndiff(x, y) (((*(uint32_t*)(x)) ^ (*(uint32_t*)(y))) & 0x80000000U)

diff --git a/libraries/lbfgs/src/arithmetic_sse_double.h b/libraries/lbfgs/src/arithmetic_sse_double.h
@@ -23,10 +23,12 @@
  * THE SOFTWARE.
  */
 
-/* $Id: arithmetic_sse_double.h 65 2010-01-29 12:19:16Z naoaki $ */
+/* $Id$ */
 
 #include <stdlib.h>
+#ifndef __APPLE__
 #include <malloc.h>
+#endif
 #include <memory.h>
 
 #if     1400 <= _MSC_VER
@@ -39,10 +41,15 @@
 
 inline static void* vecalloc(size_t size)
 {
-#ifdef	_MSC_VER
+#if     defined(_WIN32)
     void *memblock = _aligned_malloc(size, 16);
+#elif   defined(__APPLE__)  /* OS X always aligns on 16-byte boundaries */
+    void *memblock = malloc(size);
 #else
-    void *memblock = memalign(16, size);
+    void *memblock = NULL, *p = NULL;
+    if (posix_memalign(&p, 16, size) == 0) {
+        memblock = p;
+    }
 #endif
     if (memblock != NULL) {
         memset(memblock, 0, size);
@@ -192,7 +199,7 @@ inline static void vecfree(void *memblock)
 
 
 
-#if     3 <= __SSE__
+#if     3 <= __SSE__ || defined(__SSE3__)
 /*
     Horizontal add with haddps SSE3 instruction. The work register (rw)
     is unused.

diff --git a/libraries/lbfgs/src/arithmetic_sse_float.h b/libraries/lbfgs/src/arithmetic_sse_float.h
@@ -23,10 +23,12 @@
  * THE SOFTWARE.
  */
 
-/* $Id: arithmetic_sse_float.h 65 2010-01-29 12:19:16Z naoaki $ */
+/* $Id$ */
 
 #include <stdlib.h>
+#ifndef __APPLE__
 #include <malloc.h>
+#endif
 #include <memory.h>
 
 #if     1400 <= _MSC_VER
@@ -45,7 +47,16 @@
 
 inline static void* vecalloc(size_t size)
 {
+#if     defined(_MSC_VER)
     void *memblock = _aligned_malloc(size, 16);
+#elif   defined(__APPLE__)  /* OS X always aligns on 16-byte boundaries */
+    void *memblock = malloc(size);
+#else
+    void *memblock = NULL, *p = NULL;
+    if (posix_memalign(&p, 16, size) == 0) {
+        memblock = p;
+    }
+#endif
     if (memblock != NULL) {
         memset(memblock, 0, size);
     }
@@ -54,7 +65,11 @@ inline static void* vecalloc(size_t size)
 
 inline static void vecfree(void *memblock)
 {
+#ifdef	_MSC_VER
     _aligned_free(memblock);
+#else
+    free(memblock);
+#endif
 }
 
 #define vecset(x, c, n) \
@@ -185,7 +200,7 @@ inline static void vecfree(void *memblock)
 
 
 
-#if     3 <= __SSE__
+#if     3 <= __SSE__ || defined(__SSE3__)
 /*
     Horizontal add with haddps SSE3 instruction. The work register (rw)
     is unused.