fix for USE_MPI=ON and USE_GPU=ON

pcdslab · Jul 15, 2023 · 9b3c41c · 9b3c41c
1 parent 8c757fb
commit 9b3c41c
Show file tree

Hide file tree

Showing 7 changed files with 100 additions and 89 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -84,11 +84,8 @@ if(OpenMP_FOUND)
 
 endif()
 
-if (USE_MPI AND NOT USE_GPU)
+if (USE_MPI)
     find_package(MPI)
-else()
-    message(WARNING "USE_MPI and USE_GPU not fully supported together due to filesystem support.")
-    message(STATUS "=> Setting USE_MPI=OFF")
 endif()
 
 if(MPI_FOUND AND USE_MPI)

diff --git a/source/apps/argp/argp.hpp b/source/apps/argp/argp.hpp
@@ -57,7 +57,7 @@ auto getcurrtimeanddate()
 
 auto getcurrpath()
 {
-    // COMPILER VERSION GCC 9.1.0+ required 
+    // COMPILER VERSION GCC 9.1.0+ required
 #if __GNUC__ > 9 || (__GNUC__ == 9 && (__GNUC_MINOR__ >= 1))
     // COMPILER VERSION GCC 9.1.0+ required for std::filesystem calls
     static string_t currpath = std::filesystem::current_path();
@@ -97,7 +97,7 @@ auto sanitize_dM(T &dM)
 //
 // structure to store parsed params
 //
-struct params_t : public argparse::Args 
+struct params_t : public argparse::Args
 {
 
     //
@@ -155,7 +155,7 @@ struct params_t : public argparse::Args
     // base intensity x1000
     int &base_int                        = kwarg("base,base_int", "base noramlized peak intensity for MS/MS data x1000").set_default(1000);
 
-    // MS/MS peak cut off ratio 
+    // MS/MS peak cut off ratio
     double &cutoff                       = kwarg("cutoff_ratio", "cutoff peak ratio wrt base intensity (e.g. 1% = 0.01)").set_default(0.01);
 
     // m/z axis resolution
@@ -172,14 +172,14 @@ struct params_t : public argparse::Args
 
     // LBE distribution policy
 
-    // DistPolicy_t requires magic_enum submodule. 
+    // DistPolicy_t requires magic_enum submodule.
     DistPolicy_t &lbe_policy             = kwarg("policy", "LBE Distribution policy (cyclic, chunk, zigzag)").set_default(DistPolicy_t::cyclic);
 
     // scratch pad memory in MB
     int &bufferMBs                       = kwarg("buff,spad_mem", "buffer (scratch pad) RAM memory in MB (recommended: 2048MB+)").set_default(2048);
 
     // this should be an optional parameter
-    std::optional<std::vector<std::string>> &mods     
+    std::optional<std::vector<std::string>> &mods
                                          = kwarg("m,mods", "list of variable post-translational modifications (PTMs)").multi_argument();
 
     // do not keep the full database index on GPU
@@ -229,7 +229,7 @@ void getParams(gParams &params)
 
 #if !defined(ARGP_ONLY)
 
-    // COMPILER VERSION GCC 9.1.0+ required 
+    // COMPILER VERSION GCC 9.1.0+ required
 #if __GNUC__ > 9 || (__GNUC__ == 9 && (__GNUC_MINOR__ >= 1))
     // COMPILER VERSION GCC 9.1.0+ required for std::filesystem calls
     std::filesystem::create_directory(parser.workspace);
@@ -256,6 +256,7 @@ void getParams(gParams &params)
         params.gputhreads = parser.gputhreads;
 #else
         params.gputhreads = 0;
+        params.useGPU = false;
 #endif // USE_GPU
 
         // Get the min peptide length
@@ -281,7 +282,7 @@ void getParams(gParams &params)
         params.dM = parser.deltaM;
         sanitize_dM(params.dM);
 
-        // Get the min mass 
+        // Get the min mass
         params.min_mass = parser.minprecmass;
 
         // Get the max mass
@@ -325,7 +326,7 @@ void getParams(gParams &params)
             params.vModInfo.num_vars = modslist.size();
             params.modconditions = std::to_string(params.vModInfo.vmods_per_pep);
 
-            // process the strings: AA:MASS.0:NUM 
+            // process the strings: AA:MASS.0:NUM
             for (auto md = 0; md < params.vModInfo.num_vars; md++)
             {
                 // for each mod string
@@ -430,7 +431,7 @@ void printParser()
     /* Get the max fragment charge */
     printVar(parser.maxz);
 
-    // Get the m/z axis resolution 
+    // Get the m/z axis resolution
     printVar(parser.resolution);
 
     // Get the fragment mass tolerance
@@ -439,7 +440,7 @@ void printParser()
     // Get the precursor mass tolerance
     printVar(parser.deltaM);
 
-    // Get the min mass 
+    // Get the min mass
     printVar(parser.minprecmass);
 
     // Get the max mass

diff --git a/source/core/cuda/superstep2/kernel.cu b/source/core/cuda/superstep2/kernel.cu
@@ -45,7 +45,7 @@ const int BATCHSIZE = 20000;
 
 // -------------------------------------------------------------------------------------------- //
 
-namespace hcp 
+namespace hcp
 {
 
 namespace gpu
@@ -288,10 +288,10 @@ status_t ArraySort(spectype_t *intns, spectype_t *mzs, int *lens, int &idx, int
     hcp::gpu::cuda::error_check(hcp::gpu::cuda::H2D(d_lens, lens, count, driver->stream[2]));
 
     // memory for arraynums
-    hcp::gpu::cuda::error_check(hcp::gpu::cuda::device_allocate_async(d_arraynums, rawsize, driver->stream[2])); 
+    hcp::gpu::cuda::error_check(hcp::gpu::cuda::device_allocate_async(d_arraynums, rawsize, driver->stream[2]));
 
     // memory for indices
-    hcp::gpu::cuda::error_check(hcp::gpu::cuda::device_allocate_async(d_indices, rawsize, driver->stream[3])); 
+    hcp::gpu::cuda::error_check(hcp::gpu::cuda::device_allocate_async(d_indices, rawsize, driver->stream[3]));
 
     // memory for processed intensities
     hcp::gpu::cuda::error_check(hcp::gpu::cuda::device_allocate_async(d_m_intns, QALEN * BATCHSIZE, driver->stream[3]));
@@ -500,7 +500,7 @@ std::array<int, 2> readAndPreprocess(string_t &filename)
 
                         // flush to the binary file
                         MSQuery::flushBinaryFile(&filename, m_mzs, m_intns, rtimes, prec_mz, z, lens, count);
-                        
+
                         count = 0;
                         m_idx = 0;
                         largestspec_loc = 0;

diff --git a/source/core/cuda/superstep3/kernel.cu b/source/core/cuda/superstep3/kernel.cu
@@ -66,7 +66,7 @@ extern gParams params;
 
 // -------------------------------------------------------------------------------------------- //
 
-namespace hcp 
+namespace hcp
 {
 
 namespace gpu
@@ -95,8 +95,8 @@ namespace s3
 
 // -------------------------------------------------------------------------------------------- //
 
-__global__ void SpSpGEMM(spectype_t *dQ_moz, spectype_t *dQ_intensity, uint_t *dQ_idx, int *dQ_minlimits, int *dQ_maxlimits, 
-                        uint_t* d_bA, uint_t *d_iA, int iter, BYC *bycP, int maxchunk, double *d_survival, int *d_cpsms, 
+__global__ void SpSpGEMM(spectype_t *dQ_moz, spectype_t *dQ_intensity, uint_t *dQ_idx, int *dQ_minlimits, int *dQ_maxlimits,
+                        uint_t* d_bA, uint_t *d_iA, int iter, BYC *bycP, int maxchunk, double *d_survival, int *d_cpsms,
                         dhCell *d_topscore, int dF, int speclen, int maxmass, int scale, short min_shp, int ixx);
 
 // database search kernel host wrapper
@@ -193,7 +193,7 @@ void dQueries<T>::H2D(Queries<T> *rhs)
 {
     auto driver = hcp::gpu::cuda::driver::get_instance();
     int chunksize = rhs->numSpecs;
-    
+
     this->numSpecs = rhs->numSpecs;
     this->numPeaks = rhs->numPeaks;
 
@@ -254,7 +254,7 @@ __host__ status_t initialize()
     static auto h_lgfact = hcp::utils::lgfact<hcp::utils::maxshp>();
 
     // copy to CUDA constant arrays
-    hcp::gpu::cuda::error_check(cudaMemcpyToSymbol(d_lgFact, &h_lgfact.val, sizeof(double_t) * hcp::utils::maxshp)); 
+    hcp::gpu::cuda::error_check(cudaMemcpyToSymbol(d_lgFact, &h_lgfact.val, sizeof(double_t) * hcp::utils::maxshp));
 
     return SLM_SUCCESS;
 
@@ -317,7 +317,7 @@ std::pair<BYC *, int>& getBYC(int chunksize)
 void freeBYC()
 {
     auto driver = hcp::gpu::cuda::driver::get_instance();
-    
+
     auto pBYC = getBYC();
 
     auto d_BYC = std::get<0>(pBYC);
@@ -354,7 +354,7 @@ dScores *& getScorecard()
 void freeScorecard()
 {
     auto driver = hcp::gpu::cuda::driver::get_instance();
-    
+
     auto&& d_Scores = getScorecard();
 
     if (d_Scores)
@@ -369,7 +369,7 @@ void freeScorecard()
 __host__ dQueries<spectype_t> *& getdQueries()
 {
     static thread_local dQueries<spectype_t> *dqueries = nullptr;
-    
+
     if (!dqueries)
         dqueries = new dQueries<spectype_t>();
 
@@ -405,7 +405,7 @@ __host__ status_t search(Queries<spectype_t> *gWorkPtr, Index *index, uint_t idx
     static thread_local auto driver = hcp::gpu::cuda::driver::get_instance();
 
     dIndex *d_Index = nullptr;
-    
+
     if (params.gpuindex)
         d_Index = hcp::gpu::cuda::s1::getdIndex(index);
 
@@ -460,7 +460,7 @@ __host__ status_t search(Queries<spectype_t> *gWorkPtr, Index *index, uint_t idx
                     hcp::gpu::cuda::error_check(hcp::gpu::cuda::H2D(d_mat->iA, iAPtr, iAsize, driver->stream[DATA_STREAM]));
                 }
 
-                // copy the At rows to device 
+                // copy the At rows to device
                 d_mat->bA = hcp::gpu::cuda::s1::getbA();
                 uint_t bAsize = ((uint_t)(params.max_mass * params.scale)) + 1;
 
@@ -480,15 +480,19 @@ __host__ status_t search(Queries<spectype_t> *gWorkPtr, Index *index, uint_t idx
         }
     }
 
-#ifdef USE_MPI
-
     if (params.nodes > 1)
+    {
+#if defined (USE_MPI)
+        static bool once = [](){ std::cout << "WARNING: Experimental support for GPU+MPI only. Problems expected." << std::endl; return true;}();
         status = hcp::gpu::cuda::s4::getIResults(index, gWorkPtr, gpucurrSpecID, CandidatePSMS);
-    else
 #else
+        std::cerr << "ABORT: params.nodes > 1 without MPI. Build with -DUSE_MPI=ON" << std::endl;
+        exit(-1);
+#endif // USE_MPI
+    }
+    else
         // combine the results
         status = hcp::gpu::cuda::s4::processResults(index, gWorkPtr, gpucurrSpecID);
-#endif // USE_MPI
 
     hcp::gpu::cuda::s3::reset_dScores();
 
@@ -650,8 +654,8 @@ __global__ void resetdScores(double *survival, int *cpsms, dhCell *topscore)
 // -------------------------------------------------------------------------------------------- //
 
 __global__ void SpSpGEMM(spectype_t *dQ_moz, spectype_t *dQ_intensity, uint_t *dQ_idx, int *dQ_minlimits,
-                         int *dQ_maxlimits, uint_t* d_bA, uint_t *d_iA, int iter, BYC *bycP, 
-                         int maxchunk, double *d_survival, int *d_cpsms, dhCell *d_topscore, int dF, 
+                         int *dQ_maxlimits, uint_t* d_bA, uint_t *d_iA, int iter, BYC *bycP,
+                         int maxchunk, double *d_survival, int *d_cpsms, dhCell *d_topscore, int dF,
                          int speclen, int maxmass, int scale, short min_shp, int ixx)
 {
     BYC *bycPtr = &bycP[blockIdx.x * maxchunk];
@@ -701,7 +705,7 @@ __global__ void SpSpGEMM(spectype_t *dQ_moz, spectype_t *dQ_intensity, uint_t *d
             auto data = d_iA + d_bA[bin];
 
             int n = d_bA[bin + 1] - d_bA[bin];
-    
+
             /* If no ions in the bin */
             if (n < 1)
                 continue;
@@ -750,7 +754,7 @@ __global__ void SpSpGEMM(spectype_t *dQ_moz, spectype_t *dQ_intensity, uint_t *d
                 __syncthreads();
 
                 //
-                // reduce the BYC elements to avoid 
+                // reduce the BYC elements to avoid
                 // race conditions and locking
                 //
 
@@ -853,7 +857,7 @@ __global__ void SpSpGEMM(spectype_t *dQ_moz, spectype_t *dQ_intensity, uint_t *d
         ushort_t shpk = bcc + ycc;
 
         // filter by the min shared peaks
-        if (shpk >= min_shp) 
+        if (shpk >= min_shp)
         {
             // get the precomputed log(factorial(x))
 

diff --git a/source/core/dslim_scproc.cpp b/source/core/dslim_scproc.cpp
@@ -109,13 +109,19 @@ status_t DSLIM_DistScoreManager()
 #endif // USE_TIMEMORY
 
             // use GPU if available
-#if defined (USE_GPU)
             if (params.useGPU)
+            {
+#if defined (USE_GPU)
+                static bool once = [](){ std::cout << "WARNING: Experimental support for GPU+MPI only. Problems expected." << std::endl; return true;}();
                 status = ScoreHandle->GPUCombineResults();
-            else
 #else
-            status = ScoreHandle->CombineResults();
+                // should never logically reach here unless maliciously hacked into.
+                std::cerr << "ABORT: params.useGPU=true not available. Build with -DUSE_GPU=ON." << std::endl;
+                exit(-1);
 #endif // USE_GPU
+            }
+            else
+                status = ScoreHandle->CombineResults();
 
 #if defined (USE_TIMEMORY)
             merge_instr.stop();

diff --git a/source/core/ms2prep.cpp b/source/core/ms2prep.cpp
@@ -74,7 +74,7 @@ status_t synchronize()
 
 //
 // FUNCTION: get_instance
-// 
+//
 MSQuery **& get_instance()
 {
     static MSQuery** ptrs = new MSQuery*[queryfiles.size()];
@@ -83,7 +83,7 @@ MSQuery **& get_instance()
 
 //
 // FUNCTION: initialize
-// 
+//
 status_t initialize(lwqueue<MSQuery *>** qfPtrs, int_t& nBatches, int_t& dssize)
 {
     status_t status = SLM_SUCCESS;
@@ -126,7 +126,7 @@ status_t initialize(lwqueue<MSQuery *>** qfPtrs, int_t& nBatches, int_t& dssize)
             ms2local[0] = params.myid;
 
             // rest of the files in cyclic order
-            std::generate(std::begin(ms2local) + 1, std::end(ms2local), 
+            std::generate(std::begin(ms2local) + 1, std::end(ms2local),
                           [n=params.myid] () mutable { return n += params.nodes; });
 
 #if defined(USE_GPU)
@@ -143,7 +143,7 @@ status_t initialize(lwqueue<MSQuery *>** qfPtrs, int_t& nBatches, int_t& dssize)
 
             //
             // lambda function to initialize the MSQuery instances
-            // 
+            //
             auto workerthread = [&](bool gpu)
             {
                 auto loc_fid = 0;
@@ -175,7 +175,7 @@ status_t initialize(lwqueue<MSQuery *>** qfPtrs, int_t& nBatches, int_t& dssize)
                         hcp::gpu::cuda::s2::preprocess(ptrs[loc_fid], queryfiles[loc_fid], loc_fid);
                     else
                         ptrs[loc_fid]->initialize(&queryfiles[loc_fid], loc_fid);
-                
+
                 // archive the index if using MPI
 #ifdef USE_MPI
                     ptrs[loc_fid]->archive(loc_fid);