performance optimizations from tuning + roofline

pcdslab · Oct 29, 2022 · 45ccae6 · 45ccae6
1 parent 5fbf4ff
commit 45ccae6
Show file tree

Hide file tree

Showing 10 changed files with 55 additions and 62 deletions.
diff --git a/cmake/Templates/config.hpp.in b/cmake/Templates/config.hpp.in
@@ -67,6 +67,9 @@
 // number of X-axis samples in the histogram
 #define Xsamples                128
 
+// y-axis multiplier for experimental spectra data
+#define YAXISMULTIPLIER         100
+
 // FUTURE:  Only b- and y-ions used in current implementation
 
 // Define the Ion Series and add it to iSERIES

diff --git a/source/apps/argp/argp.hpp b/source/apps/argp/argp.hpp
@@ -300,7 +300,7 @@ void getParams(gParams &params)
         params.min_cpsm = parser.hits;
 
         // Base Intensity x 1000
-        params.base_int = parser.base_int * 1000;
+        params.base_int = parser.base_int * YAXISMULTIPLIER;
 
         // Cutoff intensity ratio (add 0.5 for nearest rounding)
         params.min_int = static_cast<double_t>(params.base_int) * parser.cutoff + 0.5;

diff --git a/source/core/cuda/superstep1/kernel.cu b/source/core/cuda/superstep1/kernel.cu
@@ -730,9 +730,6 @@ __global__ void GenerateFragIonData(uint_t *d_fragIon, pepEntry *d_pepEntry, cha
             Spectrum[idx] = (myVal + PROTONS(myCharge)) * scale / myCharge;
         }
     }
-
-    // do we need this?
-    // __syncthreads();
 }
 
 // -------------------------------------------------------------------------------------------- //

diff --git a/source/core/cuda/superstep2/kernel.cu b/source/core/cuda/superstep2/kernel.cu
@@ -580,7 +580,7 @@ std::array<int, 2> readAndPreprocess(string_t &filename)
                 if constexpr (std::is_same<int, spectype_t>::value)
                 {
                     mzs[m_idx] = std::atof(mz.c_str()) * params.scale;
-                    intns[m_idx] = std::atof(intn.c_str()) * 1000;
+                    intns[m_idx] = std::atof(intn.c_str()) * YAXISMULTIPLIER;
                 }
                 else
                 {

diff --git a/source/core/cuda/superstep3/kernel.cu b/source/core/cuda/superstep3/kernel.cu
@@ -580,19 +580,28 @@ __host__ status_t SearchKernel(spmat_t *mat, Queries<spectype_t> *gWorkPtr, int
     auto d_BYC = std::get<0>(pBYC);
     auto maxchunk = std::get<1>(pBYC);
 
-    // set the shared memory to 48KB
-    cudaFuncSetAttribute(SpSpGEMM, cudaFuncAttributeMaxDynamicSharedMemorySize, KBYTES(48));
+    // block size is 512 for open-search
+    int blocksize = 512;
+
+    // set block size to 320 for closed-search
+    if (params.dM < 50)
+        blocksize = 320;
+
+    // calculate shmembytes needed
+    int shmembytes = std::max((unsigned long)HISTOGRAM_SIZE, (sizeof(BYC) + sizeof(double)) * blocksize);
+
+    // set the shared memory to 16KB
+    cudaFuncSetAttribute(SpSpGEMM, cudaFuncAttributeMaxDynamicSharedMemorySize, KBYTES(16));
 
     for (int iter = 0 ; iter < niters ; iter++)
     {
         int nblocks = itersize;
-        const int blocksize = 320; // block size of 256 or 320 are good for better load factor 
 
         // if last iteration, adjust the number of blocks
         if (iter == niters - 1)
             nblocks = nspectra - iter * itersize;
 
-        hcp::gpu::cuda::s3::SpSpGEMM<<<nblocks, blocksize, KBYTES(32), driver->stream[SEARCH_STREAM]>>>(d_WorkPtr->moz, d_WorkPtr->intensity, d_WorkPtr->idx, d_WorkPtr->minlimits, d_WorkPtr->maxlimits, d_bA, d_iA, iter * itersize, d_BYC, maxchunk, d_Scores->survival, d_Scores->cpsms, d_Scores->topscore, params.dF, speclen, params.max_mass, params.scale, params.min_shp, ixx);
+        hcp::gpu::cuda::s3::SpSpGEMM<<<nblocks, blocksize, shmembytes, driver->stream[SEARCH_STREAM]>>>(d_WorkPtr->moz, d_WorkPtr->intensity, d_WorkPtr->idx, d_WorkPtr->minlimits, d_WorkPtr->maxlimits, d_bA, d_iA, iter * itersize, d_BYC, maxchunk, d_Scores->survival, d_Scores->cpsms, d_Scores->topscore, params.dF, speclen, params.max_mass, params.scale, params.min_shp, ixx);
 
         // synchronize the stream
         driver->stream_sync(SEARCH_STREAM);
@@ -721,26 +730,19 @@ __global__ void SpSpGEMM(spectype_t *dQ_moz, spectype_t *dQ_intensity, uint_t *d
                 {
                     uint_t raw = d_iA[ion];
 
-                    /* Calculate parent peptide ID */
-                    int_t ppid = (raw / speclen);
-
-                    /* Calculate the residue */
-                    int_t residue = (raw % speclen);
+                    /* Calculate the residue (raw % speclen) and isY */
+                    short isY = (((raw % speclen)) / halfspeclen);
 
-                    /* Either 0 or 1 */
-                    int isY = residue / halfspeclen;
-                    int isB = 1 - isY;
-
-                    // key - ppid
-                    myKey = ppid;
+                    // key = parent peptide ID
+                    myKey = (raw / speclen);
 
                     // write to keys
                     keys[threadIdx.x] = myKey;
 
                     /* Get the map element */
                     myVal = &vals[threadIdx.x];
-                    myVal->bc = isB;
-                    myVal->ibc = intn * isB;
+                    myVal->bc = (1 - isY);
+                    myVal->ibc = intn * (1 - isY);
                     myVal->yc = isY;
                     myVal->iyc = intn * isY;
                 }
@@ -755,7 +757,7 @@ __global__ void SpSpGEMM(spectype_t *dQ_moz, spectype_t *dQ_intensity, uint_t *d
                 // number of active threads
                 int activethds = min(blockDim.x, ends - ion + threadIdx.x + 1);
 
-                int iters = hcp::gpu::cuda::s1::log2ceil(activethds);
+                short iters = hcp::gpu::cuda::s1::log2ceil(activethds);
 
                 // is this thread a part of a localized group (and not group leader)
                 bool isGroup = false;
@@ -853,37 +855,28 @@ __global__ void SpSpGEMM(spectype_t *dQ_moz, spectype_t *dQ_intensity, uint_t *d
         // filter by the min shared peaks
         if (shpk >= min_shp) 
         {
-            // Create a heap cell
-            dhCell cell;
-
             // get the precomputed log(factorial(x))
-            double_t h1 = d_lgFact[bcc] + d_lgFact[ycc];
 
-            // Fill in the information
-            cell.hyperscore = h1 + log10f(1 + bycPtr[it].ibc) + log10f(1 + bycPtr[it].iyc) - 6;
+            double_t hyperscore = d_lgFact[bcc] + d_lgFact[ycc] + log10f(1 + bycPtr[it].ibc) + log10f(1 + bycPtr[it].iyc) - 4;
 
             // hyperscore < 0 means either b- or y- ions were not matched
-            if (cell.hyperscore > 0)
+            if (hyperscore > 0)
             {
-                if (cell.hyperscore >= MAX_HYPERSCORE)
-                    cell.hyperscore = MAX_HYPERSCORE - 1;
-
-                cell.idxoffset = ixx;
-                cell.psid = it;
-                cell.sharedions = shpk;
+                if (hyperscore >= MAX_HYPERSCORE)
+                    hyperscore = MAX_HYPERSCORE - 1;
 
                 // increment local candidate psms by +1
                 cpss +=1;
 
                 // Update the histogram
-                atomicAdd(&histogram[(int)(cell.hyperscore * 10 + 0.5)], 1);
+                atomicAdd(&histogram[(int)(hyperscore * 10 + 0.5)], 1);
 
-                if (cell.hyperscore > topscores.hyperscore)
+                if (hyperscore > topscores.hyperscore)
                 {
-                    topscores.hyperscore = cell.hyperscore;
-                    topscores.psid       = cell.psid;
-                    topscores.idxoffset  = cell.idxoffset;
-                    topscores.sharedions = cell.sharedions;
+                    topscores.hyperscore = hyperscore;
+                    topscores.psid       = it;
+                    topscores.idxoffset  = ixx;
+                    topscores.sharedions = shpk;
                 }
             }
         }
@@ -908,23 +901,23 @@ __global__ void SpSpGEMM(spectype_t *dQ_moz, spectype_t *dQ_intensity, uint_t *d
     }
 
     // local candidate psms
-    int cpsms_loc = 0;
+    int l_cpsms = 0;
 
     // sum the local cpsms to get the count
-    hcp::gpu::cuda::s3::blockSum(cpss, cpsms_loc);
+    hcp::gpu::cuda::s3::blockSum(cpss, l_cpsms);
 
     // add to the global memory sum
     if (!threadIdx.x)
-        *cpsms = *cpsms + cpsms_loc;
+        *cpsms = *cpsms + l_cpsms;
 
-    // need this to avoid race conditions. Why?
+    // need this to avoid race conditions
     __syncthreads();
 
     // copy histogram to the global memory
     for (int ii = threadIdx.x; ii < HISTOGRAM_SIZE; ii+=blockDim.x)
         survival[ii] += histogram[ii];
 
-    // need this to avoid race conditions. Why?
+    // need this to avoid race conditions
     __syncthreads();
 
     // reset the bycPtr

diff --git a/source/core/cuda/superstep4/kernel.cu b/source/core/cuda/superstep4/kernel.cu
@@ -533,18 +533,18 @@ __host__ status_t processResults(Index *index, Queries<spectype_t> *gWorkPtr, in
     auto TailFit_ptr = static_cast<void (*)(double *, int *, dhCell *, double *, short)>(&TailFit);
 
     // IMPORTANT: make sure at least 32KB+ shared memory is available to the TailFit kernel
-    cudaFuncSetAttribute(*TailFit_ptr, cudaFuncAttributeMaxDynamicSharedMemorySize, KBYTES(32));
+    cudaFuncSetAttribute(*TailFit_ptr, cudaFuncAttributeMaxDynamicSharedMemorySize, KBYTES(24));
 
     // the tailfit kernel
-    hcp::gpu::cuda::s4::TailFit<<<numSpecs, blockSize, KBYTES(32), driver->get_stream(SEARCH_STREAM)>>>(d_Scores->survival, d_Scores->cpsms, d_Scores->topscore, d_evalues, (short)params.min_cpsm);
+    hcp::gpu::cuda::s4::TailFit<<<numSpecs, blockSize, KBYTES(24), driver->get_stream(SEARCH_STREAM)>>>(d_Scores->survival, d_Scores->cpsms, d_Scores->topscore, d_evalues, (short)params.min_cpsm);
 
 #else
 
     // IMPORTANT: make sure at least 32KB+ shared memory is available to the logWeibullfit kernel
-    //cudaFuncSetAttribute(logWeibullFit, cudaFuncAttributeMaxDynamicSharedMemorySize, KBYTES(32));
+    //cudaFuncSetAttribute(logWeibullFit, cudaFuncAttributeMaxDynamicSharedMemorySize, KBYTES(24));
 
     // the logWeibullfit kernel
-    //hcp::gpu::cuda::s4::logWeibullFit<<<numSpecs, blockSize, KBYTES(32), driver->get_stream(SEARCH_STREAM)>>>(d_Scores, d_evalues, min_cpsm);
+    //hcp::gpu::cuda::s4::logWeibullFit<<<numSpecs, blockSize, KBYTES(24), driver->get_stream(SEARCH_STREAM)>>>(d_Scores, d_evalues, min_cpsm);
 
 #endif // TAILFIT
 
@@ -777,16 +777,16 @@ __host__ void processResults(double *h_data, float *h_hyp, int *h_cpsms, double
     auto TailFit_ptr = static_cast<void (*)(double *, float *, int *, double *, short)>(&TailFit);
 
     // IMPORTANT: make sure at least 32KB+ shared memory is available to the TailFit kernel
-    cudaFuncSetAttribute(*TailFit_ptr, cudaFuncAttributeMaxDynamicSharedMemorySize, KBYTES(32));
+    cudaFuncSetAttribute(*TailFit_ptr, cudaFuncAttributeMaxDynamicSharedMemorySize, KBYTES(24));
 
     // the tailfit kernel
-    hcp::gpu::cuda::s4::TailFit<<<bsize, blockSize, KBYTES(32), driver->get_stream()>>>(d_data, d_hyp, d_cpsms, d_evalues, (short)params.min_cpsm);
+    hcp::gpu::cuda::s4::TailFit<<<bsize, blockSize, KBYTES(24), driver->get_stream()>>>(d_data, d_hyp, d_cpsms, d_evalues, (short)params.min_cpsm);
 #else
     // IMPORTANT: make sure at least 32KB+ shared memory is available to the logWeibullfit kernel
-    //cudaFuncSetAttribute(logWeibullFit, cudaFuncAttributeMaxDynamicSharedMemorySize, KBYTES(32));
+    //cudaFuncSetAttribute(logWeibullFit, cudaFuncAttributeMaxDynamicSharedMemorySize, KBYTES(24));
 
     // the logWeibullfit kernel
-    //hcp::gpu::cuda::s4::logWeibullFit<<<numSpecs, blockSize, KBYTES(32), driver->get_stream(SEARCH_STREAM)>>>(d_Scores, d_evalues, min_cpsm);
+    //hcp::gpu::cuda::s4::logWeibullFit<<<numSpecs, blockSize, KBYTES(24), driver->get_stream(SEARCH_STREAM)>>>(d_Scores, d_evalues, min_cpsm);
 #endif // TAILFIT
 
     // D2H

diff --git a/source/core/dslim_query.cpp b/source/core/dslim_query.cpp
@@ -951,7 +951,7 @@ status_t DSLIM_QuerySpectrum(Queries<spectype_t> *ss, Index *index, uint_t idxch
                             double_t h1 = lgfact[bcc] + lgfact[ycc];
 
                             /* Fill in the information */
-                            cell.hyperscore = h1 + log10(1 + bycPtr[it].ibc) + log10(1 + bycPtr[it].iyc) - 6;
+                            cell.hyperscore = h1 + log10(1 + bycPtr[it].ibc) + log10(1 + bycPtr[it].iyc) - 4;
 
                             /* hyperscore < 0 means either b- or y- ions were not matched */
                             if (cell.hyperscore > 0)

diff --git a/source/core/include/cuda/superstep3/kernel.hpp b/source/core/include/cuda/superstep3/kernel.hpp
@@ -24,7 +24,7 @@
 #include "slmerr.h"
 #include "dslim.h"
 
-const int SEARCHINSTANCES = 320;
+const int SEARCHINSTANCES = 512;
 
 const int SEARCH_STREAM = 0;
 const int DATA_STREAM = 1;

diff --git a/source/core/include/slm_dsts.h b/source/core/include/slm_dsts.h
@@ -291,8 +291,8 @@ struct BYC
 {
     short_t      bc; // b ion count
     short_t      yc; // y ion count
-    long_t      ibc; // b ion intensities
-    long_t      iyc; // y ion intensities
+    uint_t      ibc; // b ion intensities
+    uint_t      iyc; // y ion intensities
 };
 
 struct DSLIM_Matrix

diff --git a/source/core/msquery.cpp b/source/core/msquery.cpp
@@ -286,7 +286,7 @@ std::array<int, 2> MSQuery::convertAndprepMS2bin(string *filename)
                 if constexpr (std::is_same<int, spectype_t>::value)
                 {
                     mzs.push_back(std::move(std::atof(mz.c_str()) * params.scale));
-                    intns.push_back(std::move(std::atof(intn.c_str()) * 1000));
+                    intns.push_back(std::move(std::atof(intn.c_str()) * YAXISMULTIPLIER));
                 }
                 else
                 {
@@ -920,7 +920,7 @@ VOID MSQuery::readMS2spectrum()
                 }
 
                 spectrum.mz[speclen] = (uint_t)((double_t)std::atof(mz.c_str()) * params.scale);
-                spectrum.intn[speclen] = (uint_t)((double_t)std::atof(intn.c_str()) * 1000);
+                spectrum.intn[speclen] = (uint_t)((double_t)std::atof(intn.c_str()) * YAXISMULTIPLIER);
 
                 speclen++;
             }
@@ -1015,7 +1015,7 @@ VOID MSQuery::readMS2spectrum()
                 }
 
                 spectrum.mz[speclen] = (uint_t)((double_t)std::atof(mz.c_str()) * params.scale);
-                spectrum.intn[speclen] = (uint_t)((double_t)std::atof(intn.c_str()) * 1000);
+                spectrum.intn[speclen] = (uint_t)((double_t)std::atof(intn.c_str()) * YAXISMULTIPLIER);
 
                 speclen++;
             }